Differential boundary

Differential boundary#

import numpy as np
import pandas as pd
import cooler
from scipy.signal import find_peaks
from scipy.stats import norm
import xarray as xr
leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'L5_ET', 'Amy', 
       'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7', 
       'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
legname = ['L2/3-IT', 'L4-IT', 'L5-IT', 'L6-IT', 'L6-IT-Car3', 'L5/6-NP', 'L6-CT', 'L6b', 'L5-ET', 'Amy-Exc', 
       'Lamp5', 'Lamp5-Lhx6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb-ChC', 'Sst', 'Chd7', 
       'MSN-D1', 'MSN-D2', 'Foxp2', 'SubCtx-Cplx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
leg2name = {xx:yy for xx,yy in zip(leg, legname)}
leg = {'exc': ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy'], 
       'inh': ['Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7'], 
       'msn': ['MSN_D1', 'MSN_D2', 'Foxp2'], 
       'sub': ['SubCtx'], 
       'glia': ['ASC', 'ODC', 'OPC'], 
       'mgc': ['MGC'], 
       'smc': ['PC'], 
       'endo': ['EC'], 
       'fibro': ['VLMC'],
      }
leg['neu'] = leg['exc'] + leg['inh'] + leg['msn'] + leg['sub']
leg['all'] = leg['neu'] + leg['glia'] + leg['mgc'] + leg['smc'] + leg['endo'] + leg['fibro']
group_name = 'neu'
leg = pd.Index(leg[group_name])
legname = leg.map(leg2name)
res = 25000
indir = '/data/hba/domain_majortype/'
outdir = f'/home/jzhou_salk_edu/sky_workdir/hba/domain_majortype/diff/{group_name}/'
chrom_size_path = '/home/jzhou_salk_edu/sky_workdir/hba/ref/hg38.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
chrom_sizes = chrom_sizes.iloc[:22]

naive differential boundary sc#

from statsmodels.sandbox.stats.multicomp import multipletests as FDR
from scipy.stats import chi2_contingency

def diff_bound(bound_count_ct, cell_count_ct):
    tmp = cell_count_ct[:,None] - bound_count_ct
    stats = np.zeros(bound_count_ct.shape[1])
    pv = np.ones(bound_count_ct.shape[1])
    binfilter = np.logical_and(bound_count_ct.sum(axis=0)>0, tmp.sum(axis=0)>0)
    for i in range(bound_count_ct.shape[1]):
        if binfilter[i]:
            contig = [bound_count_ct[:,i], tmp[:,i]]
            stats[i], pv[i], _, _ = chi2_contingency(contig)
    fdr = FDR(pv, 0.01, 'fdr_bh')[1]
    return stats, fdr

def shuffle_ct(i):
    global cell_count_ct, sc_border, leg
    np.random.seed(i)
    label = np.random.permutation(sc_border.obs[f'{ct_key}'])
    bound_count_ct = np.array([sc_border.X[label==xx].getnnz(axis=0) for xx in leg])
    bound_prob_ct = bound_count_ct / cell_count_ct[:,None]
    return diff_bound(bound_count_ct, cell_count_ct)[0]

def diff_bound_bulk(ins_count):
    stats = np.zeros(ins_count.shape[2])
    pv = np.ones(ins_count.shape[2])
    binfilter = (ins_count.min(axis=(0,1))>0)
    for i in range(ins_count.shape[2]):
        if binfilter[i]:
            stats[i], pv[i], _, _ = chi2_contingency(ins_count[:,:,i])
    fdr = FDR(pv, 0.01, 'fdr_bh')[1]
    return stats, fdr
bound_count_ct = pd.read_hdf(f'{indir}MajorType_boundcount.hdf', key='data').loc[leg]
cell_count_ct = pd.read_csv(f'{indir}MajorType_cellcount.csv.gz', index_col=0, header=0, squeeze=True).loc[leg]
bound_prob_ct = (bound_count_ct / cell_count_ct[:,None]).T
ins_count = xr.open_dataset(f'{indir}MajorType_impute.insulation.nc')
ins_count = ins_count.sel({'bin': (ins_count['bin_chrom']!='chrX')})
ins_count['ratio'] = (ins_count.sel({'type':'inter'})['__xarray_dataarray_variable__'] / ins_count.sel({'type':'intra'}))['__xarray_dataarray_variable__']
ins_count
<xarray.Dataset>
Dimensions:                        (cell: 29, bin: 115009, type: 2)
Coordinates:
  * cell                           (cell) object 'Amy' 'ASC' ... 'Vip' 'VLMC'
  * bin                            (bin) int64 0 1 2 3 ... 115006 115007 115008
  * type                           (type) object 'inter' 'intra'
    bin_chrom                      (bin) object 'chr1' 'chr1' ... 'chr22'
    bin_start                      (bin) int32 ...
    bin_end                        (bin) int32 ...
Data variables:
    __xarray_dataarray_variable__  (cell, bin, type) float64 ...
    ratio                          (cell, bin) float64 1.0 0.0 ... 0.03701
ins = ins_count['ratio'].to_pandas().loc[leg]
ins.shape
(21, 115009)
binall = ins_count[['bin_chrom', 'bin_start', 'bin_end']].to_pandas()
binall.columns = binall.columns.str.split('_').str[1]
binall.index = binall['chrom'] + '_' + (binall['start'] // res).astype(str)
bkl = pd.read_csv(f'{indir}../loop_majortype/M1C.rowsumpb1000.blf50.merged.bed', sep='\t', header=None, index_col=None)
binall['bklfilter'] = True
for c in chrom_sizes.index:
    chrfilter = (binall['chrom']==c)
    tmp = binall.loc[chrfilter.values]
    tmp.iloc[:10, -1] = False
    tmp.iloc[-10:, -1] = False
    for xx,yy in bkl.loc[bkl[0]==c, [1,2]].values // res:
        tmp.iloc[max([0,xx-2]):(yy+2), -1] = False
    binall.loc[chrfilter] = tmp.copy()

print(binall['bklfilter'].sum())
100643
chi2sc, fdr_sc = diff_bound(bound_count_ct.values, cell_count_ct.values)
ave = np.mean(chi2sc[chi2sc>0])
stdev = np.std(chi2sc[chi2sc>0])
binall['chi2filter'] = (((chi2sc - ave) / stdev)>norm.isf(0.025))
binall['ins_lm'] = 0
for xx in leg:
    sel = []
    for c in chrom_sizes.index:
        idx = np.where(binall['chrom']==c)[0]
        if len(idx)>0:
            data = -ins.loc[xx, idx]
            peaks, _ = find_peaks(data, distance=5)
            sel.append(idx.min() + peaks)
    sel = np.concatenate(sel)
    binall.loc[binall.index[sel], 'ins_lm'] = 1

binall['probdiff'] = (bound_prob_ct.max(axis=1) - bound_prob_ct.min(axis=1)).values
binall['chi2_sc'] = chi2sc.copy()
binall['insfc'] = ((ins.max(axis=0)+0.01) / (ins.min(axis=0)+0.01)).values
sel = []
thres = np.min(chi2sc[fdr_sc<1e-3])
for c in chrom_sizes.index:
    idx = np.where(binall['chrom']==c)[0]
    if len(idx)>0:
        data = chi2sc[idx]
        peaks, _ = find_peaks(data, height=thres, distance=5)
        sel.append(idx.min() + peaks)
        
sel = np.concatenate(sel)

binall['diff_sc'] = 0
binall.loc[binall.index[sel], 'diff_sc'] = 1
binall.loc[:, binall.dtypes=='category'] = binall.loc[:, binall.dtypes=='category'].astype(str)
binall.to_hdf(f'{outdir}bin_stats.hdf', key='data')
print((binall['chi2filter'] 
       & binall['diff_sc'] 
       & binall['bklfilter'] 
       & binall['ins_lm'] 
       & (binall['probdiff']>0.05) 
       #& (binall['insfc']>1.2)
      ).sum())
1720