Differential boundary#
import numpy as np
import pandas as pd
import cooler
from scipy.signal import find_peaks
from scipy.stats import norm
import xarray as xr
leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'L5_ET', 'Amy',
'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7',
'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx',
'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
]
legname = ['L2/3-IT', 'L4-IT', 'L5-IT', 'L6-IT', 'L6-IT-Car3', 'L5/6-NP', 'L6-CT', 'L6b', 'L5-ET', 'Amy-Exc',
'Lamp5', 'Lamp5-Lhx6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb-ChC', 'Sst', 'Chd7',
'MSN-D1', 'MSN-D2', 'Foxp2', 'SubCtx-Cplx',
'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
]
leg2name = {xx:yy for xx,yy in zip(leg, legname)}
leg = {'exc': ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy'],
'inh': ['Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7'],
'msn': ['MSN_D1', 'MSN_D2', 'Foxp2'],
'sub': ['SubCtx'],
'glia': ['ASC', 'ODC', 'OPC'],
'mgc': ['MGC'],
'smc': ['PC'],
'endo': ['EC'],
'fibro': ['VLMC'],
}
leg['neu'] = leg['exc'] + leg['inh'] + leg['msn'] + leg['sub']
leg['all'] = leg['neu'] + leg['glia'] + leg['mgc'] + leg['smc'] + leg['endo'] + leg['fibro']
group_name = 'neu'
leg = pd.Index(leg[group_name])
legname = leg.map(leg2name)
res = 25000
indir = '/data/hba/domain_majortype/'
outdir = f'/home/jzhou_salk_edu/sky_workdir/hba/domain_majortype/diff/{group_name}/'
chrom_size_path = '/home/jzhou_salk_edu/sky_workdir/hba/ref/hg38.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
chrom_sizes = chrom_sizes.iloc[:22]
naive differential boundary sc#
from statsmodels.sandbox.stats.multicomp import multipletests as FDR
from scipy.stats import chi2_contingency
def diff_bound(bound_count_ct, cell_count_ct):
tmp = cell_count_ct[:,None] - bound_count_ct
stats = np.zeros(bound_count_ct.shape[1])
pv = np.ones(bound_count_ct.shape[1])
binfilter = np.logical_and(bound_count_ct.sum(axis=0)>0, tmp.sum(axis=0)>0)
for i in range(bound_count_ct.shape[1]):
if binfilter[i]:
contig = [bound_count_ct[:,i], tmp[:,i]]
stats[i], pv[i], _, _ = chi2_contingency(contig)
fdr = FDR(pv, 0.01, 'fdr_bh')[1]
return stats, fdr
def shuffle_ct(i):
global cell_count_ct, sc_border, leg
np.random.seed(i)
label = np.random.permutation(sc_border.obs[f'{ct_key}'])
bound_count_ct = np.array([sc_border.X[label==xx].getnnz(axis=0) for xx in leg])
bound_prob_ct = bound_count_ct / cell_count_ct[:,None]
return diff_bound(bound_count_ct, cell_count_ct)[0]
def diff_bound_bulk(ins_count):
stats = np.zeros(ins_count.shape[2])
pv = np.ones(ins_count.shape[2])
binfilter = (ins_count.min(axis=(0,1))>0)
for i in range(ins_count.shape[2]):
if binfilter[i]:
stats[i], pv[i], _, _ = chi2_contingency(ins_count[:,:,i])
fdr = FDR(pv, 0.01, 'fdr_bh')[1]
return stats, fdr
bound_count_ct = pd.read_hdf(f'{indir}MajorType_boundcount.hdf', key='data').loc[leg]
cell_count_ct = pd.read_csv(f'{indir}MajorType_cellcount.csv.gz', index_col=0, header=0, squeeze=True).loc[leg]
bound_prob_ct = (bound_count_ct / cell_count_ct[:,None]).T
ins_count = xr.open_dataset(f'{indir}MajorType_impute.insulation.nc')
ins_count = ins_count.sel({'bin': (ins_count['bin_chrom']!='chrX')})
ins_count['ratio'] = (ins_count.sel({'type':'inter'})['__xarray_dataarray_variable__'] / ins_count.sel({'type':'intra'}))['__xarray_dataarray_variable__']
ins_count
<xarray.Dataset> Dimensions: (cell: 29, bin: 115009, type: 2) Coordinates: * cell (cell) object 'Amy' 'ASC' ... 'Vip' 'VLMC' * bin (bin) int64 0 1 2 3 ... 115006 115007 115008 * type (type) object 'inter' 'intra' bin_chrom (bin) object 'chr1' 'chr1' ... 'chr22' bin_start (bin) int32 ... bin_end (bin) int32 ... Data variables: __xarray_dataarray_variable__ (cell, bin, type) float64 ... ratio (cell, bin) float64 1.0 0.0 ... 0.03701
ins = ins_count['ratio'].to_pandas().loc[leg]
ins.shape
(21, 115009)
binall = ins_count[['bin_chrom', 'bin_start', 'bin_end']].to_pandas()
binall.columns = binall.columns.str.split('_').str[1]
binall.index = binall['chrom'] + '_' + (binall['start'] // res).astype(str)
bkl = pd.read_csv(f'{indir}../loop_majortype/M1C.rowsumpb1000.blf50.merged.bed', sep='\t', header=None, index_col=None)
binall['bklfilter'] = True
for c in chrom_sizes.index:
chrfilter = (binall['chrom']==c)
tmp = binall.loc[chrfilter.values]
tmp.iloc[:10, -1] = False
tmp.iloc[-10:, -1] = False
for xx,yy in bkl.loc[bkl[0]==c, [1,2]].values // res:
tmp.iloc[max([0,xx-2]):(yy+2), -1] = False
binall.loc[chrfilter] = tmp.copy()
print(binall['bklfilter'].sum())
100643
chi2sc, fdr_sc = diff_bound(bound_count_ct.values, cell_count_ct.values)
ave = np.mean(chi2sc[chi2sc>0])
stdev = np.std(chi2sc[chi2sc>0])
binall['chi2filter'] = (((chi2sc - ave) / stdev)>norm.isf(0.025))
binall['ins_lm'] = 0
for xx in leg:
sel = []
for c in chrom_sizes.index:
idx = np.where(binall['chrom']==c)[0]
if len(idx)>0:
data = -ins.loc[xx, idx]
peaks, _ = find_peaks(data, distance=5)
sel.append(idx.min() + peaks)
sel = np.concatenate(sel)
binall.loc[binall.index[sel], 'ins_lm'] = 1
binall['probdiff'] = (bound_prob_ct.max(axis=1) - bound_prob_ct.min(axis=1)).values
binall['chi2_sc'] = chi2sc.copy()
binall['insfc'] = ((ins.max(axis=0)+0.01) / (ins.min(axis=0)+0.01)).values
sel = []
thres = np.min(chi2sc[fdr_sc<1e-3])
for c in chrom_sizes.index:
idx = np.where(binall['chrom']==c)[0]
if len(idx)>0:
data = chi2sc[idx]
peaks, _ = find_peaks(data, height=thres, distance=5)
sel.append(idx.min() + peaks)
sel = np.concatenate(sel)
binall['diff_sc'] = 0
binall.loc[binall.index[sel], 'diff_sc'] = 1
binall.loc[:, binall.dtypes=='category'] = binall.loc[:, binall.dtypes=='category'].astype(str)
binall.to_hdf(f'{outdir}bin_stats.hdf', key='data')
print((binall['chi2filter']
& binall['diff_sc']
& binall['bklfilter']
& binall['ins_lm']
& (binall['probdiff']>0.05)
#& (binall['insfc']>1.2)
).sum())
1720