Differential compartment calling with dcHiC#
import os
import cooler
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns
mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'
leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'L5_ET', 'Amy',
'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7',
'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx',
'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
]
legname = ['L2/3-IT', 'L4-IT', 'L5-IT', 'L6-IT', 'L6-IT-Car3', 'L5/6-NP', 'L6-CT', 'L6b', 'L5-ET', 'Amy-Exc',
'Lamp5', 'Lamp5-Lhx6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb-ChC', 'Sst', 'Chd7',
'MSN-D1', 'MSN-D2', 'Foxp2', 'SubCtx-Cplx',
'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
]
leg2name = {xx:yy for xx,yy in zip(leg, legname)}
leg = {'exc': ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy'],
'inh': ['Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7'],
'msn': ['MSN_D1', 'MSN_D2', 'Foxp2'],
'sub': ['SubCtx'],
'glia': ['ASC', 'ODC', 'OPC'],
'mgc': ['MGC'],
'smc': ['PC'],
'endo': ['EC'],
'fibro': ['VLMC'],
}
leg['neu'] = leg['exc'] + leg['inh'] + leg['msn'] + leg['sub']
leg['all'] = leg['neu'] + leg['glia'] + leg['mgc'] + leg['smc'] + leg['endo'] + leg['fibro']
group_name = 'neu'
leg = pd.Index(leg[group_name])
legname = leg.map(leg2name)
res = 100000
indir = '/data/hba/compartment_majortype/'
outdir = f'/home/jzhou_salk_edu/sky_workdir/hba/compartment_majortype/diff/{group_name}/'
comp = pd.read_hdf(f'{indir}comp_raw_mergerawpca.hdf')
comp
L23_IT | L4_IT | L56_NP | L5_ET | L5_IT | L6_CT | L6_IT | L6_IT_Car3 | L6b | Lamp5 | ... | CHD7 | Foxp2 | ASC | ODC | OPC | MGC | EC | PC | VLMC | merged | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
chr1-18 | 4.532953 | 6.074816 | 4.745961 | 1.262724 | 5.044535 | 5.241165 | 4.385525 | 4.745368 | 3.601142 | 7.021493 | ... | 6.350878 | 5.443851 | 6.038124 | 6.479689 | 8.545128 | 10.942541 | 6.193613 | 7.818554 | 7.010916 | 7.573896 |
chr1-19 | 4.742626 | 6.077094 | 5.227821 | 1.421160 | 5.186667 | 5.666524 | 4.640529 | 4.969541 | 4.122677 | 7.254596 | ... | 7.296734 | 5.656314 | 6.461882 | 6.942686 | 9.275956 | 11.769346 | 6.530445 | 7.721279 | 6.664307 | 7.503558 |
chr1-20 | 4.882112 | 6.625833 | 4.720595 | 1.552144 | 5.603995 | 5.743913 | 4.805314 | 5.319397 | 4.070691 | 6.953638 | ... | 7.294498 | 5.555722 | 6.827351 | 7.392001 | 9.471356 | 11.617200 | 6.113659 | 7.430872 | 6.832435 | 7.445576 |
chr1-21 | 4.409264 | 6.373844 | 4.495035 | 1.347600 | 5.239480 | 5.593628 | 4.654892 | 4.584552 | 3.944247 | 7.050535 | ... | 6.668277 | 5.345105 | 6.756874 | 7.566949 | 8.932059 | 12.227214 | 6.573919 | 7.774136 | 6.692109 | 7.487518 |
chr1-22 | 4.792674 | 6.327627 | 4.606086 | 1.239676 | 5.004744 | 5.532068 | 4.451870 | 4.469162 | 3.724550 | 6.561917 | ... | 6.683668 | 5.515938 | 6.677850 | 7.454593 | 9.066686 | 11.302973 | 6.756169 | 7.914874 | 6.736316 | 7.543835 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
chr22-498 | -0.167683 | 0.066275 | -0.193310 | -0.642383 | -0.007180 | 0.075564 | 0.063709 | -0.003458 | -0.012046 | 0.369045 | ... | 0.219480 | -0.721090 | -0.508691 | 2.386674 | -0.335253 | 2.522928 | 0.128222 | 0.416390 | -0.067209 | 1.139465 |
chr22-499 | 0.374943 | 0.841765 | -0.211688 | -0.876948 | 0.673747 | 0.835991 | 0.760642 | 0.462583 | 0.422578 | 0.794520 | ... | 0.490132 | -0.535043 | 0.742954 | 4.393894 | 0.173314 | 4.390028 | 1.213310 | 1.763509 | 1.703635 | 2.549865 |
chr22-500 | -0.451213 | 0.072616 | -0.240891 | -0.884130 | 0.192176 | 0.114169 | 0.411017 | -0.222203 | -0.067791 | 0.034812 | ... | 0.084431 | -0.478074 | 0.289802 | -0.538458 | -1.168219 | 1.133707 | -0.305873 | -2.942465 | 1.063949 | 0.564305 |
chr22-501 | 0.402952 | 1.649726 | -0.046996 | -0.659747 | 1.195990 | 0.905608 | 1.054437 | 0.106239 | 0.437586 | 1.438901 | ... | 1.076683 | -0.238773 | 0.962307 | 1.622971 | 0.114526 | 4.747499 | 0.257000 | -0.837970 | 1.536778 | 2.990559 |
chr22-502 | 1.284506 | 1.998990 | -0.104788 | -0.350169 | 1.331522 | 1.420527 | 1.456426 | 0.676795 | 0.721309 | 1.474490 | ... | 1.110074 | 0.780029 | 1.769825 | 4.341375 | 1.826384 | 5.515240 | 3.505156 | 3.665332 | 1.979127 | 5.275064 |
24745 rows × 30 columns
binall = np.load(f'{indir}binfilter_raw.npy', allow_pickle=True)
print(np.sum([xx.sum() for xx in binall[:-1]]))
24745
res = 100000
binall = pd.DataFrame(index=comp.index)
binall['chrom'] = binall.index.str.split('-').str[0]
binall['start'] = binall.index.str.split('-').str[1].astype(int) * res
binall['end'] = binall['start'] + res
binall
chrom | start | end | |
---|---|---|---|
chr1-18 | chr1 | 1800000 | 1900000 |
chr1-19 | chr1 | 1900000 | 2000000 |
chr1-20 | chr1 | 2000000 | 2100000 |
chr1-21 | chr1 | 2100000 | 2200000 |
chr1-22 | chr1 | 2200000 | 2300000 |
... | ... | ... | ... |
chr22-498 | chr22 | 49800000 | 49900000 |
chr22-499 | chr22 | 49900000 | 50000000 |
chr22-500 | chr22 | 50000000 | 50100000 |
chr22-501 | chr22 | 50100000 | 50200000 |
chr22-502 | chr22 | 50200000 | 50300000 |
24745 rows × 3 columns
chrom_size_path = f'/home/jzhou_salk_edu/sky_workdir/hba/ref/hg38.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
for xx in leg:
os.makedirs(f'{outdir}{xx}_100Kb_pca/intra_pca/{xx}_100Kb_mat/', exist_ok=True)
tmp = binall.copy()
tmp['pc'] = comp[xx]
for c in chrom_sizes.index[:-3]:
tmp.loc[tmp['chrom']==c].to_csv(f'{outdir}{xx}_100Kb_pca/intra_pca/{xx}_100Kb_mat/{c}.pc.bedGraph', sep='\t', header=False, index=False)
tmp = pd.DataFrame(index=leg)
tmp['matrix_path'] = '.'
tmp['bed_path'] = '.'
tmp['sample'] = tmp.index + '_100Kb'
tmp['group'] = tmp.index
tmp.to_csv(f'{outdir}input.txt', sep='\t', header=False, index=False)
!Rscript ~/software/dcHiC/dchicf.r --file input.txt --pcatype analyze --dirovwt T --diffdir .