Differential compartment calling with dcHiC

Differential compartment calling with dcHiC#

import os
import cooler
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'
leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'L5_ET', 'Amy', 
       'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7', 
       'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
legname = ['L2/3-IT', 'L4-IT', 'L5-IT', 'L6-IT', 'L6-IT-Car3', 'L5/6-NP', 'L6-CT', 'L6b', 'L5-ET', 'Amy-Exc', 
       'Lamp5', 'Lamp5-Lhx6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb-ChC', 'Sst', 'Chd7', 
       'MSN-D1', 'MSN-D2', 'Foxp2', 'SubCtx-Cplx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
leg2name = {xx:yy for xx,yy in zip(leg, legname)}
leg = {'exc': ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy'], 
       'inh': ['Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7'], 
       'msn': ['MSN_D1', 'MSN_D2', 'Foxp2'], 
       'sub': ['SubCtx'], 
       'glia': ['ASC', 'ODC', 'OPC'], 
       'mgc': ['MGC'], 
       'smc': ['PC'], 
       'endo': ['EC'], 
       'fibro': ['VLMC'],
      }
leg['neu'] = leg['exc'] + leg['inh'] + leg['msn'] + leg['sub']
leg['all'] = leg['neu'] + leg['glia'] + leg['mgc'] + leg['smc'] + leg['endo'] + leg['fibro']
group_name = 'neu'
leg = pd.Index(leg[group_name])
legname = leg.map(leg2name)
res = 100000
indir = '/data/hba/compartment_majortype/'
outdir = f'/home/jzhou_salk_edu/sky_workdir/hba/compartment_majortype/diff/{group_name}/'
comp = pd.read_hdf(f'{indir}comp_raw_mergerawpca.hdf')
comp
L23_IT L4_IT L56_NP L5_ET L5_IT L6_CT L6_IT L6_IT_Car3 L6b Lamp5 ... CHD7 Foxp2 ASC ODC OPC MGC EC PC VLMC merged
chr1-18 4.532953 6.074816 4.745961 1.262724 5.044535 5.241165 4.385525 4.745368 3.601142 7.021493 ... 6.350878 5.443851 6.038124 6.479689 8.545128 10.942541 6.193613 7.818554 7.010916 7.573896
chr1-19 4.742626 6.077094 5.227821 1.421160 5.186667 5.666524 4.640529 4.969541 4.122677 7.254596 ... 7.296734 5.656314 6.461882 6.942686 9.275956 11.769346 6.530445 7.721279 6.664307 7.503558
chr1-20 4.882112 6.625833 4.720595 1.552144 5.603995 5.743913 4.805314 5.319397 4.070691 6.953638 ... 7.294498 5.555722 6.827351 7.392001 9.471356 11.617200 6.113659 7.430872 6.832435 7.445576
chr1-21 4.409264 6.373844 4.495035 1.347600 5.239480 5.593628 4.654892 4.584552 3.944247 7.050535 ... 6.668277 5.345105 6.756874 7.566949 8.932059 12.227214 6.573919 7.774136 6.692109 7.487518
chr1-22 4.792674 6.327627 4.606086 1.239676 5.004744 5.532068 4.451870 4.469162 3.724550 6.561917 ... 6.683668 5.515938 6.677850 7.454593 9.066686 11.302973 6.756169 7.914874 6.736316 7.543835
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
chr22-498 -0.167683 0.066275 -0.193310 -0.642383 -0.007180 0.075564 0.063709 -0.003458 -0.012046 0.369045 ... 0.219480 -0.721090 -0.508691 2.386674 -0.335253 2.522928 0.128222 0.416390 -0.067209 1.139465
chr22-499 0.374943 0.841765 -0.211688 -0.876948 0.673747 0.835991 0.760642 0.462583 0.422578 0.794520 ... 0.490132 -0.535043 0.742954 4.393894 0.173314 4.390028 1.213310 1.763509 1.703635 2.549865
chr22-500 -0.451213 0.072616 -0.240891 -0.884130 0.192176 0.114169 0.411017 -0.222203 -0.067791 0.034812 ... 0.084431 -0.478074 0.289802 -0.538458 -1.168219 1.133707 -0.305873 -2.942465 1.063949 0.564305
chr22-501 0.402952 1.649726 -0.046996 -0.659747 1.195990 0.905608 1.054437 0.106239 0.437586 1.438901 ... 1.076683 -0.238773 0.962307 1.622971 0.114526 4.747499 0.257000 -0.837970 1.536778 2.990559
chr22-502 1.284506 1.998990 -0.104788 -0.350169 1.331522 1.420527 1.456426 0.676795 0.721309 1.474490 ... 1.110074 0.780029 1.769825 4.341375 1.826384 5.515240 3.505156 3.665332 1.979127 5.275064

24745 rows × 30 columns

binall = np.load(f'{indir}binfilter_raw.npy', allow_pickle=True)
print(np.sum([xx.sum() for xx in binall[:-1]]))
24745
res = 100000
binall = pd.DataFrame(index=comp.index)
binall['chrom'] = binall.index.str.split('-').str[0]
binall['start'] = binall.index.str.split('-').str[1].astype(int) * res
binall['end'] = binall['start'] + res
binall
chrom start end
chr1-18 chr1 1800000 1900000
chr1-19 chr1 1900000 2000000
chr1-20 chr1 2000000 2100000
chr1-21 chr1 2100000 2200000
chr1-22 chr1 2200000 2300000
... ... ... ...
chr22-498 chr22 49800000 49900000
chr22-499 chr22 49900000 50000000
chr22-500 chr22 50000000 50100000
chr22-501 chr22 50100000 50200000
chr22-502 chr22 50200000 50300000

24745 rows × 3 columns

chrom_size_path = f'/home/jzhou_salk_edu/sky_workdir/hba/ref/hg38.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
for xx in leg:
    os.makedirs(f'{outdir}{xx}_100Kb_pca/intra_pca/{xx}_100Kb_mat/', exist_ok=True)
    tmp = binall.copy()
    tmp['pc'] = comp[xx]
    for c in chrom_sizes.index[:-3]:
        tmp.loc[tmp['chrom']==c].to_csv(f'{outdir}{xx}_100Kb_pca/intra_pca/{xx}_100Kb_mat/{c}.pc.bedGraph', sep='\t', header=False, index=False)
tmp = pd.DataFrame(index=leg)
tmp['matrix_path'] = '.'
tmp['bed_path'] = '.'
tmp['sample'] = tmp.index + '_100Kb'
tmp['group'] = tmp.index
tmp.to_csv(f'{outdir}input.txt', sep='\t', header=False, index=False)
!Rscript ~/software/dcHiC/dchicf.r --file input.txt --pcatype analyze --dirovwt T --diffdir .