Load loop matrices and stats

Load loop matrices and stats#

import cooler
import numpy as np
import pandas as pd
from scipy.sparse import triu
from scipy.stats import pearsonr, zscore, norm
from multiprocessing import Pool
from concurrent.futures import ProcessPoolExecutor, as_completed
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'
leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'L5_ET', 'Amy', 
       'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7', 
       'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
legname = ['L2/3-IT', 'L4-IT', 'L5-IT', 'L6-IT', 'L6-IT-Car3', 'L5/6-NP', 'L6-CT', 'L6b', 'L5-ET', 'Amy-Exc', 
       'Lamp5', 'Lamp5-Lhx6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb-ChC', 'Sst', 'Chd7', 
       'MSN-D1', 'MSN-D2', 'Foxp2', 'SubCtx-Cplx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
leg2name = {xx:yy for xx,yy in zip(leg, legname)}
leg = {'exc': ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy'], 
       'inh': ['Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7'], 
       'cnu': ['MSN_D1', 'MSN_D2', 'Foxp2'], 
       'sub': ['SubCtx'], 
       'glia': ['ASC', 'ODC', 'OPC'], 
       'mgc': ['MGC'], 
       'smc': ['PC'], 
       'endo': ['EC'], 
       'fibro': ['VLMC'],
      }
leg['neu'] = leg['exc'] + leg['inh'] + leg['cnu'] + leg['sub']
leg['all'] = leg['neu'] + leg['glia'] + leg['mgc'] + leg['smc'] + leg['endo'] + leg['fibro']
group_name = 'neu'
ctgroup = []
if '_' in group_name:
    for xx in group_name.split('_'):
        ctgroup.append(leg[xx])
else:
    for xx in leg[group_name]:
        ctgroup.append([xx])
        
leg = pd.Index(np.concatenate(ctgroup))
legname = leg.map(leg2name)
res = 10000
indir = '/home/jzhou_salk_edu/sky_workdir/hba/loop_majortype/'
outdir = f'/home/jzhou_salk_edu/sky_workdir/hba/loop_majortype/diff/{group_name}/'
chrom_size_path = f'{indir}hg38_with_chrl.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)

Load Loop Position and ANOVA#

loopall = [pd.read_csv(f'{indir}/{ct}/{ct}/{ct}.loop.bedpe', sep='\t', index_col=None, header=None) for ct in leg]
loopall = pd.concat(loopall, axis=0)
loopall = loopall.drop([6], axis=1).drop_duplicates(subset=[0,1,4]).sort_values([0,1,4])
loopall = pd.concat([loopall[(loopall[0]==c).values] for c in chrom_sizes.index[:-1]])
loopall.index = np.arange(loopall.shape[0])
loopall
0 1 2 3 4 5
0 chr1 900000 910000 chr1 960000 970000
1 chr1 900000 910000 chr1 970000 980000
2 chr1 910000 920000 chr1 970000 980000
3 chr1 910000 920000 chr1 980000 990000
4 chr1 910000 920000 chr1 990000 1000000
... ... ... ... ... ... ...
2873610 chr22 50570000 50580000 chr22 50670000 50680000
2873611 chr22 50580000 50590000 chr22 50670000 50680000
2873612 chr22 50590000 50600000 chr22 50670000 50680000
2873613 chr22 50600000 50610000 chr22 50670000 50680000
2873614 chr22 50610000 50620000 chr22 50670000 50680000

2873615 rows × 6 columns

loopall.to_csv(f'{outdir}merged_loop.bedpe', sep='\t', index=False, header=False)
loopall.to_hdf(f'{outdir}merged_loop.hdf', key='data')
for c in chrom_sizes.index[:-1]:
    loopfilter = (loopall[0]==c)
    looptmp = loopall.loc[loopfilter, [1,4]].values // res
    for matrix in ['Q', 'E', 'T']:
        cool = cooler.Cooler(f'{outdir}majortype_{matrix}pv.cool')
        pv = triu(cool.matrix(balance=False, sparse=True).fetch(c)).tocsr()
        loopall.loc[loopfilter, f'{matrix}anova'] = pv[(looptmp[:,0], looptmp[:,1])].A1
    print(c)
chr1
chr2
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr20
chr21
chr22
loopall.to_hdf(f'{outdir}merged_loop.hdf', key='data')

Load Loop Q#

loopall = pd.read_hdf(f'{outdir}merged_loop.hdf', key='data')
loopall
0 1 2 3 4 5 Qanova Eanova Tanova
0 chr1 900000 910000 chr1 960000 970000 3.750881 6.097476 2.068213
1 chr1 900000 910000 chr1 970000 980000 3.322128 6.001146 2.007495
2 chr1 910000 920000 chr1 970000 980000 3.293559 5.439024 2.229271
3 chr1 910000 920000 chr1 980000 990000 2.704021 5.648575 2.289167
4 chr1 910000 920000 chr1 990000 1000000 2.819877 5.675182 1.669268
... ... ... ... ... ... ... ... ... ...
2873610 chr22 50570000 50580000 chr22 50670000 50680000 1.646674 11.822375 1.625390
2873611 chr22 50580000 50590000 chr22 50670000 50680000 2.256175 11.555016 1.335000
2873612 chr22 50590000 50600000 chr22 50670000 50680000 3.531459 11.165133 1.195543
2873613 chr22 50600000 50610000 chr22 50670000 50680000 4.896728 11.926161 2.028210
2873614 chr22 50610000 50620000 chr22 50670000 50680000 5.475712 13.031472 3.153314

2873615 rows × 9 columns

def load_Q(ct, m):
    tmp = []
    cool_file = cooler.Cooler(f'{indir}{ct}/{ct}/{ct}.{m}.cool').matrix(balance=False, sparse=True)
    for c in chrom_sizes.index[:-1]:
        mat = cool_file.fetch(c).tocsr()
        tmp.append(mat[(loopall.loc[loopall[0]==c, 1].values // res, loopall.loc[loopall[0]==c, 4].values // res)].A1)
        # print(ct, c)
    return [ct, np.concatenate(tmp)]
cpu = 5
with ProcessPoolExecutor(cpu) as executor:
    futures = []
    for xx in leg:
        future = executor.submit(
            load_Q,
            ct=xx,
            m='Q'
        )
        futures.append(future)

    loopq = []
    for future in as_completed(futures):
        tmp = future.result()
        loopq.append(pd.DataFrame(tmp[1], columns=[tmp[0]]))
        print(f'{tmp[0]} finished')
        
L23_IT finished
L6_IT_Car3 finished
L5_IT finished
L6_IT finished
L4_IT finished
L56_NP finished
L6_CT finished
L6b finished
Amy finished
Lamp5 finished
Lamp5_LHX6 finished
Sncg finished
Vip finished
Pvalb finished
Pvalb_ChC finished
Sst finished
CHD7 finished
MSN_D1 finished
MSN_D2 finished
Foxp2 finished
SubCtx finished
loopq = pd.concat(loopq, axis=1)
loopq = loopq[leg]
loopq.to_hdf(f'{outdir}loop_Q.hdf', key='data')
cpu = 5
with ProcessPoolExecutor(cpu) as executor:
    futures = []
    for xx in leg:
        future = executor.submit(
            load_Q,
            ct=xx,
            m='T'
        )
        futures.append(future)

    loopt = []
    for future in as_completed(futures):
        tmp = future.result()
        loopt.append(pd.DataFrame(tmp[1], columns=[tmp[0]]))
        print(f'{tmp[0]} finished')
        
L5_IT finished
L23_IT finished
L6_IT finished
L4_IT finished
L6_IT_Car3 finished
L56_NP finished
L6_CT finished
L6b finished
Amy finished
Lamp5 finished
Lamp5_LHX6 finished
Sncg finished
Vip finished
Pvalb finished
Pvalb_ChC finished
Sst finished
CHD7 finished
MSN_D1 finished
MSN_D2 finished
Foxp2 finished
SubCtx finished
loopt = pd.concat(loopt, axis=1)
loopt = loopt[leg]
loopt.to_hdf(f'{outdir}loop_T.hdf', key='data')
loopq = pd.read_hdf(f'{outdir}loop_Q.hdf', key='data')
loopt = pd.read_hdf(f'{outdir}loop_T.hdf', key='data')
fig, axes = plt.subplots(1, 2, figsize=(4, 2), dpi=300)
ax = axes[0]
sns.histplot(zscore(np.log10(loopall['Tanova'])), bins=100, binrange=(-3,4), ax=ax)
ax = axes[1]
sns.histplot(zscore(loopall['Tanova']), bins=100, binrange=(-1,4), ax=ax)
plt.tight_layout()
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica
../../_images/2a920ef2ad558b77109344afab2e00274be1f8f19fbcc163d9f46d8005f80228.png
fig, axes = plt.subplots(1, 2, figsize=(4, 2), dpi=300)
ax = axes[0]
sns.histplot(zscore(np.log10(loopall['Qanova'])), bins=100, binrange=(-3,4), ax=ax)
ax = axes[1]
sns.histplot(zscore(loopall['Qanova']), bins=100, binrange=(-1,4), ax=ax)
plt.tight_layout()
../../_images/a6e138b6b7d595f57c17ae7cea2b12d46a994315fd1281316b533f454f591ac1.png
thres1 = norm.isf(0.025)
thres2 = norm.isf(0.15)
print(thres1, thres2)
1.9599639845400545 1.0364333894937898
selb = ((zscore(loopall['Qanova'])>thres2) & (zscore(loopall['Tanova'])>thres2))
tmpq = loopq.loc[selb].values
tmpq = zscore(tmpq, axis=1)
tmpt = loopt.loc[selb].values
tmpt = zscore(tmpt, axis=1)
np.random.seed(0)
sel = np.random.choice(np.arange(len(tmpq)), 2000, False)
cg = sns.clustermap(tmpq[sel], cmap='bwr', vmin=-3, vmax=3, metric='cosine', xticklabels=leg, yticklabels=[], figsize=(6,6))
../../_images/a643c6c77833217a12155c1e16a59e1b7b9c7773d45902ec3aeaab05b4120e0b.png
rorder = cg.dendrogram_row.reordered_ind.copy()
corder = cg.dendrogram_col.reordered_ind.copy()
fig, axes = plt.subplots(1, 2, sharey='all', figsize=(6, 5), dpi=300)

ax = axes[0]
ax.imshow(tmpq[np.ix_(sel[rorder], corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('Q', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)
ax.set_yticks([])
ax.set_ylabel(f'{tmpq.shape[0]} Diff Loop')

ax = axes[1]
ax.imshow(tmpt[np.ix_(sel[rorder], corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('T', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)

plt.tight_layout()
../../_images/9de93cd62f9d72e5c593f7c08b54141b0b87ab02cd775acd17424796de72ad6c.png