Plot correlation between boundary and mC/ATAC/RNA#

import numpy as np
import pandas as pd
from scipy.stats import pearsonr, zscore
import anndata
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib import cm as cm
import seaborn as sns
from matplotlib.colors import LogNorm

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'
leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'L5_ET', 'Amy', 
       'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7', 
       'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
legname = ['L2/3-IT', 'L4-IT', 'L5-IT', 'L6-IT', 'L6-IT-Car3', 'L5/6-NP', 'L6-CT', 'L6b', 'L5-ET', 'Amy-Exc', 
       'Lamp5', 'Lamp5-Lhx6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb-ChC', 'Sst', 'Chd7', 
       'MSN-D1', 'MSN-D2', 'Foxp2', 'SubCtx-Cplx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
leg2name = {xx:yy for xx,yy in zip(leg, legname)}
leg = {'exc': ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy'], 
       'inh': ['Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7'], 
       'msn': ['MSN_D1', 'MSN_D2', 'Foxp2'], 
       'sub': ['SubCtx'], 
       'glia': ['ASC', 'ODC', 'OPC'], 
       'mgc': ['MGC'], 
       'smc': ['PC'], 
       'endo': ['EC'], 
       'fibro': ['VLMC'],
      }
leg['neu'] = leg['exc'] + leg['inh'] + leg['msn'] + leg['sub']
leg['all'] = leg['neu'] + leg['glia'] + leg['mgc'] + leg['smc'] + leg['endo'] + leg['fibro']
group_name = 'neu'
leg = pd.Index(leg[group_name])
legname = leg.map(leg2name)
res = 25000
indir = '/data/hba/domain_majortype/'
outdir = f'/home/jzhou_salk_edu/sky_workdir/hba/domain_majortype/diff/{group_name}/'
boundcg = pd.read_hdf(f'{outdir}bound_mCG.hdf', key='data')
boundch = pd.read_hdf(f'{outdir}bound_mCH.hdf', key='data')
boundatac = pd.read_hdf(f'{outdir}bound_ATAC.hdf', key='data')
legatac = boundatac.columns
bound_count_ct = pd.read_hdf(f'{indir}MajorType_boundcount.hdf', key='data').loc[leg]
cell_count_ct = pd.read_csv(f'{indir}MajorType_cellcount.csv.gz', index_col=0, header=0, squeeze=True).loc[leg]
bound_prob_ct = (bound_count_ct / cell_count_ct[:,None]).T
binall = pd.read_hdf('bin_stats.hdf', key='data')
selb = (binall['chi2filter'] & binall['diff_sc'] & binall['bklfilter'] & binall['ins_lm'] & (binall['probdiff']>0.05))
print(sum(selb))
1720
tmpcg = boundcg.astype(float).loc[selb].values
tmpcg = zscore(tmpcg, axis=1)
tmpch = boundch.astype(float).loc[selb].values
tmpch = zscore(tmpch, axis=1)
tmpatac = boundatac.astype(float).loc[selb].values
tmpatac = zscore(tmpatac, axis=1)
tmp3c = bound_prob_ct.loc[selb].values
tmp3c = zscore(tmp3c, axis=1)
cg = sns.clustermap(tmp3c, cmap='bwr', vmin=-3, vmax=3, metric='cosine', xticklabels=leg, yticklabels=[], figsize=(6,6))
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica
../../_images/c07e1dfb7fcf3523c4b136d6d9aaf5f1f02d9613fc6ab727debe1b7625fbced0.png
rorder = cg.dendrogram_row.reordered_ind.copy()
corder = cg.dendrogram_col.reordered_ind.copy()
idxatac = np.arange(len(leg))[leg.isin(legatac)]
corderatac = [xx for xx in corder if xx in idxatac]
fig, axes = plt.subplots(1, 4, sharey='all', figsize=(12,5), dpi=300)

ax = axes[0]
ax.imshow(tmp3c[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('Boundary Prob', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)
ax.set_yticks([])
ax.set_ylabel(f'{tmp3c.shape[0]} Diff Boundary')

ax = axes[1]
ax.imshow(tmpatac[np.ix_(rorder, corderatac)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('ATAC', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(legatac)))
ax.set_xticklabels(legname[corderatac], rotation=90)

ax = axes[2]
ax.imshow(tmpcg[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('mCG', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)

ax = axes[3]
ax.imshow(tmpch[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('mCH', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)


plt.tight_layout()
# plt.savefig(f'majortype_{group_name}_diffbound_3C_Epi.pdf', transparent=True)
../../_images/bc9cdb0c340db949a8323c447ec01851a4f512563799d7b374917c802138213d.png
fig, ax = plt.subplots(figsize=(2,2), dpi=300)
sns.violinplot([binall.loc[selb, 'mCG_corr'], 
                binall.loc[selb, 'mCH_corr'], 
                binall.loc[selb, 'ATAC_corr']], ax=ax)
ax.set_xticklabels(['mCG', 'mCH', 'ATAC'])
ax.set_ylabel('PCC with Comp Score')
# plt.savefig(f'majortype_{group_name}_diffbound_epicorr.pdf', transparent=True)
Text(0, 0.5, 'PCC with Comp Score')
../../_images/72846962162491388524e454017fb887c4a0692ab0e78528209418827b1167fd.png
print(binall.loc[selb, 'mCG_corr'].median(), 
      binall.loc[selb, 'mCH_corr'].median(),
      binall.loc[selb, 'ATAC_corr'].median(),
      selb.sum()
     )
-0.23891224792565977 -0.3763989220670755 0.2403007736648618 1720

TSS stats vs corr#

binall = binall.loc[binall['bklfilter'] & (binall['probdiff']>0)]
binall
chrom start end bklfilter chi2filter ins_lm probdiff chi2_sc insfc diff_sc mCG_corr mCH_corr ATAC_corr
chr1_34 chr1 850000 875000 True False 0 0.001558 11.570785 0.281549 0 0.484103 0.355741 -0.332018
chr1_35 chr1 875000 900000 True False 0 0.007333 17.110812 0.258256 0 -0.663989 0.268799 0.316048
chr1_36 chr1 900000 925000 True False 0 0.019086 36.136662 0.247746 0 -0.188664 0.174081 -0.036786
chr1_37 chr1 925000 950000 True False 0 0.022342 45.028810 0.244715 0 -0.539621 -0.432163 0.519192
chr1_38 chr1 950000 975000 True False 0 0.021296 28.239344 0.252528 0 0.000574 -0.178776 0.467298
... ... ... ... ... ... ... ... ... ... ... ... ... ...
chr22_2018 chr22 50450000 50475000 True False 0 0.032081 45.797538 0.407091 0 0.418948 -0.172765 -0.349786
chr22_2019 chr22 50475000 50500000 True False 0 0.047843 54.166764 0.393158 0 0.330239 -0.480359 0.063941
chr22_2020 chr22 50500000 50525000 True False 1 0.050676 59.499545 0.374385 0 0.191173 -0.305744 0.146097
chr22_2021 chr22 50525000 50550000 True False 1 0.058912 61.681386 0.356571 1 -0.222545 -0.253223 0.409311
chr22_2022 chr22 50550000 50575000 True False 0 0.038865 32.520812 0.340293 0 0.255620 0.021299 -0.030856

100624 rows × 13 columns

expr = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/cluster_expr.hdf')
stats = pd.read_hdf(f'/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/{group_name}_deg_stats.hdf')
genefilter = (stats['stats']>0)
expr = expr.loc[:, genefilter]
stats = stats.loc[genefilter]
data = pd.read_csv(f'/home/jzhou_salk_edu/sky_workdir/hba/ref/hg38.main.5kbin.TSS.slop2k.txt', sep='\t', header=None, index_col=None)
data = data.loc[(data[1] % res)!=10000]
data['bin_id'] = data[0] + '_' + ((data[1] + 10000) // res).astype(str)
data = data.rename({6:'gene_id'}, axis=1)
data = data[['bin_id', 'gene_id']].drop_duplicates()
data = data.loc[(data['bin_id'].isin(binall.index)) & (data['gene_id'].isin(stats.index))]
data['chi2'] = binall.loc[data['bin_id'].values, 'chi2_sc'].values
data['kruskal'] = stats.loc[data['gene_id'].values, 'stats'].values
tmp3c = bound_prob_ct.loc[data['bin_id'].values].values
tmp3c = zscore(tmp3c, axis=1)
tmprna = expr.loc[leg, data['gene_id'].values].T.values
tmprna = zscore(tmprna, axis=1)
data['corr'] = [pearsonr(xx, yy)[0] for xx,yy in zip(tmp3c, tmprna)]
fig, ax = plt.subplots(figsize=(3.5,3), dpi=300)
sns.histplot(data, x='chi2', y='kruskal', bins=100, ax=ax, log_scale=(10,10))
<AxesSubplot:xlabel='chi2', ylabel='kruskal'>
../../_images/b9a9959e0d4f64ab88737589692aefbdef2174a7df3d370cae1365b19611ae62.png
data['chi2_group'], groupM = pd.cut(zscore(np.log10(data['chi2'])), 100, labels=False, retbins=True)
data['kruskal_group'], groupK = pd.cut(zscore(np.log10(data['kruskal'])), 100, labels=False, retbins=True)
tmp = data.groupby(['chi2_group','kruskal_group'])['corr'].mean()
groupcorr = np.zeros((100, 100)) / 0
groupcorr[(tmp.index.get_level_values('kruskal_group'), tmp.index.get_level_values('chi2_group'))] = tmp.values
tmp = data.groupby(['chi2_group','kruskal_group'])['corr'].count()
groupcount = np.zeros((100, 100))
groupcount[(tmp.index.get_level_values('kruskal_group'), tmp.index.get_level_values('chi2_group'))] = tmp.values + 1
cmap = mpl.cm.bwr
cmap.set_bad('black', 1.0)

fig, axes = plt.subplots(1, 2, figsize=(6,3), sharex='all', sharey='all', dpi=300)
ax = axes[0]
plot = ax.imshow(groupcorr, vmin=-0.8, vmax=0.8, cmap=cmap)

cbar = fig.colorbar(plot, ax=ax, shrink=0.5, fraction=0.1)
cbar.set_ticks([-0.8, 0.8])
ax.set_xlabel('Zscore Chi2')
ax.set_ylabel('Zscore Kruskal')

ax = axes[1]
plot = ax.imshow(groupcount, cmap='Reds', norm=LogNorm(vmin=1))
fig.colorbar(plot, ax=ax, shrink=0.5, fraction=0.1)
ax.set_xticks(np.arange(-0.5, 100.5, 20))
ax.set_yticks(np.arange(-0.5, 100.5, 20))
ax.set_xticklabels(np.around(groupM[np.arange(0,100.5,20).astype(int)], decimals=1))
ax.set_yticklabels(np.around(groupK[np.arange(0,100.5,20).astype(int)], decimals=1))

ax.set_xlabel('Zscore Chi2')
plt.tight_layout()
# plt.savefig(f'majortype_{group_name}_bound_rna_stats_tss_corr.pdf', transparent=True)
../../_images/9e0c107c9620edd355fb0215375fb71b837473ba4567f782b38a37c17a612382.png

Differential Loop vs All Gene TSS#

corr_all = []
selp = data['bin_id'].isin(selb.index[selb])
data = data.loc[selp]
tmp3c = tmp3c[selp]
tmprna = tmprna[selp]
cg = sns.clustermap(tmp3c, cmap='bwr', vmin=-3, vmax=3, metric='cosine', xticklabels=leg, yticklabels=[], figsize=(6,6))
../../_images/6aff77143c016f83488772556d7124323911253fa9137e28cff58352473c6138.png
rorder = cg.dendrogram_row.reordered_ind.copy()
corder = cg.dendrogram_col.reordered_ind.copy()
fig, axes = plt.subplots(1, 2, sharey='all', figsize=(6,5), dpi=300)

ax = axes[0]
ax.imshow(tmp3c[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('Boundary Prob', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)
ax.set_yticks([])
ax.set_ylabel(f'{np.unique(data["bin_id"]).shape[0]} Diff Bound')

ax = axes[1]
ax.imshow(tmprna[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('RNA', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)
ax.set_ylabel(f'{np.unique(data["gene_id"]).shape[0]} Gene')

plt.tight_layout()
# plt.savefig(f'majortype_{group_name}_diffbound_3C_RNA_alltss.pdf', transparent=True)
../../_images/3b8cece6d1baaf515509f2eae1ac809dcbfa2cfa9d3a34a8d2bdf5ec9c590b2c.png
corr_all.append(data['corr'].values)

Differential Loop vs DEG TSS#

from scipy.stats import rankdata

deg = np.zeros(genefilter.shape[0])
for i in range(len(leg)-1):
    for j in range(i+1, len(leg)):
        tmp = np.load(f'/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/DEG/{leg[i]}-{leg[j]}.npz')
        # deg[np.logical_and(np.abs(tmp['fc'])>1, tmp['fdr']<1e-3)] = 1
        rank = rankdata(tmp['fdr'])
        deg[rank<=100] = 1

print(deg.sum())
1131.0
tmp = anndata.read_h5ad('/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/cell_29000_rna.h5ad')
tmp.var['DEG'] = deg.copy()
# ((tmp.var['chrom'].isin(chrom_sizes.index)) & (tmp.var['DEG']==1) & genefilter).sum()
selp = data['gene_id'].isin(genefilter.index[deg==1])
data = data.loc[selp]
tmp3c = tmp3c[selp]
tmprna = tmprna[selp]
cg = sns.clustermap(tmp3c, cmap='bwr', vmin=-3, vmax=3, metric='cosine', xticklabels=leg, yticklabels=[], figsize=(6,6))
../../_images/2e9e85877e454e0e51c1072fba65bb0485e216b59ee80374a4c6108a1ad5fdf9.png
rorder = cg.dendrogram_row.reordered_ind.copy()
corder = cg.dendrogram_col.reordered_ind.copy()
fig, axes = plt.subplots(1, 2, sharey='all', figsize=(6,5), dpi=300)

ax = axes[0]
ax.imshow(tmp3c[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('Boundary Prob', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)
ax.set_yticks([])
ax.set_ylabel(f'{np.unique(data["bin_id"]).shape[0]} Diff Comp')

ax = axes[1]
ax.imshow(tmprna[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('RNA', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)
ax.set_ylabel(f'{np.unique(data["gene_id"]).shape[0]} DEG')

plt.tight_layout()
# plt.savefig(f'majortype_{group_name}_diffbound_3C_RNA_degtss.pdf', transparent=True)
../../_images/28040bf74a23e506f9ec4f27c5379f684c58cbd59ff3c04300d5cee6731db36a.png
corr_all.append(data['corr'].values)

Gene body stats vs corr#

expr = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/cluster_expr.hdf')
stats = pd.read_hdf(f'/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/{group_name}_deg_stats.hdf')
genefilter = (stats['stats']>0)
expr = expr.loc[:, genefilter]
stats = stats.loc[genefilter]
data = pd.read_csv(f'/home/jzhou_salk_edu/sky_workdir/hba/ref/hg38.main.5kbin.gene.slop2k.txt', sep='\t', header=None, index_col=None)
data = data.loc[(data[1] % res)!=10000]
data['bin_id'] = data[0] + '_' + ((data[1] + 10000) // res).astype(str)
data = data.rename({6:'gene_id'}, axis=1)
data = data[['bin_id', 'gene_id']].drop_duplicates()
data = data.loc[(data['bin_id'].isin(binall.index)) & (data['gene_id'].isin(stats.index))]
data['chi2'] = binall.loc[data['bin_id'].values, 'chi2_sc'].values
data['kruskal'] = stats.loc[data['gene_id'].values, 'stats'].values
tmp3c = bound_prob_ct.loc[data['bin_id'].values].values
tmp3c = zscore(tmp3c, axis=1)
tmprna = expr.loc[leg, data['gene_id'].values].T.values
tmprna = zscore(tmprna, axis=1)
data['corr'] = [pearsonr(xx, yy)[0] for xx,yy in zip(tmp3c, tmprna)]
fig, ax = plt.subplots(figsize=(3.5,3), dpi=300)
sns.histplot(data, x='chi2', y='kruskal', bins=100, ax=ax, log_scale=(10,10))
<AxesSubplot:xlabel='chi2', ylabel='kruskal'>
../../_images/9ba9ec710e38669342b81af9e8a8ffaa6342b61f8e986e61fcee80fb45c0eaac.png
data['chi2_group'], groupM = pd.cut(zscore(np.log10(data['chi2'])), 100, labels=False, retbins=True)
data['kruskal_group'], groupK = pd.cut(zscore(np.log10(data['kruskal'])), 100, labels=False, retbins=True)
tmp = data.groupby(['chi2_group','kruskal_group'])['corr'].mean()
groupcorr = np.zeros((100, 100)) / 0
groupcorr[(tmp.index.get_level_values('kruskal_group'), tmp.index.get_level_values('chi2_group'))] = tmp.values
tmp = data.groupby(['chi2_group','kruskal_group'])['corr'].count()
groupcount = np.zeros((100, 100))
groupcount[(tmp.index.get_level_values('kruskal_group'), tmp.index.get_level_values('chi2_group'))] = tmp.values + 1
cmap = mpl.cm.bwr
cmap.set_bad('black', 1.0)

fig, axes = plt.subplots(1, 2, figsize=(6,3), sharex='all', sharey='all', dpi=300)
ax = axes[0]
plot = ax.imshow(groupcorr, vmin=-0.8, vmax=0.8, cmap=cmap)

cbar = fig.colorbar(plot, ax=ax, shrink=0.5, fraction=0.1)
cbar.set_ticks([-0.8, 0.8])
ax.set_xlabel('Zscore Chi2')
ax.set_ylabel('Zscore Kruskal')

ax = axes[1]
plot = ax.imshow(groupcount, cmap='Reds', norm=LogNorm(vmin=1))
fig.colorbar(plot, ax=ax, shrink=0.5, fraction=0.1)
ax.set_xticks(np.arange(-0.5, 100.5, 20))
ax.set_yticks(np.arange(-0.5, 100.5, 20))
ax.set_xticklabels(np.around(groupM[np.arange(0,100.5,20).astype(int)], decimals=1))
ax.set_yticklabels(np.around(groupK[np.arange(0,100.5,20).astype(int)], decimals=1))

ax.set_xlabel('Zscore Chi2')
plt.tight_layout()
# plt.savefig(f'majortype_{group_name}_bound_rna_stats_genebody_corr.pdf', transparent=True)
../../_images/bebc9b36ee13679fb5d5651bc679882b6092a017ea838d0e7a00ef90230e8660.png

Differential Loop vs All Gene Body#

selp = data['bin_id'].isin(selb.index[selb])
data = data.loc[selp]
tmp3c = tmp3c[selp]
tmprna = tmprna[selp]
cg = sns.clustermap(tmp3c, cmap='bwr', vmin=-3, vmax=3, metric='cosine', xticklabels=leg, yticklabels=[], figsize=(6,6))
../../_images/5b0de176866e063d2440e8446357acc4942227dd4ea709c9b0b8c4853798a0a7.png
rorder = cg.dendrogram_row.reordered_ind.copy()
corder = cg.dendrogram_col.reordered_ind.copy()
fig, axes = plt.subplots(1, 2, sharey='all', figsize=(6,5), dpi=300)

ax = axes[0]
ax.imshow(tmp3c[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('Boundary Prob', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)
ax.set_yticks([])
ax.set_ylabel(f'{np.unique(data["bin_id"]).shape[0]} Diff Comp')

ax = axes[1]
ax.imshow(tmprna[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('RNA', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)
ax.set_ylabel(f'{np.unique(data["gene_id"]).shape[0]} Gene')

plt.tight_layout()
# plt.savefig(f'majortype_{group_name}_diffbound_3C_RNA_allbody.pdf', transparent=True)
../../_images/9defb668e7a15adfff32c865db46cead7ae273eaac4bc8a1c083bed79ec41bed.png
corr_all.append(data['corr'].values)

Differential Loop vs DEG Gene Body#

from scipy.stats import rankdata

deg = np.zeros(genefilter.shape[0])
for i in range(len(leg)-1):
    for j in range(i+1, len(leg)):
        tmp = np.load(f'/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/DEG/{leg[i]}-{leg[j]}.npz')
        # deg[np.logical_and(np.abs(tmp['fc'])>1, tmp['fdr']<1e-3)] = 1
        rank = rankdata(tmp['fdr'])
        deg[rank<=100] = 1

print(deg.sum())
1131.0
selp = data['gene_id'].isin(genefilter.index[deg==1])
data = data.loc[selp]
tmp3c = tmp3c[selp]
tmprna = tmprna[selp]
cg = sns.clustermap(tmp3c, cmap='bwr', vmin=-3, vmax=3, metric='cosine', xticklabels=leg, yticklabels=[], figsize=(6,6))
../../_images/5a2724f42227a7634fc8b436be6b7cd33dd058c044a4d730c2c89d38be7d45b1.png
rorder = cg.dendrogram_row.reordered_ind.copy()
corder = cg.dendrogram_col.reordered_ind.copy()
fig, axes = plt.subplots(1, 2, sharey='all', figsize=(6,5), dpi=300)

ax = axes[0]
ax.imshow(tmp3c[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('Boundary Prob', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)
ax.set_yticks([])
ax.set_ylabel(f'{np.unique(data["bin_id"]).shape[0]} Diff Comp')

ax = axes[1]
ax.imshow(tmprna[np.ix_(rorder, corder)], cmap='bwr', aspect='auto', vmin=-3, vmax=3, interpolation='none')
ax.set_title('RNA', fontsize=10)
# sns.despine(ax=ax, left=True, bottom=True)
ax.set_xticks(np.arange(len(leg)))
ax.set_xticklabels(legname[corder], rotation=90)
ax.set_ylabel(f'{np.unique(data["gene_id"]).shape[0]} DEG')

plt.tight_layout()
# plt.savefig(f'majortype_{group_name}_diffbound_3C_RNA_degbody.pdf', transparent=True)
../../_images/550b4bbadd027bea44d5457ccb519fda7031c2ea81bbe0f7c20f79d973d1dea9.png
corr_all.append(data['corr'].values)
np.save(f'{outdir}corr_diffbound_rna.npy', corr_all)
corr_all = np.load(f'{outdir}corr_diffbound_rna.npy', allow_pickle=True)[[0,2,1,3]]
xleg = [f'{xx}\n(n={yy.shape[0]})' for xx,yy in zip(['All Gene TSS', 'All Gene Body', 'DEG TSS', 'DEG Body'], corr_all)]
fig, ax = plt.subplots(figsize=(4.5,2), dpi=300)
sns.violinplot(corr_all, ax=ax)
ax.set_xticklabels(xleg)
ax.set_ylabel('PCC with Comp Score')
# plt.tight_layout()
# plt.savefig(f'majortype_{group_name}_diffbound_rnacorr.pdf', transparent=True)
Text(0, 0.5, 'PCC with Comp Score')
../../_images/57fce0d0cd8a4e2ec58c7f893cef54737b3a35579edb924df89920ca9fde7a46.png
[[xx.shape, np.median(xx)] for xx in corr_all]
[[(710,), 0.36908752998291405],
 [(1716,), 0.47118865132083876],
 [(84,), 0.8121020961570051],
 [(417,), 0.7439593052340755]]