RNA correlated compartment

RNA correlated compartment#

import numpy as np
import pandas as pd
import cooler
import anndata
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'
indir = f'/home/jzhou_salk_edu/sky_workdir/hba/compartment_majortype/diff/neu/'
outdir = '/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/'
# no L5ET
leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy', 
       'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7', 
       'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx', 
       # 'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
res = 100000
comp = pd.read_csv(f'{indir}DifferentialResult/fdr_result/differential.intra_sample_combined.pcQnm.bedGraph', sep='\t', header=0, index_col=None)
comp.index = comp['chr'] + '_' + (comp['start'] // res).astype(str)
comp
chr start end L23_IT_100Kb L4_IT_100Kb L5_IT_100Kb L6_IT_100Kb L6_IT_Car3_100Kb L56_NP_100Kb L6_CT_100Kb ... Sst CHD7 MSN_D1 MSN_D2 Foxp2 SubCtx sample_maha pval padj dist_clust
chr10_2 chr10 200000 300000 1.20141 1.40148 1.12274 0.91856 0.71971 1.45364 1.35924 ... 1.60003 1.79399 1.18581 1.14140 1.64998 1.77294 78.797991 6.274326e-09 1.654675e-08 1
chr10_3 chr10 300000 400000 1.67618 1.43001 1.33450 1.32691 0.51945 1.22911 1.53759 ... 1.69831 1.69831 1.61143 1.15231 1.94190 2.06830 101.685104 6.282661e-13 1.940153e-12 1
chr10_4 chr10 400000 500000 1.27442 1.33094 1.21215 1.43974 0.96214 1.59233 1.40148 ... 1.58100 1.61143 1.64171 1.62984 1.69065 1.68152 28.864611 9.045962e-02 1.563036e-01 1
chr10_5 chr10 500000 600000 1.43001 1.39806 1.69428 1.60003 0.57439 1.61143 1.37993 ... 1.76052 1.79952 1.58100 1.83248 1.55484 2.17604 110.773702 1.418987e-14 4.640257e-14 1
chr10_6 chr10 600000 700000 1.43350 1.69065 1.44978 1.66232 1.23573 1.26750 1.73916 ... 1.90078 1.82624 1.85399 1.42406 1.60286 1.35578 8.253318 9.900547e-01 1.000000e+00 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
chr9_1367 chr9 136700000 136800000 2.53781 2.49765 2.43707 2.53781 2.64364 2.50284 2.61332 ... 2.49765 2.30178 2.55214 2.61332 2.42730 2.37386 0.208439 1.000000e+00 1.000000e+00 1
chr9_1368 chr9 136800000 136900000 2.25818 2.34290 2.30178 2.27476 2.20722 2.28698 2.49765 ... 2.35196 2.29048 2.25515 2.25515 2.17299 2.22961 1.756256 1.000000e+00 1.000000e+00 1
chr9_1369 chr9 136900000 137000000 2.26494 2.13128 2.34837 2.37084 2.33576 2.35196 2.46547 ... 2.39138 2.22605 2.35521 2.31751 2.40634 2.33914 0.318975 1.000000e+00 1.000000e+00 1
chr9_1370 chr9 137000000 137100000 2.35196 2.04449 2.36778 2.18888 2.30471 2.28174 2.42333 ... 2.35761 2.25818 2.30178 2.18888 2.30759 2.36778 0.952000 1.000000e+00 1.000000e+00 1
chr9_1372 chr9 137200000 137300000 2.33276 2.13811 2.29494 2.32970 2.46547 2.41199 2.52050 ... 2.46547 2.46252 2.38687 2.35196 2.32339 2.21984 0.170553 1.000000e+00 1.000000e+00 1

24745 rows × 49 columns

binall = comp[['chr', 'start', 'end', 'sample_maha', 'pval', 'padj']]
binall = binall.rename({'chr':'chrom'}, axis=1)
comp = comp[leg]
expr = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/cluster_expr.hdf').loc[leg]
from scipy.stats import rankdata

deg = np.zeros(expr.shape[1])
for i in range(len(leg)-1):
    for j in range(i+1, len(leg)):
        tmp = np.load(f'/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/DEG/{leg[i]}-{leg[j]}.npz')
        # deg[np.logical_and(np.abs(tmp['fc'])>1, tmp['fdr']<1e-3)] = 1
        rank = rankdata(tmp['fdr'])
        deg[rank<=100] = 1

print(deg.sum())
1131.0
chrom_size_path = f'/data/hba/loop_majortype/hg38_with_chrl.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
gene_meta = pd.read_csv('/home/jzhou_salk_edu/sky_workdir/hba/ref/gencode.v33.bed', sep='\t', index_col=None, header=None)
gene_meta.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name', 'strand']
gene_meta = gene_meta.set_index('gene_id').loc[expr.columns[deg==1]]
gene_meta = gene_meta.loc[gene_meta['chrom'].isin(chrom_sizes.index[:22])]
gene_meta
chrom start end gene_name strand
gene
ENSG00000002746.15 chr7 43112629 43566001 HECW1 +
ENSG00000005108.16 chr7 11370365 11832198 THSD7A -
ENSG00000006128.12 chr7 97732084 97740472 TAC1 +
ENSG00000006468.14 chr7 13891229 13991425 ETV1 -
ENSG00000007237.18 chr17 9910609 10198551 AC005747.1 -
... ... ... ... ... ...
ENSG00000286954.1 chr6 22663507 22675493 AL033539.2 +
ENSG00000287172.1 chr2 76185020 76399490 AC073091.3 +
ENSG00000287694.1 chr16 76277288 76819624 AC106741.2 +
ENSG00000287722.1 chr13 53207831 53801489 AL356295.1 +
ENSG00000287912.1 chr11 81175201 81431520 AP003464.1 +

1099 rows × 5 columns

import joblib
# from qnorm import quantile_normalize
from scipy.stats import norm
from statsmodels.sandbox.stats.multicomp import multipletests as FDR
from tqdm import tqdm
from ALLCools.mcds.correlation import corr_array
def shuffle_corr_norm(rna_data, dmr_data):
    
    shuffle_rna_data = rna_data.copy()
    for col, data in shuffle_rna_data.items():
        n_gene = shuffle_rna_data.shape[0]
        shuffle_rna_data[col] = shuffle_rna_data[col].sample(n_gene).values
    
    if dmr_data.shape[0] > 50000:
        shuffle_dmr_data = dmr_data.sample(50000).copy()
    else:
        shuffle_dmr_data = dmr_data.copy()
    for col, data in shuffle_dmr_data.items():
        n_dmr = shuffle_dmr_data.shape[0]
        shuffle_dmr_data[col] = shuffle_dmr_data[col].sample(n_dmr).values

    # shuffle corr
    shuffle_corr = corr_array(shuffle_rna_data, shuffle_dmr_data)
    mu, std = norm.fit(shuffle_corr.ravel())
    return mu, std, shuffle_corr.ravel()
null_mu, null_std, shuffle_corr = shuffle_corr_norm(expr.loc[:,  expr.columns.isin(gene_meta.index)].T, comp)
null_mu, null_std
(3.360401866366818e-06, 0.22354448644380281)
shuffle_corr.shape
(27194755,)
gene_slop = 5000000
binall['pos'] = binall[['start', 'end']].mean(axis=1)
gene_records = []
for gene, row in tqdm(gene_meta.iterrows(), total=gene_meta.shape[0]):
    gene_rna = expr[[gene]].T
    
    dmr_chrom = row['chrom']
    dmr_start = row['start'] - gene_slop
    dmr_end = row['end'] + gene_slop
    sel_dmr = (binall['chrom']==dmr_chrom) & (binall['pos'] > dmr_start) & (binall['pos'] < dmr_end)
    gene_dmr = comp.loc[sel_dmr]

    gene_corr = corr_array(gene_rna, gene_dmr).ravel()
    gene_corr = pd.Series(gene_corr, index=gene_dmr.index)

    # pvalue = norm.sf(gene_corr.values, null_mu, null_std)
    # pvalue[pvalue > 0.5] = 1 - pvalue[pvalue > 0.5]
    # pvalue *= 2  # two tailed
    # perform multi-test correction and  q-value
    # _, q, *_ = fdrcorrection(pvalue)

    gene_corr.name = 'corr'
    gene_corr = gene_corr.reset_index()
    gene_corr['gene'] = gene
    # gene_corr["q"] = q
    
    # minimum filter
    # gene_corr = gene_corr[
    #     (gene_corr["q"] < min_q) & (gene_corr["corr"].abs() > min_corr)
    # ].set_index("dmr")

    # gene_records[gene] = gene_corr
    gene_records.append(gene_corr)
    
100%|██████████| 1099/1099 [00:03<00:00, 293.25it/s]
gene_records = pd.concat(gene_records, axis=0)
gene_records.index = np.arange(gene_records.shape[0])
gene_records
index corr gene
0 chr7_381 0.216611 ENSG00000002746.15
1 chr7_382 0.309620 ENSG00000002746.15
2 chr7_383 0.413378 ENSG00000002746.15
3 chr7_384 0.452824 ENSG00000002746.15
4 chr7_385 0.422503 ENSG00000002746.15
... ... ... ...
105394 chr11_859 0.110637 ENSG00000287912.1
105395 chr11_860 0.011490 ENSG00000287912.1
105396 chr11_861 0.264604 ENSG00000287912.1
105397 chr11_862 0.210742 ENSG00000287912.1
105398 chr11_863 0.257057 ENSG00000287912.1

105399 rows × 3 columns

fig, ax = plt.subplots()
sns.distplot(np.random.choice(gene_records['corr'], 50000), ax=ax)
sns.distplot(np.random.choice(shuffle_corr, 50000), ax=ax)
<AxesSubplot:ylabel='Density'>
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica
../../_images/1f32bf6937d8d2d6297fbf95c6ce91e84d1400da5cc816e905e6490809b723a7.png
t1 = rankdata(np.concatenate((gene_records['corr'].values, shuffle_corr)))[:gene_records.shape[0]]
t2 = rankdata(gene_records['corr'].values)
gene_records['FDRneg'] = (t1 - t2) / len(shuffle_corr) / t2 * gene_records.shape[0]
gene_records['FDRpos'] = (len(shuffle_corr) - t1 + t2) / len(shuffle_corr) / (gene_records.shape[0] - t2) * gene_records.shape[0]
threspos = gene_records.loc[gene_records['FDRpos']<0.1, 'corr'].min()
thresneg = gene_records.loc[gene_records['FDRneg']<0.1, 'corr'].max()
print(threspos, thresneg)
0.5029179099915155 -0.6172767127895147
gene_meta[['TSS', 'TES']] = gene_meta[['start', 'end']]
selg = (gene_meta['strand']=='-')
gene_meta.loc[selg, ['TSS', 'TES']] = gene_meta.loc[selg, ['TES', 'TSS']].values
gene_records['TSSdist'] = binall.loc[gene_records['index'], 'pos'].values - gene_meta.loc[gene_records['gene'], 'TSS'].values
selg = (gene_meta.loc[gene_records['gene'], 'strand']=='-')
gene_records.loc[selg.values, 'TSSdist'] = -gene_records.loc[selg.values, 'TSSdist'].values

gene_records['TESdist'] = binall.loc[gene_records['index'], 'pos'].values - gene_meta.loc[gene_records['gene'], 'TES'].values
selg = (gene_meta.loc[gene_records['gene'], 'strand']=='-')
gene_records.loc[selg.values, 'TESdist'] = -gene_records.loc[selg.values, 'TESdist'].values
gene_records['coord'] = 0
selp = (gene_records['TSSdist']<=0)
gene_records.loc[selp, 'coord'] = gene_records.loc[selp, 'TSSdist'] / res * 4 - 100

selp = (gene_records['TESdist']>=0)
gene_records.loc[selp, 'coord'] = gene_records.loc[selp, 'TESdist'] / res * 4 + 100

selp = (gene_records['TESdist']<0) & (gene_records['TSSdist']>0)
gene_records.loc[selp, 'coord'] = gene_records.loc[selp, 'TSSdist'] / (gene_records.loc[selp, 'TSSdist'] - gene_records.loc[selp, 'TESdist']) * 200 - 100
gene_records.to_hdf(f'{outdir}DEG_neu_comp_5M_corr.hdf', key='data')
gene_records = pd.read_hdf(f'{outdir}DEG_neu_comp_5M_corr.hdf', key='data')
thres = np.max(np.abs([thresneg, threspos]))
gene_records['group'] = gene_records['coord']//2
proppos = (gene_records.loc[gene_records['corr']>thres, 'group'].value_counts() / gene_records['group'].value_counts()).sort_index().fillna(0)
propneg = (gene_records.loc[gene_records['corr']<-thres, 'group'].value_counts() / gene_records['group'].value_counts()).sort_index().fillna(0)
fig, ax = plt.subplots(figsize=(3, 1.5), dpi=300)
ax.plot(np.arange(300), proppos, c='C0', label='Pos')
ax.plot(np.arange(300), propneg, c='C1', label='Neg')
ax.set_xticks([0, 100, 200, 300])
ax.set_xticklabels(['-5M', 'TSS', 'TES', '+5M'])
ax.legend()
plt.tight_layout()
# plt.savefig('proportion_corr_comp.pdf', transparent=True)
../../_images/fcca90801d1925d1256b2b3dbb33d19f7484a302f90d203a368ec657b2cea8d8.png
tmp = gene_records.loc[(gene_records['corr']>thres) & (gene_records['TSSdist']>-res) & (gene_records['TESdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])
np.savetxt(f'{outdir}gene_compposcorr_genebody.csv.gz', tmp, delimiter='\n', fmt='%s')

tmp = gene_records.loc[(gene_records['corr']<-thres) & (gene_records['TSSdist']>-res) & (gene_records['TESdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])

tmp = gene_records.loc[(gene_records['corr'].abs()>thres) & (gene_records['TSSdist']>-res) & (gene_records['TESdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])
450 0.4094631483166515
12 0.01091901728844404
462 0.42038216560509556
fig, axes = plt.subplots(2, 1, sharex='all', figsize=(3,3), dpi=300)
ax = axes[0]
tmp = gene_records.loc[gene_records['corr']>thres]
sns.histplot(data=tmp, x='coord', y='corr', bins=300, ax=ax, cbar=True, cbar_kws=dict(ticks=[0,4], fraction=0.2, shrink=0.5))
ax.set_yticks(np.arange(0.6, 1.05, 0.1))

ax = axes[1]
tmp = gene_records.loc[gene_records['corr']<-thres]
# tmp = gene_records.loc[gene_records['corr']<-threspos]
sns.histplot(data=tmp, x='coord', y='corr', bins=300, ax=ax, cbar=True, cbar_kws=dict(ticks=[0,3], fraction=0.2, shrink=0.5))
# ax.set_yticks([-0.6, -0.7, -0.8, -0.9])
ax.set_yticks(np.arange(-1.0, -0.55, 0.1))

ax.set_xticks([-300, -100, 100, 300])
ax.set_xticklabels(['-5M', 'TSS', 'TES', '+5M'])

# plt.savefig('DEG_comp_sigcorr.pdf', transparent=True)
[Text(-300, 0, '-5M'),
 Text(-100, 0, 'TSS'),
 Text(100, 0, 'TES'),
 Text(300, 0, '+5M')]
../../_images/f9caeeaa6173adbe85586e37ee2aff6f2d9e17b35d721feb0374362316a57450.png