RNA correlated compartment

RNA correlated compartment#

import numpy as np
import pandas as pd
import cooler
import anndata
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'

indir = f'/home/jzhou_salk_edu/sky_workdir/hba/compartment_majortype/diff/neu/'
outdir = '/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/'

# no L5ET
leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy', 
       'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7', 
       'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx', 
       # 'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
res = 100000

comp = pd.read_csv(f'{indir}DifferentialResult/fdr_result/differential.intra_sample_combined.pcQnm.bedGraph', sep='\t', header=0, index_col=None)
comp.index = comp['chr'] + '_' + (comp['start'] // res).astype(str)
comp

	chr	start	end	L23_IT_100Kb	L4_IT_100Kb	L5_IT_100Kb	L6_IT_100Kb	L6_IT_Car3_100Kb	L56_NP_100Kb	L6_CT_100Kb	...	Sst	CHD7	MSN_D1	MSN_D2	Foxp2	SubCtx	sample_maha	pval	padj	dist_clust
chr10_2	chr10	200000	300000	1.20141	1.40148	1.12274	0.91856	0.71971	1.45364	1.35924	...	1.60003	1.79399	1.18581	1.14140	1.64998	1.77294	78.797991	6.274326e-09	1.654675e-08	1
chr10_3	chr10	300000	400000	1.67618	1.43001	1.33450	1.32691	0.51945	1.22911	1.53759	...	1.69831	1.69831	1.61143	1.15231	1.94190	2.06830	101.685104	6.282661e-13	1.940153e-12	1
chr10_4	chr10	400000	500000	1.27442	1.33094	1.21215	1.43974	0.96214	1.59233	1.40148	...	1.58100	1.61143	1.64171	1.62984	1.69065	1.68152	28.864611	9.045962e-02	1.563036e-01	1
chr10_5	chr10	500000	600000	1.43001	1.39806	1.69428	1.60003	0.57439	1.61143	1.37993	...	1.76052	1.79952	1.58100	1.83248	1.55484	2.17604	110.773702	1.418987e-14	4.640257e-14	1
chr10_6	chr10	600000	700000	1.43350	1.69065	1.44978	1.66232	1.23573	1.26750	1.73916	...	1.90078	1.82624	1.85399	1.42406	1.60286	1.35578	8.253318	9.900547e-01	1.000000e+00	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
chr9_1367	chr9	136700000	136800000	2.53781	2.49765	2.43707	2.53781	2.64364	2.50284	2.61332	...	2.49765	2.30178	2.55214	2.61332	2.42730	2.37386	0.208439	1.000000e+00	1.000000e+00	1
chr9_1368	chr9	136800000	136900000	2.25818	2.34290	2.30178	2.27476	2.20722	2.28698	2.49765	...	2.35196	2.29048	2.25515	2.25515	2.17299	2.22961	1.756256	1.000000e+00	1.000000e+00	1
chr9_1369	chr9	136900000	137000000	2.26494	2.13128	2.34837	2.37084	2.33576	2.35196	2.46547	...	2.39138	2.22605	2.35521	2.31751	2.40634	2.33914	0.318975	1.000000e+00	1.000000e+00	1
chr9_1370	chr9	137000000	137100000	2.35196	2.04449	2.36778	2.18888	2.30471	2.28174	2.42333	...	2.35761	2.25818	2.30178	2.18888	2.30759	2.36778	0.952000	1.000000e+00	1.000000e+00	1
chr9_1372	chr9	137200000	137300000	2.33276	2.13811	2.29494	2.32970	2.46547	2.41199	2.52050	...	2.46547	2.46252	2.38687	2.35196	2.32339	2.21984	0.170553	1.000000e+00	1.000000e+00	1

24745 rows × 49 columns

binall = comp[['chr', 'start', 'end', 'sample_maha', 'pval', 'padj']]
binall = binall.rename({'chr':'chrom'}, axis=1)
comp = comp[leg]

expr = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/cluster_expr.hdf').loc[leg]

from scipy.stats import rankdata

deg = np.zeros(expr.shape[1])
for i in range(len(leg)-1):
    for j in range(i+1, len(leg)):
        tmp = np.load(f'/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/DEG/{leg[i]}-{leg[j]}.npz')
        # deg[np.logical_and(np.abs(tmp['fc'])>1, tmp['fdr']<1e-3)] = 1
        rank = rankdata(tmp['fdr'])
        deg[rank<=100] = 1

print(deg.sum())

1131.0

chrom_size_path = f'/data/hba/loop_majortype/hg38_with_chrl.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)

gene_meta = pd.read_csv('/home/jzhou_salk_edu/sky_workdir/hba/ref/gencode.v33.bed', sep='\t', index_col=None, header=None)
gene_meta.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name', 'strand']
gene_meta = gene_meta.set_index('gene_id').loc[expr.columns[deg==1]]
gene_meta = gene_meta.loc[gene_meta['chrom'].isin(chrom_sizes.index[:22])]
gene_meta

	chrom	start	end	gene_name	strand
gene
ENSG00000002746.15	chr7	43112629	43566001	HECW1	+
ENSG00000005108.16	chr7	11370365	11832198	THSD7A	-
ENSG00000006128.12	chr7	97732084	97740472	TAC1	+
ENSG00000006468.14	chr7	13891229	13991425	ETV1	-
ENSG00000007237.18	chr17	9910609	10198551	AC005747.1	-
...	...	...	...	...	...
ENSG00000286954.1	chr6	22663507	22675493	AL033539.2	+
ENSG00000287172.1	chr2	76185020	76399490	AC073091.3	+
ENSG00000287694.1	chr16	76277288	76819624	AC106741.2	+
ENSG00000287722.1	chr13	53207831	53801489	AL356295.1	+
ENSG00000287912.1	chr11	81175201	81431520	AP003464.1	+

1099 rows × 5 columns

import joblib
# from qnorm import quantile_normalize
from scipy.stats import norm
from statsmodels.sandbox.stats.multicomp import multipletests as FDR
from tqdm import tqdm
from ALLCools.mcds.correlation import corr_array

def shuffle_corr_norm(rna_data, dmr_data):
    
    shuffle_rna_data = rna_data.copy()
    for col, data in shuffle_rna_data.items():
        n_gene = shuffle_rna_data.shape[0]
        shuffle_rna_data[col] = shuffle_rna_data[col].sample(n_gene).values
    
    if dmr_data.shape[0] > 50000:
        shuffle_dmr_data = dmr_data.sample(50000).copy()
    else:
        shuffle_dmr_data = dmr_data.copy()
    for col, data in shuffle_dmr_data.items():
        n_dmr = shuffle_dmr_data.shape[0]
        shuffle_dmr_data[col] = shuffle_dmr_data[col].sample(n_dmr).values

    # shuffle corr
    shuffle_corr = corr_array(shuffle_rna_data, shuffle_dmr_data)
    mu, std = norm.fit(shuffle_corr.ravel())
    return mu, std, shuffle_corr.ravel()

null_mu, null_std, shuffle_corr = shuffle_corr_norm(expr.loc[:,  expr.columns.isin(gene_meta.index)].T, comp)
null_mu, null_std

(3.360401866366818e-06, 0.22354448644380281)

shuffle_corr.shape

(27194755,)

gene_slop = 5000000

binall['pos'] = binall[['start', 'end']].mean(axis=1)

gene_records = []
for gene, row in tqdm(gene_meta.iterrows(), total=gene_meta.shape[0]):
    gene_rna = expr[[gene]].T
    
    dmr_chrom = row['chrom']
    dmr_start = row['start'] - gene_slop
    dmr_end = row['end'] + gene_slop
    sel_dmr = (binall['chrom']==dmr_chrom) & (binall['pos'] > dmr_start) & (binall['pos'] < dmr_end)
    gene_dmr = comp.loc[sel_dmr]

    gene_corr = corr_array(gene_rna, gene_dmr).ravel()
    gene_corr = pd.Series(gene_corr, index=gene_dmr.index)

    # pvalue = norm.sf(gene_corr.values, null_mu, null_std)
    # pvalue[pvalue > 0.5] = 1 - pvalue[pvalue > 0.5]
    # pvalue *= 2  # two tailed
    # perform multi-test correction and  q-value
    # _, q, *_ = fdrcorrection(pvalue)

    gene_corr.name = 'corr'
    gene_corr = gene_corr.reset_index()
    gene_corr['gene'] = gene
    # gene_corr["q"] = q
    
    # minimum filter
    # gene_corr = gene_corr[
    #     (gene_corr["q"] < min_q) & (gene_corr["corr"].abs() > min_corr)
    # ].set_index("dmr")

    # gene_records[gene] = gene_corr
    gene_records.append(gene_corr)
    

100%|██████████| 1099/1099 [00:03<00:00, 293.25it/s]

gene_records = pd.concat(gene_records, axis=0)
gene_records.index = np.arange(gene_records.shape[0])
gene_records

	index	corr	gene
0	chr7_381	0.216611	ENSG00000002746.15
1	chr7_382	0.309620	ENSG00000002746.15
2	chr7_383	0.413378	ENSG00000002746.15
3	chr7_384	0.452824	ENSG00000002746.15
4	chr7_385	0.422503	ENSG00000002746.15
...	...	...	...
105394	chr11_859	0.110637	ENSG00000287912.1
105395	chr11_860	0.011490	ENSG00000287912.1
105396	chr11_861	0.264604	ENSG00000287912.1
105397	chr11_862	0.210742	ENSG00000287912.1
105398	chr11_863	0.257057	ENSG00000287912.1

105399 rows × 3 columns

fig, ax = plt.subplots()
sns.distplot(np.random.choice(gene_records['corr'], 50000), ax=ax)
sns.distplot(np.random.choice(shuffle_corr, 50000), ax=ax)

<AxesSubplot:ylabel='Density'>

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica

../../_images/1f32bf6937d8d2d6297fbf95c6ce91e84d1400da5cc816e905e6490809b723a7.png

t1 = rankdata(np.concatenate((gene_records['corr'].values, shuffle_corr)))[:gene_records.shape[0]]
t2 = rankdata(gene_records['corr'].values)

gene_records['FDRneg'] = (t1 - t2) / len(shuffle_corr) / t2 * gene_records.shape[0]
gene_records['FDRpos'] = (len(shuffle_corr) - t1 + t2) / len(shuffle_corr) / (gene_records.shape[0] - t2) * gene_records.shape[0]

threspos = gene_records.loc[gene_records['FDRpos']<0.1, 'corr'].min()
thresneg = gene_records.loc[gene_records['FDRneg']<0.1, 'corr'].max()
print(threspos, thresneg)

0.5029179099915155 -0.6172767127895147

gene_meta[['TSS', 'TES']] = gene_meta[['start', 'end']]
selg = (gene_meta['strand']=='-')
gene_meta.loc[selg, ['TSS', 'TES']] = gene_meta.loc[selg, ['TES', 'TSS']].values

gene_records['TSSdist'] = binall.loc[gene_records['index'], 'pos'].values - gene_meta.loc[gene_records['gene'], 'TSS'].values
selg = (gene_meta.loc[gene_records['gene'], 'strand']=='-')
gene_records.loc[selg.values, 'TSSdist'] = -gene_records.loc[selg.values, 'TSSdist'].values

gene_records['TESdist'] = binall.loc[gene_records['index'], 'pos'].values - gene_meta.loc[gene_records['gene'], 'TES'].values
selg = (gene_meta.loc[gene_records['gene'], 'strand']=='-')
gene_records.loc[selg.values, 'TESdist'] = -gene_records.loc[selg.values, 'TESdist'].values

gene_records['coord'] = 0
selp = (gene_records['TSSdist']<=0)
gene_records.loc[selp, 'coord'] = gene_records.loc[selp, 'TSSdist'] / res * 4 - 100

selp = (gene_records['TESdist']>=0)
gene_records.loc[selp, 'coord'] = gene_records.loc[selp, 'TESdist'] / res * 4 + 100

selp = (gene_records['TESdist']<0) & (gene_records['TSSdist']>0)
gene_records.loc[selp, 'coord'] = gene_records.loc[selp, 'TSSdist'] / (gene_records.loc[selp, 'TSSdist'] - gene_records.loc[selp, 'TESdist']) * 200 - 100

gene_records.to_hdf(f'{outdir}DEG_neu_comp_5M_corr.hdf', key='data')

gene_records = pd.read_hdf(f'{outdir}DEG_neu_comp_5M_corr.hdf', key='data')

thres = np.max(np.abs([thresneg, threspos]))
gene_records['group'] = gene_records['coord']//2
proppos = (gene_records.loc[gene_records['corr']>thres, 'group'].value_counts() / gene_records['group'].value_counts()).sort_index().fillna(0)
propneg = (gene_records.loc[gene_records['corr']<-thres, 'group'].value_counts() / gene_records['group'].value_counts()).sort_index().fillna(0)

fig, ax = plt.subplots(figsize=(3, 1.5), dpi=300)
ax.plot(np.arange(300), proppos, c='C0', label='Pos')
ax.plot(np.arange(300), propneg, c='C1', label='Neg')
ax.set_xticks([0, 100, 200, 300])
ax.set_xticklabels(['-5M', 'TSS', 'TES', '+5M'])
ax.legend()
plt.tight_layout()
# plt.savefig('proportion_corr_comp.pdf', transparent=True)

../../_images/fcca90801d1925d1256b2b3dbb33d19f7484a302f90d203a368ec657b2cea8d8.png

tmp = gene_records.loc[(gene_records['corr']>thres) & (gene_records['TSSdist']>-res) & (gene_records['TESdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])
np.savetxt(f'{outdir}gene_compposcorr_genebody.csv.gz', tmp, delimiter='\n', fmt='%s')

tmp = gene_records.loc[(gene_records['corr']<-thres) & (gene_records['TSSdist']>-res) & (gene_records['TESdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])

tmp = gene_records.loc[(gene_records['corr'].abs()>thres) & (gene_records['TSSdist']>-res) & (gene_records['TESdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])

0.4094631483166515
0.01091901728844404
0.42038216560509556

fig, axes = plt.subplots(2, 1, sharex='all', figsize=(3,3), dpi=300)
ax = axes[0]
tmp = gene_records.loc[gene_records['corr']>thres]
sns.histplot(data=tmp, x='coord', y='corr', bins=300, ax=ax, cbar=True, cbar_kws=dict(ticks=[0,4], fraction=0.2, shrink=0.5))
ax.set_yticks(np.arange(0.6, 1.05, 0.1))

ax = axes[1]
tmp = gene_records.loc[gene_records['corr']<-thres]
# tmp = gene_records.loc[gene_records['corr']<-threspos]
sns.histplot(data=tmp, x='coord', y='corr', bins=300, ax=ax, cbar=True, cbar_kws=dict(ticks=[0,3], fraction=0.2, shrink=0.5))
# ax.set_yticks([-0.6, -0.7, -0.8, -0.9])
ax.set_yticks(np.arange(-1.0, -0.55, 0.1))

ax.set_xticks([-300, -100, 100, 300])
ax.set_xticklabels(['-5M', 'TSS', 'TES', '+5M'])

# plt.savefig('DEG_comp_sigcorr.pdf', transparent=True)

[Text(-300, 0, '-5M'),
 Text(-100, 0, 'TSS'),
 Text(100, 0, 'TES'),
 Text(300, 0, '+5M')]

../../_images/f9caeeaa6173adbe85586e37ee2aff6f2d9e17b35d721feb0374362316a57450.png