RNA correlated boundary

RNA correlated boundary#

import numpy as np
import pandas as pd
import cooler
import anndata
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'
indir = '/data/hba/domain_majortype/'
outdir = '/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/'
# no L5ET
leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy', 
       'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7', 
       'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx', 
       # 'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
res = 25000
bound_count_ct = pd.read_hdf(f'{indir}MajorType_boundcount.hdf')
cell_count_ct = pd.read_csv(f'{indir}MajorType_cellcount.csv.gz', header=0, index_col=0)['count_cell']
bound_prob_ct = bound_count_ct.loc[leg] / cell_count_ct.loc[leg][:, None]
expr = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/cluster_expr.hdf').loc[leg]
from scipy.stats import rankdata

deg = np.zeros(expr.shape[1])
for i in range(len(leg)-1):
    for j in range(i+1, len(leg)):
        tmp = np.load(f'/home/jzhou_salk_edu/sky_workdir/hba/rna_majortype/DEG/{leg[i]}-{leg[j]}.npz')
        # deg[np.logical_and(np.abs(tmp['fc'])>1, tmp['fdr']<1e-3)] = 1
        rank = rankdata(tmp['fdr'])
        deg[rank<=100] = 1

print(deg.sum())
1131.0
chrom_size_path = f'/data/hba/loop_majortype/hg38_with_chrl.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
gene_meta = pd.read_csv('/home/jzhou_salk_edu/sky_workdir/hba/ref/gencode.v33.bed', sep='\t', index_col=None, header=None)
gene_meta.columns = ['chrom', 'start', 'end', 'gene_id', 'gene_name', 'strand']
gene_meta = gene_meta.set_index('gene_id').loc[expr.columns[deg==1]]
gene_meta = gene_meta.loc[gene_meta['chrom'].isin(chrom_sizes.index[:22])]
gene_meta
chrom start end gene_name strand
gene
ENSG00000002746.15 chr7 43112629 43566001 HECW1 +
ENSG00000005108.16 chr7 11370365 11832198 THSD7A -
ENSG00000006128.12 chr7 97732084 97740472 TAC1 +
ENSG00000006468.14 chr7 13891229 13991425 ETV1 -
ENSG00000007237.18 chr17 9910609 10198551 AC005747.1 -
... ... ... ... ... ...
ENSG00000286954.1 chr6 22663507 22675493 AL033539.2 +
ENSG00000287172.1 chr2 76185020 76399490 AC073091.3 +
ENSG00000287694.1 chr16 76277288 76819624 AC106741.2 +
ENSG00000287722.1 chr13 53207831 53801489 AL356295.1 +
ENSG00000287912.1 chr11 81175201 81431520 AP003464.1 +

1099 rows × 5 columns

binall = pd.DataFrame(index=bound_count_ct.columns)
binall['chrom'] = binall.index.str.split('_').str[0]
binall['start'] = binall.index.str.split('_').str[1].astype(int) * res
binall['end'] = binall['start'] + res
binall
chrom start end
chr1_0 chr1 0 25000
chr1_1 chr1 25000 50000
chr1_2 chr1 50000 75000
chr1_3 chr1 75000 100000
chr1_4 chr1 100000 125000
... ... ... ...
chr22_2028 chr22 50700000 50725000
chr22_2029 chr22 50725000 50750000
chr22_2030 chr22 50750000 50775000
chr22_2031 chr22 50775000 50800000
chr22_2032 chr22 50800000 50825000

115009 rows × 3 columns

import joblib
# from qnorm import quantile_normalize
from scipy.stats import norm
from statsmodels.sandbox.stats.multicomp import multipletests as FDR
from tqdm import tqdm
from ALLCools.mcds.correlation import corr_array
bkl = pd.read_csv(f'/data/hba/loop_majortype/M1C.rowsumpb1000.blf50.merged.bed', sep='\t', header=None, index_col=None)
binall['bklfilter'] = True
for c in chrom_sizes.index[:-1]:
    chrfilter = (binall['chrom']==c)
    tmp = binall.loc[chrfilter]
    tmp.iloc[:10, -1] = False
    tmp.iloc[-10:, -1] = False
    for xx,yy in bkl.loc[bkl[0]==c, [1,2]].values // res:
        tmp.iloc[max([0,xx-2]):(yy+2), -1] = False
    binall.loc[chrfilter] = tmp.copy()

print(binall['bklfilter'].sum())
100643
bound_prob_ct = bound_prob_ct.loc[:, binall['bklfilter']]
binall = binall.loc[binall['bklfilter']]
def shuffle_corr_norm(rna_data, dmr_data):
    
    shuffle_rna_data = rna_data.copy()
    for col, data in shuffle_rna_data.items():
        n_gene = shuffle_rna_data.shape[0]
        shuffle_rna_data[col] = shuffle_rna_data[col].sample(n_gene).values
    
    if dmr_data.shape[0] > 50000:
        shuffle_dmr_data = dmr_data.sample(50000).copy()
    else:
        shuffle_dmr_data = dmr_data.copy()
    for col, data in shuffle_dmr_data.items():
        n_dmr = shuffle_dmr_data.shape[0]
        shuffle_dmr_data[col] = shuffle_dmr_data[col].sample(n_dmr).values

    # shuffle corr
    shuffle_corr = corr_array(shuffle_rna_data, shuffle_dmr_data)
    mu, std = norm.fit(shuffle_corr.ravel())
    return mu, std, shuffle_corr.ravel()
null_mu, null_std, shuffle_corr = shuffle_corr_norm(expr.loc[:,  expr.columns.isin(gene_meta.index)].T, bound_prob_ct.T)
null_mu, null_std
(0.0019941576523659988, 0.2235632728434467)
shuffle_corr.shape
(54950000,)
gene_slop = 5000000
gene_records = []
for gene, row in tqdm(gene_meta.iterrows(), total=gene_meta.shape[0]):
    gene_rna = expr[[gene]].T
    
    dmr_chrom = row['chrom']
    dmr_start = row['start'] - gene_slop
    dmr_end = row['end'] + gene_slop
    sel_dmr = (binall['chrom']==dmr_chrom) & (binall['start'] > dmr_start) & (binall['start'] < dmr_end)
    gene_dmr = bound_prob_ct.T.loc[sel_dmr]

    gene_corr = corr_array(gene_rna, gene_dmr).ravel()
    gene_corr = pd.Series(gene_corr, index=gene_dmr.index)

    # pvalue = norm.sf(gene_corr.values, null_mu, null_std)
    # pvalue[pvalue > 0.5] = 1 - pvalue[pvalue > 0.5]
    # pvalue *= 2  # two tailed
    # perform multi-test correction and  q-value
    # _, q, *_ = fdrcorrection(pvalue)

    gene_corr.name = 'corr'
    gene_corr = gene_corr.reset_index()
    gene_corr['gene'] = gene
    # gene_corr["q"] = q
    
    # minimum filter
    # gene_corr = gene_corr[
    #     (gene_corr["q"] < min_q) & (gene_corr["corr"].abs() > min_corr)
    # ].set_index("dmr")

    # gene_records[gene] = gene_corr
    gene_records.append(gene_corr)
    
100%|██████████| 1099/1099 [00:21<00:00, 51.51it/s]
gene_records = pd.concat(gene_records, axis=0)
gene_records.index = np.arange(gene_records.shape[0])
gene_records
index corr gene
0 chr7_1525 0.044345 ENSG00000002746.15
1 chr7_1526 -0.298720 ENSG00000002746.15
2 chr7_1527 -0.408159 ENSG00000002746.15
3 chr7_1528 -0.556742 ENSG00000002746.15
4 chr7_1529 -0.246539 ENSG00000002746.15
... ... ... ...
424261 chr11_3453 -0.259785 ENSG00000287912.1
424262 chr11_3454 -0.141301 ENSG00000287912.1
424263 chr11_3455 0.278937 ENSG00000287912.1
424264 chr11_3456 0.347317 ENSG00000287912.1
424265 chr11_3457 0.187560 ENSG00000287912.1

424266 rows × 3 columns

fig, ax = plt.subplots()
sns.distplot(np.random.choice(gene_records['corr'], 50000), ax=ax)
sns.distplot(np.random.choice(shuffle_corr, 50000), ax=ax)
<AxesSubplot:ylabel='Density'>
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica
../../_images/831f2e50f735ff1fa052f26845672703d384e8e6d14b8d4c7448348b0d82d9df.png
t1 = rankdata(np.concatenate((gene_records['corr'].values, shuffle_corr)))[:gene_records.shape[0]]
t2 = rankdata(gene_records['corr'].values)
gene_records['FDRneg'] = (t1 - t2) / len(shuffle_corr) / t2 * gene_records.shape[0]
gene_records['FDRpos'] = (len(shuffle_corr) - t1 + t2) / len(shuffle_corr) / (gene_records.shape[0] - t2) * gene_records.shape[0]
threspos = gene_records.loc[gene_records['FDRpos']<0.1, 'corr'].min()
thresneg = gene_records.loc[gene_records['FDRneg']<0.1, 'corr'].max()
print(threspos, thresneg)
0.681722215979567 -0.514844193782862
gene_meta[['TSS', 'TES']] = gene_meta[['start', 'end']]
selg = (gene_meta['strand']=='-')
gene_meta.loc[selg, ['TSS', 'TES']] = gene_meta.loc[selg, ['TES', 'TSS']].values
gene_records['TSSdist'] = binall.loc[gene_records['index'], 'start'].values - gene_meta.loc[gene_records['gene'], 'TSS'].values
selg = (gene_meta.loc[gene_records['gene'], 'strand']=='-')
gene_records.loc[selg.values, 'TSSdist'] = -gene_records.loc[selg.values, 'TSSdist'].values

gene_records['TESdist'] = binall.loc[gene_records['index'], 'start'].values - gene_meta.loc[gene_records['gene'], 'TES'].values
selg = (gene_meta.loc[gene_records['gene'], 'strand']=='-')
gene_records.loc[selg.values, 'TESdist'] = -gene_records.loc[selg.values, 'TESdist'].values
gene_records['coord'] = 0
selp = (gene_records['TSSdist']<=0)
gene_records.loc[selp, 'coord'] = gene_records.loc[selp, 'TSSdist'] / res - 100

selp = (gene_records['TESdist']>=0)
gene_records.loc[selp, 'coord'] = gene_records.loc[selp, 'TESdist'] / res + 100

selp = (gene_records['TESdist']<0) & (gene_records['TSSdist']>0)
gene_records.loc[selp, 'coord'] = gene_records.loc[selp, 'TSSdist'] / (gene_records.loc[selp, 'TSSdist'] - gene_records.loc[selp, 'TESdist']) * 200 - 100
gene_records.to_hdf(f'{outdir}DEG_neu_bound_5M_corr.hdf', key='data')
gene_records = pd.read_hdf(f'{outdir}DEG_neu_bound_5M_corr.hdf', key='data')
thres = np.max(np.abs([thresneg, threspos]))
gene_records['group'] = gene_records['coord']//2
proppos = (gene_records.loc[gene_records['corr']>thres, 'group'].value_counts() / gene_records['group'].value_counts()).sort_index().fillna(0)
propneg = (gene_records.loc[gene_records['corr']<-thres, 'group'].value_counts() / gene_records['group'].value_counts()).sort_index().fillna(0)
fig, ax = plt.subplots(figsize=(3, 1.5), dpi=300)
ax.plot(np.arange(300), proppos, c='C0', label='Pos')
ax.plot(np.arange(300), propneg, c='C1', label='Neg')
ax.set_xticks([0, 100, 200, 300])
ax.set_xticklabels(['-5M', 'TSS', 'TES', '+5M'])
ax.legend()
plt.tight_layout()
# plt.savefig('proportion_corr_bound.pdf', transparent=True)
../../_images/22836423b44a6158f24f5292ea82e8622c1ebc6823069ccfccefd6fe11ce97d0.png
tmp = gene_records.loc[(gene_records['corr']>thres) & (gene_records['TSSdist']>-res) & (gene_records['TSSdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])
np.savetxt(f'{outdir}gene_boundposcorr_tss.csv.gz', tmp, delimiter='\n', fmt='%s')

tmp = gene_records.loc[(gene_records['corr']>thres) & (gene_records['TESdist']>-res) & (gene_records['TESdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])
np.savetxt(f'{outdir}gene_boundposcorr_tes.csv.gz', tmp, delimiter='\n', fmt='%s')

tmp = gene_records.loc[(gene_records['corr']>thres) & (((gene_records['TSSdist']>-res) & (gene_records['TSSdist']<res)) | ((gene_records['TESdist']>-res) & (gene_records['TESdist']<res))), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])
np.savetxt(f'{outdir}gene_boundposcorr_tsstes.csv.gz', tmp, delimiter='\n', fmt='%s')
285 0.25932666060054593
271 0.24658780709736125
466 0.4240218380345769
tmp = gene_records.loc[(gene_records['corr']>thres) & (gene_records['TSSdist']>-res) & (gene_records['TESdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])
np.savetxt(f'{outdir}gene_boundposcorr_genebody.csv.gz', tmp, delimiter='\n', fmt='%s')

tmp = gene_records.loc[(gene_records['corr']<-thres) & (gene_records['TSSdist']>-res) & (gene_records['TESdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])

tmp = gene_records.loc[(gene_records['corr'].abs()>thres) & (gene_records['TSSdist']>-res) & (gene_records['TESdist']<res), 'gene'].unique()
print(tmp.shape[0], tmp.shape[0]/gene_records['gene'].unique().shape[0])
591 0.5377616014558689
224 0.20382165605095542
674 0.6132848043676069
fig, axes = plt.subplots(2, 1, sharex='all', figsize=(3,3), dpi=300)
ax = axes[0]
tmp = gene_records.loc[gene_records['corr']>threspos]
sns.histplot(data=tmp, x='coord', y='corr', bins=300, ax=ax, cbar=True, cbar_kws=dict(ticks=[0,7], fraction=0.2, shrink=0.5))
ax.set_yticks([0.7, 0.8, 0.9, 1.0])

ax = axes[1]
# tmp = gene_records.loc[gene_records['corr']<thresneg]
tmp = gene_records.loc[gene_records['corr']<-threspos]
sns.histplot(data=tmp, x='coord', y='corr', bins=300, ax=ax, cbar=True, cbar_kws=dict(ticks=[0,3], fraction=0.2, shrink=0.5))
# ax.set_yticks([-0.6, -0.7, -0.8, -0.9])
ax.set_yticks([-0.7, -0.8, -0.9, -1.0])

ax.set_xticks([-300, -100, 100, 300])
ax.set_xticklabels(['-5M', 'TSS', 'TES', '+5M'])

# plt.savefig('DEG_domain_sigcorr.pdf', transparent=True)
[Text(-300, 0, '-5M'),
 Text(-100, 0, 'TSS'),
 Text(100, 0, 'TES'),
 Text(300, 0, '+5M')]
../../_images/2cd0a3566249de7f32bd2e97fa8a918880887e1a3b73810634949f0a7691d6e4.png