Correlation between compartment and other modalities

Contents

Correlation between compartment and other modalities#

import cooler
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns
from scipy.stats import zscore, pearsonr, norm

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'

leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'L5_ET', 'Amy', 
       'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7', 
       'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
legname = ['L2/3-IT', 'L4-IT', 'L5-IT', 'L6-IT', 'L6-IT-Car3', 'L5/6-NP', 'L6-CT', 'L6b', 'L5-ET', 'Amy-Exc', 
       'Lamp5', 'Lamp5-Lhx6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb-ChC', 'Sst', 'Chd7', 
       'MSN-D1', 'MSN-D2', 'Foxp2', 'SubCtx-Cplx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
leg2name = {xx:yy for xx,yy in zip(leg, legname)}

leg = {'exc': ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy'], 
       'inh': ['Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7'], 
       'msn': ['MSN_D1', 'MSN_D2', 'Foxp2'], 
       'sub': ['SubCtx'], 
       'glia': ['ASC', 'ODC', 'OPC'], 
       'mgc': ['MGC'], 
       'smc': ['PC'], 
       'endo': ['EC'], 
       'fibro': ['VLMC'],
      }
leg['neu'] = leg['exc'] + leg['inh'] + leg['msn'] + leg['sub']
leg['all'] = leg['neu'] + leg['glia'] + leg['mgc'] + leg['smc'] + leg['endo'] + leg['fibro']

group_name = 'neu'

leg = pd.Index(leg[group_name])
legname = leg.map(leg2name)
res = 100000

indir = f'/home/jzhou_salk_edu/sky_workdir/hba/compartment_majortype/diff/{group_name}/'

comp = pd.read_csv(f'{indir}DifferentialResult/fdr_result/differential.intra_sample_combined.pcQnm.bedGraph', sep='\t', header=0, index_col=None)
comp.index = comp['chr'] + '_' + (comp['start'] // res).astype(str)
comp

	chr	start	end	L23_IT_100Kb	L4_IT_100Kb	L5_IT_100Kb	L6_IT_100Kb	L6_IT_Car3_100Kb	L56_NP_100Kb	L6_CT_100Kb	...	Sst	CHD7	MSN_D1	MSN_D2	Foxp2	SubCtx	sample_maha	pval	padj	dist_clust
chr10_2	chr10	200000	300000	1.20141	1.40148	1.12274	0.91856	0.71971	1.45364	1.35924	...	1.60003	1.79399	1.18581	1.14140	1.64998	1.77294	78.797991	6.274326e-09	1.654675e-08	1
chr10_3	chr10	300000	400000	1.67618	1.43001	1.33450	1.32691	0.51945	1.22911	1.53759	...	1.69831	1.69831	1.61143	1.15231	1.94190	2.06830	101.685104	6.282661e-13	1.940153e-12	1
chr10_4	chr10	400000	500000	1.27442	1.33094	1.21215	1.43974	0.96214	1.59233	1.40148	...	1.58100	1.61143	1.64171	1.62984	1.69065	1.68152	28.864611	9.045962e-02	1.563036e-01	1
chr10_5	chr10	500000	600000	1.43001	1.39806	1.69428	1.60003	0.57439	1.61143	1.37993	...	1.76052	1.79952	1.58100	1.83248	1.55484	2.17604	110.773702	1.418987e-14	4.640257e-14	1
chr10_6	chr10	600000	700000	1.43350	1.69065	1.44978	1.66232	1.23573	1.26750	1.73916	...	1.90078	1.82624	1.85399	1.42406	1.60286	1.35578	8.253318	9.900547e-01	1.000000e+00	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
chr9_1367	chr9	136700000	136800000	2.53781	2.49765	2.43707	2.53781	2.64364	2.50284	2.61332	...	2.49765	2.30178	2.55214	2.61332	2.42730	2.37386	0.208439	1.000000e+00	1.000000e+00	1
chr9_1368	chr9	136800000	136900000	2.25818	2.34290	2.30178	2.27476	2.20722	2.28698	2.49765	...	2.35196	2.29048	2.25515	2.25515	2.17299	2.22961	1.756256	1.000000e+00	1.000000e+00	1
chr9_1369	chr9	136900000	137000000	2.26494	2.13128	2.34837	2.37084	2.33576	2.35196	2.46547	...	2.39138	2.22605	2.35521	2.31751	2.40634	2.33914	0.318975	1.000000e+00	1.000000e+00	1
chr9_1370	chr9	137000000	137100000	2.35196	2.04449	2.36778	2.18888	2.30471	2.28174	2.42333	...	2.35761	2.25818	2.30178	2.18888	2.30759	2.36778	0.952000	1.000000e+00	1.000000e+00	1
chr9_1372	chr9	137200000	137300000	2.33276	2.13811	2.29494	2.32970	2.46547	2.41199	2.52050	...	2.46547	2.46252	2.38687	2.35196	2.32339	2.21984	0.170553	1.000000e+00	1.000000e+00	1

24745 rows × 49 columns

binall = comp[['chr', 'start', 'end', 'sample_maha', 'pval', 'padj']]
comp = comp[leg]

from ALLCools.mcds import MCDS
from ALLCools.mcds.utilities import calculate_posterior_mc_frac

mcds = MCDS.open('/data/hba/mc_majortype/MajorType.mcds', var_dim='chrom5k')
mcds['chrom100k'] = mcds['chrom5k_chrom'].to_pandas().astype(str) + '_' + (mcds['chrom5k_start'] // res).to_pandas().astype(str)
mcds

<xarray.MCDS>
Dimensions:        (cell: 40, chrom5k: 617669, count_type: 2, mc_type: 2)
Coordinates:
  * cell           (cell) <U10 'Amy' 'ASC' 'CA1' 'CA2' ... 'THM_MB' 'Vip' 'VLMC'
  * chrom5k        (chrom5k) object 'chr1_0' 'chr1_1' ... 'chrY_11445'
    chrom5k_chrom  (chrom5k) <U5 'chr1' 'chr1' 'chr1' ... 'chrY' 'chrY' 'chrY'
    chrom5k_end    (chrom5k) int64 5000 10000 15000 ... 57225000 57227415
    chrom5k_start  (chrom5k) int64 0 5000 10000 ... 57215000 57220000 57225000
  * count_type     (count_type) <U3 'mc' 'cov'
  * mc_type        (mc_type) <U3 'CGN' 'CHN'
Data variables:
    chrom5k_da     (cell, chrom5k, mc_type, count_type) uint32 dask.array<chunksize=(5, 77209, 1, 1), meta=np.ndarray>
    chrom100k      (chrom5k) object 'chr1_0' 'chr1_0' ... 'chrY_572' 'chrY_572'
Attributes:
    obs_dim:  cell
    var_dim:  chrom5k

mCG#

mc = mcds['chrom5k_da'].sel(count_type='mc', mc_type='CGN').to_pandas().T
mc['chrom100k'] = mcds['chrom100k'].to_pandas()
mc = mc.groupby('chrom100k').sum().T
cov = mcds['chrom5k_da'].sel(count_type='cov', mc_type='CGN').to_pandas().T
cov['chrom100k'] = mcds['chrom100k'].to_pandas()
cov = cov.groupby('chrom100k').sum().T

binfilter = ['_'.join(xx.split('_')[:-1]) for xx in mc.columns]
binfilter = [(len(xx)<6) and (xx not in ['chrM','chrX','chrY']) for xx in binfilter]
print(np.sum(binfilter))
mc = mc.loc[leg, binfilter]
cov = cov.loc[leg, binfilter]
print(mc.shape, cov.shape)

28760
(21, 28760) (21, 28760)

mcg = calculate_posterior_mc_frac(mc.values, cov.values)
mcg = pd.DataFrame(mcg, index=leg, columns=mc.columns)
mcg = mcg[binall.index].T

mCH#

mc = mcds['chrom5k_da'].sel(count_type='mc', mc_type='CHN').to_pandas().T
mc['chrom100k'] = mcds['chrom100k'].to_pandas()
mc = mc.groupby('chrom100k').sum().T
cov = mcds['chrom5k_da'].sel(count_type='cov', mc_type='CHN').to_pandas().T
cov['chrom100k'] = mcds['chrom100k'].to_pandas()
cov = cov.groupby('chrom100k').sum().T

binfilter = ['_'.join(xx.split('_')[:-1]) for xx in mc.columns]
binfilter = [(len(xx)<6) and (xx not in ['chrM','chrX','chrY']) for xx in binfilter]
print(np.sum(binfilter))
mc = mc.loc[leg, binfilter]
cov = cov.loc[leg, binfilter]
print(mc.shape, cov.shape)

28760
(21, 28760) (21, 28760)

mch = calculate_posterior_mc_frac(mc.values, cov.values)
mch = pd.DataFrame(mch, index=leg, columns=mc.columns)
mch = mch[binall.index].T

ATAC#

sig = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/atac_majortype/cluster_atac_signal.hdf')
cov = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/atac_majortype/cluster_atac_cov.hdf')

bins = pd.DataFrame(index=sig.columns)
bins['chrom'] = bins.index.str.split('_').str[0]
bins['start'] = (bins.index.str.split('_').str[1].astype(int) - 1) * 5000
bins['chrom100k'] = bins['chrom'] + '_' + (bins['start'] // res).astype(str)

sig = sig.groupby(by=bins['chrom100k'], axis=1).sum()
cov = cov.groupby(by=bins['chrom100k']).sum()
atac = (sig/cov).fillna(0)

legatac = leg[leg.isin(atac.index)]
atac = atac.loc[legatac, binall.index].T
atac = atac / atac.sum(axis=0)

outdir = './'

mcg.to_hdf(f'{outdir}comp_mCG.hdf', key='data')
mch.to_hdf(f'{outdir}comp_mCH.hdf', key='data')
atac.to_hdf(f'{outdir}comp_ATAC.hdf', key='data')

binall['mCG_corr'] = [pearsonr(xx, yy)[0] for xx,yy in zip(comp.values, mcg.values)]
binall['mCH_corr'] = [pearsonr(xx, yy)[0] for xx,yy in zip(comp.values, mch.values)]
if atac.shape[1]==comp.shape[1]:
    binall['ATAC_corr'] = [pearsonr(xx, yy)[0] for xx,yy in zip(comp.values, atac.values)]

binall['logPadj'] = -np.log10(binall['padj'])
binall.loc[binall['logPadj']>300, 'logPadj'] = 300

binall['mCG_std'] = np.std(mcg, axis=1)
binall['mCH_std'] = np.std(mch, axis=1)
binall['ATAC_std'] = np.std(atac, axis=1)
binall['comp_std'] = np.std(comp, axis=1)
binall

	chr	start	end	sample_maha	pval	padj	mCG_corr	mCH_corr	ATAC_corr	logPadj	mCG_std	mCH_std	ATAC_std	comp_std
chr10_2	chr10	200000	300000	78.797991	6.274326e-09	1.654675e-08	0.189020	-0.307897	0.083061	7.781287	0.020524	0.069741	0.000007	0.372115
chr10_3	chr10	300000	400000	101.685104	6.282661e-13	1.940153e-12	0.287263	-0.255900	-0.157650	11.712164	0.032419	0.084277	0.000013	0.345404
chr10_4	chr10	400000	500000	28.864611	9.045962e-02	1.563036e-01	-0.492743	-0.395472	0.361907	0.806031	0.028772	0.080842	0.000012	0.285201
chr10_5	chr10	500000	600000	110.773702	1.418987e-14	4.640257e-14	-0.021354	-0.314666	0.287532	13.333458	0.034190	0.098258	0.000012	0.322221
chr10_6	chr10	600000	700000	8.253318	9.900547e-01	1.000000e+00	-0.100094	-0.142201	0.179416	-0.000000	0.040324	0.077744	0.000017	0.236699
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
chr9_1367	chr9	136700000	136800000	0.208439	1.000000e+00	1.000000e+00	-0.064657	0.237518	0.096892	-0.000000	0.023485	0.057986	0.000010	0.094490
chr9_1368	chr9	136800000	136900000	1.756256	1.000000e+00	1.000000e+00	0.344439	0.447092	-0.465422	-0.000000	0.041249	0.046669	0.000012	0.119900
chr9_1369	chr9	136900000	137000000	0.318975	1.000000e+00	1.000000e+00	0.525077	0.260201	-0.377476	-0.000000	0.028921	0.059762	0.000011	0.095886
chr9_1370	chr9	137000000	137100000	0.952000	1.000000e+00	1.000000e+00	0.361288	0.461071	-0.361203	-0.000000	0.042212	0.047043	0.000013	0.119908
chr9_1372	chr9	137200000	137300000	0.170553	1.000000e+00	1.000000e+00	0.490722	0.239300	-0.064113	-0.000000	0.014158	0.042930	0.000012	0.098449

24745 rows × 14 columns

binall.to_hdf(f'{outdir}bin_stats.hdf', key='data')

fig, axes = plt.subplots(1, 2, figsize=(4, 2), dpi=300)
ax = axes[0]
sns.histplot(zscore(np.log10(binall['sample_maha'])), bins=100, ax=ax)
ax = axes[1]
sns.histplot(zscore(binall['sample_maha']), bins=100, binrange=(-1,3), ax=ax)
plt.tight_layout()

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica

../../_images/ff685045357d2c201a81a9aca3f62e83b4961cdc15357850f6f9941f5d493c43.png

print(np.sum(zscore(binall['sample_maha'])>norm.isf(0.025)))

fig, axes = plt.subplots(1, 2, figsize=(4,2), sharey='all', dpi=300)
ax = axes[0]
sns.histplot(binall, x='comp_std', y='sample_maha', bins=100, ax=ax, log_scale=(False, 10))
ax = axes[1]
sns.histplot(binall, x='logPadj', y='sample_maha', bins=100, ax=ax, log_scale=(False, 10))

<AxesSubplot:xlabel='logPadj', ylabel='sample_maha'>

../../_images/8790d579ad88ba0bbe82ec95bcc39fbda735c4b81d9d1af60da6a47990b7412d.png

fig, axes = plt.subplots(1, 2, figsize=(4,2), sharex='all', sharey='all', dpi=300)
ax = axes[0]
sns.histplot(binall, x='mCG_corr', y='comp_std', bins=100, ax=ax)
ax = axes[1]
sns.histplot(binall, x='mCH_corr', y='comp_std', bins=100, ax=ax)
ax.set_xticks([-1, 0, 1])
# plt.savefig(f'majortype_{group_name}_diffcomp_stdcorr.pdf', transparent=True)

[<matplotlib.axis.XTick at 0x7f509c6ad690>,
 <matplotlib.axis.XTick at 0x7f509c6a8390>,
 <matplotlib.axis.XTick at 0x7f509c64e190>]

../../_images/9456f96c3af0b221788cbabd287c71dc9bebc7219263c6b7bf44cd60aa15110b.png

fig, axes = plt.subplots(1, 2, figsize=(4,2), sharex='all', sharey='all', dpi=300)
ax = axes[0]
sns.histplot(binall, x='mCG_corr', y='sample_maha', bins=100, log_scale=(False, 10), ax=ax)
ax = axes[1]
sns.histplot(binall, x='mCH_corr', y='sample_maha', bins=100, log_scale=(False, 10), ax=ax)
ax.set_xticks([-1, 0, 1])
# plt.savefig(f'majortype_{group_name}_diffcomp_statscorr.pdf', transparent=True)

[<matplotlib.axis.XTick at 0x7f509c603210>,
 <matplotlib.axis.XTick at 0x7f509c600250>,
 <matplotlib.axis.XTick at 0x7f509c626350>]

../../_images/e1930836b74ca65848eae2611ca0faa384c1d74034c890cc76a650578f13dbee.png