Correlation between boundary and other modalities

Contents

Correlation between boundary and other modalities#

import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib import cm as cm
import seaborn as sns
from matplotlib.colors import LogNorm

from ALLCools.mcds import MCDS
from ALLCools.mcds.utilities import calculate_posterior_mc_frac

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'

leg = ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'L5_ET', 'Amy', 
       'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7', 
       'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
legname = ['L2/3-IT', 'L4-IT', 'L5-IT', 'L6-IT', 'L6-IT-Car3', 'L5/6-NP', 'L6-CT', 'L6b', 'L5-ET', 'Amy-Exc', 
       'Lamp5', 'Lamp5-Lhx6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb-ChC', 'Sst', 'Chd7', 
       'MSN-D1', 'MSN-D2', 'Foxp2', 'SubCtx-Cplx', 
       'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
      ]
leg2name = {xx:yy for xx,yy in zip(leg, legname)}

leg = {'exc': ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy'], 
       'inh': ['Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7'], 
       'msn': ['MSN_D1', 'MSN_D2', 'Foxp2'], 
       'sub': ['SubCtx'], 
       'glia': ['ASC', 'ODC', 'OPC'], 
       'mgc': ['MGC'], 
       'smc': ['PC'], 
       'endo': ['EC'], 
       'fibro': ['VLMC'],
      }
leg['neu'] = leg['exc'] + leg['inh'] + leg['msn'] + leg['sub']
leg['all'] = leg['neu'] + leg['glia'] + leg['mgc'] + leg['smc'] + leg['endo'] + leg['fibro']

group_name = 'neu'

leg = pd.Index(leg[group_name])
legname = leg.map(leg2name)
res = 25000

indir = '/data/hba/domain_majortype/'
outdir = f'/home/jzhou_salk_edu/sky_workdir/hba/domain_majortype/diff/{group_name}/'

binall = pd.read_hdf(f'{outdir}bin_stats.hdf', key='data')

mcds = MCDS.open('/data/hba/mc_majortype/MajorType.mcds', var_dim='chrom5k')
mcds['chrom10k_even'] = mcds['chrom5k_chrom'].to_pandas().astype(str) + '_' + (mcds['chrom5k_start'] // 10000 * 2).to_pandas().astype(str)
mcds['chrom10k_odd'] = mcds['chrom5k_chrom'].to_pandas().astype(str) + '_' + ((mcds['chrom5k_start'] + 5000) // 10000 * 2 - 1).to_pandas().astype(str)
mcds

<xarray.MCDS>
Dimensions:        (cell: 40, chrom5k: 617669, count_type: 2, mc_type: 2)
Coordinates:
  * cell           (cell) <U10 'Amy' 'ASC' 'CA1' 'CA2' ... 'THM_MB' 'Vip' 'VLMC'
  * chrom5k        (chrom5k) object 'chr1_0' 'chr1_1' ... 'chrY_11445'
    chrom5k_chrom  (chrom5k) <U5 'chr1' 'chr1' 'chr1' ... 'chrY' 'chrY' 'chrY'
    chrom5k_end    (chrom5k) int64 5000 10000 15000 ... 57225000 57227415
    chrom5k_start  (chrom5k) int64 0 5000 10000 ... 57215000 57220000 57225000
  * count_type     (count_type) <U3 'mc' 'cov'
  * mc_type        (mc_type) <U3 'CGN' 'CHN'
Data variables:
    chrom5k_da     (cell, chrom5k, mc_type, count_type) uint32 dask.array<chunksize=(5, 77209, 1, 1), meta=np.ndarray>
    chrom10k_even  (chrom5k) object 'chr1_0' 'chr1_0' ... 'chrY_11444'
    chrom10k_odd   (chrom5k) object 'chr1_-1' 'chr1_1' ... 'chrY_11445'
Attributes:
    obs_dim:  cell
    var_dim:  chrom5k

mCG#

boundcg = pd.DataFrame(index=binall.index, columns=leg)

mc = mcds['chrom5k_da'].sel(count_type='mc', mc_type='CGN').to_pandas().T
mc['chrom10k'] = mcds['chrom10k_odd'].to_pandas()
mc = mc.groupby('chrom10k').sum().T
cov = mcds['chrom5k_da'].sel(count_type='cov', mc_type='CGN').to_pandas().T
cov['chrom10k'] = mcds['chrom10k_odd'].to_pandas()
cov = cov.groupby('chrom10k').sum().T

binfilter = ['_'.join(xx.split('_')[:-1]) for xx in mc.columns]
binfilter = [(len(xx)<6) and (xx not in ['chrM','chrX','chrY']) for xx in binfilter]
print(np.sum(binfilter))
mc = mc.loc[leg, binfilter]
cov = cov.loc[leg, binfilter]
print(mc.shape, cov.shape)

287523
(21, 287523) (21, 287523)

ratio = calculate_posterior_mc_frac(mc.values, cov.values)
ratio = pd.DataFrame(ratio, index=leg, columns=mc.columns)
ratio

chrom10k	chr10_-1	chr10_1	chr10_10001	chr10_10003	chr10_10005	chr10_10007	chr10_10009	chr10_1001	chr10_10011	chr10_10013	...	chr9_9983	chr9_9985	chr9_9987	chr9_9989	chr9_999	chr9_9991	chr9_9993	chr9_9995	chr9_9997	chr9_9999
L23_IT	1.0	0.860457	1.081336	0.925333	1.009707	1.112735	1.108767	1.073129	0.951386	0.582522	...	1.0	1.0	1.0	1.0	1.103275	1.0	1.0	1.0	1.0	1.0
L4_IT	1.0	0.823696	1.075727	0.914209	0.981365	1.105991	1.093843	1.056361	0.940196	0.554439	...	1.0	1.0	1.0	1.0	1.074466	1.0	1.0	1.0	1.0	1.0
L5_IT	1.0	0.866910	1.095000	0.987629	1.034343	1.093693	1.091953	1.070210	1.022007	0.581350	...	1.0	1.0	1.0	1.0	1.068948	1.0	1.0	1.0	1.0	1.0
L6_IT	1.0	0.832396	1.101325	0.912712	1.015612	1.102152	1.099118	1.065947	0.907962	0.593821	...	1.0	1.0	1.0	1.0	1.079558	1.0	1.0	1.0	1.0	1.0
L6_IT_Car3	1.0	0.870886	1.103143	0.981731	1.015656	1.099449	1.027843	1.066831	0.961772	0.537093	...	1.0	1.0	1.0	1.0	1.054659	1.0	1.0	1.0	1.0	1.0
L56_NP	1.0	0.789294	1.065180	1.019229	1.034690	1.069006	1.045871	1.053416	1.031039	0.558070	...	1.0	1.0	1.0	1.0	1.073882	1.0	1.0	1.0	1.0	1.0
L6_CT	1.0	0.850089	1.095890	0.944332	1.006207	1.098835	1.085214	1.042651	1.021171	0.540605	...	1.0	1.0	1.0	1.0	1.096978	1.0	1.0	1.0	1.0	1.0
L6b	1.0	0.874737	1.082416	0.972364	1.032669	1.104728	1.072531	1.067300	1.029268	0.539889	...	1.0	1.0	1.0	1.0	1.102754	1.0	1.0	1.0	1.0	1.0
Amy	1.0	0.849339	1.121909	0.955589	1.021628	1.111045	1.108338	1.034604	0.975217	0.498814	...	1.0	1.0	1.0	1.0	1.098151	1.0	1.0	1.0	1.0	1.0
Lamp5	1.0	0.803441	1.099491	0.901131	0.996747	1.084012	1.084341	1.061091	1.065331	0.573551	...	1.0	1.0	1.0	1.0	1.029678	1.0	1.0	1.0	1.0	1.0
Lamp5_LHX6	1.0	0.788889	1.095696	0.948651	1.009161	1.064117	1.076356	1.044956	1.053349	0.559807	...	1.0	1.0	1.0	1.0	1.043798	1.0	1.0	1.0	1.0	1.0
Sncg	1.0	0.790187	1.092085	0.943028	1.013903	1.075181	1.074456	1.025663	1.041255	0.578329	...	1.0	1.0	1.0	1.0	1.041308	1.0	1.0	1.0	1.0	1.0
Vip	1.0	0.764841	1.079621	0.954340	0.988894	1.074535	1.076771	1.030072	1.030965	0.523459	...	1.0	1.0	1.0	1.0	1.077833	1.0	1.0	1.0	1.0	1.0
Pvalb	1.0	0.791781	1.078261	0.957770	0.996808	1.065400	1.062728	1.062498	1.035391	0.573805	...	1.0	1.0	1.0	1.0	1.066142	1.0	1.0	1.0	1.0	1.0
Pvalb_ChC	1.0	0.799670	1.079489	1.042405	1.039103	0.965079	1.065501	1.047256	0.947254	0.535809	...	1.0	1.0	1.0	1.0	1.072020	1.0	1.0	1.0	1.0	1.0
Sst	1.0	0.790550	1.009991	0.967094	1.003426	1.075078	1.063624	1.052106	1.037828	0.567196	...	1.0	1.0	1.0	1.0	1.068358	1.0	1.0	1.0	1.0	1.0
CHD7	1.0	0.783001	1.091096	0.973865	0.987348	1.059799	1.079257	1.049891	1.006733	0.535868	...	1.0	1.0	1.0	1.0	1.069605	1.0	1.0	1.0	1.0	1.0
MSN_D1	1.0	0.805578	1.081088	0.949242	0.987770	1.095029	1.077659	1.031337	1.028073	0.541634	...	1.0	1.0	1.0	1.0	1.070032	1.0	1.0	1.0	1.0	1.0
MSN_D2	1.0	0.807129	1.078072	0.925217	0.991441	1.091273	1.082277	1.035319	1.031940	0.531349	...	1.0	1.0	1.0	1.0	1.079601	1.0	1.0	1.0	1.0	1.0
Foxp2	1.0	0.779720	1.100584	0.911965	0.957330	1.087619	1.084313	1.057260	1.002953	0.547599	...	1.0	1.0	1.0	1.0	1.082638	1.0	1.0	1.0	1.0	1.0
SubCtx	1.0	0.778561	1.064634	0.979620	0.998362	1.063654	1.059708	1.040602	0.996573	0.525402	...	1.0	1.0	1.0	1.0	1.063075	1.0	1.0	1.0	1.0	1.0

21 rows × 287523 columns

idx1 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000 - 2).astype(str)
idx2 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000).astype(str)
selb = (binall['start']%10000!=0)#  & idx1.isin(ratio.columns)
boundcg.loc[selb] = (ratio.loc[:, idx1[selb]].values + ratio.loc[:, idx2[selb]].values).T / 2

mc = mcds['chrom5k_da'].sel(count_type='mc', mc_type='CGN').to_pandas().T
mc['chrom10k'] = mcds['chrom10k_even'].to_pandas()
mc = mc.groupby('chrom10k').sum().T
cov = mcds['chrom5k_da'].sel(count_type='cov', mc_type='CGN').to_pandas().T
cov['chrom10k'] = mcds['chrom10k_even'].to_pandas()
cov = cov.groupby('chrom10k').sum().T

binfilter = ['_'.join(xx.split('_')[:-1]) for xx in mc.columns]
binfilter = [(len(xx)<6) and (xx not in ['chrM','chrX','chrY']) for xx in binfilter]
print(np.sum(binfilter))
mc = mc.loc[leg, binfilter]
cov = cov.loc[leg, binfilter]
print(mc.shape, cov.shape)

287509
(21, 287509) (21, 287509)

ratio = calculate_posterior_mc_frac(mc.values, cov.values)
ratio = pd.DataFrame(ratio, index=leg, columns=mc.columns)
ratio

chrom10k	chr10_0	chr10_10	chr10_100	chr10_1000	chr10_10000	chr10_10002	chr10_10004	chr10_10006	chr10_10008	chr10_10010	...	chr9_9980	chr9_9982	chr9_9984	chr9_9986	chr9_9988	chr9_9990	chr9_9992	chr9_9994	chr9_9996	chr9_9998
L23_IT	1.0	1.030605	0.938468	1.079237	1.087932	1.036463	0.933259	1.085677	1.113516	1.092081	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L4_IT	1.0	1.008030	0.893732	1.061923	1.084499	1.066191	0.883820	1.083390	1.100948	1.046365	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L5_IT	1.0	1.035229	0.968205	1.064959	1.097828	1.078424	0.974695	1.077973	1.091472	1.080833	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L6_IT	1.0	1.043728	1.006064	1.036545	1.108225	1.035341	0.927944	1.081138	1.100847	1.058464	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L6_IT_Car3	1.0	1.031033	1.024065	0.972414	1.106033	1.069932	0.960716	1.080387	1.014772	1.059353	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L56_NP	1.0	1.024043	1.067959	1.031688	1.077790	1.064895	1.004588	1.056444	1.054735	1.060728	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L6_CT	1.0	1.025605	1.000952	1.033709	1.105316	1.079336	0.916573	1.080173	1.081485	1.098547	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L6b	1.0	1.051203	1.039012	1.055885	1.094319	1.048304	0.974152	1.092275	1.058137	1.089253	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Amy	1.0	1.037157	0.883397	0.945863	1.126014	1.067755	0.954871	1.078414	1.115085	1.086908	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Lamp5	1.0	0.945802	1.048737	1.020667	1.102167	0.995533	0.926336	1.069959	1.081162	1.089796	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Lamp5_LHX6	1.0	0.980062	1.057993	0.982959	1.096893	1.008295	0.969985	1.048721	1.079863	1.075894	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Sncg	1.0	0.925705	1.048671	0.978772	1.098617	1.045065	0.951006	1.055583	1.074915	1.075155	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Vip	1.0	0.962763	0.984013	0.947534	1.089664	1.054085	0.932628	1.041100	1.083444	1.070266	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Pvalb	1.0	0.998853	1.010985	1.007189	1.083609	1.071767	0.926349	1.047639	1.067686	1.051676	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Pvalb_ChC	1.0	1.014795	0.930437	1.039565	1.079252	1.072438	1.024667	0.993707	1.042559	0.973670	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Sst	1.0	0.975179	1.061164	0.932726	1.079956	1.027505	0.942059	1.056591	1.069491	1.061776	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
CHD7	1.0	0.970671	0.875364	0.964086	1.093977	1.072723	0.940073	1.037374	1.080247	1.056002	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
MSN_D1	1.0	1.039511	1.045804	1.058324	1.081883	1.071348	0.917046	1.072203	1.087172	1.079323	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
MSN_D2	1.0	1.044885	1.037520	1.043545	1.076334	1.076735	0.896778	1.073562	1.088832	1.081124	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Foxp2	1.0	0.991555	0.970998	1.050386	1.104645	1.080997	0.862161	1.056843	1.089961	1.080012	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
SubCtx	1.0	1.004029	1.038828	0.960336	1.073671	1.061120	0.947283	1.052452	1.068439	1.040170	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0

21 rows × 287509 columns

idx1 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000 - 2).astype(str)
idx2 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000).astype(str)
selb = (binall['start']%10000==0) & idx1.isin(ratio.columns)
boundcg.loc[selb] = (ratio.loc[:, idx1[selb]].values + ratio.loc[:, idx2[selb]].values).T / 2

mCH#

boundch = pd.DataFrame(index=binall.index, columns=leg)

mc = mcds['chrom5k_da'].sel(count_type='mc', mc_type='CHN').to_pandas().T
mc['chrom10k'] = mcds['chrom10k_odd'].to_pandas()
mc = mc.groupby('chrom10k').sum().T
cov = mcds['chrom5k_da'].sel(count_type='cov', mc_type='CHN').to_pandas().T
cov['chrom10k'] = mcds['chrom10k_odd'].to_pandas()
cov = cov.groupby('chrom10k').sum().T

binfilter = ['_'.join(xx.split('_')[:-1]) for xx in mc.columns]
binfilter = [(len(xx)<6) and (xx not in ['chrM','chrX','chrY']) for xx in binfilter]
print(np.sum(binfilter))
mc = mc.loc[leg, binfilter]
cov = cov.loc[leg, binfilter]
print(mc.shape, cov.shape)

287523
(21, 287523) (21, 287523)

ratio = calculate_posterior_mc_frac(mc.values, cov.values)
ratio = pd.DataFrame(ratio, index=leg, columns=mc.columns)
ratio

chrom10k	chr10_-1	chr10_1	chr10_10001	chr10_10003	chr10_10005	chr10_10007	chr10_10009	chr10_1001	chr10_10011	chr10_10013	...	chr9_9983	chr9_9985	chr9_9987	chr9_9989	chr9_999	chr9_9991	chr9_9993	chr9_9995	chr9_9997	chr9_9999
L23_IT	1.0	0.510464	0.441238	0.856999	1.229206	1.541424	1.589255	1.146245	1.419087	0.385546	...	1.0	1.0	1.0	1.0	0.911612	1.0	1.0	1.0	1.0	1.0
L4_IT	1.0	0.424107	0.397881	0.681891	0.994451	1.350999	1.335008	1.133499	1.108866	0.333833	...	1.0	1.0	1.0	1.0	0.764278	1.0	1.0	1.0	1.0	1.0
L5_IT	1.0	0.382084	0.450242	0.937998	1.271952	1.531300	1.513247	1.199643	1.342807	0.344674	...	1.0	1.0	1.0	1.0	0.742818	1.0	1.0	1.0	1.0	1.0
L6_IT	1.0	0.516017	0.407969	0.842039	1.121085	1.496506	1.548475	1.164626	1.119029	0.315417	...	1.0	1.0	1.0	1.0	0.696133	1.0	1.0	1.0	1.0	1.0
L6_IT_Car3	1.0	0.501950	0.430480	0.856359	1.082338	1.393629	1.143643	1.204057	1.161465	0.339972	...	1.0	1.0	1.0	1.0	0.426450	1.0	1.0	1.0	1.0	1.0
L56_NP	1.0	0.320368	0.407804	0.852860	1.271590	1.428042	1.314577	1.145670	1.175423	0.216557	...	1.0	1.0	1.0	1.0	0.719350	1.0	1.0	1.0	1.0	1.0
L6_CT	1.0	0.377183	0.380987	0.793518	1.104818	1.291011	1.302317	1.083256	1.146751	0.319666	...	1.0	1.0	1.0	1.0	0.708590	1.0	1.0	1.0	1.0	1.0
L6b	1.0	0.395342	0.428573	0.888223	1.155397	1.554271	1.391349	1.151900	1.298691	0.288390	...	1.0	1.0	1.0	1.0	0.760155	1.0	1.0	1.0	1.0	1.0
Amy	1.0	0.395040	0.504294	0.836117	1.225487	1.352221	1.581872	0.951827	1.300263	0.360894	...	1.0	1.0	1.0	1.0	0.557824	1.0	1.0	1.0	1.0	1.0
Lamp5	1.0	0.344861	1.379219	1.571598	1.727956	1.958670	2.215138	1.033501	1.897049	0.497977	...	1.0	1.0	1.0	1.0	0.426988	1.0	1.0	1.0	1.0	1.0
Lamp5_LHX6	1.0	0.359777	0.825219	1.361178	1.353167	1.376417	1.741107	1.057578	1.594113	0.422394	...	1.0	1.0	1.0	1.0	0.363146	1.0	1.0	1.0	1.0	1.0
Sncg	1.0	0.366004	0.774885	1.467858	1.327428	1.357707	1.649965	0.977765	1.407841	0.460822	...	1.0	1.0	1.0	1.0	0.447523	1.0	1.0	1.0	1.0	1.0
Vip	1.0	0.333840	0.717739	1.288908	1.187511	1.293638	1.458470	1.100303	1.258915	0.321260	...	1.0	1.0	1.0	1.0	0.472692	1.0	1.0	1.0	1.0	1.0
Pvalb	1.0	0.390031	0.494385	1.010168	1.122271	1.336544	1.427473	1.252491	1.124361	0.274971	...	1.0	1.0	1.0	1.0	0.431932	1.0	1.0	1.0	1.0	1.0
Pvalb_ChC	1.0	0.443974	0.441581	0.945737	1.075615	0.896663	1.206834	1.309925	1.030457	0.238352	...	1.0	1.0	1.0	1.0	0.433659	1.0	1.0	1.0	1.0	1.0
Sst	1.0	0.397211	0.458702	0.957316	1.135977	1.442911	1.558147	1.219512	1.375276	0.348379	...	1.0	1.0	1.0	1.0	0.430488	1.0	1.0	1.0	1.0	1.0
CHD7	1.0	0.523041	0.463368	1.046665	1.033602	1.223731	1.368707	1.172668	1.056110	0.315515	...	1.0	1.0	1.0	1.0	0.350804	1.0	1.0	1.0	1.0	1.0
MSN_D1	1.0	0.402915	0.426668	0.792652	1.103806	1.378452	1.359629	1.314098	1.248070	0.301343	...	1.0	1.0	1.0	1.0	0.638274	1.0	1.0	1.0	1.0	1.0
MSN_D2	1.0	0.366594	0.363376	0.736952	1.094653	1.371585	1.365391	1.173767	1.160560	0.292648	...	1.0	1.0	1.0	1.0	0.597558	1.0	1.0	1.0	1.0	1.0
Foxp2	1.0	0.416216	0.392466	0.756189	0.891100	1.083624	1.261196	1.087753	1.003878	0.346461	...	1.0	1.0	1.0	1.0	0.527072	1.0	1.0	1.0	1.0	1.0
SubCtx	1.0	0.418725	0.433070	0.996978	1.124348	1.283384	1.400723	1.181628	1.103102	0.315080	...	1.0	1.0	1.0	1.0	0.412262	1.0	1.0	1.0	1.0	1.0

21 rows × 287523 columns

idx1 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000 - 2).astype(str)
idx2 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000).astype(str)
selb = (binall['start']%10000!=0)# & idx1.isin(ratio.columns)
boundch.loc[selb] = (ratio.loc[:, idx1[selb]].values + ratio.loc[:, idx2[selb]].values).T / 2

mc = mcds['chrom5k_da'].sel(count_type='mc', mc_type='CHN').to_pandas().T
mc['chrom10k'] = mcds['chrom10k_even'].to_pandas()
mc = mc.groupby('chrom10k').sum().T
cov = mcds['chrom5k_da'].sel(count_type='cov', mc_type='CHN').to_pandas().T
cov['chrom10k'] = mcds['chrom10k_even'].to_pandas()
cov = cov.groupby('chrom10k').sum().T

binfilter = ['_'.join(xx.split('_')[:-1]) for xx in mc.columns]
binfilter = [(len(xx)<6) and (xx not in ['chrM','chrX','chrY']) for xx in binfilter]
print(np.sum(binfilter))
mc = mc.loc[leg, binfilter]
cov = cov.loc[leg, binfilter]
print(mc.shape, cov.shape)

287509
(21, 287509) (21, 287509)

ratio = calculate_posterior_mc_frac(mc.values, cov.values)
ratio = pd.DataFrame(ratio, index=leg, columns=mc.columns)
ratio

chrom10k	chr10_0	chr10_10	chr10_100	chr10_1000	chr10_10000	chr10_10002	chr10_10004	chr10_10006	chr10_10008	chr10_10010	...	chr9_9980	chr9_9982	chr9_9984	chr9_9986	chr9_9988	chr9_9990	chr9_9992	chr9_9994	chr9_9996	chr9_9998
L23_IT	1.0	1.004191	0.493460	1.104673	0.378159	0.577510	1.104026	1.432413	1.759355	1.492050	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L4_IT	1.0	0.869427	0.593259	1.191464	0.377823	0.468765	0.875656	1.254744	1.432864	1.190639	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L5_IT	1.0	0.901131	0.664278	1.144501	0.397166	0.651486	1.175162	1.473311	1.663474	1.392828	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L6_IT	1.0	1.011073	0.535467	1.080717	0.416892	0.529509	1.052912	1.361006	1.674774	1.247187	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L6_IT_Car3	1.0	0.990890	0.659824	0.899510	0.376149	0.579968	0.991147	1.359062	1.279046	1.186164	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L56_NP	1.0	0.890467	1.008560	1.115885	0.375578	0.576387	1.110340	1.440122	1.478495	1.246266	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L6_CT	1.0	0.909796	0.825940	1.015736	0.392367	0.510480	1.034966	1.274706	1.381530	1.202989	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
L6b	1.0	0.965625	0.488996	1.060917	0.460871	0.616292	1.076651	1.473702	1.472755	1.357385	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Amy	1.0	1.032535	0.450361	0.911437	0.443585	0.582688	1.088852	1.387712	1.625620	1.352506	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Lamp5	1.0	0.762560	0.543802	0.997103	1.148783	1.356227	1.704878	2.037967	2.222906	1.870465	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Lamp5_LHX6	1.0	0.785061	0.497714	0.911998	0.693598	1.087251	1.364749	1.501270	1.703855	1.493310	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Sncg	1.0	0.658161	0.646019	0.839545	0.711791	1.164374	1.429084	1.381591	1.653368	1.397234	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Vip	1.0	0.719938	0.567544	0.780902	0.622367	1.032879	1.248919	1.291014	1.519195	1.297617	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Pvalb	1.0	0.910356	0.440534	1.016046	0.521019	0.761039	1.071845	1.290787	1.508756	1.246113	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Pvalb_ChC	1.0	0.952738	0.426413	1.269375	0.409346	0.731215	0.976371	1.081925	1.207088	0.965896	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Sst	1.0	0.778337	0.570330	0.841826	0.462577	0.692936	1.060044	1.373523	1.617611	1.445655	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
CHD7	1.0	0.796231	0.386411	0.874622	0.459980	0.814569	1.034546	1.179762	1.449169	1.097513	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
MSN_D1	1.0	1.071710	0.493949	1.378706	0.434703	0.542109	0.974037	1.322889	1.533243	1.281728	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
MSN_D2	1.0	1.101675	0.485418	1.271987	0.351280	0.503255	0.967607	1.286954	1.513391	1.265431	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
Foxp2	1.0	0.851022	0.559014	1.094504	0.414223	0.556357	0.857242	1.030724	1.313589	1.098945	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0
SubCtx	1.0	0.984603	0.638502	0.929416	0.436759	0.740556	1.077906	1.282124	1.497868	1.163109	...	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0	1.0

21 rows × 287509 columns

idx1 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000 - 2).astype(str)
idx2 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000).astype(str)
selb = (binall['start']%10000==0) & idx1.isin(ratio.columns)
boundch.loc[selb] = (ratio.loc[:, idx1[selb]].values + ratio.loc[:, idx2[selb]].values).T / 2

ATAC#

sig = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/atac_majortype/cluster_atac_signal.hdf')
cov = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/atac_majortype/cluster_atac_cov.hdf')

bins = pd.DataFrame(index=sig.columns)
bins['chrom'] = bins.index.str.split('_').str[0]
bins['start'] = (bins.index.str.split('_').str[1].astype(int) - 1) * 5000
bins['chrom10k_even'] = bins['chrom'] + '_' + (bins['start'] // 10000 * 2).astype(str)
bins['chrom10k_odd'] = bins['chrom'] + '_' + ((bins['start'] + 5000) // 10000 * 2 - 1).astype(str)

sig = sig.groupby(by=bins['chrom10k_odd'], axis=1).sum()
cov = cov.groupby(by=bins['chrom10k_odd']).sum()
atac = (sig/cov).fillna(0)

legatac = leg[leg.isin(atac.index)]
atac = atac.loc[legatac].T
atac = atac / atac.sum(axis=0)

boundatac = pd.DataFrame(index=binall.index, columns=legatac)
idx1 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000 - 2).astype(str)
idx2 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000).astype(str)
selb = (binall['start']%10000!=0)# & idx1.isin(atac.index)
boundatac.loc[selb] = (atac.loc[idx1[selb]].values + atac.loc[idx2[selb]].values) / 2

sig = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/atac_majortype/cluster_atac_signal.hdf')
cov = pd.read_hdf('/home/jzhou_salk_edu/sky_workdir/hba/atac_majortype/cluster_atac_cov.hdf')

sig = sig.groupby(by=bins['chrom10k_even'], axis=1).sum()
cov = cov.groupby(by=bins['chrom10k_even']).sum()
atac = (sig/cov).fillna(0)

legatac = leg[leg.isin(atac.index)]
atac = atac.loc[legatac].T
atac = atac / atac.sum(axis=0)

idx1 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000 - 2).astype(str)
idx2 = binall['chrom'].astype(str) + '_' + (binall['start'] // 5000).astype(str)
selb = (binall['start']%10000==0) & idx1.isin(atac.index)
boundatac.loc[selb] = (atac.loc[idx1[selb]].values + atac.loc[idx2[selb]].values) / 2

boundcg.to_hdf(f'{outdir}bound_mCG.hdf', key='data')
boundch.to_hdf(f'{outdir}bound_mCH.hdf', key='data')
boundatac.to_hdf(f'{outdir}bound_ATAC.hdf', key='data')

binfilter = (boundcg.isna().sum(axis=1)==0).values
binall.loc[~binfilter, ['mCG_corr', 'mCH_corr', 'ATAC_corr']] = 0
binall

	chrom	start	end	bklfilter	chi2filter	ins_lm	probdiff	chi2_sc	insfc	diff_sc	mCG_corr	mCH_corr	ATAC_corr
chr1_0	chr1	0	25000	False	False	0	0.001333	18.816754	1.020000	0	0.0	0.0	0.0
chr1_1	chr1	25000	50000	False	False	0	0.000000	0.000000	inf	0	NaN	NaN	NaN
chr1_2	chr1	50000	75000	False	False	0	0.000000	0.000000	inf	0	NaN	NaN	NaN
chr1_3	chr1	75000	100000	False	False	0	0.000000	0.000000	inf	0	NaN	NaN	NaN
chr1_4	chr1	100000	125000	False	False	0	0.000000	0.000000	inf	0	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...
chr22_2028	chr22	50700000	50725000	False	False	0	0.028816	21.417634	0.322651	0	NaN	NaN	NaN
chr22_2029	chr22	50725000	50750000	False	False	0	0.023924	21.869962	0.297184	0	NaN	NaN	NaN
chr22_2030	chr22	50750000	50775000	False	False	0	0.019718	20.685304	0.263324	0	NaN	NaN	NaN
chr22_2031	chr22	50775000	50800000	False	False	0	0.016411	11.303940	0.238471	0	NaN	NaN	NaN
chr22_2032	chr22	50800000	50818468	False	False	0	0.001982	22.931718	0.337864	0	NaN	NaN	NaN

115009 rows × 13 columns

bound_count_ct = pd.read_hdf(f'{indir}MajorType_boundcount.hdf', key='data').loc[leg]
cell_count_ct = pd.read_csv(f'{indir}MajorType_cellcount.csv.gz', index_col=0, header=0, squeeze=True).loc[leg]
bound_prob_ct = (bound_count_ct / cell_count_ct[:,None]).T

binall.loc[binfilter, 'mCG_corr'] = [pearsonr(xx, yy)[0] for xx,yy in zip(bound_prob_ct.values[binfilter], boundcg.values[binfilter])]
binall.loc[binfilter, 'mCH_corr'] = [pearsonr(xx, yy)[0] for xx,yy in zip(bound_prob_ct.values[binfilter], boundch.values[binfilter])]
binall.loc[binfilter, 'ATAC_corr'] = [pearsonr(xx, yy)[0] for xx,yy in zip(bound_prob_ct[legatac].values[binfilter], boundatac[legatac].values[binfilter])]

binall.to_hdf(f'{outdir}bin_stats.hdf', key='data')

fig, axes = plt.subplots(1, 2, figsize=(4,2), sharey='all', dpi=300)
ax = axes[0]
sns.histplot(binall, x='mCG_corr', y='chi2_sc', bins=100, ax=ax, log_scale=(False, 10))
ax = axes[1]
sns.histplot(binall, x='mCH_corr', y='chi2_sc', bins=100, ax=ax, log_scale=(False, 10))

<AxesSubplot:xlabel='mCH_corr', ylabel='chi2_sc'>

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica

../../_images/f8722f728f4418712aac85df476b5b3e0218c699f540257b998bc058938489e5.png

fig, axes = plt.subplots(1, 2, figsize=(4,2), sharey='all', dpi=300)
ax = axes[0]
sns.histplot(binall, y='chi2_sc', x='probdiff', bins=100, ax=ax, log_scale=(False, False))
ax = axes[1]
sns.histplot(binall, y='chi2_sc', x='insfc', bins=100, ax=ax, log_scale=(10, False))

<AxesSubplot:xlabel='insfc', ylabel='chi2_sc'>

../../_images/d87f0e98287fc460b92a87e7880ac0e56ce985951fb95e4d77a705cc09660dfd.png