Domain boundary prob

Domain boundary prob#

import cooler
import numpy as np
import pandas as pd
import anndata
from glob import glob
leg = pd.Index(['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'L5_ET', 'Amy', 
                'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7', 
                'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx', 
                'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
               ])
legname = pd.Index(['L2/3-IT', 'L4-IT', 'L5-IT', 'L6-IT', 'L6-IT-Car3', 'L5/6-NP', 'L6-CT', 'L6b', 'L5-ET', 'Amy-Exc', 
                    'Lamp5', 'Lamp5-Lhx6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb-ChC', 'Sst', 'Chd7', 
                    'MSN-D1', 'MSN-D2', 'Foxp2', 'SubCtx-Cplx', 
                    'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
                   ])
print(len(leg))
29
indir = '/data/hba/domain_majortype/'
outdir = '/home/jzhou_salk_edu/sky_workdir/hba/domain_majortype/'
res = 25000
chrom_size_path = '/home/jzhou_salk_edu/sky_workdir/hba/ref/hg38.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
chrom_sizes
chr1     248956422
chr2     242193529
chr3     198295559
chr4     190214555
chr5     181538259
chr6     170805979
chr7     159345973
chr8     145138636
chr9     138394717
chr10    133797422
chr11    135086622
chr12    133275309
chr13    114364328
chr14    107043718
chr15    101991189
chr16     90338345
chr17     83257441
chr18     80373285
chr19     58617616
chr20     64444167
chr21     46709983
chr22     50818468
chrX     156040895
chrY      57227415
chrM         16569
Name: length, dtype: int64
metadata = pd.read_hdf(f'{outdir}cell_117540_meta.hdf')
metadata
PassFilter mCCCFrac mCGFrac mCGFracAdj mCHFrac mCHFracAdj FinalmCReads pool Plate plate_relative_read ... forloop_SubType forloop_MajorType forloop_Region CisLongContact TransContact Cis/Trans domain_coverage domain_count boundary_count domain_size
HBA_220218_H1930002_CX47_BNST_3C_1_P1-1-M14-A1 True 0.006068 0.773828 0.772447 0.012215 0.006185 2116936 hba_m3c16_h1930002_BNST HBA_220218_H1930002_CX47_BNST_3C_1_P1 0.971324 ... ASC_Bergemann_0 NaN NaN 278749.0 157264.0 1.772491 2801350000 4410.0 4857 635226.757370
HBA_220218_H1930002_CX47_BNST_3C_1_P1-1-M14-A13 True 0.004877 0.769910 0.768782 0.011441 0.006596 2250650 hba_m3c16_h1930002_BNST HBA_220218_H1930002_CX47_BNST_3C_1_P1 1.032676 ... NaN NaN NaN 335489.0 163603.0 2.050629 2807375000 4752.0 5265 590777.567340
HBA_220218_H1930002_CX47_BNST_3C_1_P1-1-M14-A14 True 0.005228 0.745918 0.744583 0.006356 0.001134 2655450 hba_m3c16_h1930002_BNST HBA_220218_H1930002_CX47_BNST_3C_1_P1 1.218413 ... OPC_0 OPC NaN 353846.0 234196.0 1.510897 2797275000 4800.0 5313 582765.625000
HBA_220218_H1930002_CX47_BNST_3C_1_P1-1-M14-A2 True 0.008819 0.750491 0.748271 0.013308 0.004529 2365072 hba_m3c16_h1930002_BNST HBA_220218_H1930002_CX47_BNST_3C_1_P1 1.085177 ... NaN NaN NaN 324190.0 189126.0 1.714148 2810575000 4469.0 4889 628904.676661
HBA_220218_H1930002_CX47_BNST_3C_1_P1-1-M14-B1 True 0.005061 0.761122 0.759906 0.016676 0.011674 779910 hba_m3c16_h1930002_BNST HBA_220218_H1930002_CX47_BNST_3C_1_P1 0.357850 ... ASC_2 NaN NaN 93768.0 57444.0 1.632338 2787325000 3041.0 3297 916581.716541
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
HBA_211117_H1930002_CX47_FI_3C_1_P8-6-J7-O24 True 0.010842 0.798236 0.796024 0.054124 0.043756 1948656 hba_m3c11_h1930002_FI HBA_211117_H1930002_CX47_FI_3C_1_P8 0.984126 ... Sncg_1 Sncg NaN 246773.0 133468.0 1.848930 2823125000 4690.0 5181 601945.628998
HBA_211117_H1930002_CX47_FI_3C_1_P8-6-J7-P11 True 0.012916 0.768051 0.765016 0.062617 0.050352 2450246 hba_m3c11_h1930002_FI HBA_211117_H1930002_CX47_FI_3C_1_P8 1.237444 ... L6_IT_Car3_1 L6_IT_Car3 NaN 356046.0 131529.0 2.706977 2849325000 5063.0 5683 562774.047008
HBA_211117_H1930002_CX47_FI_3C_1_P8-6-J7-P12 True 0.006666 0.801151 0.799817 0.026621 0.020089 2906150 hba_m3c11_h1930002_FI HBA_211117_H1930002_CX47_FI_3C_1_P8 1.467688 ... Vip_4 Vip NaN 352862.0 219220.0 1.609625 2846425000 4978.0 5499 571800.924066
HBA_211117_H1930002_CX47_FI_3C_1_P8-6-J7-P23 True 0.008189 0.792329 0.790614 0.036346 0.028389 2293534 hba_m3c11_h1930002_FI HBA_211117_H1930002_CX47_FI_3C_1_P8 1.158300 ... Sncg_0 Sncg NaN 268935.0 194847.0 1.380237 2854400000 4642.0 5151 614907.367514
HBA_211117_H1930002_CX47_FI_3C_1_P8-6-J7-P24 True 0.010332 0.797810 0.795699 0.053267 0.043384 2110322 hba_m3c11_h1930002_FI HBA_211117_H1930002_CX47_FI_3C_1_P8 1.065772 ... Sst_5 Sst NaN 282997.0 145288.0 1.947835 2841475000 4801.0 5292 591850.656113

117540 rows × 26 columns

tad_path_list = glob(f'{indir}sc_domain/*.boundary.h5ad')
print(len(tad_path_list))
46
sc_border = []
for xx in tad_path_list:
    sc_border.append(anndata.read_h5ad(xx))
    
sc_border = anndata.AnnData.concatenate(*sc_border, index_unique=None)
sc_border
AnnData object with n_obs × n_vars = 131885 × 123544
    obs: 'batch'
    var: 'chrom', 'start', 'end'
sc_border.X = sc_border.X.astype(int)
sc_border = sc_border[sc_border.obs.index.isin(metadata.index)].copy()
sc_border = sc_border[:, ~sc_border.var['chrom'].isin(['chrX','chrY','chrM','chrL'])].copy()
sc_border.obs = metadata.loc[sc_border.obs.index]
sc_border.raw = sc_border.copy()
sc_border.write_h5ad(f'{outdir}cell_{sc_border.shape[0]}_boundary.h5ad')
bound_count_ct = []
for xx in leg:
    bound_count_ct.append(sc_border.raw.X[sc_border.obs['MajorType']==xx].getnnz(axis=0))

bound_count_ct = pd.DataFrame(bound_count_ct, index=leg, 
                              columns=sc_border.raw.var['chrom'].astype(str) + '_' + (sc_border.raw.var['start'] // res).astype(str))
cell_count_ct = sc_border.obs['MajorType'].value_counts().loc[leg]
# bound_prob_ct = (bound_count_ct / cell_count_ct[:,None]).T
bound_count_ct.to_hdf(f'{outdir}MajorType_boundcount.hdf', key='data')
cell_count_ct.to_csv(f'{outdir}MajorType_cellcount.csv.gz')