Domain boundary prob#
import cooler
import numpy as np
import pandas as pd
import anndata
from glob import glob
leg = pd.Index(['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'L5_ET', 'Amy',
'Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7',
'MSN_D1', 'MSN_D2', 'Foxp2', 'SubCtx',
'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
])
legname = pd.Index(['L2/3-IT', 'L4-IT', 'L5-IT', 'L6-IT', 'L6-IT-Car3', 'L5/6-NP', 'L6-CT', 'L6b', 'L5-ET', 'Amy-Exc',
'Lamp5', 'Lamp5-Lhx6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb-ChC', 'Sst', 'Chd7',
'MSN-D1', 'MSN-D2', 'Foxp2', 'SubCtx-Cplx',
'ASC', 'ODC', 'OPC', 'MGC', 'PC', 'EC', 'VLMC'
])
print(len(leg))
29
indir = '/data/hba/domain_majortype/'
outdir = '/home/jzhou_salk_edu/sky_workdir/hba/domain_majortype/'
res = 25000
chrom_size_path = '/home/jzhou_salk_edu/sky_workdir/hba/ref/hg38.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
chrom_sizes
chr1 248956422
chr2 242193529
chr3 198295559
chr4 190214555
chr5 181538259
chr6 170805979
chr7 159345973
chr8 145138636
chr9 138394717
chr10 133797422
chr11 135086622
chr12 133275309
chr13 114364328
chr14 107043718
chr15 101991189
chr16 90338345
chr17 83257441
chr18 80373285
chr19 58617616
chr20 64444167
chr21 46709983
chr22 50818468
chrX 156040895
chrY 57227415
chrM 16569
Name: length, dtype: int64
metadata = pd.read_hdf(f'{outdir}cell_117540_meta.hdf')
metadata
PassFilter | mCCCFrac | mCGFrac | mCGFracAdj | mCHFrac | mCHFracAdj | FinalmCReads | pool | Plate | plate_relative_read | ... | forloop_SubType | forloop_MajorType | forloop_Region | CisLongContact | TransContact | Cis/Trans | domain_coverage | domain_count | boundary_count | domain_size | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
HBA_220218_H1930002_CX47_BNST_3C_1_P1-1-M14-A1 | True | 0.006068 | 0.773828 | 0.772447 | 0.012215 | 0.006185 | 2116936 | hba_m3c16_h1930002_BNST | HBA_220218_H1930002_CX47_BNST_3C_1_P1 | 0.971324 | ... | ASC_Bergemann_0 | NaN | NaN | 278749.0 | 157264.0 | 1.772491 | 2801350000 | 4410.0 | 4857 | 635226.757370 |
HBA_220218_H1930002_CX47_BNST_3C_1_P1-1-M14-A13 | True | 0.004877 | 0.769910 | 0.768782 | 0.011441 | 0.006596 | 2250650 | hba_m3c16_h1930002_BNST | HBA_220218_H1930002_CX47_BNST_3C_1_P1 | 1.032676 | ... | NaN | NaN | NaN | 335489.0 | 163603.0 | 2.050629 | 2807375000 | 4752.0 | 5265 | 590777.567340 |
HBA_220218_H1930002_CX47_BNST_3C_1_P1-1-M14-A14 | True | 0.005228 | 0.745918 | 0.744583 | 0.006356 | 0.001134 | 2655450 | hba_m3c16_h1930002_BNST | HBA_220218_H1930002_CX47_BNST_3C_1_P1 | 1.218413 | ... | OPC_0 | OPC | NaN | 353846.0 | 234196.0 | 1.510897 | 2797275000 | 4800.0 | 5313 | 582765.625000 |
HBA_220218_H1930002_CX47_BNST_3C_1_P1-1-M14-A2 | True | 0.008819 | 0.750491 | 0.748271 | 0.013308 | 0.004529 | 2365072 | hba_m3c16_h1930002_BNST | HBA_220218_H1930002_CX47_BNST_3C_1_P1 | 1.085177 | ... | NaN | NaN | NaN | 324190.0 | 189126.0 | 1.714148 | 2810575000 | 4469.0 | 4889 | 628904.676661 |
HBA_220218_H1930002_CX47_BNST_3C_1_P1-1-M14-B1 | True | 0.005061 | 0.761122 | 0.759906 | 0.016676 | 0.011674 | 779910 | hba_m3c16_h1930002_BNST | HBA_220218_H1930002_CX47_BNST_3C_1_P1 | 0.357850 | ... | ASC_2 | NaN | NaN | 93768.0 | 57444.0 | 1.632338 | 2787325000 | 3041.0 | 3297 | 916581.716541 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
HBA_211117_H1930002_CX47_FI_3C_1_P8-6-J7-O24 | True | 0.010842 | 0.798236 | 0.796024 | 0.054124 | 0.043756 | 1948656 | hba_m3c11_h1930002_FI | HBA_211117_H1930002_CX47_FI_3C_1_P8 | 0.984126 | ... | Sncg_1 | Sncg | NaN | 246773.0 | 133468.0 | 1.848930 | 2823125000 | 4690.0 | 5181 | 601945.628998 |
HBA_211117_H1930002_CX47_FI_3C_1_P8-6-J7-P11 | True | 0.012916 | 0.768051 | 0.765016 | 0.062617 | 0.050352 | 2450246 | hba_m3c11_h1930002_FI | HBA_211117_H1930002_CX47_FI_3C_1_P8 | 1.237444 | ... | L6_IT_Car3_1 | L6_IT_Car3 | NaN | 356046.0 | 131529.0 | 2.706977 | 2849325000 | 5063.0 | 5683 | 562774.047008 |
HBA_211117_H1930002_CX47_FI_3C_1_P8-6-J7-P12 | True | 0.006666 | 0.801151 | 0.799817 | 0.026621 | 0.020089 | 2906150 | hba_m3c11_h1930002_FI | HBA_211117_H1930002_CX47_FI_3C_1_P8 | 1.467688 | ... | Vip_4 | Vip | NaN | 352862.0 | 219220.0 | 1.609625 | 2846425000 | 4978.0 | 5499 | 571800.924066 |
HBA_211117_H1930002_CX47_FI_3C_1_P8-6-J7-P23 | True | 0.008189 | 0.792329 | 0.790614 | 0.036346 | 0.028389 | 2293534 | hba_m3c11_h1930002_FI | HBA_211117_H1930002_CX47_FI_3C_1_P8 | 1.158300 | ... | Sncg_0 | Sncg | NaN | 268935.0 | 194847.0 | 1.380237 | 2854400000 | 4642.0 | 5151 | 614907.367514 |
HBA_211117_H1930002_CX47_FI_3C_1_P8-6-J7-P24 | True | 0.010332 | 0.797810 | 0.795699 | 0.053267 | 0.043384 | 2110322 | hba_m3c11_h1930002_FI | HBA_211117_H1930002_CX47_FI_3C_1_P8 | 1.065772 | ... | Sst_5 | Sst | NaN | 282997.0 | 145288.0 | 1.947835 | 2841475000 | 4801.0 | 5292 | 591850.656113 |
117540 rows × 26 columns
tad_path_list = glob(f'{indir}sc_domain/*.boundary.h5ad')
print(len(tad_path_list))
46
sc_border = []
for xx in tad_path_list:
sc_border.append(anndata.read_h5ad(xx))
sc_border = anndata.AnnData.concatenate(*sc_border, index_unique=None)
sc_border
AnnData object with n_obs × n_vars = 131885 × 123544
obs: 'batch'
var: 'chrom', 'start', 'end'
sc_border.X = sc_border.X.astype(int)
sc_border = sc_border[sc_border.obs.index.isin(metadata.index)].copy()
sc_border = sc_border[:, ~sc_border.var['chrom'].isin(['chrX','chrY','chrM','chrL'])].copy()
sc_border.obs = metadata.loc[sc_border.obs.index]
sc_border.raw = sc_border.copy()
sc_border.write_h5ad(f'{outdir}cell_{sc_border.shape[0]}_boundary.h5ad')
bound_count_ct = []
for xx in leg:
bound_count_ct.append(sc_border.raw.X[sc_border.obs['MajorType']==xx].getnnz(axis=0))
bound_count_ct = pd.DataFrame(bound_count_ct, index=leg,
columns=sc_border.raw.var['chrom'].astype(str) + '_' + (sc_border.raw.var['start'] // res).astype(str))
cell_count_ct = sc_border.obs['MajorType'].value_counts().loc[leg]
# bound_prob_ct = (bound_count_ct / cell_count_ct[:,None]).T
bound_count_ct.to_hdf(f'{outdir}MajorType_boundcount.hdf', key='data')
cell_count_ct.to_csv(f'{outdir}MajorType_cellcount.csv.gz')