Assign DipC cells to RNA clusters

Assign DipC cells to RNA clusters#

import time
import numpy as np
import pandas as pd
from glob import glob
import anndata
import scanpy as sc
import scanpy.external as sce
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns
from scipy.sparse import csr_matrix
from ALLCools.plot import *
from ALLCools.clustering import *
from ALLCools.integration.seurat_class import SeuratIntegration
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize, OneHotEncoder
import pynndescent

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'

gene_hdf = pd.read_hdf(f'/data/test_schicluster/Tan2021/scool/dataset/Tan2021.geneimputescore.hdf', key='data')

res = 10000
gene_meta_path = '/data/ref/mm10/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz'
gene_meta = pd.read_csv(gene_meta_path, index_col='gene_id', sep='\t')
gene_meta['bin_len'] = (gene_meta['end'] // res) - (gene_meta['start'] // res) + 1
#gene_meta['weight'] = gene_meta['bin_len'] * (gene_meta['bin_len'] + 1) / 2
gene_meta['weight'] = (gene_meta['bin_len']+2) * (gene_meta['bin_len'] + 1) / 2
#gene_meta['weight'] = gene_meta['bin_len'].copy()

indir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/'

chrom_size_path = '/data/ref/mm10/genome/mm10.main.chrom.sizes'
chromsize = pd.read_csv(chrom_size_path, sep='\t', header=None, index_col=0)
gene_meta = gene_meta[gene_meta['chrom'].isin(chromsize.index)]
gene_meta

	chrom	source	feature	start	end	score	strand	phase	transcript_id	gene_type	...	transcript_status	transcript_name	exon_number	exon_id	level	mgi_id	havana_gene	tag	bin_len	weight
gene_id
ENSMUSG00000102693.1	chr1	HAVANA	gene	3073253	3074322	.	+	.	NaN	TEC	...	NaN	NaN	NaN	NaN	2	MGI:1918292	OTTMUSG00000049935.1	NaN	1	3.0
ENSMUSG00000064842.1	chr1	ENSEMBL	gene	3102016	3102125	.	+	.	NaN	snRNA	...	NaN	NaN	NaN	NaN	3	MGI:5455983	NaN	NaN	1	3.0
ENSMUSG00000051951.5	chr1	HAVANA	gene	3205901	3671498	.	-	.	NaN	protein_coding	...	NaN	NaN	NaN	NaN	2	MGI:3528744	OTTMUSG00000026353.2	NaN	48	1225.0
ENSMUSG00000102851.1	chr1	HAVANA	gene	3252757	3253236	.	+	.	NaN	processed_pseudogene	...	NaN	NaN	NaN	NaN	1	MGI:5011141	OTTMUSG00000049958.1	pseudo_consens	1	3.0
ENSMUSG00000103377.1	chr1	HAVANA	gene	3365731	3368549	.	-	.	NaN	TEC	...	NaN	NaN	NaN	NaN	2	MGI:5610408	OTTMUSG00000049960.1	NaN	1	3.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
ENSMUSG00000064368.1	chrM	ENSEMBL	gene	13552	14070	.	-	.	NaN	protein_coding	...	NaN	NaN	NaN	NaN	3	MGI:102495	NaN	NaN	1	3.0
ENSMUSG00000064369.1	chrM	ENSEMBL	gene	14071	14139	.	-	.	NaN	Mt_tRNA	...	NaN	NaN	NaN	NaN	3	MGI:102488	NaN	NaN	1	3.0
ENSMUSG00000064370.1	chrM	ENSEMBL	gene	14145	15288	.	+	.	NaN	protein_coding	...	NaN	NaN	NaN	NaN	3	MGI:102501	NaN	NaN	1	3.0
ENSMUSG00000064371.1	chrM	ENSEMBL	gene	15289	15355	.	+	.	NaN	Mt_tRNA	...	NaN	NaN	NaN	NaN	3	MGI:102473	NaN	NaN	1	3.0
ENSMUSG00000064372.1	chrM	ENSEMBL	gene	15356	15422	.	-	.	NaN	Mt_tRNA	...	NaN	NaN	NaN	NaN	3	MGI:102478	NaN	NaN	1	3.0

55487 rows × 23 columns

#gene_hdf = gene_hdf * gene_meta.loc[gene_hdf.columns, 'weight']
(gene_hdf>0).sum(axis=1)

cortex-p001-cb_034         39820
cortex-p001-cb_025         44877
cortex-p001-cb_119         48588
cortex-p001-cb_009         49538
cortex-p001-cb_005         49383
                           ...  
hippocampus-p347-cb_166    50257
hippocampus-p347-cb_191    49946
hippocampus-p347-cb_178    50059
hippocampus-p347-cb_187    49801
hippocampus-p347-cb_188    50075
Length: 3646, dtype: int64

adata = anndata.read_h5ad(f'{indir}Tan2021_dipc.h5ad')
adata

AnnData object with n_obs × n_vars = 3646 × 1
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    uns: 'leiden', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'dipc_pca_all'
    obsp: 'connectivities', 'distances'

gene3c = anndata.AnnData(gene_hdf.loc[adata.obs.index], obs=adata.obs, var=gene_meta.loc[gene_hdf.columns])
gene3c

AnnData object with n_obs × n_vars = 3646 × 51259
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag', 'bin_len', 'weight'

chrom_to_remove = ['chrL', 'chrM', 'chrX', 'chrY']
chrfilter = ~gene3c.var['chrom'].isin(chrom_to_remove)
gene3c = gene3c[:, chrfilter]

genefilter = ((gene3c.X>0).sum(axis=0)>10)
gene3c = gene3c[:, genefilter]
gene3c

View of AnnData object with n_obs × n_vars = 3646 × 51097
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag', 'bin_len', 'weight'

gene3c.var.index.name = 'gene_id'

gene3c.raw = gene3c.copy()
#sc.pp.highly_variable_genes(gene3c, n_top_genes=2000, flavor='seurat_v3')
gene3c.write_h5ad(f'cell_{gene3c.shape[0]}_gene3C.h5ad')

# gene3c = anndata.read_h5ad(f'{indir}cell_8225_gene3C_impute.h5ad')
gene3c = anndata.read_h5ad(glob(f'cell_*_gene3C.h5ad')[0])
gene3c

AnnData object with n_obs × n_vars = 3646 × 51097
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag', 'bin_len', 'weight'

gene_meta

	chrom	source	feature	start	end	score	strand	phase	transcript_id	gene_type	...	transcript_status	transcript_name	exon_number	exon_id	level	mgi_id	havana_gene	tag	bin_len	weight
gene_id
ENSMUSG00000102693.1	chr1	HAVANA	gene	3073253	3074322	.	+	.	NaN	TEC	...	NaN	NaN	NaN	NaN	2	MGI:1918292	OTTMUSG00000049935.1	NaN	1	3.0
ENSMUSG00000064842.1	chr1	ENSEMBL	gene	3102016	3102125	.	+	.	NaN	snRNA	...	NaN	NaN	NaN	NaN	3	MGI:5455983	NaN	NaN	1	3.0
ENSMUSG00000051951.5	chr1	HAVANA	gene	3205901	3671498	.	-	.	NaN	protein_coding	...	NaN	NaN	NaN	NaN	2	MGI:3528744	OTTMUSG00000026353.2	NaN	48	1225.0
ENSMUSG00000102851.1	chr1	HAVANA	gene	3252757	3253236	.	+	.	NaN	processed_pseudogene	...	NaN	NaN	NaN	NaN	1	MGI:5011141	OTTMUSG00000049958.1	pseudo_consens	1	3.0
ENSMUSG00000103377.1	chr1	HAVANA	gene	3365731	3368549	.	-	.	NaN	TEC	...	NaN	NaN	NaN	NaN	2	MGI:5610408	OTTMUSG00000049960.1	NaN	1	3.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
ENSMUSG00000064368.1	chrM	ENSEMBL	gene	13552	14070	.	-	.	NaN	protein_coding	...	NaN	NaN	NaN	NaN	3	MGI:102495	NaN	NaN	1	3.0
ENSMUSG00000064369.1	chrM	ENSEMBL	gene	14071	14139	.	-	.	NaN	Mt_tRNA	...	NaN	NaN	NaN	NaN	3	MGI:102488	NaN	NaN	1	3.0
ENSMUSG00000064370.1	chrM	ENSEMBL	gene	14145	15288	.	+	.	NaN	protein_coding	...	NaN	NaN	NaN	NaN	3	MGI:102501	NaN	NaN	1	3.0
ENSMUSG00000064371.1	chrM	ENSEMBL	gene	15289	15355	.	+	.	NaN	Mt_tRNA	...	NaN	NaN	NaN	NaN	3	MGI:102473	NaN	NaN	1	3.0
ENSMUSG00000064372.1	chrM	ENSEMBL	gene	15356	15422	.	-	.	NaN	Mt_tRNA	...	NaN	NaN	NaN	NaN	3	MGI:102478	NaN	NaN	1	3.0

55487 rows × 23 columns

gene3c.var.index = gene_meta.loc[gene3c.var.index, 'gene_name']
gene3c.var_names_make_unique()

sc.pp.normalize_total(gene3c, target_sum=np.median(gene3c.X.sum(axis=1)))

expr = anndata.read_h5ad(f'{indir}Tan2021_rna.h5ad')
expr

AnnData object with n_obs × n_vars = 3517 × 3000
    obs: 'Genes', 'nUMI', 'umap_0', 'umap_1', 'age', 'region', 'cluster'
    var: 'chrom', 'start', 'end', 'gene_id', 'strand', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg', 'log1p', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'rna_pca_all'
    obsp: 'connectivities', 'distances'

expr = anndata.AnnData(X=expr.raw.X, obs=expr.obs, var=expr.raw.var, raw=expr.raw, obsm=expr.obsm, uns=expr.uns, obsp=expr.obsp)
expr = expr[:, expr.var.index.isin(gene3c.var.index)]
expr

View of AnnData object with n_obs × n_vars = 3517 × 24511
    obs: 'Genes', 'nUMI', 'umap_0', 'umap_1', 'age', 'region', 'cluster'
    var: 'chrom', 'start', 'end', 'gene_id', 'strand', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg', 'log1p', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'rna_pca_all'
    obsp: 'connectivities', 'distances'

gene3c = gene3c[:, gene3c.var.index.isin(expr.var.index)]
gene3c

View of AnnData object with n_obs × n_vars = 3646 × 24511
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag', 'bin_len', 'weight'

tmp = np.std(gene3c.X, axis=0)
from scipy.stats import rankdata
tmp = rankdata(-tmp)
gene3c.var['highly_variable'] = (tmp<2000)

gene3c = gene3c[:, gene3c.var['highly_variable']]

expr = expr[:, gene3c.var.index]

# sc.pp.highly_variable_genes(expr, n_top_genes=2000)
# expr = expr[:, expr.var.highly_variable]
sc.pp.regress_out(expr, ['nUMI'])
sc.pp.scale(expr, max_value=10)

dim = 50
model = TruncatedSVD(n_components=dim, algorithm='arpack')
matrix_reduce = model.fit_transform(expr.X)
matrix_reduce = matrix_reduce / model.singular_values_

expr.obsm['rna_pca_all'] = matrix_reduce.copy()
significant_pc_test(expr, p_cutoff=0.1, update=False, obsm='rna_pca_all')

15 components passed P cutoff of 0.1.

def dump_embedding(adata, name, n_dim=2):
    # put manifold coordinates into adata.obs
    for i in range(n_dim):
        adata.obs[f'{name}_{i}'] = adata.obsm[f'X_{name}'][:, i]
    return adata

knn = 25
expr.obsm['X_pca'] = normalize(expr.obsm['rna_pca_all'][:, :30], axis=1)
sc.pp.neighbors(expr, n_neighbors=knn, use_rep='X_pca')

sc.tl.umap(expr, maxiter=200, random_state=0)
expr = dump_embedding(expr, 'umap')
expr.obsm['rna_u30_umap'] = expr.obsm['X_umap'].copy()

fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
                       data=expr.obs,
                       hue='nUMI',
                       coord_base='umap',
                       max_points=None,
                       labelsize=10,
                       )
_ = categorical_scatter(data=expr.obs,
                        ax=axes[0,1],
                        coord_base='umap',
                        hue='age',
                        palette='rainbow',
                        labelsize=10,
                        show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
                        data=expr.obs,
                        hue='region',
                        coord_base='umap',
                        # text_anno='region',
                        # palette='tab10',
                        labelsize=10,
                        show_legend=True
                       )
_ = categorical_scatter(ax=axes[1,1],
                        data=expr.obs,
                        hue='cluster',
                        coord_base='umap',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        # show_legend=True
                       )

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica

../_images/dcd3fb6ea24a041dad64ef6b5e64be4a2403fbd6005088bec4bd597f8af570a6.png

gene3c

View of AnnData object with n_obs × n_vars = 3646 × 1999
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag', 'bin_len', 'weight', 'highly_variable'

sc.pp.regress_out(gene3c, ['contacts'])
sc.pp.scale(gene3c, max_value=10)

gene3c.obsm['X_pca'] = model.transform(gene3c.X) / model.singular_values_
gene3c.obsm['X_pca'] = normalize(gene3c.obsm['X_pca'][:, :30], axis=1)

expr.obs['Modality'] = 'RNA'
gene3c.obs['Modality'] = '3C'

adata_list = [expr, gene3c]

integrator = SeuratIntegration()

anchor = integrator.find_anchor(adata_list,
                                k_local=None,
                                key_local='X_pca',
                                k_anchor=5,
                                key_anchor='X',
                                dim_red='cca',
                                max_cc_cells=50000,
                                k_score=30,
                                k_filter=None,
                                scale1=False,
                                scale2=False,
                                #scale =[False, True]
                                n_components=30,
                                n_features=200,
                                alignments=[[[0], [1]]])

Find anchors across datasets.
Run CCA
non zero dims 30
Find Anchors using k=30
Score Anchors
Identified 6894 anchors between datasets 0 and 1.

start_time = time.time()
corrected = integrator.integrate(key_correct='X_pca',
                                 row_normalize=True,
                                 n_components=30,
                                 k_weight=100,
                                 sd=1,
                                 alignments=[[[0], [1]]])

print(time.time() - start_time)

Merge datasets
[[0], [1]]
Initialize
Find nearest anchors. k_weight:  100
Normalize graph
Transform data
6.461906433105469

ncell = np.sum([xx.shape[0] for xx in adata_list])
adata_merge = anndata.AnnData(
    X=np.ones((ncell, 1)), obs=pd.concat([xx.obs for xx in adata_list], axis=0)
)
adata_merge

AnnData object with n_obs × n_vars = 7163 × 1
    obs: 'Genes', 'nUMI', 'umap_0', 'umap_1', 'age', 'region', 'cluster', 'Modality', 'tissue', 'treatment', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'leiden'

adata_merge.obsm['u30_seuratcc30nofilter'] = np.concatenate(corrected, axis=0)
adata_merge.obsm['X_pca'] = normalize(adata_merge.obsm['u30_seuratcc30nofilter'][:, :30], axis=1)
# sce.pp.harmony_integrate(adata_merge, 'Modality', max_iter_harmony=30, random_state=0)
# adata_merge.obsm['log_reghvg_cca_u25hm'] = adata_merge.obsm['X_pca_harmony'].copy()

knn = 25
sc.pp.neighbors(adata_merge, n_neighbors=knn, use_rep='u30_seuratcc30nofilter')
sc.tl.umap(adata_merge, maxiter=200, random_state=0)
adata_merge = dump_embedding(adata_merge, 'umap')
adata_merge.obsm['u30_seuratcc30nofilter_u30_umap'] = adata_merge.obsm['X_umap'].copy()

legage = np.sort(adata_merge.obs['age'].unique())
palage = {xx:yy for xx,yy in zip(legage, sns.color_palette('hls', len(legage)))}

# adata_merge.obsm['X_umap'] = adata_merge.obsm['u50_seuratcc30nofilter_u30_umap'].copy()
# dump_embedding(adata_merge, 'umap')

fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300, constrained_layout=True)
tmp = adata_merge.obs.loc[adata_merge.obs['Modality']=='3C']
ax = axes[0,0]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='age',
                        palette=palage,
                        labelsize=10,
                        show_legend=True)

ax = axes[0,1]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='leiden',
                        text_anno='leiden',
                        palette='tab20',
                        labelsize=10,
                        #show_legend=True
                       )

tmp = adata_merge.obs.loc[adata_merge.obs['Modality']=='RNA']
ax = axes[1,0]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='age',
                        palette=palage,
                        labelsize=10,
                        show_legend=True)

ax = axes[1,1]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='cluster',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        #show_legend=True
                       )

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica

../_images/8a0bf8d4d66185260b784f2335b93abdfe86ddeda474979ca3e582046cb7ad04.png

adata_merge.obs.loc[:,adata_merge.obs.dtypes == 'object'] = adata_merge.obs.loc[:,adata_merge.obs.dtypes == 'object'].astype(str)
adata_merge.write_h5ad('Tan2021_rna3c.h5ad')

rna_cell = (adata_merge.obs['Modality']=='RNA')
hic_cell = (adata_merge.obs['Modality']=='3C')
print(rna_cell.sum(), hic_cell.sum())

start_time = time.time()
index = pynndescent.NNDescent(adata_merge.obsm['X_pca'][rna_cell], metric='euclidean', n_neighbors=50, random_state=0, n_jobs=-1)
print(time.time() - start_time)
G, D = index.query(adata_merge.obsm['X_pca'][hic_cell], k=15)
print(time.time() - start_time)

3646
7463104724884033
10052227973938

chunk_size = 50000
sd = 1
start_time = time.time()
cellfilter = D[:, -1] == 0
D = 1 - D / D[:, -1][:, None]
D[cellfilter] = 1
D = 1 - np.exp(-D * (sd**2) / 4)
D = D / (np.sum(D, axis=1) + 1e-6)[:, None]
print(time.time() - start_time)

0.0013263225555419922

rna_cell = rna_cell.index[rna_cell]
hic_cell = hic_cell.index[hic_cell]

enc = OneHotEncoder()
# enc.fit(adata_merge.obs.loc[aibs_cell, ['L3']].values.astype(str))
labelref = enc.fit_transform(adata_merge.obs.loc[rna_cell, 'cluster'].astype(str)[:, None])
cluster = pd.DataFrame(index=hic_cell, columns=['rnatype', 'score'], dtype=str)
for chunk_start in range(0, len(hic_cell), chunk_size):
    result = (
        D[chunk_start : (chunk_start + chunk_size), :, None]
        * labelref[G[chunk_start : (chunk_start + chunk_size)].flatten()]
        .toarray()
        .reshape((-1, 15, enc.categories_[0].shape[0]))
    ).sum(axis=1)
    result = pd.DataFrame(
        result,
        columns=enc.categories_[0],
        index=hic_cell[chunk_start : (chunk_start + chunk_size)],
    )
    result = result.loc[:, result.columns != "nan"]
    cluster.loc[result.index, "rnatype"] = result.idxmax(axis=1).values
    cluster.loc[result.index, "score"] = result.max(axis=1).values
    print(chunk_start)

print(time.time() - start_time)
cluster.to_hdf("DipC_rnacluster.hdf", key="data")

0
0.6793961524963379

cluster['dipctype'] = adata_merge.obs.loc[hic_cell, 'cell-type cluster']
adata_merge.obs.loc[hic_cell, 'cluster'] = cluster['rnatype']

# adata_merge.obsm['X_umap'] = adata_merge.obsm['u50_seuratcc30nofilter_u30_umap'].copy()
# dump_embedding(adata_merge, 'umap')

fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300, constrained_layout=True)
tmp = adata_merge.obs.loc[adata_merge.obs['Modality']=='3C']
ax = axes[0,0]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='age',
                        palette=palage,
                        labelsize=10,
                        show_legend=True)

ax = axes[0,1]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='cluster',
                        # text_anno='celltype',
                        palette='tab20',
                        labelsize=10,
                        #show_legend=True
                       )

tmp = adata_merge.obs.loc[adata_merge.obs['Modality']=='RNA']
ax = axes[1,0]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='age',
                        palette=palage,
                        labelsize=10,
                        show_legend=True)

ax = axes[1,1]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='cluster',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        #show_legend=True
                       )

../_images/6501bcd97ca1616cf5f4d3eaeedb86bb69aeb409511b9332b0daf58c652f56e8.png

cf_matrix = cluster[['dipctype','rnatype']].value_counts().unstack().fillna(0)
cf_matrix = cf_matrix / np.sqrt(cf_matrix.sum(axis=1)[:,None]) / np.sqrt(cf_matrix.sum(axis=0))

idx = pd.Series(np.arange(cf_matrix.shape[1]), cf_matrix.columns)

fig, ax = plt.subplots(figsize=(8,4), dpi=300)
sns.heatmap(cf_matrix.iloc[np.argsort(idx[np.argmax(cf_matrix.values, axis=1)])], ax=ax)

<AxesSubplot:xlabel='rnatype', ylabel='dipctype'>

../_images/cba54f7e1ff880c7469f07c956be3e29848c166d487736ff3caeeba4c3d3c6e9.png

cluster['rnatype'].value_counts()

Microglia Etc.                                391
Cortical L6 Pyramidal Cell                    333
Hippocampal Granuale Cell                     309
Neonatal Astrocyte                            234
Cortical L2-5 Pyramidal Cell, Neonatal        211
Mature Oligodendrocyte                        210
Cortical L2/3 Pyramidal Cell                  204
Cortical L4 Pyramidal Cell                    195
Oligodendrocyte Progenitor                    189
Medium Spiny Neuron                           167
Adult Astrocyte                               144
Hippocampal CA1 Pyramidal Cell                134
VIP Interneuron                               112
Cortical L5 Pyramidal Cell                     98
Hippocampal Pyramidal Cell, Neonatal           93
Unknown Interneuron 1                          89
MEIS2 Interneuron                              88
PV/SST Interneuron, Neonatal                   85
Hippocampal CA3 Pyramidal Cell                 70
PV Interneuron                                 61
SST Interneuron                                55
Unknown Interneuron 2                          51
Cortical L2-4 Pyramidal Cell, Intermediate     37
NDNF Interneuron                               36
Newly Formed Oligodendrocyte                   27
Cajal-Retzius Cell                             23
Name: rnatype, dtype: int64

cluster.to_csv('Tan2021_dipc_cluster.csv.gz', )

	rnatype	score	dipctype	dipcleiden
cortex-p028-cb_116	Cortical L6 Pyramidal Cell	0.999996	Cortical L6 Pyramidal Cell	5
cortex-visual-control-p007-b6_182	Cortical L6 Pyramidal Cell	0.999992	Neuron	2
cortex-p028-cb_112	Cortical L6 Pyramidal Cell	0.999996	Cortical L2–5 Pyramidal Cell	5
cortex-visual-control-p001-b6_061	Unknown Interneuron 2	0.330496	Neuron	2
cortex-p056-cb_216	Microglia Etc.	0.999989	Microglia Etc.	14
...	...	...	...	...
cortex-visual-control-p021-b6_090	Mature Oligodendrocyte	0.999994	Mature Oligodendrocyte	16
cortex-visual-control-p021-b6_012	Cortical L6 Pyramidal Cell	0.999997	Neuron	5
hippocampus-p007-cb_046	Microglia Etc.	0.999993	Microglia Etc.	13
cortex-visual-dark-p014-b6_106	Microglia Etc.	0.999994	Microglia Etc.	13
cortex-visual-control-p021-b6_174	Microglia Etc.	0.999995	Microglia Etc.	13

3646 rows × 4 columns

cluster['dipcleiden'] = adata_merge.obs.loc[hic_cell, 'leiden']
cf_matrix = cluster[['dipcleiden','rnatype']].value_counts().unstack().fillna(0)
cf_matrix = cf_matrix / np.sqrt(cf_matrix.sum(axis=1)[:,None]) / np.sqrt(cf_matrix.sum(axis=0))
idx = pd.Series(np.arange(cf_matrix.shape[1]), cf_matrix.columns)

sns.clustermap(cf_matrix, figsize=(8,8))

<seaborn.matrix.ClusterGrid at 0x7f0f48f22f50>

../_images/19c59490a59a99bce9f97dda168cca4df9ed5c8d0d70eeb91f1407563a1ef4b3.png