Assign DipC cells to RNA clusters

Assign DipC cells to RNA clusters#

import time
import numpy as np
import pandas as pd
from glob import glob
import anndata
import scanpy as sc
import scanpy.external as sce
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns
from scipy.sparse import csr_matrix
from ALLCools.plot import *
from ALLCools.clustering import *
from ALLCools.integration.seurat_class import SeuratIntegration
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize, OneHotEncoder
import pynndescent

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'
gene_hdf = pd.read_hdf(f'/data/test_schicluster/Tan2021/scool/dataset/Tan2021.geneimputescore.hdf', key='data')
res = 10000
gene_meta_path = '/data/ref/mm10/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz'
gene_meta = pd.read_csv(gene_meta_path, index_col='gene_id', sep='\t')
gene_meta['bin_len'] = (gene_meta['end'] // res) - (gene_meta['start'] // res) + 1
#gene_meta['weight'] = gene_meta['bin_len'] * (gene_meta['bin_len'] + 1) / 2
gene_meta['weight'] = (gene_meta['bin_len']+2) * (gene_meta['bin_len'] + 1) / 2
#gene_meta['weight'] = gene_meta['bin_len'].copy()
indir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/'
chrom_size_path = '/data/ref/mm10/genome/mm10.main.chrom.sizes'
chromsize = pd.read_csv(chrom_size_path, sep='\t', header=None, index_col=0)
gene_meta = gene_meta[gene_meta['chrom'].isin(chromsize.index)]
gene_meta
chrom source feature start end score strand phase transcript_id gene_type ... transcript_status transcript_name exon_number exon_id level mgi_id havana_gene tag bin_len weight
gene_id
ENSMUSG00000102693.1 chr1 HAVANA gene 3073253 3074322 . + . NaN TEC ... NaN NaN NaN NaN 2 MGI:1918292 OTTMUSG00000049935.1 NaN 1 3.0
ENSMUSG00000064842.1 chr1 ENSEMBL gene 3102016 3102125 . + . NaN snRNA ... NaN NaN NaN NaN 3 MGI:5455983 NaN NaN 1 3.0
ENSMUSG00000051951.5 chr1 HAVANA gene 3205901 3671498 . - . NaN protein_coding ... NaN NaN NaN NaN 2 MGI:3528744 OTTMUSG00000026353.2 NaN 48 1225.0
ENSMUSG00000102851.1 chr1 HAVANA gene 3252757 3253236 . + . NaN processed_pseudogene ... NaN NaN NaN NaN 1 MGI:5011141 OTTMUSG00000049958.1 pseudo_consens 1 3.0
ENSMUSG00000103377.1 chr1 HAVANA gene 3365731 3368549 . - . NaN TEC ... NaN NaN NaN NaN 2 MGI:5610408 OTTMUSG00000049960.1 NaN 1 3.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ENSMUSG00000064368.1 chrM ENSEMBL gene 13552 14070 . - . NaN protein_coding ... NaN NaN NaN NaN 3 MGI:102495 NaN NaN 1 3.0
ENSMUSG00000064369.1 chrM ENSEMBL gene 14071 14139 . - . NaN Mt_tRNA ... NaN NaN NaN NaN 3 MGI:102488 NaN NaN 1 3.0
ENSMUSG00000064370.1 chrM ENSEMBL gene 14145 15288 . + . NaN protein_coding ... NaN NaN NaN NaN 3 MGI:102501 NaN NaN 1 3.0
ENSMUSG00000064371.1 chrM ENSEMBL gene 15289 15355 . + . NaN Mt_tRNA ... NaN NaN NaN NaN 3 MGI:102473 NaN NaN 1 3.0
ENSMUSG00000064372.1 chrM ENSEMBL gene 15356 15422 . - . NaN Mt_tRNA ... NaN NaN NaN NaN 3 MGI:102478 NaN NaN 1 3.0

55487 rows × 23 columns

#gene_hdf = gene_hdf * gene_meta.loc[gene_hdf.columns, 'weight']
(gene_hdf>0).sum(axis=1)
cortex-p001-cb_034         39820
cortex-p001-cb_025         44877
cortex-p001-cb_119         48588
cortex-p001-cb_009         49538
cortex-p001-cb_005         49383
                           ...  
hippocampus-p347-cb_166    50257
hippocampus-p347-cb_191    49946
hippocampus-p347-cb_178    50059
hippocampus-p347-cb_187    49801
hippocampus-p347-cb_188    50075
Length: 3646, dtype: int64
adata = anndata.read_h5ad(f'{indir}Tan2021_dipc.h5ad')
adata
AnnData object with n_obs × n_vars = 3646 × 1
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    uns: 'leiden', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'dipc_pca_all'
    obsp: 'connectivities', 'distances'
gene3c = anndata.AnnData(gene_hdf.loc[adata.obs.index], obs=adata.obs, var=gene_meta.loc[gene_hdf.columns])
gene3c
AnnData object with n_obs × n_vars = 3646 × 51259
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag', 'bin_len', 'weight'
chrom_to_remove = ['chrL', 'chrM', 'chrX', 'chrY']
chrfilter = ~gene3c.var['chrom'].isin(chrom_to_remove)
gene3c = gene3c[:, chrfilter]
genefilter = ((gene3c.X>0).sum(axis=0)>10)
gene3c = gene3c[:, genefilter]
gene3c
View of AnnData object with n_obs × n_vars = 3646 × 51097
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag', 'bin_len', 'weight'
gene3c.var.index.name = 'gene_id'
gene3c.raw = gene3c.copy()
#sc.pp.highly_variable_genes(gene3c, n_top_genes=2000, flavor='seurat_v3')
gene3c.write_h5ad(f'cell_{gene3c.shape[0]}_gene3C.h5ad')
# gene3c = anndata.read_h5ad(f'{indir}cell_8225_gene3C_impute.h5ad')
gene3c = anndata.read_h5ad(glob(f'cell_*_gene3C.h5ad')[0])
gene3c
AnnData object with n_obs × n_vars = 3646 × 51097
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag', 'bin_len', 'weight'
gene_meta
chrom source feature start end score strand phase transcript_id gene_type ... transcript_status transcript_name exon_number exon_id level mgi_id havana_gene tag bin_len weight
gene_id
ENSMUSG00000102693.1 chr1 HAVANA gene 3073253 3074322 . + . NaN TEC ... NaN NaN NaN NaN 2 MGI:1918292 OTTMUSG00000049935.1 NaN 1 3.0
ENSMUSG00000064842.1 chr1 ENSEMBL gene 3102016 3102125 . + . NaN snRNA ... NaN NaN NaN NaN 3 MGI:5455983 NaN NaN 1 3.0
ENSMUSG00000051951.5 chr1 HAVANA gene 3205901 3671498 . - . NaN protein_coding ... NaN NaN NaN NaN 2 MGI:3528744 OTTMUSG00000026353.2 NaN 48 1225.0
ENSMUSG00000102851.1 chr1 HAVANA gene 3252757 3253236 . + . NaN processed_pseudogene ... NaN NaN NaN NaN 1 MGI:5011141 OTTMUSG00000049958.1 pseudo_consens 1 3.0
ENSMUSG00000103377.1 chr1 HAVANA gene 3365731 3368549 . - . NaN TEC ... NaN NaN NaN NaN 2 MGI:5610408 OTTMUSG00000049960.1 NaN 1 3.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ENSMUSG00000064368.1 chrM ENSEMBL gene 13552 14070 . - . NaN protein_coding ... NaN NaN NaN NaN 3 MGI:102495 NaN NaN 1 3.0
ENSMUSG00000064369.1 chrM ENSEMBL gene 14071 14139 . - . NaN Mt_tRNA ... NaN NaN NaN NaN 3 MGI:102488 NaN NaN 1 3.0
ENSMUSG00000064370.1 chrM ENSEMBL gene 14145 15288 . + . NaN protein_coding ... NaN NaN NaN NaN 3 MGI:102501 NaN NaN 1 3.0
ENSMUSG00000064371.1 chrM ENSEMBL gene 15289 15355 . + . NaN Mt_tRNA ... NaN NaN NaN NaN 3 MGI:102473 NaN NaN 1 3.0
ENSMUSG00000064372.1 chrM ENSEMBL gene 15356 15422 . - . NaN Mt_tRNA ... NaN NaN NaN NaN 3 MGI:102478 NaN NaN 1 3.0

55487 rows × 23 columns

gene3c.var.index = gene_meta.loc[gene3c.var.index, 'gene_name']
gene3c.var_names_make_unique()
sc.pp.normalize_total(gene3c, target_sum=np.median(gene3c.X.sum(axis=1)))
expr = anndata.read_h5ad(f'{indir}Tan2021_rna.h5ad')
expr
AnnData object with n_obs × n_vars = 3517 × 3000
    obs: 'Genes', 'nUMI', 'umap_0', 'umap_1', 'age', 'region', 'cluster'
    var: 'chrom', 'start', 'end', 'gene_id', 'strand', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg', 'log1p', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'rna_pca_all'
    obsp: 'connectivities', 'distances'
expr = anndata.AnnData(X=expr.raw.X, obs=expr.obs, var=expr.raw.var, raw=expr.raw, obsm=expr.obsm, uns=expr.uns, obsp=expr.obsp)
expr = expr[:, expr.var.index.isin(gene3c.var.index)]
expr
View of AnnData object with n_obs × n_vars = 3517 × 24511
    obs: 'Genes', 'nUMI', 'umap_0', 'umap_1', 'age', 'region', 'cluster'
    var: 'chrom', 'start', 'end', 'gene_id', 'strand', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg', 'log1p', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'rna_pca_all'
    obsp: 'connectivities', 'distances'
gene3c = gene3c[:, gene3c.var.index.isin(expr.var.index)]
gene3c
View of AnnData object with n_obs × n_vars = 3646 × 24511
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag', 'bin_len', 'weight'
tmp = np.std(gene3c.X, axis=0)
from scipy.stats import rankdata
tmp = rankdata(-tmp)
gene3c.var['highly_variable'] = (tmp<2000)
gene3c = gene3c[:, gene3c.var['highly_variable']]
expr = expr[:, gene3c.var.index]
# sc.pp.highly_variable_genes(expr, n_top_genes=2000)
# expr = expr[:, expr.var.highly_variable]
sc.pp.regress_out(expr, ['nUMI'])
sc.pp.scale(expr, max_value=10)
dim = 50
model = TruncatedSVD(n_components=dim, algorithm='arpack')
matrix_reduce = model.fit_transform(expr.X)
matrix_reduce = matrix_reduce / model.singular_values_
expr.obsm['rna_pca_all'] = matrix_reduce.copy()
significant_pc_test(expr, p_cutoff=0.1, update=False, obsm='rna_pca_all')
15 components passed P cutoff of 0.1.
15
def dump_embedding(adata, name, n_dim=2):
    # put manifold coordinates into adata.obs
    for i in range(n_dim):
        adata.obs[f'{name}_{i}'] = adata.obsm[f'X_{name}'][:, i]
    return adata
knn = 25
expr.obsm['X_pca'] = normalize(expr.obsm['rna_pca_all'][:, :30], axis=1)
sc.pp.neighbors(expr, n_neighbors=knn, use_rep='X_pca')
sc.tl.umap(expr, maxiter=200, random_state=0)
expr = dump_embedding(expr, 'umap')
expr.obsm['rna_u30_umap'] = expr.obsm['X_umap'].copy()
fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
                       data=expr.obs,
                       hue='nUMI',
                       coord_base='umap',
                       max_points=None,
                       labelsize=10,
                       )
_ = categorical_scatter(data=expr.obs,
                        ax=axes[0,1],
                        coord_base='umap',
                        hue='age',
                        palette='rainbow',
                        labelsize=10,
                        show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
                        data=expr.obs,
                        hue='region',
                        coord_base='umap',
                        # text_anno='region',
                        # palette='tab10',
                        labelsize=10,
                        show_legend=True
                       )
_ = categorical_scatter(ax=axes[1,1],
                        data=expr.obs,
                        hue='cluster',
                        coord_base='umap',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        # show_legend=True
                       )
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica
../_images/dcd3fb6ea24a041dad64ef6b5e64be4a2403fbd6005088bec4bd597f8af570a6.png
gene3c
View of AnnData object with n_obs × n_vars = 3646 × 1999
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'umap_0', 'umap_1', 'leiden'
    var: 'chrom', 'source', 'feature', 'start', 'end', 'score', 'strand', 'phase', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'exon_number', 'exon_id', 'level', 'mgi_id', 'havana_gene', 'tag', 'bin_len', 'weight', 'highly_variable'
sc.pp.regress_out(gene3c, ['contacts'])
sc.pp.scale(gene3c, max_value=10)
gene3c.obsm['X_pca'] = model.transform(gene3c.X) / model.singular_values_
gene3c.obsm['X_pca'] = normalize(gene3c.obsm['X_pca'][:, :30], axis=1)
expr.obs['Modality'] = 'RNA'
gene3c.obs['Modality'] = '3C'
adata_list = [expr, gene3c]
integrator = SeuratIntegration()
anchor = integrator.find_anchor(adata_list,
                                k_local=None,
                                key_local='X_pca',
                                k_anchor=5,
                                key_anchor='X',
                                dim_red='cca',
                                max_cc_cells=50000,
                                k_score=30,
                                k_filter=None,
                                scale1=False,
                                scale2=False,
                                #scale =[False, True]
                                n_components=30,
                                n_features=200,
                                alignments=[[[0], [1]]])
Find anchors across datasets.
Run CCA
non zero dims 30
Find Anchors using k=30
Score Anchors
Identified 6894 anchors between datasets 0 and 1.
start_time = time.time()
corrected = integrator.integrate(key_correct='X_pca',
                                 row_normalize=True,
                                 n_components=30,
                                 k_weight=100,
                                 sd=1,
                                 alignments=[[[0], [1]]])

print(time.time() - start_time)
Merge datasets
[[0], [1]]
Initialize
Find nearest anchors. k_weight:  100
Normalize graph
Transform data
6.461906433105469
ncell = np.sum([xx.shape[0] for xx in adata_list])
adata_merge = anndata.AnnData(
    X=np.ones((ncell, 1)), obs=pd.concat([xx.obs for xx in adata_list], axis=0)
)
adata_merge
AnnData object with n_obs × n_vars = 7163 × 1
    obs: 'Genes', 'nUMI', 'umap_0', 'umap_1', 'age', 'region', 'cluster', 'Modality', 'tissue', 'treatment', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'contacts', 'leiden'
adata_merge.obsm['u30_seuratcc30nofilter'] = np.concatenate(corrected, axis=0)
adata_merge.obsm['X_pca'] = normalize(adata_merge.obsm['u30_seuratcc30nofilter'][:, :30], axis=1)
# sce.pp.harmony_integrate(adata_merge, 'Modality', max_iter_harmony=30, random_state=0)
# adata_merge.obsm['log_reghvg_cca_u25hm'] = adata_merge.obsm['X_pca_harmony'].copy()
knn = 25
sc.pp.neighbors(adata_merge, n_neighbors=knn, use_rep='u30_seuratcc30nofilter')
sc.tl.umap(adata_merge, maxiter=200, random_state=0)
adata_merge = dump_embedding(adata_merge, 'umap')
adata_merge.obsm['u30_seuratcc30nofilter_u30_umap'] = adata_merge.obsm['X_umap'].copy()
legage = np.sort(adata_merge.obs['age'].unique())
palage = {xx:yy for xx,yy in zip(legage, sns.color_palette('hls', len(legage)))}
# adata_merge.obsm['X_umap'] = adata_merge.obsm['u50_seuratcc30nofilter_u30_umap'].copy()
# dump_embedding(adata_merge, 'umap')

fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300, constrained_layout=True)
tmp = adata_merge.obs.loc[adata_merge.obs['Modality']=='3C']
ax = axes[0,0]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='age',
                        palette=palage,
                        labelsize=10,
                        show_legend=True)

ax = axes[0,1]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='leiden',
                        text_anno='leiden',
                        palette='tab20',
                        labelsize=10,
                        #show_legend=True
                       )

tmp = adata_merge.obs.loc[adata_merge.obs['Modality']=='RNA']
ax = axes[1,0]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='age',
                        palette=palage,
                        labelsize=10,
                        show_legend=True)

ax = axes[1,1]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='cluster',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        #show_legend=True
                       )
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica
../_images/8a0bf8d4d66185260b784f2335b93abdfe86ddeda474979ca3e582046cb7ad04.png
adata_merge.obs.loc[:,adata_merge.obs.dtypes == 'object'] = adata_merge.obs.loc[:,adata_merge.obs.dtypes == 'object'].astype(str)
adata_merge.write_h5ad('Tan2021_rna3c.h5ad')
rna_cell = (adata_merge.obs['Modality']=='RNA')
hic_cell = (adata_merge.obs['Modality']=='3C')
print(rna_cell.sum(), hic_cell.sum())

start_time = time.time()
index = pynndescent.NNDescent(adata_merge.obsm['X_pca'][rna_cell], metric='euclidean', n_neighbors=50, random_state=0, n_jobs=-1)
print(time.time() - start_time)
G, D = index.query(adata_merge.obsm['X_pca'][hic_cell], k=15)
print(time.time() - start_time)
3517 3646
0.7463104724884033
2.10052227973938
chunk_size = 50000
sd = 1
start_time = time.time()
cellfilter = D[:, -1] == 0
D = 1 - D / D[:, -1][:, None]
D[cellfilter] = 1
D = 1 - np.exp(-D * (sd**2) / 4)
D = D / (np.sum(D, axis=1) + 1e-6)[:, None]
print(time.time() - start_time)
0.0013263225555419922
rna_cell = rna_cell.index[rna_cell]
hic_cell = hic_cell.index[hic_cell]
enc = OneHotEncoder()
# enc.fit(adata_merge.obs.loc[aibs_cell, ['L3']].values.astype(str))
labelref = enc.fit_transform(adata_merge.obs.loc[rna_cell, 'cluster'].astype(str)[:, None])
cluster = pd.DataFrame(index=hic_cell, columns=['rnatype', 'score'], dtype=str)
for chunk_start in range(0, len(hic_cell), chunk_size):
    result = (
        D[chunk_start : (chunk_start + chunk_size), :, None]
        * labelref[G[chunk_start : (chunk_start + chunk_size)].flatten()]
        .toarray()
        .reshape((-1, 15, enc.categories_[0].shape[0]))
    ).sum(axis=1)
    result = pd.DataFrame(
        result,
        columns=enc.categories_[0],
        index=hic_cell[chunk_start : (chunk_start + chunk_size)],
    )
    result = result.loc[:, result.columns != "nan"]
    cluster.loc[result.index, "rnatype"] = result.idxmax(axis=1).values
    cluster.loc[result.index, "score"] = result.max(axis=1).values
    print(chunk_start)

print(time.time() - start_time)
cluster.to_hdf("DipC_rnacluster.hdf", key="data")
0
0.6793961524963379
cluster['dipctype'] = adata_merge.obs.loc[hic_cell, 'cell-type cluster']
adata_merge.obs.loc[hic_cell, 'cluster'] = cluster['rnatype']
# adata_merge.obsm['X_umap'] = adata_merge.obsm['u50_seuratcc30nofilter_u30_umap'].copy()
# dump_embedding(adata_merge, 'umap')

fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300, constrained_layout=True)
tmp = adata_merge.obs.loc[adata_merge.obs['Modality']=='3C']
ax = axes[0,0]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='age',
                        palette=palage,
                        labelsize=10,
                        show_legend=True)

ax = axes[0,1]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='cluster',
                        # text_anno='celltype',
                        palette='tab20',
                        labelsize=10,
                        #show_legend=True
                       )

tmp = adata_merge.obs.loc[adata_merge.obs['Modality']=='RNA']
ax = axes[1,0]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='age',
                        palette=palage,
                        labelsize=10,
                        show_legend=True)

ax = axes[1,1]
ax.scatter(adata_merge.obs['umap_0'], adata_merge.obs['umap_1'], c='#e0e0e0', edgecolors='none', s=1, rasterized=True)
_ = categorical_scatter(data=tmp,
                        ax=ax,
                        coord_base='umap',
                        hue='cluster',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        #show_legend=True
                       )
../_images/6501bcd97ca1616cf5f4d3eaeedb86bb69aeb409511b9332b0daf58c652f56e8.png
cf_matrix = cluster[['dipctype','rnatype']].value_counts().unstack().fillna(0)
cf_matrix = cf_matrix / np.sqrt(cf_matrix.sum(axis=1)[:,None]) / np.sqrt(cf_matrix.sum(axis=0))
idx = pd.Series(np.arange(cf_matrix.shape[1]), cf_matrix.columns)
fig, ax = plt.subplots(figsize=(8,4), dpi=300)
sns.heatmap(cf_matrix.iloc[np.argsort(idx[np.argmax(cf_matrix.values, axis=1)])], ax=ax)
<AxesSubplot:xlabel='rnatype', ylabel='dipctype'>
../_images/cba54f7e1ff880c7469f07c956be3e29848c166d487736ff3caeeba4c3d3c6e9.png
cluster['rnatype'].value_counts()
Microglia Etc.                                391
Cortical L6 Pyramidal Cell                    333
Hippocampal Granuale Cell                     309
Neonatal Astrocyte                            234
Cortical L2-5 Pyramidal Cell, Neonatal        211
Mature Oligodendrocyte                        210
Cortical L2/3 Pyramidal Cell                  204
Cortical L4 Pyramidal Cell                    195
Oligodendrocyte Progenitor                    189
Medium Spiny Neuron                           167
Adult Astrocyte                               144
Hippocampal CA1 Pyramidal Cell                134
VIP Interneuron                               112
Cortical L5 Pyramidal Cell                     98
Hippocampal Pyramidal Cell, Neonatal           93
Unknown Interneuron 1                          89
MEIS2 Interneuron                              88
PV/SST Interneuron, Neonatal                   85
Hippocampal CA3 Pyramidal Cell                 70
PV Interneuron                                 61
SST Interneuron                                55
Unknown Interneuron 2                          51
Cortical L2-4 Pyramidal Cell, Intermediate     37
NDNF Interneuron                               36
Newly Formed Oligodendrocyte                   27
Cajal-Retzius Cell                             23
Name: rnatype, dtype: int64
cluster.to_csv('Tan2021_dipc_cluster.csv.gz', )
rnatype score dipctype dipcleiden
cortex-p028-cb_116 Cortical L6 Pyramidal Cell 0.999996 Cortical L6 Pyramidal Cell 5
cortex-visual-control-p007-b6_182 Cortical L6 Pyramidal Cell 0.999992 Neuron 2
cortex-p028-cb_112 Cortical L6 Pyramidal Cell 0.999996 Cortical L2–5 Pyramidal Cell 5
cortex-visual-control-p001-b6_061 Unknown Interneuron 2 0.330496 Neuron 2
cortex-p056-cb_216 Microglia Etc. 0.999989 Microglia Etc. 14
... ... ... ... ...
cortex-visual-control-p021-b6_090 Mature Oligodendrocyte 0.999994 Mature Oligodendrocyte 16
cortex-visual-control-p021-b6_012 Cortical L6 Pyramidal Cell 0.999997 Neuron 5
hippocampus-p007-cb_046 Microglia Etc. 0.999993 Microglia Etc. 13
cortex-visual-dark-p014-b6_106 Microglia Etc. 0.999994 Microglia Etc. 13
cortex-visual-control-p021-b6_174 Microglia Etc. 0.999995 Microglia Etc. 13

3646 rows × 4 columns

cluster['dipcleiden'] = adata_merge.obs.loc[hic_cell, 'leiden']
cf_matrix = cluster[['dipcleiden','rnatype']].value_counts().unstack().fillna(0)
cf_matrix = cf_matrix / np.sqrt(cf_matrix.sum(axis=1)[:,None]) / np.sqrt(cf_matrix.sum(axis=0))
idx = pd.Series(np.arange(cf_matrix.shape[1]), cf_matrix.columns)
sns.clustermap(cf_matrix, figsize=(8,8))
<seaborn.matrix.ClusterGrid at 0x7f0f48f22f50>
../_images/19c59490a59a99bce9f97dda168cca4df9ed5c8d0d70eeb91f1407563a1ef4b3.png