scRNA(MALBAC) clustering

scRNA(MALBAC) clustering#

import numpy as np
import pandas as pd
from glob import glob
import anndata
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns
from scipy.sparse import csr_matrix
from ALLCools.plot import *
from ALLCools.clustering import *
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'
indir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/'
data = pd.read_csv(f'{indir}other_data/GSE162509_UMIcount_mouse_decon_filtered_genes.tsv.gz', index_col=0, header=0, sep='\t')
data
Gnai3 Cdc45 H19 Scml2 Apoh Narf Cav2 Klf6 Scmh1 Cox5a ... AC154500.1 AC114990.3 AC169675.1 AC109261.3 AC109261.4 AC154486.3 CAAA01189291.2 CT025659.3 AC136921.2 AC134446.1
PR10_HCA_P14HM-1_01 0 0 0 0 0 1 0 2 2 1 ... 0 0 0 0 0 1 0 0 0 0
PR10_HCA_P14HM-1_02 0 0 0 0 0 1 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
PR10_HCA_P14HM-1_03 2 0 0 0 0 2 0 7 3 1 ... 0 0 0 0 0 0 0 0 0 0
PR10_HCA_P14HM-1_04 1 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
PR10_HCA_P14HM-1_07 0 0 0 0 0 3 0 3 4 1 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
PR10_HCA_P7PC-3_92 0 0 0 0 0 1 0 16 3 0 ... 0 0 0 0 0 0 0 0 0 0
PR10_HCA_P7PC-3_93 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
PR10_HCA_P7PC-3_94 1 0 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
PR10_HCA_P7PC-3_95 0 0 0 0 0 0 0 8 1 3 ... 0 0 0 0 0 0 0 0 0 0
PR10_HCA_P7PC-3_96 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3517 rows × 40760 columns

meta = pd.read_csv(f'{indir}other_data/GSE162509_mapStat_mouse_filter.txt.gz', sep='\t', index_col=0, header=0)
meta
Reads Uniquely Uniquely_ratio Multi Multi_ratio Many Many_ratio A_ratio Mapping_ratio Genes nUMI mito ERCC assigned_ratio avg.cor keep
PR10_HCA_P14HM-1_01 3815377 3558074 0.9326 65771 0.0172 8865 0.0023 0.988705 0.9521 6414 36609 0.002103 0.020966 0.211559 0.418141 True
PR10_HCA_P14HM-1_02 936833 867607 0.9261 19886 0.0212 2618 0.0028 0.985054 0.9501 3184 7940 0.011083 0.074592 0.215140 0.352350 True
PR10_HCA_P14HM-1_03 2196459 2059892 0.9378 35331 0.0161 3744 0.0017 0.988487 0.9556 4710 16613 0.004093 0.028139 0.197201 0.393179 True
PR10_HCA_P14HM-1_04 1334069 1229429 0.9216 28281 0.0212 4119 0.0031 0.986058 0.9459 3825 10736 0.002701 0.056673 0.233741 0.364440 True
PR10_HCA_P14HM-1_05 268103 187902 0.7009 6316 0.0236 950 0.0035 0.968316 0.7280 1223 2617 0.049675 0.216936 0.355089 0.218429 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
PR10_HCA_P7PC-3_92 1287766 1177633 0.9145 28821 0.0224 5017 0.0039 0.978918 0.9408 3593 11865 0.005057 0.056011 0.199417 0.356637 True
PR10_HCA_P7PC-3_93 676788 607913 0.8982 13266 0.0196 1459 0.0022 0.978000 0.9200 1945 4458 0.026918 0.108400 0.251697 0.294853 True
PR10_HCA_P7PC-3_94 2375796 2074957 0.8734 39335 0.0166 5894 0.0025 0.970208 0.8925 4040 18380 0.004570 0.050914 0.195646 0.369667 True
PR10_HCA_P7PC-3_95 1585829 1425909 0.8992 24091 0.0152 2937 0.0019 0.962138 0.9163 3752 16077 0.005411 0.059164 0.267316 0.368889 True
PR10_HCA_P7PC-3_96 1821256 1614804 0.8866 40218 0.0221 6150 0.0034 0.973619 0.9121 4087 27057 0.020993 0.052792 0.279251 0.373820 True

4032 rows × 16 columns

gene_meta = pd.read_csv('/data/ref/mm10/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz', header=0, index_col=None, sep='\t')
gene_meta
chrom source feature start end score strand phase gene_id transcript_id ... gene_name transcript_type transcript_status transcript_name exon_number exon_id level mgi_id havana_gene tag
0 chr1 HAVANA gene 3073253 3074322 . + . ENSMUSG00000102693.1 NaN ... 4933401J01Rik NaN NaN NaN NaN NaN 2 MGI:1918292 OTTMUSG00000049935.1 NaN
1 chr1 ENSEMBL gene 3102016 3102125 . + . ENSMUSG00000064842.1 NaN ... Gm26206 NaN NaN NaN NaN NaN 3 MGI:5455983 NaN NaN
2 chr1 HAVANA gene 3205901 3671498 . - . ENSMUSG00000051951.5 NaN ... Xkr4 NaN NaN NaN NaN NaN 2 MGI:3528744 OTTMUSG00000026353.2 NaN
3 chr1 HAVANA gene 3252757 3253236 . + . ENSMUSG00000102851.1 NaN ... Gm18956 NaN NaN NaN NaN NaN 1 MGI:5011141 OTTMUSG00000049958.1 pseudo_consens
4 chr1 HAVANA gene 3365731 3368549 . - . ENSMUSG00000103377.1 NaN ... Gm37180 NaN NaN NaN NaN NaN 2 MGI:5610408 OTTMUSG00000049960.1 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
55482 chrM ENSEMBL gene 13552 14070 . - . ENSMUSG00000064368.1 NaN ... mt-Nd6 NaN NaN NaN NaN NaN 3 MGI:102495 NaN NaN
55483 chrM ENSEMBL gene 14071 14139 . - . ENSMUSG00000064369.1 NaN ... mt-Te NaN NaN NaN NaN NaN 3 MGI:102488 NaN NaN
55484 chrM ENSEMBL gene 14145 15288 . + . ENSMUSG00000064370.1 NaN ... mt-Cytb NaN NaN NaN NaN NaN 3 MGI:102501 NaN NaN
55485 chrM ENSEMBL gene 15289 15355 . + . ENSMUSG00000064371.1 NaN ... mt-Tt NaN NaN NaN NaN NaN 3 MGI:102473 NaN NaN
55486 chrM ENSEMBL gene 15356 15422 . - . ENSMUSG00000064372.1 NaN ... mt-Tp NaN NaN NaN NaN NaN 3 MGI:102478 NaN NaN

55487 rows × 22 columns

gene_meta = gene_meta.loc[~gene_meta['gene_name'].duplicated()]
gene_meta = gene_meta.set_index('gene_name')[['chrom', 'start', 'end', 'gene_id', 'strand']]
data = data.loc[:, data.columns.isin(gene_meta.index)]
adata = anndata.AnnData(csr_matrix(data.values), 
                        obs=meta.loc[data.index, ['Genes', 'nUMI']],
                        var=gene_meta.loc[data.columns]
                      )
adata
AnnData object with n_obs × n_vars = 3517 × 39990
    obs: 'Genes', 'nUMI'
    var: 'chrom', 'start', 'end', 'gene_id', 'strand'
adata.write_h5ad('Tan2021_rna.h5ad')
adata = anndata.read_h5ad('Tan2021_rna.h5ad')
adata
AnnData object with n_obs × n_vars = 3517 × 3000
    obs: 'Genes', 'nUMI', 'umap_0', 'umap_1', 'age', 'region'
    var: 'chrom', 'start', 'end', 'gene_id', 'strand', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg', 'log1p', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'rna_pca_all'
    obsp: 'connectivities', 'distances'
cluster = pd.read_csv(f'{indir}metadata_major_cell_types.txt', sep='\t', header=None, index_col=0)[1]
cluster_neu = pd.read_csv(f'{indir}metadata_neuron_cell_types.txt', sep='\t', header=None, index_col=0)[1]
cluster.loc[cluster_neu.index] = cluster_neu.copy()
cluster.value_counts()
Hippocampal Granuale Cell                     398
Microglia Etc.                                364
Mature Oligodendrocyte                        314
Cortical L6 Pyramidal Cell                    297
Neonatal Astrocyte                            230
Oligodendrocyte Progenitor                    177
Hippocampal CA1 Pyramidal Cell                175
Cortical L2-5 Pyramidal Cell, Neonatal        156
Cortical L4 Pyramidal Cell                    151
Hippocampal Pyramidal Cell, Neonatal          136
Cortical L2/3 Pyramidal Cell                  134
Adult Astrocyte                               116
Cortical L5 Pyramidal Cell                     96
PV/SST Interneuron, Neonatal                   96
Unknown Interneuron 1                          94
VIP Interneuron                                85
Hippocampal CA3 Pyramidal Cell                 73
Medium Spiny Neuron                            73
Unknown Interneuron 2                          73
MEIS2 Interneuron                              53
SST Interneuron                                49
PV Interneuron                                 45
Newly Formed Oligodendrocyte                   39
NDNF Interneuron                               38
Cortical L2-4 Pyramidal Cell, Intermediate     30
Cajal-Retzius Cell                             25
Name: 1, dtype: int64
adata.obs['region'] = [xx.split('-')[0].split('_')[-1][-2:] for xx in adata.obs.index]
adata.obs['age'] = [xx.split('-')[0].split('_')[-1][1:-2] for xx in adata.obs.index]
adata.obs['age'] = adata.obs['age'].astype(int)
adata.obs['cluster'] = cluster.copy()
sc.pp.filter_genes(adata, min_cells=10)
sc.pp.highly_variable_genes(adata, n_top_genes=3000, flavor='seurat_v3')
adata.raw = adata.copy()
adata = adata[:, adata.var['highly_variable']].copy()
adata.X.data = adata.X.data / np.repeat(adata.obs['nUMI'].values, adata.X.getnnz(axis=1)) * adata.obs['nUMI'].median()
sc.pp.log1p(adata)
model = TruncatedSVD(n_components=50, algorithm='arpack', random_state=0) 
model.fit(adata.X)
sel_dim = (model.singular_values_ != 0)
print(sel_dim.sum())
50
data = model.transform(adata.X)
adata.obsm['rna_pca_all'] = data / model.singular_values_
significant_pc_test(adata, p_cutoff=0.1, update=False, obsm='rna_pca_all')
17 components passed P cutoff of 0.1.
17
adata.obsm['X_pca'] = normalize(adata.obsm['rna_pca_all'][:, :30], axis=1)
def dump_embedding(adata, name, n_dim=2):
    # put manifold coordinates into adata.obs
    for i in range(n_dim):
        adata.obs[f'{name}_{i}'] = adata.obsm[f'X_{name}'][:, i]
    return adata
knn = 25
sc.pp.neighbors(adata, n_neighbors=knn, use_rep='X_pca')
sc.tl.umap(adata, maxiter=200, random_state=0)
adata = dump_embedding(adata, 'umap')
fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
                       data=adata.obs,
                       hue='Genes',
                       coord_base='umap',
                       #max_points=None,
                       labelsize=10,
                       s=4)
_ = categorical_scatter(data=adata.obs,
                        ax=axes[0,1],
                        coord_base='umap',
                        hue='age',
                        palette='rainbow',
                        labelsize=10,
                        show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
                        data=adata.obs,
                        hue='region',
                        coord_base='umap',
                        # text_anno='region',
                        # palette='tab10',
                        labelsize=10,
                        show_legend=True
                       )
_ = categorical_scatter(ax=axes[1,1],
                        data=adata.obs,
                        hue='cluster',
                        coord_base='umap',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        # show_legend=True
                       )
../_images/e0de4d91b09a3e7732d833432eb7ef9cec6841a0482fea01e7c7942879bd1384.png
fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
                       data=adata.obs,
                       hue='Genes',
                       coord_base='umap',
                       #max_points=None,
                       labelsize=10,
                       s=4)
_ = categorical_scatter(data=adata.obs,
                        ax=axes[0,1],
                        coord_base='umap',
                        hue='age',
                        palette='rainbow',
                        labelsize=10,
                        show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
                        data=adata.obs,
                        hue='region',
                        coord_base='umap',
                        # text_anno='region',
                        # palette='tab10',
                        labelsize=10,
                        show_legend=True
                       )
_ = categorical_scatter(ax=axes[1,1],
                        data=adata.obs,
                        hue='cluster',
                        coord_base='umap',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        # show_legend=True
                       )
(0.0, 1.0, 0.0, 1.0)
../_images/f6b53f59bfe1648c9a4b0dcb4a05d07b9eb0a05b09ff90826d159b541fc41cb9.png
fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
                       data=adata.obs,
                       hue='Genes',
                       coord_base='umap',
                       #max_points=None,
                       labelsize=10,
                       s=4)
_ = categorical_scatter(data=adata.obs,
                        ax=axes[0,1],
                        coord_base='umap',
                        hue='age',
                        palette='rainbow',
                        labelsize=10,
                        show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
                        data=adata.obs,
                        hue='region',
                        coord_base='umap',
                        # text_anno='region',
                        # palette='tab10',
                        labelsize=10,
                        show_legend=True
                       )
_ = categorical_scatter(ax=axes[1,1],
                        data=adata.obs,
                        hue='cluster',
                        coord_base='umap',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        # show_legend=True
                       )
../_images/bdfecda4b90c7b6a6bdcaa17f06273beb5eed8ecde6a3acd21c18531870e4fc3.png
adata.write_h5ad('Tan2021_rna.h5ad')