scRNA(MALBAC) clustering

scRNA(MALBAC) clustering#

import numpy as np
import pandas as pd
from glob import glob
import anndata
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns
from scipy.sparse import csr_matrix
from ALLCools.plot import *
from ALLCools.clustering import *
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'

indir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/'

data = pd.read_csv(f'{indir}other_data/GSE162509_UMIcount_mouse_decon_filtered_genes.tsv.gz', index_col=0, header=0, sep='\t')
data

	Gnai3	Cdc45	H19	Scml2	Apoh	Narf	Cav2	Klf6	Scmh1	Cox5a	...	AC154500.1	AC114990.3	AC169675.1	AC109261.3	AC109261.4	AC154486.3	CAAA01189291.2	CT025659.3	AC136921.2	AC134446.1
PR10_HCA_P14HM-1_01	0	0	0	0	0	1	0	2	2	1	...	0	0	0	0	0	1	0	0	0	0
PR10_HCA_P14HM-1_02	0	0	0	0	0	1	0	0	2	0	...	0	0	0	0	0	0	0	0	0	0
PR10_HCA_P14HM-1_03	2	0	0	0	0	2	0	7	3	1	...	0	0	0	0	0	0	0	0	0	0
PR10_HCA_P14HM-1_04	1	0	0	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
PR10_HCA_P14HM-1_07	0	0	0	0	0	3	0	3	4	1	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
PR10_HCA_P7PC-3_92	0	0	0	0	0	1	0	16	3	0	...	0	0	0	0	0	0	0	0	0	0
PR10_HCA_P7PC-3_93	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
PR10_HCA_P7PC-3_94	1	0	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
PR10_HCA_P7PC-3_95	0	0	0	0	0	0	0	8	1	3	...	0	0	0	0	0	0	0	0	0	0
PR10_HCA_P7PC-3_96	0	0	0	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

3517 rows × 40760 columns

meta = pd.read_csv(f'{indir}other_data/GSE162509_mapStat_mouse_filter.txt.gz', sep='\t', index_col=0, header=0)
meta

	Reads	Uniquely	Uniquely_ratio	Multi	Multi_ratio	Many	Many_ratio	A_ratio	Mapping_ratio	Genes	nUMI	mito	ERCC	assigned_ratio	avg.cor	keep
PR10_HCA_P14HM-1_01	3815377	3558074	0.9326	65771	0.0172	8865	0.0023	0.988705	0.9521	6414	36609	0.002103	0.020966	0.211559	0.418141	True
PR10_HCA_P14HM-1_02	936833	867607	0.9261	19886	0.0212	2618	0.0028	0.985054	0.9501	3184	7940	0.011083	0.074592	0.215140	0.352350	True
PR10_HCA_P14HM-1_03	2196459	2059892	0.9378	35331	0.0161	3744	0.0017	0.988487	0.9556	4710	16613	0.004093	0.028139	0.197201	0.393179	True
PR10_HCA_P14HM-1_04	1334069	1229429	0.9216	28281	0.0212	4119	0.0031	0.986058	0.9459	3825	10736	0.002701	0.056673	0.233741	0.364440	True
PR10_HCA_P14HM-1_05	268103	187902	0.7009	6316	0.0236	950	0.0035	0.968316	0.7280	1223	2617	0.049675	0.216936	0.355089	0.218429	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
PR10_HCA_P7PC-3_92	1287766	1177633	0.9145	28821	0.0224	5017	0.0039	0.978918	0.9408	3593	11865	0.005057	0.056011	0.199417	0.356637	True
PR10_HCA_P7PC-3_93	676788	607913	0.8982	13266	0.0196	1459	0.0022	0.978000	0.9200	1945	4458	0.026918	0.108400	0.251697	0.294853	True
PR10_HCA_P7PC-3_94	2375796	2074957	0.8734	39335	0.0166	5894	0.0025	0.970208	0.8925	4040	18380	0.004570	0.050914	0.195646	0.369667	True
PR10_HCA_P7PC-3_95	1585829	1425909	0.8992	24091	0.0152	2937	0.0019	0.962138	0.9163	3752	16077	0.005411	0.059164	0.267316	0.368889	True
PR10_HCA_P7PC-3_96	1821256	1614804	0.8866	40218	0.0221	6150	0.0034	0.973619	0.9121	4087	27057	0.020993	0.052792	0.279251	0.373820	True

4032 rows × 16 columns

gene_meta = pd.read_csv('/data/ref/mm10/gencode/vm22/gencode.vM22.annotation.gene.flat.tsv.gz', header=0, index_col=None, sep='\t')
gene_meta

	chrom	source	feature	start	end	score	strand	phase	gene_id	transcript_id	...	gene_name	transcript_type	transcript_status	transcript_name	exon_number	exon_id	level	mgi_id	havana_gene	tag
0	chr1	HAVANA	gene	3073253	3074322	.	+	.	ENSMUSG00000102693.1	NaN	...	4933401J01Rik	NaN	NaN	NaN	NaN	NaN	2	MGI:1918292	OTTMUSG00000049935.1	NaN
1	chr1	ENSEMBL	gene	3102016	3102125	.	+	.	ENSMUSG00000064842.1	NaN	...	Gm26206	NaN	NaN	NaN	NaN	NaN	3	MGI:5455983	NaN	NaN
2	chr1	HAVANA	gene	3205901	3671498	.	-	.	ENSMUSG00000051951.5	NaN	...	Xkr4	NaN	NaN	NaN	NaN	NaN	2	MGI:3528744	OTTMUSG00000026353.2	NaN
3	chr1	HAVANA	gene	3252757	3253236	.	+	.	ENSMUSG00000102851.1	NaN	...	Gm18956	NaN	NaN	NaN	NaN	NaN	1	MGI:5011141	OTTMUSG00000049958.1	pseudo_consens
4	chr1	HAVANA	gene	3365731	3368549	.	-	.	ENSMUSG00000103377.1	NaN	...	Gm37180	NaN	NaN	NaN	NaN	NaN	2	MGI:5610408	OTTMUSG00000049960.1	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
55482	chrM	ENSEMBL	gene	13552	14070	.	-	.	ENSMUSG00000064368.1	NaN	...	mt-Nd6	NaN	NaN	NaN	NaN	NaN	3	MGI:102495	NaN	NaN
55483	chrM	ENSEMBL	gene	14071	14139	.	-	.	ENSMUSG00000064369.1	NaN	...	mt-Te	NaN	NaN	NaN	NaN	NaN	3	MGI:102488	NaN	NaN
55484	chrM	ENSEMBL	gene	14145	15288	.	+	.	ENSMUSG00000064370.1	NaN	...	mt-Cytb	NaN	NaN	NaN	NaN	NaN	3	MGI:102501	NaN	NaN
55485	chrM	ENSEMBL	gene	15289	15355	.	+	.	ENSMUSG00000064371.1	NaN	...	mt-Tt	NaN	NaN	NaN	NaN	NaN	3	MGI:102473	NaN	NaN
55486	chrM	ENSEMBL	gene	15356	15422	.	-	.	ENSMUSG00000064372.1	NaN	...	mt-Tp	NaN	NaN	NaN	NaN	NaN	3	MGI:102478	NaN	NaN

55487 rows × 22 columns

gene_meta = gene_meta.loc[~gene_meta['gene_name'].duplicated()]
gene_meta = gene_meta.set_index('gene_name')[['chrom', 'start', 'end', 'gene_id', 'strand']]

data = data.loc[:, data.columns.isin(gene_meta.index)]

adata = anndata.AnnData(csr_matrix(data.values), 
                        obs=meta.loc[data.index, ['Genes', 'nUMI']],
                        var=gene_meta.loc[data.columns]
                      )
adata

AnnData object with n_obs × n_vars = 3517 × 39990
    obs: 'Genes', 'nUMI'
    var: 'chrom', 'start', 'end', 'gene_id', 'strand'

adata.write_h5ad('Tan2021_rna.h5ad')

adata = anndata.read_h5ad('Tan2021_rna.h5ad')
adata

AnnData object with n_obs × n_vars = 3517 × 3000
    obs: 'Genes', 'nUMI', 'umap_0', 'umap_1', 'age', 'region'
    var: 'chrom', 'start', 'end', 'gene_id', 'strand', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'hvg', 'log1p', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap', 'rna_pca_all'
    obsp: 'connectivities', 'distances'

cluster = pd.read_csv(f'{indir}metadata_major_cell_types.txt', sep='\t', header=None, index_col=0)[1]
cluster_neu = pd.read_csv(f'{indir}metadata_neuron_cell_types.txt', sep='\t', header=None, index_col=0)[1]
cluster.loc[cluster_neu.index] = cluster_neu.copy()
cluster.value_counts()

Hippocampal Granuale Cell                     398
Microglia Etc.                                364
Mature Oligodendrocyte                        314
Cortical L6 Pyramidal Cell                    297
Neonatal Astrocyte                            230
Oligodendrocyte Progenitor                    177
Hippocampal CA1 Pyramidal Cell                175
Cortical L2-5 Pyramidal Cell, Neonatal        156
Cortical L4 Pyramidal Cell                    151
Hippocampal Pyramidal Cell, Neonatal          136
Cortical L2/3 Pyramidal Cell                  134
Adult Astrocyte                               116
Cortical L5 Pyramidal Cell                     96
PV/SST Interneuron, Neonatal                   96
Unknown Interneuron 1                          94
VIP Interneuron                                85
Hippocampal CA3 Pyramidal Cell                 73
Medium Spiny Neuron                            73
Unknown Interneuron 2                          73
MEIS2 Interneuron                              53
SST Interneuron                                49
PV Interneuron                                 45
Newly Formed Oligodendrocyte                   39
NDNF Interneuron                               38
Cortical L2-4 Pyramidal Cell, Intermediate     30
Cajal-Retzius Cell                             25
Name: 1, dtype: int64

adata.obs['region'] = [xx.split('-')[0].split('_')[-1][-2:] for xx in adata.obs.index]
adata.obs['age'] = [xx.split('-')[0].split('_')[-1][1:-2] for xx in adata.obs.index]
adata.obs['age'] = adata.obs['age'].astype(int)
adata.obs['cluster'] = cluster.copy()

sc.pp.filter_genes(adata, min_cells=10)

sc.pp.highly_variable_genes(adata, n_top_genes=3000, flavor='seurat_v3')
adata.raw = adata.copy()
adata = adata[:, adata.var['highly_variable']].copy()
adata.X.data = adata.X.data / np.repeat(adata.obs['nUMI'].values, adata.X.getnnz(axis=1)) * adata.obs['nUMI'].median()
sc.pp.log1p(adata)

model = TruncatedSVD(n_components=50, algorithm='arpack', random_state=0) 
model.fit(adata.X)
sel_dim = (model.singular_values_ != 0)
print(sel_dim.sum())

data = model.transform(adata.X)
adata.obsm['rna_pca_all'] = data / model.singular_values_

significant_pc_test(adata, p_cutoff=0.1, update=False, obsm='rna_pca_all')

17 components passed P cutoff of 0.1.

adata.obsm['X_pca'] = normalize(adata.obsm['rna_pca_all'][:, :30], axis=1)

def dump_embedding(adata, name, n_dim=2):
    # put manifold coordinates into adata.obs
    for i in range(n_dim):
        adata.obs[f'{name}_{i}'] = adata.obsm[f'X_{name}'][:, i]
    return adata

knn = 25
sc.pp.neighbors(adata, n_neighbors=knn, use_rep='X_pca')
sc.tl.umap(adata, maxiter=200, random_state=0)
adata = dump_embedding(adata, 'umap')

fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
                       data=adata.obs,
                       hue='Genes',
                       coord_base='umap',
                       #max_points=None,
                       labelsize=10,
                       s=4)
_ = categorical_scatter(data=adata.obs,
                        ax=axes[0,1],
                        coord_base='umap',
                        hue='age',
                        palette='rainbow',
                        labelsize=10,
                        show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
                        data=adata.obs,
                        hue='region',
                        coord_base='umap',
                        # text_anno='region',
                        # palette='tab10',
                        labelsize=10,
                        show_legend=True
                       )
_ = categorical_scatter(ax=axes[1,1],
                        data=adata.obs,
                        hue='cluster',
                        coord_base='umap',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        # show_legend=True
                       )

../_images/e0de4d91b09a3e7732d833432eb7ef9cec6841a0482fea01e7c7942879bd1384.png

fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
                       data=adata.obs,
                       hue='Genes',
                       coord_base='umap',
                       #max_points=None,
                       labelsize=10,
                       s=4)
_ = categorical_scatter(data=adata.obs,
                        ax=axes[0,1],
                        coord_base='umap',
                        hue='age',
                        palette='rainbow',
                        labelsize=10,
                        show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
                        data=adata.obs,
                        hue='region',
                        coord_base='umap',
                        # text_anno='region',
                        # palette='tab10',
                        labelsize=10,
                        show_legend=True
                       )
_ = categorical_scatter(ax=axes[1,1],
                        data=adata.obs,
                        hue='cluster',
                        coord_base='umap',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        # show_legend=True
                       )

(0.0, 1.0, 0.0, 1.0)

../_images/f6b53f59bfe1648c9a4b0dcb4a05d07b9eb0a05b09ff90826d159b541fc41cb9.png

fig, axes = plt.subplots(2, 2, figsize=(10, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
                       data=adata.obs,
                       hue='Genes',
                       coord_base='umap',
                       #max_points=None,
                       labelsize=10,
                       s=4)
_ = categorical_scatter(data=adata.obs,
                        ax=axes[0,1],
                        coord_base='umap',
                        hue='age',
                        palette='rainbow',
                        labelsize=10,
                        show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
                        data=adata.obs,
                        hue='region',
                        coord_base='umap',
                        # text_anno='region',
                        # palette='tab10',
                        labelsize=10,
                        show_legend=True
                       )
_ = categorical_scatter(ax=axes[1,1],
                        data=adata.obs,
                        hue='cluster',
                        coord_base='umap',
                        # text_anno='cluster',
                        palette='tab20',
                        labelsize=10,
                        # show_legend=True
                       )

../_images/bdfecda4b90c7b6a6bdcaa17f06273beb5eed8ecde6a3acd21c18531870e4fc3.png

adata.write_h5ad('Tan2021_rna.h5ad')