DipC clustering at 100kb resolution

DipC clustering at 100kb resolution#

import time
import numpy as np
import pandas as pd
from glob import glob
import anndata
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns
from scipy.sparse import csr_matrix
from ALLCools.plot import *
from ALLCools.clustering import *
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'

indir = '/data/test_schicluster/Tan2021/scool/dataset/'

chrom_sizes = pd.read_csv('/data/ref/mm10/genome/mm10.autosome.chrom.sizes', index_col=0, header=None, sep='\t')[1]
chrom_sizes

0
chr1     195471971
chr2     182113224
chr3     160039680
chr4     156508116
chr5     151834684
chr6     149736546
chr7     145441459
chr8     129401213
chr9     124595110
chr10    130694993
chr11    122082543
chr12    120129022
chr13    120421639
chr14    124902244
chr15    104043685
chr16     98207768
chr17     94987271
chr18     90702639
chr19     61431566
Name: 1, dtype: int64

celllist = pd.read_csv(f'{indir}../impute/100K/cell_table.tsv', sep='\t', index_col=0, header=None)
celllist

	1
0
cortex-p028-cb_116	/anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-visual-control-p007-b6_182	/anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-p028-cb_112	/anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-visual-control-p001-b6_061	/anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-p056-cb_216	/anvil/scratch/x-zhou/Tan2021/scool/impute/100...
...	...
cortex-visual-control-p021-b6_090	/anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-visual-control-p021-b6_012	/anvil/scratch/x-zhou/Tan2021/scool/impute/100...
hippocampus-p007-cb_046	/anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-visual-dark-p014-b6_106	/anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-visual-control-p021-b6_174	/anvil/scratch/x-zhou/Tan2021/scool/impute/100...

3646 rows × 1 columns

meta = pd.read_csv('meta.txt', sep='\t', header=0, index_col=0)
meta

	tissue	treatment	age	sex	father	mother	restriction enzyme	cell-type cluster	reads	read length (bp)	raw throughput (Gb)	raw contacts	raw intra (%)	dup rate (%)	contacts	intra (%)	phased legs (%)	raw contacts per read (%)	20kb RMS RMSD
cell
cortex-p001-cb_001	cortex	control	P1	female	C57BL/6J	CAST/EiJ	MboI	Neonatal Astrocyte	13,675,687	150	4.1	1,703,101	70.8	70.4	503,696	68.1	47.0	12.5	0.50
cortex-p001-cb_002	cortex	control	P1	female	C57BL/6J	CAST/EiJ	MboI	Neonatal Neuron 1	10,057,130	150	3.0	1,202,070	76.5	68.6	377,718	73.6	47.0	12.0	10.07
cortex-p001-cb_003	cortex	control	P1	female	C57BL/6J	CAST/EiJ	MboI	Neonatal Neuron 2	12,673,668	150	3.8	1,163,722	84.2	68.6	365,317	80.2	46.5	9.2	11.35
cortex-p001-cb_004	cortex	control	P1	female	C57BL/6J	CAST/EiJ	MboI	Unknown	12,936,354	150	3.9	1,289,148	84.6	69.2	396,869	81.0	46.5	10.0	14.27
cortex-p001-cb_005	cortex	control	P1	female	C57BL/6J	CAST/EiJ	MboI	Neonatal Neuron 2	14,086,574	150	4.2	1,510,084	76.9	71.2	434,395	73.4	46.9	10.7	0.97
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
cortex-visual-dark-p028-b6_188	visual cortex	dark rearing	P28	male	C57BL/6N	C57BL/6N	NlaIII	Neuron	5,796,965	150	1.7	603,448	92.2	57.7	255,561	90.4	0.0	10.4	NaN
cortex-visual-dark-p028-b6_189	visual cortex	dark rearing	P28	male	C57BL/6N	C57BL/6N	NlaIII	Neuron	6,520,182	150	2.0	736,632	90.6	59.8	296,289	88.7	0.0	11.3	NaN
cortex-visual-dark-p028-b6_190	visual cortex	dark rearing	P28	male	C57BL/6N	C57BL/6N	NlaIII	Neuron	10,724,783	150	3.2	1,296,262	90.6	70.6	381,381	87.9	0.0	12.1	NaN
cortex-visual-dark-p028-b6_191	visual cortex	dark rearing	P28	male	C57BL/6N	C57BL/6N	NlaIII	Unknown	6,720,196	150	2.0	473,515	85.4	58.8	195,243	83.0	0.0	7.0	NaN
cortex-visual-dark-p028-b6_192	visual cortex	dark rearing	P28	male	C57BL/6N	C57BL/6N	NlaIII	Neuron	6,859,779	150	2.1	909,985	91.5	62.6	340,432	89.8	0.0	13.3	NaN

4272 rows × 19 columns

start_time = time.time()
matrix = []
dim = 50
for chrom in chrom_sizes.index:
    tmp = np.load(f'{indir}embedding/raw/{chrom}.npz')['arr_0']
    dim = min(dim, tmp.shape[0] - 1, tmp.shape[1] - 1)
    model = TruncatedSVD(n_components=dim, algorithm='arpack')
    tmp_reduce = model.fit_transform(tmp)
    matrix.append(tmp_reduce / model.singular_values_)
    print(chrom, time.time() - start_time)

chr1 6.581930160522461
chr2 11.939553499221802
chr3 17.48383092880249
chr4 22.65337896347046
chr5 27.87999391555786
chr6 33.39244103431702
chr7 38.8983359336853
chr8 43.60885500907898
chr9 49.0135338306427
chr10 54.309446811676025
chr11 58.871928453445435
chr12 64.58947443962097
chr13 67.8880524635315
chr14 73.15141296386719
chr15 76.84170579910278
chr16 79.75830459594727
chr17 82.38596296310425
chr18 85.09349274635315
chr19 86.87178158760071

model = TruncatedSVD(n_components=dim, algorithm='arpack')
matrix_reduce = model.fit_transform(np.concatenate(matrix, axis=1))
matrix_reduce = matrix_reduce / model.singular_values_

adata = anndata.AnnData(X=np.ones((celllist.shape[0],1)), 
                        obs=meta.loc[celllist.index])
adata

AnnData object with n_obs × n_vars = 3646 × 1
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'reads', 'read length (bp)', 'raw throughput (Gb)', 'raw contacts', 'raw intra (%)', 'dup rate (%)', 'contacts', 'intra (%)', 'phased legs (%)', 'raw contacts per read (%)', '20kb RMS RMSD'

adata.obs.index.name = 'cell_name'
adata.obs['age'] = adata.obs['age'].str[1:].astype(int)
adata.obs['contacts'] = adata.obs['contacts'].str.replace(',','').astype(int)

adata.obs = adata.obs[['tissue', 'treatment', 'age', 'sex', 'father', 'mother',
                       'restriction enzyme', 'cell-type cluster', 'contacts']]

adata.obsm['dipc_pca_all'] = matrix_reduce.copy()

significant_pc_test(adata, p_cutoff=0.1, update=False, obsm='dipc_pca_all')

7 components passed P cutoff of 0.1.

adata.obsm['X_pca'] = normalize(adata.obsm['dipc_pca_all'][:, :20], axis=1)

def dump_embedding(adata, name, n_dim=2):
    # put manifold coordinates into adata.obs
    for i in range(n_dim):
        adata.obs[f'{name}_{i}'] = adata.obsm[f'X_{name}'][:, i]
    return adata

knn = 25
sc.pp.neighbors(adata, n_neighbors=knn, use_rep='X_pca')
sc.tl.umap(adata, maxiter=200, random_state=0)
adata = dump_embedding(adata, 'umap')

sc.tl.leiden(adata, resolution=1.0, random_state=0)

fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
                       data=adata.obs,
                       hue='contacts',
                       coord_base='umap',
                       #max_points=None,
                       labelsize=10,
                       s=4)
_ = categorical_scatter(data=adata.obs,
                        ax=axes[0,1],
                        coord_base='umap',
                        hue='age',
                        palette='hls',
                        labelsize=10,
                        show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
                        data=adata.obs,
                        hue='tissue',
                        coord_base='umap',
                        # text_anno='region',
                        # palette='tab10',
                        labelsize=10,
                        show_legend=True
                       )
_ = categorical_scatter(ax=axes[1,1],
                        data=adata.obs,
                        hue='cell-type cluster',
                        coord_base='umap',
                        # text_anno='cell-type cluster',
                        palette='tab20',
                        labelsize=10,
                        show_legend=True
                       )

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica

../_images/8125aae0dce5e14dfe50f50d463e5c5a998c0dff313d7b8b5b35f1891bacd4e3.png

fig, ax = plt.subplots(figsize=(6, 4), dpi=300, constrained_layout=True)
_ = categorical_scatter(data=adata.obs,
                        ax=ax,
                        coord_base='umap',
                        hue='leiden',
                        text_anno='leiden',
                        palette='tab20',
                        labelsize=10,
                        # show_legend=True
                       )

findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica

../_images/bc0e35afda603b996d3a11489cf5b56b21952ddf1139006dfc9a6e35e89493f1.png

adata.write_h5ad('Tan2021_dipc.h5ad')