DipC clustering at 100kb resolution

DipC clustering at 100kb resolution#

import time
import numpy as np
import pandas as pd
from glob import glob
import anndata
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns
from scipy.sparse import csr_matrix
from ALLCools.plot import *
from ALLCools.clustering import *
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'
indir = '/data/test_schicluster/Tan2021/scool/dataset/'
chrom_sizes = pd.read_csv('/data/ref/mm10/genome/mm10.autosome.chrom.sizes', index_col=0, header=None, sep='\t')[1]
chrom_sizes
0
chr1     195471971
chr2     182113224
chr3     160039680
chr4     156508116
chr5     151834684
chr6     149736546
chr7     145441459
chr8     129401213
chr9     124595110
chr10    130694993
chr11    122082543
chr12    120129022
chr13    120421639
chr14    124902244
chr15    104043685
chr16     98207768
chr17     94987271
chr18     90702639
chr19     61431566
Name: 1, dtype: int64
celllist = pd.read_csv(f'{indir}../impute/100K/cell_table.tsv', sep='\t', index_col=0, header=None)
celllist
1
0
cortex-p028-cb_116 /anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-visual-control-p007-b6_182 /anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-p028-cb_112 /anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-visual-control-p001-b6_061 /anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-p056-cb_216 /anvil/scratch/x-zhou/Tan2021/scool/impute/100...
... ...
cortex-visual-control-p021-b6_090 /anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-visual-control-p021-b6_012 /anvil/scratch/x-zhou/Tan2021/scool/impute/100...
hippocampus-p007-cb_046 /anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-visual-dark-p014-b6_106 /anvil/scratch/x-zhou/Tan2021/scool/impute/100...
cortex-visual-control-p021-b6_174 /anvil/scratch/x-zhou/Tan2021/scool/impute/100...

3646 rows × 1 columns

meta = pd.read_csv('meta.txt', sep='\t', header=0, index_col=0)
meta
tissue treatment age sex father mother restriction enzyme cell-type cluster reads read length (bp) raw throughput (Gb) raw contacts raw intra (%) dup rate (%) contacts intra (%) phased legs (%) raw contacts per read (%) 20kb RMS RMSD
cell
cortex-p001-cb_001 cortex control P1 female C57BL/6J CAST/EiJ MboI Neonatal Astrocyte 13,675,687 150 4.1 1,703,101 70.8 70.4 503,696 68.1 47.0 12.5 0.50
cortex-p001-cb_002 cortex control P1 female C57BL/6J CAST/EiJ MboI Neonatal Neuron 1 10,057,130 150 3.0 1,202,070 76.5 68.6 377,718 73.6 47.0 12.0 10.07
cortex-p001-cb_003 cortex control P1 female C57BL/6J CAST/EiJ MboI Neonatal Neuron 2 12,673,668 150 3.8 1,163,722 84.2 68.6 365,317 80.2 46.5 9.2 11.35
cortex-p001-cb_004 cortex control P1 female C57BL/6J CAST/EiJ MboI Unknown 12,936,354 150 3.9 1,289,148 84.6 69.2 396,869 81.0 46.5 10.0 14.27
cortex-p001-cb_005 cortex control P1 female C57BL/6J CAST/EiJ MboI Neonatal Neuron 2 14,086,574 150 4.2 1,510,084 76.9 71.2 434,395 73.4 46.9 10.7 0.97
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
cortex-visual-dark-p028-b6_188 visual cortex dark rearing P28 male C57BL/6N C57BL/6N NlaIII Neuron 5,796,965 150 1.7 603,448 92.2 57.7 255,561 90.4 0.0 10.4 NaN
cortex-visual-dark-p028-b6_189 visual cortex dark rearing P28 male C57BL/6N C57BL/6N NlaIII Neuron 6,520,182 150 2.0 736,632 90.6 59.8 296,289 88.7 0.0 11.3 NaN
cortex-visual-dark-p028-b6_190 visual cortex dark rearing P28 male C57BL/6N C57BL/6N NlaIII Neuron 10,724,783 150 3.2 1,296,262 90.6 70.6 381,381 87.9 0.0 12.1 NaN
cortex-visual-dark-p028-b6_191 visual cortex dark rearing P28 male C57BL/6N C57BL/6N NlaIII Unknown 6,720,196 150 2.0 473,515 85.4 58.8 195,243 83.0 0.0 7.0 NaN
cortex-visual-dark-p028-b6_192 visual cortex dark rearing P28 male C57BL/6N C57BL/6N NlaIII Neuron 6,859,779 150 2.1 909,985 91.5 62.6 340,432 89.8 0.0 13.3 NaN

4272 rows × 19 columns

start_time = time.time()
matrix = []
dim = 50
for chrom in chrom_sizes.index:
    tmp = np.load(f'{indir}embedding/raw/{chrom}.npz')['arr_0']
    dim = min(dim, tmp.shape[0] - 1, tmp.shape[1] - 1)
    model = TruncatedSVD(n_components=dim, algorithm='arpack')
    tmp_reduce = model.fit_transform(tmp)
    matrix.append(tmp_reduce / model.singular_values_)
    print(chrom, time.time() - start_time)
chr1 6.581930160522461
chr2 11.939553499221802
chr3 17.48383092880249
chr4 22.65337896347046
chr5 27.87999391555786
chr6 33.39244103431702
chr7 38.8983359336853
chr8 43.60885500907898
chr9 49.0135338306427
chr10 54.309446811676025
chr11 58.871928453445435
chr12 64.58947443962097
chr13 67.8880524635315
chr14 73.15141296386719
chr15 76.84170579910278
chr16 79.75830459594727
chr17 82.38596296310425
chr18 85.09349274635315
chr19 86.87178158760071
model = TruncatedSVD(n_components=dim, algorithm='arpack')
matrix_reduce = model.fit_transform(np.concatenate(matrix, axis=1))
matrix_reduce = matrix_reduce / model.singular_values_
adata = anndata.AnnData(X=np.ones((celllist.shape[0],1)), 
                        obs=meta.loc[celllist.index])
adata
AnnData object with n_obs × n_vars = 3646 × 1
    obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'reads', 'read length (bp)', 'raw throughput (Gb)', 'raw contacts', 'raw intra (%)', 'dup rate (%)', 'contacts', 'intra (%)', 'phased legs (%)', 'raw contacts per read (%)', '20kb RMS RMSD'
adata.obs.index.name = 'cell_name'
adata.obs['age'] = adata.obs['age'].str[1:].astype(int)
adata.obs['contacts'] = adata.obs['contacts'].str.replace(',','').astype(int)
adata.obs = adata.obs[['tissue', 'treatment', 'age', 'sex', 'father', 'mother',
                       'restriction enzyme', 'cell-type cluster', 'contacts']]
adata.obsm['dipc_pca_all'] = matrix_reduce.copy()
significant_pc_test(adata, p_cutoff=0.1, update=False, obsm='dipc_pca_all')
7 components passed P cutoff of 0.1.
7
adata.obsm['X_pca'] = normalize(adata.obsm['dipc_pca_all'][:, :20], axis=1)
def dump_embedding(adata, name, n_dim=2):
    # put manifold coordinates into adata.obs
    for i in range(n_dim):
        adata.obs[f'{name}_{i}'] = adata.obsm[f'X_{name}'][:, i]
    return adata
knn = 25
sc.pp.neighbors(adata, n_neighbors=knn, use_rep='X_pca')
sc.tl.umap(adata, maxiter=200, random_state=0)
adata = dump_embedding(adata, 'umap')
sc.tl.leiden(adata, resolution=1.0, random_state=0)
fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
                       data=adata.obs,
                       hue='contacts',
                       coord_base='umap',
                       #max_points=None,
                       labelsize=10,
                       s=4)
_ = categorical_scatter(data=adata.obs,
                        ax=axes[0,1],
                        coord_base='umap',
                        hue='age',
                        palette='hls',
                        labelsize=10,
                        show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
                        data=adata.obs,
                        hue='tissue',
                        coord_base='umap',
                        # text_anno='region',
                        # palette='tab10',
                        labelsize=10,
                        show_legend=True
                       )
_ = categorical_scatter(ax=axes[1,1],
                        data=adata.obs,
                        hue='cell-type cluster',
                        coord_base='umap',
                        # text_anno='cell-type cluster',
                        palette='tab20',
                        labelsize=10,
                        show_legend=True
                       )
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica
../_images/8125aae0dce5e14dfe50f50d463e5c5a998c0dff313d7b8b5b35f1891bacd4e3.png
fig, ax = plt.subplots(figsize=(6, 4), dpi=300, constrained_layout=True)
_ = categorical_scatter(data=adata.obs,
                        ax=ax,
                        coord_base='umap',
                        hue='leiden',
                        text_anno='leiden',
                        palette='tab20',
                        labelsize=10,
                        # show_legend=True
                       )
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica
../_images/bc0e35afda603b996d3a11489cf5b56b21952ddf1139006dfc9a6e35e89493f1.png
adata.write_h5ad('Tan2021_dipc.h5ad')