DipC clustering at 100kb resolution#
import time
import numpy as np
import pandas as pd
from glob import glob
import anndata
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import cm as cm
import seaborn as sns
from scipy.sparse import csr_matrix
from ALLCools.plot import *
from ALLCools.clustering import *
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
mpl.style.use('default')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.family'] = 'sans-serif'
mpl.rcParams['font.sans-serif'] = 'Helvetica'
indir = '/data/test_schicluster/Tan2021/scool/dataset/'
chrom_sizes = pd.read_csv('/data/ref/mm10/genome/mm10.autosome.chrom.sizes', index_col=0, header=None, sep='\t')[1]
chrom_sizes
0
chr1 195471971
chr2 182113224
chr3 160039680
chr4 156508116
chr5 151834684
chr6 149736546
chr7 145441459
chr8 129401213
chr9 124595110
chr10 130694993
chr11 122082543
chr12 120129022
chr13 120421639
chr14 124902244
chr15 104043685
chr16 98207768
chr17 94987271
chr18 90702639
chr19 61431566
Name: 1, dtype: int64
celllist = pd.read_csv(f'{indir}../impute/100K/cell_table.tsv', sep='\t', index_col=0, header=None)
celllist
1 | |
---|---|
0 | |
cortex-p028-cb_116 | /anvil/scratch/x-zhou/Tan2021/scool/impute/100... |
cortex-visual-control-p007-b6_182 | /anvil/scratch/x-zhou/Tan2021/scool/impute/100... |
cortex-p028-cb_112 | /anvil/scratch/x-zhou/Tan2021/scool/impute/100... |
cortex-visual-control-p001-b6_061 | /anvil/scratch/x-zhou/Tan2021/scool/impute/100... |
cortex-p056-cb_216 | /anvil/scratch/x-zhou/Tan2021/scool/impute/100... |
... | ... |
cortex-visual-control-p021-b6_090 | /anvil/scratch/x-zhou/Tan2021/scool/impute/100... |
cortex-visual-control-p021-b6_012 | /anvil/scratch/x-zhou/Tan2021/scool/impute/100... |
hippocampus-p007-cb_046 | /anvil/scratch/x-zhou/Tan2021/scool/impute/100... |
cortex-visual-dark-p014-b6_106 | /anvil/scratch/x-zhou/Tan2021/scool/impute/100... |
cortex-visual-control-p021-b6_174 | /anvil/scratch/x-zhou/Tan2021/scool/impute/100... |
3646 rows × 1 columns
meta = pd.read_csv('meta.txt', sep='\t', header=0, index_col=0)
meta
tissue | treatment | age | sex | father | mother | restriction enzyme | cell-type cluster | reads | read length (bp) | raw throughput (Gb) | raw contacts | raw intra (%) | dup rate (%) | contacts | intra (%) | phased legs (%) | raw contacts per read (%) | 20kb RMS RMSD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
cell | |||||||||||||||||||
cortex-p001-cb_001 | cortex | control | P1 | female | C57BL/6J | CAST/EiJ | MboI | Neonatal Astrocyte | 13,675,687 | 150 | 4.1 | 1,703,101 | 70.8 | 70.4 | 503,696 | 68.1 | 47.0 | 12.5 | 0.50 |
cortex-p001-cb_002 | cortex | control | P1 | female | C57BL/6J | CAST/EiJ | MboI | Neonatal Neuron 1 | 10,057,130 | 150 | 3.0 | 1,202,070 | 76.5 | 68.6 | 377,718 | 73.6 | 47.0 | 12.0 | 10.07 |
cortex-p001-cb_003 | cortex | control | P1 | female | C57BL/6J | CAST/EiJ | MboI | Neonatal Neuron 2 | 12,673,668 | 150 | 3.8 | 1,163,722 | 84.2 | 68.6 | 365,317 | 80.2 | 46.5 | 9.2 | 11.35 |
cortex-p001-cb_004 | cortex | control | P1 | female | C57BL/6J | CAST/EiJ | MboI | Unknown | 12,936,354 | 150 | 3.9 | 1,289,148 | 84.6 | 69.2 | 396,869 | 81.0 | 46.5 | 10.0 | 14.27 |
cortex-p001-cb_005 | cortex | control | P1 | female | C57BL/6J | CAST/EiJ | MboI | Neonatal Neuron 2 | 14,086,574 | 150 | 4.2 | 1,510,084 | 76.9 | 71.2 | 434,395 | 73.4 | 46.9 | 10.7 | 0.97 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
cortex-visual-dark-p028-b6_188 | visual cortex | dark rearing | P28 | male | C57BL/6N | C57BL/6N | NlaIII | Neuron | 5,796,965 | 150 | 1.7 | 603,448 | 92.2 | 57.7 | 255,561 | 90.4 | 0.0 | 10.4 | NaN |
cortex-visual-dark-p028-b6_189 | visual cortex | dark rearing | P28 | male | C57BL/6N | C57BL/6N | NlaIII | Neuron | 6,520,182 | 150 | 2.0 | 736,632 | 90.6 | 59.8 | 296,289 | 88.7 | 0.0 | 11.3 | NaN |
cortex-visual-dark-p028-b6_190 | visual cortex | dark rearing | P28 | male | C57BL/6N | C57BL/6N | NlaIII | Neuron | 10,724,783 | 150 | 3.2 | 1,296,262 | 90.6 | 70.6 | 381,381 | 87.9 | 0.0 | 12.1 | NaN |
cortex-visual-dark-p028-b6_191 | visual cortex | dark rearing | P28 | male | C57BL/6N | C57BL/6N | NlaIII | Unknown | 6,720,196 | 150 | 2.0 | 473,515 | 85.4 | 58.8 | 195,243 | 83.0 | 0.0 | 7.0 | NaN |
cortex-visual-dark-p028-b6_192 | visual cortex | dark rearing | P28 | male | C57BL/6N | C57BL/6N | NlaIII | Neuron | 6,859,779 | 150 | 2.1 | 909,985 | 91.5 | 62.6 | 340,432 | 89.8 | 0.0 | 13.3 | NaN |
4272 rows × 19 columns
start_time = time.time()
matrix = []
dim = 50
for chrom in chrom_sizes.index:
tmp = np.load(f'{indir}embedding/raw/{chrom}.npz')['arr_0']
dim = min(dim, tmp.shape[0] - 1, tmp.shape[1] - 1)
model = TruncatedSVD(n_components=dim, algorithm='arpack')
tmp_reduce = model.fit_transform(tmp)
matrix.append(tmp_reduce / model.singular_values_)
print(chrom, time.time() - start_time)
chr1 6.581930160522461
chr2 11.939553499221802
chr3 17.48383092880249
chr4 22.65337896347046
chr5 27.87999391555786
chr6 33.39244103431702
chr7 38.8983359336853
chr8 43.60885500907898
chr9 49.0135338306427
chr10 54.309446811676025
chr11 58.871928453445435
chr12 64.58947443962097
chr13 67.8880524635315
chr14 73.15141296386719
chr15 76.84170579910278
chr16 79.75830459594727
chr17 82.38596296310425
chr18 85.09349274635315
chr19 86.87178158760071
model = TruncatedSVD(n_components=dim, algorithm='arpack')
matrix_reduce = model.fit_transform(np.concatenate(matrix, axis=1))
matrix_reduce = matrix_reduce / model.singular_values_
adata = anndata.AnnData(X=np.ones((celllist.shape[0],1)),
obs=meta.loc[celllist.index])
adata
AnnData object with n_obs × n_vars = 3646 × 1
obs: 'tissue', 'treatment', 'age', 'sex', 'father', 'mother', 'restriction enzyme', 'cell-type cluster', 'reads', 'read length (bp)', 'raw throughput (Gb)', 'raw contacts', 'raw intra (%)', 'dup rate (%)', 'contacts', 'intra (%)', 'phased legs (%)', 'raw contacts per read (%)', '20kb RMS RMSD'
adata.obs.index.name = 'cell_name'
adata.obs['age'] = adata.obs['age'].str[1:].astype(int)
adata.obs['contacts'] = adata.obs['contacts'].str.replace(',','').astype(int)
adata.obs = adata.obs[['tissue', 'treatment', 'age', 'sex', 'father', 'mother',
'restriction enzyme', 'cell-type cluster', 'contacts']]
adata.obsm['dipc_pca_all'] = matrix_reduce.copy()
significant_pc_test(adata, p_cutoff=0.1, update=False, obsm='dipc_pca_all')
7 components passed P cutoff of 0.1.
7
adata.obsm['X_pca'] = normalize(adata.obsm['dipc_pca_all'][:, :20], axis=1)
def dump_embedding(adata, name, n_dim=2):
# put manifold coordinates into adata.obs
for i in range(n_dim):
adata.obs[f'{name}_{i}'] = adata.obsm[f'X_{name}'][:, i]
return adata
knn = 25
sc.pp.neighbors(adata, n_neighbors=knn, use_rep='X_pca')
sc.tl.umap(adata, maxiter=200, random_state=0)
adata = dump_embedding(adata, 'umap')
sc.tl.leiden(adata, resolution=1.0, random_state=0)
fig, axes = plt.subplots(2, 2, figsize=(12, 8), dpi=300, constrained_layout=True)
_ = continuous_scatter(ax=axes[0,0],
data=adata.obs,
hue='contacts',
coord_base='umap',
#max_points=None,
labelsize=10,
s=4)
_ = categorical_scatter(data=adata.obs,
ax=axes[0,1],
coord_base='umap',
hue='age',
palette='hls',
labelsize=10,
show_legend=True)
_ = categorical_scatter(ax=axes[1,0],
data=adata.obs,
hue='tissue',
coord_base='umap',
# text_anno='region',
# palette='tab10',
labelsize=10,
show_legend=True
)
_ = categorical_scatter(ax=axes[1,1],
data=adata.obs,
hue='cell-type cluster',
coord_base='umap',
# text_anno='cell-type cluster',
palette='tab20',
labelsize=10,
show_legend=True
)
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica
fig, ax = plt.subplots(figsize=(6, 4), dpi=300, constrained_layout=True)
_ = categorical_scatter(data=adata.obs,
ax=ax,
coord_base='umap',
hue='leiden',
text_anno='leiden',
palette='tab20',
labelsize=10,
# show_legend=True
)
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Helvetica
adata.write_h5ad('Tan2021_dipc.h5ad')