import time
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, diags, eye, save_npz
from scipy.sparse.linalg import norm
from scipy.ndimage import gaussian_filter
import cooler
import logging
# from ..cool import write_coo
[docs]
def calc_sparsity(matrix):
row, col = matrix.shape
sparsity = matrix.nnz / row / col
return sparsity
[docs]
def random_walk_cpu(P, rp, tol):
if rp == 1:
return P
_start_time = time.time()
n_genes = P.shape[0]
I = eye(n_genes, dtype=np.float32)
Q = P.copy()
for i in range(30):
Q_new = P.dot(Q * (1 - rp) + rp * I)
delta = norm(Q - Q_new)
Q = Q_new.copy()
sparsity = calc_sparsity(Q)
_end_time = time.time()
logging.debug(
f'Iter {i + 1} takes {(_end_time - _start_time):.3f} seconds. '
f'Loss: {delta:.3f}; Sparsity: {sparsity:.3f}', P.dtype, Q.dtype)
if delta < tol:
break
return Q
[docs]
def impute_chromosome(chrom,
resolution,
output_path,
scool_url=None,
contact_path=None,
chrom_size_path=None,
logscale=False,
pad=1,
std=1,
rp=0.5,
tol=0.01,
window_size=500000000,
step_size=10000000,
output_dist=500000000,
min_cutoff=0,
chrom1=1,
pos1=2,
chrom2=5,
pos2=6):
if scool_url is not None:
cell_cool = cooler.Cooler(scool_url)
A = cell_cool.matrix(balance=False, sparse=True).fetch(chrom)
# A = A + diags(A.diagonal())
n_bins = A.shape[0]
elif contact_path is not None:
if chrom_size_path is not None:
chrom_sizes = pd.read_csv(chrom_size_path, sep='\t', index_col=0, header=None).squeeze(axis=1)
if chrom=='all':
from schicluster.cool.utilities import get_chrom_offsets
bins_df = cooler.binnify(chrom_sizes, resolution)
chrom_offset = get_chrom_offsets(bins_df)
n_bins = bins_df.shape[0]
else:
n_bins = (chrom_sizes.loc[chrom] // resolution) + 1
else:
print("ERROR : Must provide chrom_size_path if using contact file as input")
return
A = pd.read_csv(contact_path, sep='\t', header=None, index_col=None, comment='#')[[chrom1, pos1, chrom2, pos2]]
if chrom=='all':
A = A[A[chrom1].isin(chrom_offset) & A[chrom2].isin(chrom_offset)]
A[pos1] = A[chrom1].map(chrom_offset) + (A[pos1] - 1) // resolution
A[pos2] = A[chrom2].map(chrom_offset) + (A[pos2] - 1) // resolution
else:
A = A.loc[(A[chrom1]==chrom) & (A[chrom2]==chrom)]
A[[pos1, pos2]] = (A[[pos1, pos2]] - 1) // resolution
A = A.groupby(by=[pos1, pos2])[chrom1].count().reset_index()
A = csr_matrix((A[chrom1].astype(np.int32), (A[pos1], A[pos2])), (n_bins, n_bins))
A = A + A.T
else:
print("ERROR : Must provide either scool_url or contact_file_path")
return
ws = int(window_size // resolution)
ss = int(step_size // resolution)
# log transform
if logscale:
A.data = np.log2(A.data + 1)
# Remove diagonal before convolution
A = A - diags(A.diagonal())
# Gaussian convolution and
start_time = time.time()
if pad > 0:
# full matrix step
A = gaussian_filter(A.astype(np.float32).toarray(),
std, order=0, mode='mirror', truncate=pad)
A = csr_matrix(A)
# else:
# A = A + A.T
end_time = time.time()
logging.debug(f'Convolution takes {end_time - start_time:.3f} seconds')
# Remove diagonal before RWR
A = A - diags(A.diagonal())
# Random Walk with Restart
start_time = time.time()
if ws >= n_bins or rp == 1:
B = A + diags((A.sum(axis=0).A.ravel() == 0).astype(int))
d = diags(1 / B.sum(axis=0).A.ravel())
P = d.dot(B).astype(np.float32)
E = random_walk_cpu(P, rp, tol)
else:
# if the chromosome is too large, compute by chunks
idx = (np.repeat(np.arange(ws), ws), np.tile(np.arange(ws), ws))
idxfilter = (np.abs(idx[1] - idx[0]) < (output_dist // resolution + 1))
idx = (idx[0][idxfilter], idx[1][idxfilter])
# first filter
idxfilter = ((idx[0] + idx[1]) < (ws + ss))
idx1 = (idx[0][idxfilter], idx[1][idxfilter])
mask1 = csr_matrix((np.ones(len(idx1[0])), (idx1[0], idx1[1])),
(ws, ws))
# last filter
idxfilter = ((idx[0] + idx[1]) >= (
(n_bins - ws) // ss * 2 + 1) * ss + 3 * ws - 2 * n_bins)
idx2 = (idx[0][idxfilter], idx[1][idxfilter])
mask2 = csr_matrix((np.ones(len(idx2[0])), (idx2[0], idx2[1])),
(ws, ws))
# center filter
idxfilter = np.logical_and((idx[0] + idx[1]) < (ws + ss),
(idx[0] + idx[1]) >= (ws - ss))
idx0 = (idx[0][idxfilter], idx[1][idxfilter])
mask0 = csr_matrix((np.ones(len(idx0[0])), (idx0[0], idx0[1])),
(ws, ws))
start_time = time.time()
E = csr_matrix(A.shape, dtype=np.float32)
for ll in [x for x in range(0, n_bins - ws, ss)] + [n_bins - ws]:
B = A[ll:(ll + ws), ll:(ll + ws)]
B = B + diags((B.sum(axis=0).A.ravel() == 0).astype(int))
d = diags(1 / B.sum(axis=0).A.ravel())
P = d.dot(B).astype(np.float32)
Etmp = random_walk_cpu(P, rp, tol)
if ll == 0:
E[ll:(ll + ws), ll:(ll + ws)] += Etmp.multiply(mask1)
elif ll == (n_bins - ws):
E[ll:(ll + ws), ll:(ll + ws)] += Etmp.multiply(mask2)
else:
E[ll:(ll + ws), ll:(ll + ws)] += Etmp.multiply(mask0)
logging.debug(f'RWR takes {time.time() - start_time:.3f} seconds')
# Normalize
start_time = time.time()
E += E.T
d = E.sum(axis=0).A.ravel()
d[d == 0] = 1
b = diags(1 / np.sqrt(d))
E = b.dot(E).dot(b)
logging.debug(f'SQRTVC takes {time.time() - start_time:.3f} seconds')
start_time = time.time()
# mask the lower triangle of E
# TODO This part is MEM intensive, the mask below can be combined with the chunk mask above
idx = np.triu_indices(E.shape[0], 0)
if (output_dist // resolution + 1) < n_bins:
# longest distance filter mask
idxfilter = ((idx[1] - idx[0]) < (output_dist // resolution + 1))
idx = (idx[0][idxfilter], idx[1][idxfilter])
mask = csr_matrix((np.ones(len(idx[0])), (idx[0], idx[1])),
E.shape,
dtype=np.float32)
E = E.tocsr().multiply(mask)
logging.debug(f'Filter takes {time.time() - start_time:.3f} seconds')
# TODO put this part inside RWR, before normalize
# min_cutoff = tol/
# Make values < min_cutoff to 0
if min_cutoff > 0:
s_before = calc_sparsity(E)
E = E.multiply(E > min_cutoff)
s_after = calc_sparsity(E)
logging.debug(f'Mask values smaller than {min_cutoff}. Sparsity before {s_before:.3f}, after {s_after:.3f}')
# save to file
# write_coo(output_path, E)
save_npz(output_path, E)
return