Source code for schicluster.dev.imputecell

import time
import h5py
import cv2
import numpy as np
from scipy.sparse import csr_matrix, save_npz, diags, eye
from scipy.sparse.linalg import norm

cv2.useOptimized()
[docs] hg38dim = [248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, 138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, 83257441, 80373285, 58617616, 64444167, 46709983, 50818468]
[docs] hg19dim = [249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, 146364022, 141213431, 135534747, 135006516, 133851895, 115169878, 107349540, 102531392, 90354753, 81195210, 78077248, 59128983, 63025520, 48129895, 51304566]
[docs] mm10dim = [195471971, 182113224, 160039680, 156508116, 151834684, 149736546, 145441459, 129401213, 124595110, 130694993, 122082543, 120129022, 120421639, 124902244, 104043685, 98207768, 94987271, 90702639, 61431566]
[docs] def random_walk_cpu(P, rp, tol, dist, spr): global ngene if rp == 1: return P I = eye(ngene) Q = rp * I + (1 - rp) * P if dist < ngene: idx = np.triu_indices(ngene, 0) idxfilter = ((idx[1] - idx[0]) < dist) idx = ( np.concatenate((idx[0][idxfilter], idx[1][idxfilter])), np.concatenate((idx[1][idxfilter], idx[0][idxfilter]))) mask = csr_matrix((np.ones(len(idx[0])), (idx[0], idx[1])), P.shape) start_time = time.time() for i in range(30): Q_new = rp * I + (1 - rp) * P.dot(Q) delta = norm(Q - Q_new) Q = Q_new.copy() sparsity = Q.nnz / ngene / ngene if (dist < ngene) and (sparsity > spr): Q = Q.multiply(mask) end_time = time.time() print('Iter', i + 1, 'takes', end_time - start_time, 'seconds, loss', delta, 'sparsity', sparsity) if delta < tol: break return Q
[docs] def impute_cell(indir, outdir, cell, chrom, res, genome, mode, logscale=False, pad=1, std=1, rp=0.5, tol=0.01, rwr_dist=500000000, rwr_sparsity=1, output_dist=500000000, output_format='hdf'): if chrom[:3] == 'chr': c = chrom[3:] else: c = chrom if not mode: mode = 'pad' + str(pad) + '_std' + str(std) + '_rp' + str(rp) + '_sqrtvc' if genome == 'hg38': chrom = [str(i + 1) for i in range(22)] chromsize = {x: y for x, y in zip(chrom, hg38dim)} elif genome == 'hg19': chrom = [str(i + 1) for i in range(22)] chromsize = {x: y for x, y in zip(chrom, hg19dim)} elif genome == 'mm10': chrom = [str(i + 1) for i in range(19)] chromsize = {x: y for x, y in zip(chrom, mm10dim)} else: # TODO instead of using fixed chrom info, use chrom size file raise NotImplementedError start_time = time.time() ngene = int(chromsize[c] // res) + 1 D = np.loadtxt(indir + cell + '_chr' + c + '.txt') if len(D) == 0: D = np.array([[0, 0, 0]]) elif len(D.shape) == 1: D = D.reshape(1, -1) A = csr_matrix((D[:, 2], (D[:, 0], D[:, 1])), shape=(ngene, ngene)) if logscale: A.data = np.log2(A.data + 1) end_time = time.time() print('Loading chr', c, 'takes', end_time - start_time, 'seconds') start_time = time.time() B = cv2.GaussianBlur((A + A.T).toarray().astype(np.float32), (pad * 2 + 1, pad * 2 + 1), std) end_time = time.time() print('Convolution chr', c, 'take', end_time - start_time, 'seconds') start_time = time.time() B = csr_matrix(B) B = B - diags(B.diagonal()) B = B + diags((B.sum(axis=0).A.ravel() == 0).astype(int)) d = diags(1 / B.sum(axis=0).A.ravel()) P = d.dot(B) Q = random_walk_cpu(P, rp, tol, int(rwr_dist // res) + 1, rwr_sparsity) print('RWR', time.time() - start_time) start_time = time.time() E = Q + Q.T d = E.sum(axis=0).A.ravel() d[d == 0] = 1 b = diags(1 / np.sqrt(d)) E = b.dot(E).dot(b) print('SQRTVC', time.time() - start_time) start_time = time.time() idx = np.triu_indices(E.shape[0], 0) idxfilter = ((idx[1] - idx[0]) < (output_dist // res + 1)) idx = (idx[0][idxfilter], idx[1][idxfilter]) mask = csr_matrix((np.ones(len(idx[0])), (idx[0], idx[1])), E.shape) E = E.multiply(mask) E = E.tocsr() print('Filter', time.time() - start_time) if output_format == 'npz': save_npz(outdir + cell + '_chr' + c + '_' + mode + '.npz', E) else: f = h5py.File(outdir + cell + '_chr' + c + '_' + mode + '.hdf5', 'w') g = f.create_group('Matrix') g.create_dataset('data', data=E.data, dtype='float32', compression='gzip') g.create_dataset('indices', data=E.indices, dtype=int, compression='gzip') g.create_dataset('indptr', data=E.indptr, dtype=int, compression='gzip') g.attrs['shape'] = E.shape f.close() return