Source code for loop_bkg_cell

# command time python /gale/ddn/snm3C/humanPFC/code/loop_bkg_cell.py --indir /gale/ddn/snm3C/humanPFC/smoothed_matrix/${res0}b_resolution/ --cell ${sample} --chrom ${c} --res ${res} --impute_mode pad2_std1_rp0.5_ws40

import sys
import h5py
import time
import cv2
cv2.useOptimized()
import argparse
import numpy as np
from scipy.sparse import save_npz, csr_matrix
from sklearn.preprocessing import RobustScaler

[docs] def loop_bkg_cell(indir, cell, chrom, impute_mode, res, dist=10050000, cap=5, pad=5, gap=2, norm_mode='dist_trim'): if chrom[:3]=='chr': c = chrom else: c = 'chr' + chrom start_time = time.time() with h5py.File(f'{indir}{c}/{cell}_{c}_{impute_mode}.hdf5', 'r') as f: g = f['Matrix'] E = csr_matrix((g['data'][()], g['indices'][()], g['indptr'][()]), g.attrs['shape']).tocoo() print('Load', time.time() - start_time) # TODO numba optimize start_time = time.time() ave, std, top, count = np.zeros((4, dist // res + pad + 1)) for i in range(dist // res + pad + 1): tmp = E.diagonal(i) top[i] = np.percentile(tmp,99) tmp[tmp>top[i]] = top[i] ave[i] = np.mean(tmp) std[i] = np.std(tmp) count[i] = np.sum(tmp>0) print('Curve', time.time() - start_time, '#Nonzero', np.sum(count)) idx = np.triu_indices(E.shape[0], 0) idxfilter = ((idx[1] - idx[0]) < (dist // res + 1)) idx = (idx[0][idxfilter], idx[1][idxfilter]) mask = csr_matrix((np.ones(len(idx[0])), (idx[0], idx[1])), E.shape) start_time = time.time() E.data = np.min([E.data, top[E.col - E.row]], axis=0) E = E.astype(np.float32).toarray() tmp = E[idx] tmp = (tmp - ave[idx[1] - idx[0]]) / std[idx[1] - idx[0]] tmp[count[idx[1] - idx[0]] < 100] = 0 tmp[std[idx[1] - idx[0]]==0] = 0 tmp[tmp > cap] = cap tmp[tmp < -cap] = -cap E[idx] = tmp.copy() print('Norm', time.time() - start_time) start_time = time.time() w = pad * 2 + 1 kernel = np.ones((w, w), np.float32) kernel[(pad - gap):(pad + gap + 1), (pad - gap):(pad + gap + 1)] = 0 kernel = kernel / np.sum(kernel) N = cv2.filter2D(E, -1, kernel=kernel) E = csr_matrix(np.around(E, decimals=6)) N = csr_matrix(np.around(N, decimals=6)).multiply(mask) N = E - N print('Bkg', time.time() - start_time) save_npz(f'{indir}{c}/{cell}_{c}_{impute_mode}_{norm_mode}.E.npz', E) save_npz(f'{indir}{c}/{cell}_{c}_{impute_mode}_{norm_mode}.T.npz', N) return
''' parser = argparse.ArgumentParser() parser.add_argument('--indir', type=str, default=None, help='Directory of imputed matrix') parser.add_argument('--cell', type=str, default=None, help='Specific identifier of a cell') parser.add_argument('--chrom', type=str, default=None, help='Chromosome imputed') parser.add_argument('--impute_mode', type=str, default=None, help='Suffix of imputed matrix file names') parser.add_argument('--res', type=int, default=None, help='Bin size as integer to generate contact matrix') parser.add_argument('--dist', type=int, default=10050000, help='Maximum distance threshold of contacts to use') parser.add_argument('--cap', type=int, default=5, help='Trim Z-scores over the threshold') parser.add_argument('--pad', type=int, default=5, help='One direction size of larger square for donut background') parser.add_argument('--gap', type=int, default=2, help='One direction size of smaller square for donut background') parser.add_argument('--norm_mode', type=str, default='dist_trim', help='Suffix of normalized file names') opt = parser.parse_args() loop_bkg_cell(opt.indir, opt.cell, opt.chrom, opt.impute_mode, opt.res, opt.dist, opt.cap, opt.pad, opt.gap, opt.norm_mode) '''