Source code for loop_sumcell_chr

# mode=pad2_std1_rp0.5_sqrtvc
# command time python /gale/ddn/snm3C/humanPFC/code/loop_sumcell_chr.py --cell_list /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/filelist/L23_covgroup${i}_${mode}_chr${c}_looplist.txt --outprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/merged/L23_covgroup${i}_${mode}_dist_trim/L23_covgroup${i}_${mode}_dist_trim_chr${c} --res 10000
# command time python /gale/ddn/snm3C/humanPFC/code/loop_sumcell_chr.py --cell_list /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/filelist/L23_${mode}_chr${c}_looplist.txt --group_list /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/filelist/L23_${mode}_chr${c}_grouplist.txt --outprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/merged/L23_${mode}_dist_trim/L23_${mode}_dist_trim_chr${c} --res 10000

import time
import cv2
import h5py
import argparse
import numpy as np
import pandas as pd
from scipy import stats
from scipy.sparse import save_npz, load_npz, csr_matrix, coo_matrix

[docs] def loop_sumcell_chr(cell_list, outprefix, res, group_list=None, matrix='QEO', sum_only=False, test_only=False, norm_mode='dist_trim', min_dist=50000, max_dist=10000000, pad=5, gap=2): def load_group_hdf(file): with h5py.File(file, 'r') as f: g = f['Matrix'] A = csr_matrix((g['data'][()], g['indices'][()], g['indptr'][()]), g.attrs['shape']) tmp = g.attrs['count'] A.data = A.data * tmp return [A, tmp] def output(file, A): f = h5py.File(f'{file}', 'w') g = f.create_group('Matrix') g.create_dataset('data', data=A.data, dtype='float32', compression='gzip') g.create_dataset('indices', data=A.indices, dtype=int, compression='gzip') g.create_dataset('indptr', data=A.indptr, dtype=int, compression='gzip') g.attrs['shape'] = A.shape g.attrs['count'] = tot f.close() celllist = np.loadtxt(cell_list, dtype=np.str) tot = len(celllist) if not test_only: thres = stats.norm(0, 1).isf(0.025) with h5py.File(f'{celllist[0]}.hdf5', 'r') as f: dim1, dim2 = f['Matrix'].attrs['shape'] Qsum, Esum, Osum = [csr_matrix((dim1, dim2)) for i in range(3)] start_time = time.time() if group_list: grouplist = np.loadtxt(group_list, dtype=np.str) for i, cell in enumerate(grouplist): if 'Q' in matrix: A, tmp = load_group_hdf(f'{cell}.hdf5') Qsum += A if 'E' in matrix: A, tmp = load_group_hdf(f'{cell}.E.hdf5') Esum += A if 'O' in matrix: A, tmp = load_group_hdf(f'{cell}.O.hdf5') Osum += A print('Merge', i, 'groups takes', time.time() - start_time, 'seconds') else: for i, cell in enumerate(celllist): if 'Q' in matrix: with h5py.File(f'{cell}.hdf5', 'r') as f: g = f['Matrix'] Q = csr_matrix((g['data'][()], g['indices'][()], g['indptr'][()]), g.attrs['shape']) Qsum += Q if ('E' in matrix) or ('O' in matrix): E = load_npz(f'{cell}_{norm_mode}.E.npz') if 'E' in matrix: Esum += E if 'O' in matrix: O = E.copy() O.data = (O.data > thres).astype(int) Osum += O print('Merge', i, 'cells takes', time.time() - start_time, 'seconds') if 'Q' in matrix: Qsum.data = Qsum.data / tot output(f'{outprefix}.hdf5', Qsum) if 'E' in matrix: Esum.data = Esum.data / tot output(f'{outprefix}.E.hdf5', Esum) if 'O' in matrix: Osum.data = Osum.data / tot output(f'{outprefix}.O.hdf5', Osum) print('Merge cell', time.time() - start_time) if (not 'E' in matrix) or (not 'O' in matrix) or sum_only: return E = Esum.toarray() O = Osum.toarray() del Qsum, Esum, Osum else: E, _ = load_group_hdf(f'{outprefix}.E.hdf5') O, _ = load_group_hdf(f'{outprefix}.O.hdf5') start_time = time.time() oefilter = np.logical_and(E > 0, O > 0.1) loop = np.where(oefilter) distfilter = np.logical_and((loop[1] - loop[0]) > (min_dist / res), (loop[1] - loop[0]) < (max_dist / res)) loop = (loop[0][distfilter], loop[1][distfilter]) start_time = time.time() eloop = np.zeros((len(celllist), len(loop[0]))) for i, cell in enumerate(celllist): eloop[i] = load_npz(f'{cell}_{norm_mode}.T.npz')[loop].A.ravel() print('Load', len(loop[0]), 'loop', time.time() - start_time) start_time = time.time() pvr = np.array([stats.wilcoxon(xx, alternative='greater')[1] for xx in eloop.T]) pvt = stats.ttest_1samp(eloop, 0, axis=0) pvt[1][pvt[0] > 0] *= 2 pvt[1][pvt[0] <= 0] = 1 pvt = pvt[1] print('Test loop', time.time() - start_time) del eloop w = pad * 2 + 1 start_time = time.time() kernel_bl = np.zeros((w, w), np.float32) kernel_bl[-pad:, :(pad - gap)] = 1 kernel_bl[-(pad - gap):, :pad] = 1 kernel_donut = np.ones((w, w), np.float32) kernel_donut[pad, :] = 0 kernel_donut[:, pad] = 0 kernel_donut[(pad - gap):(pad + gap + 1), (pad - gap):(pad + gap + 1)] = 0 kernel_lr = np.ones((3, w), np.float32) kernel_lr[:, (pad - gap):(pad + gap + 1)] = 0 kernel_bu = np.ones((w, 3), np.float32) kernel_bu[(pad - gap):(pad + gap + 1), :] = 0 kernel_bl = kernel_bl / np.sum(kernel_bl) kernel_donut = kernel_donut / np.sum(kernel_donut) kernel_lr = kernel_lr / np.sum(kernel_lr) kernel_bu = kernel_bu / np.sum(kernel_bu) Ebl = cv2.filter2D(E, -1, kernel=kernel_bl) * (E > 0) Edonut = cv2.filter2D(E, -1, kernel=kernel_donut) * (E > 0) Elr = cv2.filter2D(E, -1, kernel=kernel_lr) * (E > 0) Ebu = cv2.filter2D(E, -1, kernel=kernel_bu) * (E > 0) data = pd.DataFrame(np.array([loop[0], loop[1], pvr, pvt, E[loop], Ebl[loop], Edonut[loop], Elr[loop], Ebu[loop]]).T, columns = ['x1', 'y1', 'rpv', 'tpv', 'E', 'E_bl', 'E_donut', 'E_h', 'E_v']) data.to_hdf(f'{outprefix}.loop.hdf5', key='loop') print('Bulk bkg', time.time() - start_time) return
''' parser = argparse.ArgumentParser() parser.add_argument('--cell_list', type=str, default=None, help='Full path of a file containing the full path of all imputed files to be merged without .hdf5 suffix') parser.add_argument('--outprefix', type=str, default=None, help='Prefix of sumed matrix including directory') parser.add_argument('--res', type=int, default=None, help='Bin size as integer to generate contact matrix') parser.add_argument('--group_list', type=str, default=None, help='Full path of a file containing the full path of all summed imputed files to be merged without .hdf5 suffix') parser.add_argument('--matrix', type=str, default='QEO', help='Types of matrices to sum') #Q E O parser.add_argument('--sum_only', dest='sum_only', action='store_true', help='Sum cells only and do not test for loops') parser.set_defaults(sum_only=False) parser.add_argument('--test_only', dest='test_only', action='store_true', help='Sum of cells already exist and do loop test only') parser.set_defaults(test_only=False) parser.add_argument('--norm_mode', type=str, default='dist_trim', help='Suffix of normalized file names') parser.add_argument('--min_dist', type=int, default=50000, help='Minimum distance threshold of loop') parser.add_argument('--max_dist', type=int, default=10000000, help='Maximum distance threshold of loop') parser.add_argument('--pad', type=int, default=5, help='One direction size of larger square for donut background') parser.add_argument('--gap', type=int, default=2, help='One direction size of smaller square for donut background') opt = parser.parse_args() loop_sumcell_chr(opt.cell_list, opt.outprefix, opt.res, opt.group_list, opt.matrix, opt.sum_only, opt.test_only, opt.norm_mode, opt.min_dist, opt.max_dist, opt.pad, opt.gap) '''