Source code for loop_sumcell_chr

# mode=pad2_std1_rp0.5_sqrtvc
# command time python /gale/ddn/snm3C/humanPFC/code/loop_sumcell_chr.py --cell_list /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/filelist/L23_covgroup${i}_${mode}_chr${c}_looplist.txt --outprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/merged/L23_covgroup${i}_${mode}_dist_trim/L23_covgroup${i}_${mode}_dist_trim_chr${c} --res 10000
# command time python /gale/ddn/snm3C/humanPFC/code/loop_sumcell_chr.py --cell_list /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/filelist/L23_${mode}_chr${c}_looplist.txt --group_list /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/filelist/L23_${mode}_chr${c}_grouplist.txt --outprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/merged/L23_${mode}_dist_trim/L23_${mode}_dist_trim_chr${c} --res 10000

import time
import cv2
import h5py
import argparse
import numpy as np
import pandas as pd
from scipy import stats
from scipy.sparse import save_npz, load_npz, csr_matrix, coo_matrix


[docs]
def loop_sumcell_chr(cell_list, outprefix, res, 
    group_list=None, matrix='QEO', sum_only=False, test_only=False, 
    norm_mode='dist_trim', min_dist=50000, max_dist=10000000, pad=5, gap=2):

    def load_group_hdf(file):
        with h5py.File(file, 'r') as f:
            g = f['Matrix']
            A = csr_matrix((g['data'][()], g['indices'][()], g['indptr'][()]), g.attrs['shape'])
            tmp = g.attrs['count']
            A.data = A.data * tmp
        return [A, tmp]

    def output(file, A):
        f = h5py.File(f'{file}', 'w')
        g = f.create_group('Matrix')
        g.create_dataset('data', data=A.data, dtype='float32', compression='gzip')
        g.create_dataset('indices', data=A.indices, dtype=int, compression='gzip') 
        g.create_dataset('indptr', data=A.indptr, dtype=int, compression='gzip')
        g.attrs['shape'] = A.shape
        g.attrs['count'] = tot
        f.close()

    celllist = np.loadtxt(cell_list, dtype=np.str)
    tot = len(celllist)

    if not test_only:
        thres = stats.norm(0, 1).isf(0.025) 
        with h5py.File(f'{celllist[0]}.hdf5', 'r') as f:
            dim1, dim2 = f['Matrix'].attrs['shape']
        Qsum, Esum, Osum = [csr_matrix((dim1, dim2)) for i in range(3)]
        start_time = time.time()
        if group_list:
            grouplist = np.loadtxt(group_list, dtype=np.str)
            for i, cell in enumerate(grouplist):
                if 'Q' in matrix:
                    A, tmp = load_group_hdf(f'{cell}.hdf5')
                    Qsum += A
                if 'E' in matrix:
                    A, tmp = load_group_hdf(f'{cell}.E.hdf5')
                    Esum += A
                if 'O' in matrix:
                    A, tmp = load_group_hdf(f'{cell}.O.hdf5')
                    Osum += A
                print('Merge', i, 'groups takes', time.time() - start_time, 'seconds')
        else:
            for i, cell in enumerate(celllist):
                if 'Q' in matrix:
                    with h5py.File(f'{cell}.hdf5', 'r') as f:
                        g = f['Matrix']
                        Q = csr_matrix((g['data'][()], g['indices'][()], g['indptr'][()]), g.attrs['shape'])
                    Qsum += Q
                if ('E' in matrix) or ('O' in matrix):
                    E = load_npz(f'{cell}_{norm_mode}.E.npz')
                if 'E' in matrix:
                    Esum += E
                if 'O' in matrix:
                    O = E.copy()
                    O.data = (O.data > thres).astype(int)
                    Osum += O
                print('Merge', i, 'cells takes', time.time() - start_time, 'seconds')
        if 'Q' in matrix:
            Qsum.data = Qsum.data / tot
            output(f'{outprefix}.hdf5', Qsum)
        if 'E' in matrix:
            Esum.data = Esum.data / tot
            output(f'{outprefix}.E.hdf5', Esum)
        if 'O' in matrix:
            Osum.data = Osum.data / tot
            output(f'{outprefix}.O.hdf5', Osum)
        print('Merge cell', time.time() - start_time)

        if (not 'E' in matrix) or (not 'O' in matrix) or sum_only:
            return

        E = Esum.toarray()
        O = Osum.toarray()
        del Qsum, Esum, Osum

    else:
        E, _ = load_group_hdf(f'{outprefix}.E.hdf5')
        O, _ = load_group_hdf(f'{outprefix}.O.hdf5')

    start_time = time.time()
    oefilter = np.logical_and(E > 0, O > 0.1)
    loop = np.where(oefilter)
    distfilter = np.logical_and((loop[1] - loop[0]) > (min_dist / res), (loop[1] - loop[0]) < (max_dist / res))
    loop = (loop[0][distfilter], loop[1][distfilter])

    start_time = time.time()
    eloop = np.zeros((len(celllist), len(loop[0])))
    for i, cell in enumerate(celllist):
        eloop[i] = load_npz(f'{cell}_{norm_mode}.T.npz')[loop].A.ravel()
    print('Load', len(loop[0]), 'loop', time.time() - start_time)

    start_time = time.time()
    pvr = np.array([stats.wilcoxon(xx, alternative='greater')[1] for xx in eloop.T])
    pvt = stats.ttest_1samp(eloop, 0, axis=0)
    pvt[1][pvt[0] > 0] *= 2
    pvt[1][pvt[0] <= 0] = 1
    pvt = pvt[1]
    print('Test loop', time.time() - start_time)
    del eloop

    w = pad * 2 + 1
    start_time = time.time()

    kernel_bl = np.zeros((w, w), np.float32)
    kernel_bl[-pad:, :(pad - gap)] = 1
    kernel_bl[-(pad - gap):, :pad] = 1

    kernel_donut = np.ones((w, w), np.float32)
    kernel_donut[pad, :] = 0
    kernel_donut[:, pad] = 0
    kernel_donut[(pad - gap):(pad + gap + 1), (pad - gap):(pad + gap + 1)] = 0

    kernel_lr = np.ones((3, w), np.float32)
    kernel_lr[:, (pad - gap):(pad + gap + 1)] = 0

    kernel_bu = np.ones((w, 3), np.float32)
    kernel_bu[(pad - gap):(pad + gap + 1), :] = 0

    kernel_bl = kernel_bl / np.sum(kernel_bl)
    kernel_donut = kernel_donut / np.sum(kernel_donut)
    kernel_lr = kernel_lr / np.sum(kernel_lr)
    kernel_bu = kernel_bu / np.sum(kernel_bu)

    Ebl = cv2.filter2D(E, -1, kernel=kernel_bl) * (E > 0)
    Edonut = cv2.filter2D(E, -1, kernel=kernel_donut) * (E > 0)
    Elr = cv2.filter2D(E, -1, kernel=kernel_lr) * (E > 0)
    Ebu = cv2.filter2D(E, -1, kernel=kernel_bu) * (E > 0)

    data = pd.DataFrame(np.array([loop[0], loop[1], pvr, pvt, E[loop], Ebl[loop], Edonut[loop], Elr[loop], Ebu[loop]]).T, columns = ['x1', 'y1', 'rpv', 'tpv', 'E', 'E_bl', 'E_donut', 'E_h', 'E_v'])
    data.to_hdf(f'{outprefix}.loop.hdf5', key='loop')
    print('Bulk bkg', time.time() - start_time)

    return


'''
parser = argparse.ArgumentParser()
parser.add_argument('--cell_list', type=str, default=None, help='Full path of a file containing the full path of all imputed files to be merged without .hdf5 suffix')
parser.add_argument('--outprefix', type=str, default=None, help='Prefix of sumed matrix including directory')
parser.add_argument('--res', type=int, default=None, help='Bin size as integer to generate contact matrix')
parser.add_argument('--group_list', type=str, default=None, help='Full path of a file containing the full path of all summed imputed files to be merged without .hdf5 suffix')
parser.add_argument('--matrix', type=str, default='QEO', help='Types of matrices to sum') #Q E O
parser.add_argument('--sum_only', dest='sum_only', action='store_true', help='Sum cells only and do not test for loops')
parser.set_defaults(sum_only=False)
parser.add_argument('--test_only', dest='test_only', action='store_true', help='Sum of cells already exist and do loop test only')
parser.set_defaults(test_only=False)
parser.add_argument('--norm_mode', type=str, default='dist_trim', help='Suffix of normalized file names')
parser.add_argument('--min_dist', type=int, default=50000, help='Minimum distance threshold of loop')
parser.add_argument('--max_dist', type=int, default=10000000, help='Maximum distance threshold of loop')
parser.add_argument('--pad', type=int, default=5, help='One direction size of larger square for donut background')
parser.add_argument('--gap', type=int, default=2, help='One direction size of smaller square for donut background')
opt = parser.parse_args()

loop_sumcell_chr(opt.cell_list, opt.outprefix, opt.res, 
        opt.group_list, opt.matrix, opt.sum_only, opt.test_only,
        opt.norm_mode, opt.min_dist, opt.max_dist, opt.pad, opt.gap)
'''