Source code for loop_mergechr

# command time python /gale/ddn/snm3C/humanPFC/code/loop_mergechr.py --inprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/merged/L23_${mode}_dist_trim/L23_${mode}_dist_trim --outprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/merged/L23_${mode}_dist_trim/L23_${mode}_dist_trim --chrom_file /gale/netapp/home/zhoujt/genome/hg19/hg19.autosomal.chrom.sizes
# command time python /gale/ddn/snm3C/humanPFC/code/loop_mergechr.py --inprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/merged/L23_${mode}_dist_trim/L23_${mode}_dist_trim --outprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/10kb_resolution/merged/L23_${mode}_dist_trim/L23_${mode}_dist_trim.split --chrom_file /gale/netapp/home/zhoujt/genome/hg19/hg19.autosomal.chrom.sizes --split_file /gale/netapp/home/zhoujt/genome/hg19/hg19.chrsplit.bed

import time
import argparse
import numpy as np
import pandas as pd
from heapq import *
from statsmodels.sandbox.stats.multicomp import multipletests as FDR

[docs] def loop_mergechr(inprefix, outprefix, chrom_file, split_file=None, res=10000, thres_bl=1.33, thres_d=1.33, thres_h=1.2, thres_v=1.2, fdr_thres=0.1, dist_thres=20000, size_thres=1): def find_summit(loop, dist_thres): start_time = time.time() cord = loop[['x1', 'y1']].values // res idx = np.argsort(cord[:, 0]) neighbor = {i:[] for i in range(len(idx))} for i in range(len(idx)-1): tmp = cord[idx[i]] for j in range(i+1,len(idx)): if cord[idx[j], 0] - tmp[0] > dist_thres: break if np.abs(tmp[1] - cord[idx[j], 1]) <= dist_thres: neighbor[idx[i]].append(idx[j]) neighbor[idx[j]].append(idx[i]) # dist = np.max([pairwise_distances(loop['x1'].values.reshape(-1,1)//res), # pairwise_distances(loop['y1'].values.reshape(-1,1)//res)], axis=0) # print('Compute distances takes', time.time() - start_time, 'seconds') # start_time = time.time() # G = nx.from_numpy_array((dist <= dist_thres)) print('Build graph takes', time.time() - start_time, 'seconds') start_time = time.time() nodescore = loop['E'].values flag = np.zeros(len(nodescore)) tot = len(nodescore) summit = [] nodeheap = (loop['E'] * -1).reset_index().reset_index()[['E', 'level_0']].values.tolist() heapify(nodeheap) while tot>0: t = int(heappop(nodeheap)[1]) while flag[t]: t = int(heappop(nodeheap)[1]) q = [t] flag[t] = 1 tot -= 1 head = 0 flagtmp = np.zeros(len(nodescore)) while (head < len(q)): for t in neighbor[q[head]]: if not flagtmp[t] and nodescore[t]<nodescore[q[head]]: if not flag[t]: flag[t] = 1 tot -= 1 flagtmp[t] = 1 q.append(t) head += 1 summit.append([q[0], len(q)]) summit = np.array(summit) loop = loop.iloc[summit[:,0]] loop['size'] = summit[:,1] print('BFS takes', time.time() - start_time, 'seconds') return loop chrom = np.loadtxt(chrom_file, dtype=np.str)[:,0] if not split_file: chrom_split = chrom.copy() else: splitbed = pd.read_csv(split_file, sep='\t', header=None, index_col=0) chrom_split = np.concatenate([[c+'p', c+'q'] if c in splitbed.index else [c] for c in chrom]) chrom_split = np.array([c if c[:3]=='chr' else 'chr'+c for c in chrom_split]) start_time = time.time() loopall = [] for c in chrom_split: data = pd.read_hdf(f'{inprefix}_{c}.loop.hdf5', key='loop') data['bkfilter'] = (((data['E']/data['E_bl'] > thres_bl) | (data['E_bl']<0)) & ((data['E']/data['E_donut'] > thres_d) | (data['E_donut']<0)) & ((data['E']/data['E_h'] > thres_h) | (data['E_h']<0)) & ((data['E']/data['E_v'] > thres_v) | (data['E_v']<0))) data['x1'] = data['x1'].astype(int) * res data['y1'] = data['y1'].astype(int) * res data['x2'] = data['x1'] + res data['y2'] = data['y1'] + res if c[-1]=='p' or c[-1]=='q': data['chr'] = c[:-1] if c[-1]=='q': data[['x1','x2','y1','y2']] += splitbed.loc[c[:-1],2] // res * res else: data['chr'] = c loopall.append(data) loopall = pd.concat(loopall, axis=0) loopall['rFDR'] = FDR(loopall['rpv'], 0.1, 'fdr_bh')[1] loopall['tFDR'] = FDR(loopall['tpv'], 0.1, 'fdr_bh')[1] print('Merge chromosomes takes', time.time() - start_time, 'seconds') loop = loopall.loc[(loopall['bkfilter']==1) & (loopall['tFDR']<fdr_thres)] loop.sort_values(by=['chr', 'x1', 'y1'])[['chr', 'x1', 'x2', 'chr', 'y1', 'y2', 'E']].to_csv(f'{outprefix}.loop.bedpe', sep='\t', index=False, header=None) scloop = loopall.loc[loopall['tFDR']<fdr_thres] scloop.sort_values(by=['chr', 'x1', 'y1'])[['chr', 'x1', 'x2', 'chr', 'y1', 'y2', 'E']].to_csv(f'{outprefix}.scloop.bedpe', sep='\t', index=False, header=None) bkloop = loopall.loc[loopall['bkfilter']==1] bkloop.sort_values(by=['chr', 'x1', 'y1'])[['chr', 'x1', 'x2', 'chr', 'y1', 'y2', 'E']].to_csv(f'{outprefix}.bkloop.bedpe', sep='\t', index=False, header=None) summit = pd.concat([find_summit(loop[loop['chr']==c], dist_thres//res) for c in chrom], axis=0) suumit = summit[summit['size'] >= size_thres] summit.sort_values(by=['chr', 'x1', 'y1'])[['chr', 'x1', 'x2', 'chr', 'y1', 'y2', 'E', 'size']].to_csv(f'{outprefix}.loopsummit.bedpe', sep='\t', index=False, header=None) return
''' parser = argparse.ArgumentParser() parser.add_argument('--inprefix', type=str, default=None, help='Full path of a file containing the full path of all imputed files to be merged without .hdf5 suffix') parser.add_argument('--outprefix', type=str, default=None, help='Prefix of merged matrix including directory') parser.add_argument('--chrom_file', type=str, default=None, help='Path to the chromosome size files containing all chromosome to be analyzed as the first column') ## mm10 or hg38 parser.add_argument('--split_file', type=str, default=None, help='Path to the bed file containing all chromosomes need to split and one region per chromosome as splitting point') parser.add_argument('--res', type=int, default=10000, help='Bin size as integer to generate contact matrix') parser.add_argument('--thres_bl', type=int, default=1.33, help='Lowest fold change threshold against bottom left background') parser.add_argument('--thres_d', type=int, default=1.33, help='Lowest fold change threshold against donut background') parser.add_argument('--thres_h', type=int, default=1.2, help='Lowest fold change threshold against horizontal background') parser.add_argument('--thres_v', type=int, default=1.2, help='Lowest fold change threshold against vertical background') parser.add_argument('--fdr_thres', type=int, default=0.1, help='Highest t-test FDR threshold of loops') parser.add_argument('--dist_thres', type=int, default=20000, help='Highest distance threshold to merge loops into summit') parser.add_argument('--size_thres', type=int, default=1, help='Lowest loop number threshold of summit') opt = parser.parse_args() loop_mergechr(opt.inprefix, opt.outprefix, opt.chrom_file, opt.split_file, opt.res, opt.thres_bl, opt.thres_d, opt.thres_h, opt.thres_v, opt.fdr_thres, opt.dist_thres, opt.size_thres) '''