Source code for domain_concatcell_chr

# for c in `seq 1 22`; do awk -v c=$c '{printf("/gale/ddn/snm3C/humanPFC/smoothed_matrix/25kb_resolution/chr%s/%s_chr%s_pad2_std1_rp0.5_sqrtvc.w10.domain.bed\n",c,$1,c)}' celllist_long.txt > 25kb_resolution/filelist/domainlist_pad2_std1_rp0.5_sqrtvc_chr${c}.txt; echo $c; done
# for c in `seq 1 22`; do awk -v c=$c '{printf("/gale/ddn/snm3C/humanPFC/smoothed_matrix/25kb_resolution/chr%s/%s_chr%s_pad2_std1_rp0.5_sqrtvc.w10.ins.npy\n",c,$1,c)}' celllist_long.txt > 25kb_resolution/filelist/inslist_pad2_std1_rp0.5_sqrtvc_chr${c}.txt; echo $c; done

# command time python /gale/ddn/snm3C/humanPFC/code/domain_concatcell_chr.py --cell_list /gale/ddn/snm3C/humanPFC/smoothed_matrix/25kb_resolution/filelist/inslist_pad2_std1_rp0.5_sqrtvc_chr${c}.txt --outprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/${res0}b_resolution/merged/pad2_std1_rp0.5_sqrtvc_chr${c}.w10 --res 25000 --input_type insulation --ncpus 10
# command time python /gale/ddn/snm3C/humanPFC/code/domain_concatcell_chr.py --cell_list /gale/ddn/snm3C/humanPFC/smoothed_matrix/25kb_resolution/filelist/domainlist_pad2_std1_rp0.5_sqrtvc_chr${c}.txt --outprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/${res0}b_resolution/merged/pad2_std1_rp0.5_sqrtvc_chr${c}.w10 --res 25000 --input_type boundary --ncpus 10

import argparse
import numpy as np
import pandas as pd
from multiprocessing import Pool
from scipy.sparse import csr_matrix, save_npz

[docs] def load_insulation(i): try: data = np.load(celllist[i]) except: print(celllist[i]) data = 0 return [i, data]
[docs] def load_boundary(i): try: tmp = pd.read_csv(celllist[i], sep='\t', header=None) except: print(celllist[i]) data = 0 else: data = np.zeros(int(tmp.iloc[-1,2] // rs) + 1) tmp = tmp[tmp[3]=='domain'][[1,2]].values // rs data[tmp[:,0]] += 1 data[tmp[:,1]] += 1 return [i, data]
[docs] def domain_concatcell_chr(cell_list, outprefix, res, input_type='insulation', ncpus=10): global celllist, rs celllist = np.loadtxt(cell_list, dtype=np.str) rs = res p = Pool(ncpus) if input_type=='insulation': result = p.map(load_insulation, np.arange(len(celllist))) elif input_type=='boundary': result = p.map(load_boundary, np.arange(len(celllist))) ins = np.zeros((len(celllist), len(result[0][1]))) for i,x in result: if not isinstance(x, int): ins[i] = x.copy() if input_type=='insulation': np.save(f'{outprefix}.{input_type}.npy', ins) elif input_type=='boundary': save_npz(f'{outprefix}.{input_type}.npz', csr_matrix(ins)) p.close() return
''' parser = argparse.ArgumentParser() parser.add_argument('--cell_list', type=str, default=None, help='Full path of a file containing the full path of all insulation npy or domain txt files to be concatenate') parser.add_argument('--outprefix', type=str, default=None, help='Prefix of concatenated matrix including directory') parser.add_argument('--res', type=int, default=None, help='Bin size as integer') parser.add_argument('--input_type', type=str, default='insulation', help='Whether input files are insulation.npy or domain.txt') # insulation or boundary parser.add_argument('--ncpus', type=int, default=10, help='# threads for parallelization') opt = parser.parse_args() domain_concatcell_chr(opt.cell_list, opt.outprefix, opt.res, opt.input_type, opt.ncpus) '''