Source code for generatematrix_cell

# python /gale/ddn/snm3C/humanPFC/code/generatematrix_cell.py --infile /gale/netapp/seq4/illumina_runs/ENTEx/Pool_BX_BY/mapping/output/TrCo_A9HOW_Plate1-1-H22/hic/TrCo_A9HOW_Plate1-1-H22-A13.3C.sorted_contacts_old.txt --outdir /gale/ddn/snm3C/humanPFC/cell_matrix/test/10kb_resolution/ --chrom_file /gale/netapp/home/zhoujt/genome/hg19/hg19.autosomal.chrom.sizes --split_file /gale/netapp/home/zhoujt/genome/hg19/hg19.chrsplit.bed --res 10000 --cell TrCo_A9HOW_Plate1-1-H22-A13
# python /gale/ddn/snm3C/humanPFC/code/generatematrix_cell.py --infile /gale/netapp/seq4/illumina_runs/ENTEx/Pool_BX_BY/mapping/output/TrCo_A9HOW_Plate1-1-H22/hic/TrCo_A9HOW_Plate1-1-H22-A13.3C.sorted_contacts_old.txt --outdir /gale/ddn/snm3C/humanPFC/cell_matrix/test/100kb_resolution/ --chrom_file /gale/netapp/home/zhoujt/genome/hg19/hg19.autosomal.chrom.sizes --res 100000 --cell TrCo_A9HOW_Plate1-1-H22-A13

import os
import argparse
import numpy as np
import pandas as pd
#from collections import Counter

[docs] def generatematrix_cell(infile, outdir, cell, res, chrom_file, chr1=1, pos1=2, chr2=5, pos2=6, split_file=None, dist=2500): chrom = np.loadtxt(chrom_file, dtype=np.str)[:,0] # add p/q arm to split chromosomes if not split_file: chrom_split = chrom.copy() else: splitbed = pd.read_csv(split_file, sep='\t', header=None, index_col=0) chrom_split = np.concatenate([[c+'p', c+'q'] if c in splitbed.index else [c] for c in chrom]) # create a folder per chromosome, containing all cells for c in chrom_split: if c[:3]=='chr': os.makedirs(f'{outdir}{c}/', exist_ok=True) else: os.makedirs(f'{outdir}chr{c}/', exist_ok=True) data = pd.read_csv(infile, sep='\t', header=None) # make pos1 < pos2 data.loc[data[pos1]>data[pos2], [pos1, pos2]] = data.loc[data[pos1]>data[pos2], [pos2, pos1]] # filter intra autosomal contacts with insertion size threshold data = data[(data[chr1]==data[chr2]) & (data[chr1].isin(chrom)) & (data[pos2] - data[pos1] > dist)] if split_file: splitfilter = data[chr1].isin(splitbed.index) datasplit = data[splitfilter] # end position of p and start position of q datasplit['p'] = splitbed.loc[datasplit[chr1], 1].values datasplit['q'] = splitbed.loc[datasplit[chr1], 2].values datasplit.loc[datasplit[pos1]<datasplit['p'], chr1] += 'p' datasplit.loc[datasplit[pos2]<datasplit['p'], chr2] += 'p' datasplit.loc[datasplit[pos1]>datasplit['q'], chr1] += 'q' datasplit.loc[datasplit[pos2]>datasplit['q'], chr2] += 'q' # fully divide to ensure bins consistency between p and q datasplit.loc[datasplit[pos1]>datasplit['q'], pos1] -= datasplit['q'] // res * res datasplit.loc[datasplit[pos2]>datasplit['q'], pos2] -= datasplit['q'] // res * res data = pd.concat([datasplit[data.columns], data[~splitfilter]], axis=0) data[[pos1, pos2]] = data[[pos1,pos2]] // res data = data[(data[chr1]==data[chr2]) & (data[chr1].isin(chrom_split)) & (data[pos1]!=data[pos2])] # count contacts at each pixel data = data.groupby(by=[chr1, pos1, pos2])[chr2].count().reset_index() for c, tmp in data.groupby(by=chr1): if c[:3]=='chr': tmp[[pos1, pos2, chr2]].astype(int).to_csv(f'{outdir}{c}/{cell}_{c}.txt', sep='\t', header=False, index=False) else: tmp[[pos1, pos2, chr2]].astype(int).to_csv(f'{outdir}{c}/{cell}_chr{c}.txt', sep='\t', header=False, index=False) ''' count = Counter() with open(infile, 'r') as fin: for line in fin: tmp = line.strip().split('\t') c1, c2, p1, p2 = tmp[chr1], tmp[chr2], int(tmp[pos1]), int(tmp[pos2]) if p1>p2: p1, p2 = p2, p1 if split_file: if c1 in splitbed.index: if p1 < splitbed.loc[c1,0]: c1 = c1 + 'p' elif p1 > splitbed.loc[c1,1]: c1 = c1 + 'q' p1 = p1 - splitbed.loc[c1,1] if c2 in splitbed.index: if p2 < splitbed.loc[c2,0]: c2 = c2 + 'p' elif p2 > splitbed.loc[c2,1]: c2 = c2 + 'q' p2 = p2 - splitbed.loc[c2,1] if (c1==c2) and (c1 in chrom_split) and (p2 - p1 >= dist): p1, p2 = p1 // res, p2 // res if p1!=p2: pos = '-'.join([c1, str(p1), str(p2)]) count[pos] += 1 fout = {c:open(outdir + 'chr' + c + '/' + cell + '_chr' + c + '.txt', 'w') for c in chrom_split} for key in count: tmp = key.split('-') fout[tmp[0]].write('{0}\t{1}\t{2}.0\n'.format(tmp[1], tmp[2], count[key])) for c in chrom: fout[c].close() ''' return
''' parser = argparse.ArgumentParser() parser.add_argument('--infile', type=str, default=None, help='Path to the short format contact file') parser.add_argument('--chr1', type=int, default=2, help='Column of fragment1 chromosome') parser.add_argument('--pos1', type=int, default=3, help='Column of fragment1 position') parser.add_argument('--chr2', type=int, default=6, help='Column of fragment2 chromosome') parser.add_argument('--pos2', type=int, default=7, help='Column of fragment2 position') parser.add_argument('--outdir', type=str, default=None, help='Output directory end with /') parser.add_argument('--cell', type=str, default=None, help='Specific identifier of a cell') parser.add_argument('--res', type=int, default=None, help='Bin size as integer to generate contact matrix') parser.add_argument('--chrom_file', type=str, default=None, help='Path to the chromosome size files containing all chromosome to be analyzed as the first column') ## mm10 or hg38 parser.add_argument('--split_file', type=str, default=None, help='Path to the bed file containing all chromosomes need to split and one region per chromosome as splitting point') parser.add_argument('--dist', type=int, default=2500, help='Minimum distance threshold of contacts to use') opt = parser.parse_args() generatematrix_cell(opt.infile, opt.outdir, opt.cell, opt.res, opt.chrom_file, opt.chr1-1, opt.pos1-1, opt.chr2-1, opt.pos2-1, opt.split_file, opt.dist) '''