Source code for comp_cpg_cell

# command time python /gale/ddn/snm3C/humanPFC/code/comp_cpg_cell.py --indir /gale/raidix/rdx-5/zhoujt/projects/methylHiC/PFC_batch_merged/smoothed_matrix/1cell/${res0}b_resolution/ --outdir /gale/ddn/snm3C/humanPFC/smoothed_matrix/${res0}b_resolution/ --cell $(cat /gale/ddn/snm3C/humanPFC/smoothed_matrix/celllist_long.txt | head -${SGE_TASK_ID} | tail -1) --chrom ${c} --mode raw --cpg_file /gale/ddn/snm3C/humanPFC/smoothed_matrix/${res0}b_resolution/bins/hg19.${res0}bin.CpG.txt

import h5py
import argparse
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, diags

[docs] def comp_cpg_cell(indir, outdir, cell, chrom, mode, cpg_file): if chrom[:3]=='chr': c = chrom[3:] else: c = chrom cpg = pd.read_csv(cpg_file, sep='\t', header=0) cpg['ratio'] = (cpg['13_user_patt_count'] / (cpg['12_seq_len'] - cpg['10_num_N'])).fillna(0) cpg = cpg[(cpg['#1_usercol']==('chr'+c))]['ratio'].values[:,None] binfilter = (cpg[:,0]>0) if mode=='raw': D = np.loadtxt(f'{indir}chr{c}/{cell}_chr{c}.txt') if len(D)==0: D = np.array([[0,0,0]]) elif len(D.shape)==1: D = D.reshape(1,-1) A = csr_matrix((D[:, 2], (D[:, 0], D[:, 1])), shape = (len(cpg), len(cpg))) else: with h5py.File(f'{indir}chr{c}/{cell}_chr{c}_{mode}.hdf5', 'r') as f: g = f['Matrix'] A = csr_matrix((g['data'][()], g['indices'][()], g['indptr'][()]), g.attrs['shape']) A = A - diags(A.diagonal()) A = A + A.T A = A + diags((A.sum(axis=0).A.ravel()==0).astype(int)) # normalize row sum to 1 d = diags(1 / A.sum(axis=0).A.ravel()) P = d.dot(A) # weighted average of cpg density comp = A.dot(cpg)[:,0] tmp = comp[binfilter] Apos = (tmp > np.percentile(tmp, 80)) Bpos = (tmp < np.percentile(tmp, 20)) E = A.tocoo() decay = np.array([E.diagonal(i).mean() for i in range(E.shape[0])]) E.data = E.data / decay[np.abs(E.col - E.row)] E = E.tocsr()[np.ix_(binfilter, binfilter)] score = [E[np.ix_(Apos, Apos)].sum(), E[np.ix_(Bpos, Bpos)].sum(), E[np.ix_(Apos, Bpos)].sum()] np.save(f'{outdir}chr{c}/{cell}_chr{c}_{mode}.cpgcomp.npy', comp.tolist() + score) return
''' parser = argparse.ArgumentParser() parser.add_argument('--indir', type=str, default=None, help='Directory of raw/imputed matrices end with /') parser.add_argument('--outdir', type=str, default=None, help='Directory of compartment matrix end with /') parser.add_argument('--cell', type=str, default=None, help='Full path of a file containing a list of cell identifiers to be concatenate') parser.add_argument('--chrom', type=str, default=None, help='Chromosome to impute') parser.add_argument('--mode', type=str, default=None, help='Suffix of imputed matrix file names') parser.add_argument('--cpg_file', type=str, default=None, help='CpG counts generated by bedtools nuc') opt = parser.parse_args() comp_cpg_cell(opt.indir, opt.outdir, opt.cell, opt.chrom, opt.mode, opt.cpg_file) '''