Source code for gene_score

import cooler
import numpy as np
import pandas as pd
from scipy.sparse import triu, csr_matrix
from concurrent.futures import ProcessPoolExecutor, as_completed

[docs] def gene_score_raw(cell_path, chrom_sizes, gene_meta, resolution, chrom1, pos1, chrom2, pos2): data = pd.read_csv(cell_path, sep='\t', index_col=None, header=None, comment='#') data = data.loc[(data[chrom1]==data[chrom2]) & data[chrom1].isin(chrom_sizes.index)] result = [] for chrom in chrom_sizes.index: n_bins = (chrom_sizes.loc[chrom] // resolution) + 1 chrfilter = (data[chrom1]==chrom) if chrfilter.sum()==0: D = csr_matrix((n_bins, n_bins)) else: D = data.loc[data[chrom1]==chrfilter] D[[pos1, pos2]] = (D[[pos1, pos2]] - 1) // resolution D = D.groupby(by=[pos1, pos2])[chrom1].count().reset_index() D = csr_matrix((D[chrom1].astype(np.int32), (D[pos1], D[pos2])), (n_bins, n_bins)) gene = gene_meta.loc[gene_meta[0]==chrom, [1,2]].values for xx,yy in gene: result.append(D[xx:(yy+1), xx:(yy+1)].sum()) return result
[docs] def gene_score_impute(cell_path, chrom_sizes, gene_meta): cool = cooler.Cooler(cell_path) result = [] for chrom in chrom_sizes.index: D = triu(cool.matrix(balance=False, sparse=True).fetch(chrom), k=1).tocsr() gene = gene_meta.loc[gene_meta[0]==chrom, [1,2]].values for xx,yy in gene: result.append(D[(xx-1):(yy+1), xx:(yy+2)].sum()) return result
[docs] def gene_score(cell_table_path, gene_meta_path, resolution, output_hdf_path, chrom_size_path, slop=0, cpu=10, mode='impute', chrom1=1, pos1=2, chrom2=5, pos2=6): chrom_sizes = pd.read_csv(chrom_size_path, sep='\t', header=None, index_col=0) gene_meta = pd.read_csv(gene_meta_path, sep='\t', header=None, index_col=3) gene_meta = gene_meta[gene_meta[0].isin(chrom_sizes.index)] gene_meta[1] = (gene_meta[1] - slop) // resolution gene_meta[2] = (gene_meta[2] + slop) // resolution cell_table = pd.read_csv(cell_table_path, sep='\t', header=None, index_col=None).values with ProcessPoolExecutor(cpu) as exe: future_dict = {} for cell, cell_path in cell_table: if mode=='impute': future = exe.submit(gene_score_impute, cell_path=cell_path, chrom_sizes=chrom_sizes, gene_meta=gene_meta) elif mode=='raw': future = exe.submit(gene_score_raw, cell_path=cell_path, chrom_sizes=chrom_sizes, gene_meta=gene_meta, resolution=resolution, chrom1=chrom1, pos1=pos1, chrom2=chrom2, pos2=pos2) else: print("ERROR : Mode must be one of impute/raw/diag") return 0 future_dict[future] = cell result, cell_list = [], [] for future in as_completed(future_dict): cell = future_dict[future] print(f'{cell} finished.') result.append(future.result()) cell_list.append(cell) result = pd.DataFrame(result, index=cell_list, columns=gene_meta.index) result.to_hdf(output_hdf_path, key='data', complevel=9)