Source code for schicluster.cool.contact_distance

import sys
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor, as_completed

[docs] def compute_decay(cell_name, contact_path, bins, chrom_sizes, resolution, chrom1=1, chrom2=5, pos1=2, pos2=6): # decay data = pd.read_csv(contact_path, sep='\t', header=None, index_col=None) data = data.loc[(data[chrom1]==data[chrom2]) & data[chrom1].isin(chrom_sizes.index)] # select cis-contact hist = np.histogram(np.abs(data[pos2] - data[pos1]), bins)[0] # sparsity data[[pos1, pos2]] = data[[pos1, pos2]] // resolution data = data.groupby(by=[chrom1, pos1, pos2])[chrom2].count().reset_index() data = data.loc[data[pos1]!=data[pos2], chrom1].value_counts() # total number of contacts on each chromosoe return [pd.DataFrame(data).set_axis([cell_name], axis=1), pd.DataFrame(hist, columns=[cell_name])]
[docs] def contact_distance(contact_table, chrom_size_path, resolution, output_prefix, chrom1, chrom2, pos1, pos2, cpu): chrom_sizes = pd.read_csv(chrom_size_path, sep='\t', header=None, index_col=0) nbins = np.floor(np.log2(chrom_sizes[1].values.max() / 2500) / 0.125) bins = 2500 * np.exp2(0.125 * np.arange(nbins+1)) #dist = int(chromsize[1].min() // res + 1) contact_table = pd.read_csv(contact_table, sep='\t', index_col=None, header=None) with ProcessPoolExecutor(cpu) as executor: futures = {} for cell_name, contact_path in contact_table.values: future = executor.submit( compute_decay, cell_name=cell_name, contact_path=contact_path, bins=bins, chrom_sizes=chrom_sizes, resolution=resolution, chrom1=chrom1, pos1=pos1, chrom2=chrom2, pos2=pos2, ) futures[future] = cell_name sparsity, decay = [], [] for future in as_completed(futures): cell_name = futures[future] xx, yy = future.result() sparsity.append(xx) decay.append(yy) print(f'{cell_name} finished') sparsity = pd.concat(sparsity, axis = 1)[contact_table[0]].T decay = pd.concat(decay, axis = 1)[contact_table[0]].T sparsity.to_hdf(f'{output_prefix}_chromsparsity.hdf5', key='data') decay.to_hdf(f'{output_prefix}_decay.hdf5', key='data')