Source code for schicluster.cool.merge

import sys
import cooler
import numpy as np
import pandas as pd
from glob import glob
from scipy.sparse import csr_matrix, diags
from schicluster.cool.utilities import get_chrom_offsets

[docs] def load_cell_csv_to_csr(cell_path, chrom_offset, bins_df, resolution, chrom1, pos1, chrom2, pos2, min_pos_dist): contacts = pd.read_csv(cell_path, header=None, index_col=None, sep='\t', comment='#', usecols=[chrom1, pos1, chrom2, pos2]) contacts = contacts[contacts[chrom1].isin(chrom_offset) & contacts[chrom2].isin(chrom_offset)] pos_dist = (contacts[pos1] - contacts[pos2]).abs() contacts = contacts[(pos_dist > min_pos_dist) | (contacts[chrom1] != contacts[chrom2])] contacts['bin1_id'] = contacts[chrom1].map(chrom_offset) + (contacts[pos1] - 1) // resolution contacts['bin2_id'] = contacts[chrom2].map(chrom_offset) + (contacts[pos2] - 1) // resolution orderfilter = (contacts['bin1_id']>contacts['bin2_id']) contacts.loc[orderfilter, ['bin1_id', 'bin2_id']] = contacts.loc[orderfilter, ['bin2_id', 'bin1_id']].values count = contacts.groupby(['bin1_id','bin2_id'])[chrom1].count().reset_index() data = csr_matrix((count[chrom1].values, (count['bin1_id'].values, count['bin2_id'].values)), shape=(bins_df.shape[0], bins_df.shape[0])) return data
[docs] def merge_cell_raw(cell_table, chrom_size_path, output_file, resolution=5000, chrom1=1, pos1=2, chrom2=5, pos2=6, min_pos_dist=2500): chrom_sizes = pd.read_csv(chrom_size_path, sep='\t', index_col=0, header=None).squeeze(axis=1) bins_df = cooler.binnify(chrom_sizes, resolution) chrom_offset = get_chrom_offsets(bins_df) cell_list = pd.read_csv(cell_table, sep='\t', index_col=0, header=None).squeeze(axis=1) data = csr_matrix((bins_df.shape[0], bins_df.shape[0])) for xx,yy in zip(cell_list.values, cell_list.index): data += load_cell_csv_to_csr(cell_path=xx, chrom_offset=chrom_offset, bins_df=bins_df, resolution=resolution, chrom1=chrom1, pos1=pos1, chrom2=chrom2, pos2=pos2, min_pos_dist=min_pos_dist) print(yy) data = data + diags(data.diagonal()) data = data.tocoo() data = pd.DataFrame(np.array([data.row, data.col, data.data], dtype=int).T, columns=['bin1_id', 'bin2_id', 'count']) cooler.create_cooler(cool_uri=output_file, bins=bins_df, pixels=data, ordered=True) return