Source code for schicluster.cool.scool

import re
import subprocess
import warnings
from collections import Counter, defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed

import cooler
import pandas as pd
import pandas.errors
from cooler import create_scool

from .remove_blacklist import filter_contacts
from .utilities import get_chrom_offsets


[docs] def generate_scool_batch_data(cell_path_dict, resolution, chrom_offset, chrom_size_path, blacklist_1d_path, blacklist_2d_path, remove_duplicates, blacklist_resolution, output_path, chr1=1, chr2=5, pos1=2, pos2=6, min_pos_dist=2500): def single_cell_pixel(_path): try: contacts = filter_contacts(_path, chrom_size_path=chrom_size_path, blacklist_1d_path=blacklist_1d_path, blacklist_2d_path=blacklist_2d_path, remove_duplicates=remove_duplicates, resolution_2d=blacklist_resolution, chrom1=chr1, pos1=pos1, chrom2=chr2, pos2=pos2) except pandas.errors.EmptyDataError: # empty contacts file return pd.DataFrame([], columns=['bin1_id', 'bin2_id', 'count']) pos_dist = (contacts[pos1] - contacts[pos2]).abs() # filter contacts = contacts[contacts[chr1].isin(chrom_offset) & contacts[chr2].isin(chrom_offset) & (contacts[pos1] > 0) & (contacts[pos2] > 0)] # apply min_pos_dist filter on cis contact only contacts = contacts[(pos_dist > min_pos_dist) | # filter all close contact (contacts[chr1] != contacts[chr2])] # but keep all trans # calculate pixel bin1_id = contacts[chr1].map(chrom_offset) + (contacts[pos1] - 1) // resolution bin2_id = contacts[chr2].map(chrom_offset) + (contacts[pos2] - 1) // resolution counter = Counter() for x, y in zip(bin1_id, bin2_id): if x > y: x, y = y, x counter[(x, y)] += 1 pixel_df = pd.Series(counter).reset_index() pixel_df.columns = pd.Index(['bin1_id', 'bin2_id', 'count']) pixel_df = pixel_df.sort_values(['bin1_id', 'bin2_id']).reset_index(drop=True) return pixel_df with warnings.catch_warnings(): # ignore the warnings.simplefilter("ignore") with pd.HDFStore(output_path) as hdf: for cell_id, path in cell_path_dict.items(): hdf[cell_id] = single_cell_pixel(path) return
[docs] def generate_scool_single_resolution(cell_path_dict, chrom_size_path, resolution, output_path, blacklist_1d_path, blacklist_2d_path, remove_duplicates, blacklist_resolution, chr1=1, chr2=5, pos1=2, pos2=6, min_pos_dist=2500, batch_n=20, cpu=1): # parse chromosome sizes, prepare bin_df chrom_sizes = pd.read_csv( chrom_size_path, sep='\t', index_col=0, header=None).squeeze(axis=1) bins_df = cooler.binnify(chrom_sizes, resolution) chrom_offset = get_chrom_offsets(bins_df) chunk_dicts = defaultdict(dict) for i, (cell, path) in enumerate(cell_path_dict.items()): chunk_dicts[i // batch_n][cell] = path with ProcessPoolExecutor(cpu) as exe: futures = {} for batch, cell_path_dict in chunk_dicts.items(): batch_output = output_path + f'_{batch}' f = exe.submit(generate_scool_batch_data, cell_path_dict=cell_path_dict, resolution=resolution, chrom_offset=chrom_offset, output_path=batch_output, chr1=chr1, chr2=chr2, pos1=pos1, pos2=pos2, min_pos_dist=min_pos_dist, chrom_size_path=chrom_size_path, blacklist_1d_path=blacklist_1d_path, blacklist_2d_path=blacklist_2d_path, remove_duplicates=remove_duplicates, blacklist_resolution=blacklist_resolution) futures[f] = batch_output for future in as_completed(futures): # batch finished batch_output = futures[future] future.result() # dump batch result into cool cell_pixel_dict = {} with pd.HDFStore(batch_output, mode='r') as hdf: for cell_id in hdf.keys(): cell_id = cell_id[1:] # remove '/' cell_pixel_dict[cell_id] = hdf[cell_id] create_scool(output_path, bins=bins_df, cell_name_pixels_dict=cell_pixel_dict, ordered=True, mode='a') subprocess.run(['rm', '-f', batch_output], check=True) return
[docs] def generate_scool(contacts_table, output_prefix, chrom_size_path, resolutions, blacklist_1d_path=None, blacklist_2d_path=None, blacklist_resolution=10000, remove_duplicates=True, chr1=1, chr2=5, pos1=2, pos2=6, min_pos_dist=2500, cpu=1, batch_n=50): """ Generate single-resolution cool files from single-cell contact files recorded in contacts_table Parameters ---------- contacts_table tab-separated table containing tow columns, 1) cell id, 2) cell contact file path (juicer-pre format) No header output_prefix Output prefix of the cool files. Output path will be {output_prefix}.{resolution_str}.cool chrom_size_path Path to the chromosome size file, this file should be in UCSC chrom.sizes format. We will use this file as the reference to create matrix. It is recommended to remove small contigs or chrM from this file. resolutions Resolutions to generate the matrix. Each resolution will be stored in a separate file. blacklist_1d_path Path to blacklist region BED file, such as ENCODE blacklist. Either side of the contact overlapping with a blacklist region will be removed. blacklist_2d_path Path to blacklist region pair BEDPE file. Both side of the contact overlapping with the same blacklist region pair will be removed. blacklist_resolution Resolution in bps when consider the 2D blacklist region pairs. remove_duplicates If true, will remove duplicated contacts based on [chr1, pos1, chr2, pos2] values chr1 0 based index of chr1 column. chr2 0 based index of chr2 column. pos1 0 based index of pos1 column. pos2 0 based index of pos2 column. min_pos_dist Minimum distance for a fragment to be considered. cpu number of cpus to parallel. batch_n number of cells to deal with in each cpu process. Returns ------- """ # read cell paths cell_path_dict = pd.read_csv(contacts_table, header=None, index_col=0, sep='\t').squeeze(axis=1).to_dict() print(f'{len(cell_path_dict)} cells to process.') for resolution in resolutions: resolution_str = str(resolution) resolution_str = re.sub(r'000000$', 'M', resolution_str) resolution_str = re.sub(r'000$', 'K', resolution_str) output_path = f'{output_prefix}.{resolution_str}.scool' print('Generating', output_path) generate_scool_single_resolution(cell_path_dict=cell_path_dict, chrom_size_path=chrom_size_path, resolution=resolution, output_path=output_path, batch_n=batch_n, cpu=cpu, chr1=chr1, chr2=chr2, pos1=pos1, pos2=pos2, min_pos_dist=min_pos_dist, blacklist_1d_path=blacklist_1d_path, blacklist_2d_path=blacklist_2d_path, remove_duplicates=remove_duplicates, blacklist_resolution=blacklist_resolution) print('Finished', output_path) return