Source code for schicluster.cool.scool

import re
import subprocess
import warnings
from collections import Counter, defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed

import cooler
import pandas as pd
import pandas.errors
from cooler import create_scool

from .remove_blacklist import filter_contacts
from .utilities import get_chrom_offsets



[docs]
def generate_scool_batch_data(cell_path_dict,
                              resolution,
                              chrom_offset,
                              chrom_size_path,
                              blacklist_1d_path,
                              blacklist_2d_path,
                              remove_duplicates,
                              blacklist_resolution,
                              output_path,
                              chr1=1,
                              chr2=5,
                              pos1=2,
                              pos2=6,
                              min_pos_dist=2500):
    def single_cell_pixel(_path):
        try:
            contacts = filter_contacts(_path,
                                       chrom_size_path=chrom_size_path,
                                       blacklist_1d_path=blacklist_1d_path,
                                       blacklist_2d_path=blacklist_2d_path,
                                       remove_duplicates=remove_duplicates,
                                       resolution_2d=blacklist_resolution,
                                       chrom1=chr1,
                                       pos1=pos1,
                                       chrom2=chr2,
                                       pos2=pos2)
        except pandas.errors.EmptyDataError:
            # empty contacts file
            return pd.DataFrame([], columns=['bin1_id', 'bin2_id', 'count'])
        pos_dist = (contacts[pos1] - contacts[pos2]).abs()

        # filter
        contacts = contacts[contacts[chr1].isin(chrom_offset)
                            & contacts[chr2].isin(chrom_offset)
                            & (contacts[pos1] > 0)
                            & (contacts[pos2] > 0)]
        # apply min_pos_dist filter on cis contact only
        contacts = contacts[(pos_dist > min_pos_dist) |  # filter all close contact
                            (contacts[chr1] != contacts[chr2])]  # but keep all trans

        # calculate pixel
        bin1_id = contacts[chr1].map(chrom_offset) + (contacts[pos1] - 1) // resolution
        bin2_id = contacts[chr2].map(chrom_offset) + (contacts[pos2] - 1) // resolution
        counter = Counter()
        for x, y in zip(bin1_id, bin2_id):
            if x > y:
                x, y = y, x
            counter[(x, y)] += 1
        pixel_df = pd.Series(counter).reset_index()
        pixel_df.columns = pd.Index(['bin1_id', 'bin2_id', 'count'])
        pixel_df = pixel_df.sort_values(['bin1_id',
                                         'bin2_id']).reset_index(drop=True)
        return pixel_df

    with warnings.catch_warnings():
        # ignore the
        warnings.simplefilter("ignore")
        with pd.HDFStore(output_path) as hdf:
            for cell_id, path in cell_path_dict.items():
                hdf[cell_id] = single_cell_pixel(path)
    return




[docs]
def generate_scool_single_resolution(cell_path_dict,
                                     chrom_size_path,
                                     resolution,
                                     output_path,
                                     blacklist_1d_path,
                                     blacklist_2d_path,
                                     remove_duplicates,
                                     blacklist_resolution,
                                     chr1=1,
                                     chr2=5,
                                     pos1=2,
                                     pos2=6,
                                     min_pos_dist=2500,
                                     batch_n=20,
                                     cpu=1):
    # parse chromosome sizes, prepare bin_df
    chrom_sizes = pd.read_csv(
        chrom_size_path,
        sep='\t',
        index_col=0,
        header=None).squeeze(axis=1)
    bins_df = cooler.binnify(chrom_sizes, resolution)
    chrom_offset = get_chrom_offsets(bins_df)

    chunk_dicts = defaultdict(dict)
    for i, (cell, path) in enumerate(cell_path_dict.items()):
        chunk_dicts[i // batch_n][cell] = path

    with ProcessPoolExecutor(cpu) as exe:
        futures = {}
        for batch, cell_path_dict in chunk_dicts.items():
            batch_output = output_path + f'_{batch}'
            f = exe.submit(generate_scool_batch_data,
                           cell_path_dict=cell_path_dict,
                           resolution=resolution,
                           chrom_offset=chrom_offset,
                           output_path=batch_output,
                           chr1=chr1, chr2=chr2,
                           pos1=pos1, pos2=pos2,
                           min_pos_dist=min_pos_dist,
                           chrom_size_path=chrom_size_path,
                           blacklist_1d_path=blacklist_1d_path,
                           blacklist_2d_path=blacklist_2d_path,
                           remove_duplicates=remove_duplicates,
                           blacklist_resolution=blacklist_resolution)
            futures[f] = batch_output

        for future in as_completed(futures):
            # batch finished
            batch_output = futures[future]
            future.result()

            # dump batch result into cool
            cell_pixel_dict = {}
            with pd.HDFStore(batch_output, mode='r') as hdf:
                for cell_id in hdf.keys():
                    cell_id = cell_id[1:]  # remove '/'
                    cell_pixel_dict[cell_id] = hdf[cell_id]
            create_scool(output_path,
                         bins=bins_df,
                         cell_name_pixels_dict=cell_pixel_dict,
                         ordered=True,
                         mode='a')
            subprocess.run(['rm', '-f', batch_output], check=True)
    return




[docs]
def generate_scool(contacts_table,
                   output_prefix,
                   chrom_size_path,
                   resolutions,
                   blacklist_1d_path=None,
                   blacklist_2d_path=None,
                   blacklist_resolution=10000,
                   remove_duplicates=True,
                   chr1=1,
                   chr2=5,
                   pos1=2,
                   pos2=6,
                   min_pos_dist=2500,
                   cpu=1,
                   batch_n=50):
    """
    Generate single-resolution cool files from single-cell contact files recorded in contacts_table

    Parameters
    ----------
    contacts_table
        tab-separated table containing tow columns, 1) cell id, 2) cell contact file path (juicer-pre format)
        No header
    output_prefix
        Output prefix of the cool files. Output path will be {output_prefix}.{resolution_str}.cool
    chrom_size_path
        Path to the chromosome size file, this file should be in UCSC chrom.sizes format. We will use this file as
        the reference to create matrix. It is recommended to remove small contigs or chrM from this file.
    resolutions
        Resolutions to generate the matrix. Each resolution will be stored in a separate file.
    blacklist_1d_path
        Path to blacklist region BED file, such as ENCODE blacklist.
        Either side of the contact overlapping with a blacklist region will be removed.
    blacklist_2d_path
        Path to blacklist region pair BEDPE file.
        Both side of the contact overlapping with the same blacklist region pair will be removed.
    blacklist_resolution
        Resolution in bps when consider the 2D blacklist region pairs.
    remove_duplicates
        If true, will remove duplicated contacts based on [chr1, pos1, chr2, pos2] values
    chr1
        0 based index of chr1 column.
    chr2
        0 based index of chr2 column.
    pos1
        0 based index of pos1 column.
    pos2
        0 based index of pos2 column.
    min_pos_dist
        Minimum distance for a fragment to be considered.
    cpu
        number of cpus to parallel.
    batch_n
        number of cells to deal with in each cpu process.

    Returns
    -------

    """
    # read cell paths
    cell_path_dict = pd.read_csv(contacts_table,
                                 header=None,
                                 index_col=0,
                                 sep='\t').squeeze(axis=1).to_dict()
    print(f'{len(cell_path_dict)} cells to process.')

    for resolution in resolutions:
        resolution_str = str(resolution)
        resolution_str = re.sub(r'000000$', 'M', resolution_str)
        resolution_str = re.sub(r'000$', 'K', resolution_str)
        output_path = f'{output_prefix}.{resolution_str}.scool'

        print('Generating', output_path)
        generate_scool_single_resolution(cell_path_dict=cell_path_dict,
                                         chrom_size_path=chrom_size_path,
                                         resolution=resolution,
                                         output_path=output_path,
                                         batch_n=batch_n,
                                         cpu=cpu,
                                         chr1=chr1, chr2=chr2,
                                         pos1=pos1, pos2=pos2,
                                         min_pos_dist=min_pos_dist,
                                         blacklist_1d_path=blacklist_1d_path,
                                         blacklist_2d_path=blacklist_2d_path,
                                         remove_duplicates=remove_duplicates,
                                         blacklist_resolution=blacklist_resolution)
        print('Finished', output_path)
    return