Source code for schicluster.cool.remove_blacklist

import pandas as pd
from collections import defaultdict
import pybedtools
from functools import lru_cache
from concurrent.futures import ProcessPoolExecutor, as_completed
import pathlib

@lru_cache()
[docs] def prepare_2d_blacklist_dict(blacklist_bedpe, resolution=10000): # read blacklist bed file, turn the region into bad pixel idx dict with resolution blacklist_bedpe_df = pd.read_csv(blacklist_bedpe, sep='\t', header=None) # turn region into region pixel idx blacklist_bedpe_df[1] //= resolution blacklist_bedpe_df[2] //= resolution # in case the region is smaller than resolution, add one so at least one pixel is bad blacklist_bedpe_df.loc[2, (blacklist_bedpe_df[2] - blacklist_bedpe_df[1]) < 1] += 1 blacklist_bedpe_df[4] //= resolution blacklist_bedpe_df[5] //= resolution # in case the region is smaller than resolution, add one so at least one pixel is bad blacklist_bedpe_df.loc[5, (blacklist_bedpe_df[5] - blacklist_bedpe_df[4]) < 1] += 1 chrom_pair_bad_points = defaultdict(set) for idx, row in blacklist_bedpe_df.iterrows(): for i in range(row[1], row[2]): for j in range(row[4], row[5]): chrom_pair_bad_points[row[0], row[3]].add((i, j)) # in case contact is not ordered as blacklist chrom_pair_bad_points[row[3], row[0]].add((j, i)) # return a dict, key is chrom pair return chrom_pair_bad_points
[docs] def _is_2d_blacklist(row, blacklist_2d): chrom1, chrom2, pos1, pos2 = row judge = (pos1, pos2) in blacklist_2d[(chrom1, chrom2)] return judge
[docs] def filter_contacts(contact_path, chrom_size_path=None, blacklist_1d_path=None, blacklist_2d_path=None, output_path=None, remove_duplicates=True, resolution_2d=10000, min_pos_dist=0, chrom1=1, pos1=2, chrom2=5, pos2=6): try: contacts = pd.read_csv(contact_path, header=None, sep='\t', dtype={ chrom1: str, pos1: int, chrom2: str, pos2: int }, index_col=None, comment='#', ) except Exception as e: print(f'Got error when opening {contact_path}') raise e print(f"{contact_path.split('/')[-1]}: {contacts.shape[0]} input contacts.") if remove_duplicates: # remove duplicates contacts.drop_duplicates(subset=[chrom1, pos1, chrom2, pos2], inplace=True) if min_pos_dist>0: contacts = contacts[((contacts[pos2] - contacts[pos1]).abs() > min_pos_dist) | (contacts[chrom1] != contacts[chrom2])] # remove additional chroms not exist in chrom_size_path chroms = pd.read_csv(chrom_size_path, sep='\t', index_col=0, header=None).index contacts = contacts[contacts[chrom1].isin(chroms) & contacts[chrom2].isin(chroms)].copy() if blacklist_1d_path is not None: blacklist_bed_df = pd.read_csv(blacklist_1d_path, sep='\t', index_col=None, header=None) blacklist_bed_df = blacklist_bed_df[blacklist_bed_df.iloc[:, 0].isin(chroms)].copy() blacklist_bed = pybedtools.BedTool.from_dataframe(blacklist_bed_df).sort(g=chrom_size_path) # determine blacklist 1d (either side overlap with 1D blacklist) left_bed_df = contacts[[chrom1, pos1, pos1]].reset_index().iloc[:, [1, 2, 3, 0]] right_bed_df = contacts[[chrom2, pos2, pos2]].reset_index().iloc[:, [1, 2, 3, 0]] left_bed_df.columns = ['chrom', 'start', 'end', 'id'] right_bed_df.columns = ['chrom', 'start', 'end', 'id'] contact_bed_df = pd.concat([left_bed_df, right_bed_df]) contact_bed = pybedtools.BedTool.from_dataframe(contact_bed_df).sort(g=chrom_size_path) # collect contact ids with either side overlap with blacklist bad_contacts = contact_bed.intersect(blacklist_bed, wa=True, u=True).to_dataframe() if bad_contacts.shape[0] > 0: bad_contacts = bad_contacts['name'].unique() else: # no bad contacts bad_contacts = set() # remove bad contacts contacts = contacts[~contacts.index.isin(bad_contacts)].copy() pybedtools.cleanup() if blacklist_2d_path is not None: chrom_2d_blacklist = prepare_2d_blacklist_dict(blacklist_2d_path, resolution=resolution_2d) contacts_idx = contacts[[chrom1, chrom2, pos1, pos2]].copy() # turn contact location into bin idx with resolution contacts_idx[pos1] //= resolution_2d contacts_idx[pos2] //= resolution_2d # determine blacklist 2d (both side overlap with 2D blacklist) is_blacklist_2d = contacts_idx.apply(_is_2d_blacklist, blacklist_2d=chrom_2d_blacklist, axis=1) contacts = contacts[~is_blacklist_2d].copy() print(f"{contact_path.split('/')[-1]}: {contacts.shape[0]} filtered contacts in scool.") if output_path is not None: contacts.to_csv(output_path, sep='\t', index=False, header=False) else: return contacts
[docs] def filter_contacts_wrapper(contact_table=None, output_dir=None, chrom_size_path=None, blacklist_1d_path=None, blacklist_2d_path=None, remove_duplicates=True, resolution_2d=10000, chrom1=1, pos1=2, chrom2=5, pos2=6, min_pos_dist=0, cpu=20): contact_table = pd.read_csv(contact_table, sep='\t', header=None, index_col=None) output_dir = pathlib.Path(output_dir).absolute() output_dir.mkdir(parents=True, exist_ok=True) with ProcessPoolExecutor(cpu) as executor: futures = {} for xx,yy in contact_table.values: future = executor.submit( filter_contacts, contact_path=yy, chrom_size_path=chrom_size_path, blacklist_1d_path=blacklist_1d_path, blacklist_2d_path=blacklist_2d_path, output_path=f'{output_dir}/{xx}.contact.rmbkl.tsv.gz', remove_duplicates=remove_duplicates, resolution_2d=resolution_2d, min_pos_dist=min_pos_dist, chrom1=chrom1, pos1=pos1, chrom2=chrom2, pos2=pos2, ) futures[future] = xx for future in as_completed(futures): print(f'{futures[future]} finished') return