Source code for schicluster.loop.loop_calling

import cooler
import numpy as np
from scipy import stats
from scipy.ndimage import convolve
import pandas as pd
import time
from statsmodels.stats.multitest import multipletests
from heapq import heappop, heapify


[docs] def fetch_chrom(cool, chrom) -> np.array: return cool.matrix(balance=False, sparse=True).fetch(chrom).toarray()
[docs] def select_loop_candidates(cool_e, min_dist, max_dist, resolution, chrom): """Select loop candidate pixel to perform t test""" E = fetch_chrom(cool_e, chrom) loop = np.where(E > 0) # loop is [xs, ys] of E # only calculate upper triangle and remove the pixels close to diagonal dist_filter = np.logical_and((loop[1] - loop[0]) > (min_dist / resolution), (loop[1] - loop[0]) < (max_dist / resolution)) loop = (loop[0][dist_filter], loop[1][dist_filter]) print(f'{chrom}\t{dist_filter.sum()} loop candidate pixels') return E, loop
[docs] def paired_t_test(cool_t, cool_t2, chrom, loop, n_cells): """Paired t test per pixel""" T = fetch_chrom(cool_t, chrom) T2 = fetch_chrom(cool_t2, chrom) loop_delta = T[loop] loop_t = loop_delta * n_cells loop_t2 = T2[loop] * n_cells sed = np.sqrt((loop_t2 - loop_t ** 2 / n_cells) / (n_cells - 1) / n_cells) t_score = loop_delta / sed p_value = stats.t.sf(t_score, n_cells - 1) # Cohen d effect size d = 2 * t_score / np.sqrt(2 * n_cells) return p_value, loop_delta, d
[docs] def scan_kernel(E, kernel, loop): """Scan loop surrounding background kernel""" E_kernel = convolve(E, kernel, mode='mirror') * (E > 0) return E_kernel[loop]
[docs] def loop_background(E, pad, gap, loop): """Calculate loop surrounding background level""" w = pad * 2 + 1 kernel_bl = np.zeros((w, w), np.float32) kernel_bl[-pad:, :(pad - gap)] = 1 kernel_bl[-(pad - gap):, :pad] = 1 kernel_bl = kernel_bl / np.sum(kernel_bl) loop_bl = scan_kernel(E, kernel_bl, loop) kernel_donut = np.ones((w, w), np.float32) kernel_donut[pad, :] = 0 kernel_donut[:, pad] = 0 kernel_donut[(pad - gap):(pad + gap + 1), (pad - gap):(pad + gap + 1)] = 0 kernel_donut = kernel_donut / np.sum(kernel_donut) loop_donut = scan_kernel(E, kernel_donut, loop) kernel_h = np.ones((3, w), np.float32) kernel_h[:, (pad - gap):(pad + gap + 1)] = 0 kernel_h = kernel_h / np.sum(kernel_h) loop_h = scan_kernel(E, kernel_h, loop) kernel_v = np.ones((w, 3), np.float32) kernel_v[(pad - gap):(pad + gap + 1), :] = 0 kernel_v = kernel_v / np.sum(kernel_v) loop_v = scan_kernel(E, kernel_v, loop) return loop_bl, loop_donut, loop_h, loop_v
[docs] def call_loop_single_chrom(group_prefix, chrom, resolution=10000, min_dist=50000, max_dist=10000000, pad=5, gap=2): """calculate t test and loop background for one chromosome""" # matrix cool obj cool_e = cooler.Cooler(f'{group_prefix}.E.cool') cool_e2 = cooler.Cooler(f'{group_prefix}.E2.cool') cool_t = cooler.Cooler(f'{group_prefix}.T.cool') cool_t2 = cooler.Cooler(f'{group_prefix}.T2.cool') n_cells = cool_e.info['group_n_cells'] # call loop print(f'{chrom}\tSelecting loop candidates.') E, loop = select_loop_candidates(cool_e=cool_e, min_dist=min_dist, max_dist=max_dist, resolution=resolution, chrom=chrom) print(f'{chrom}\tDoing paired T test with the local background.') local_p_value, loop_t, local_d = paired_t_test(cool_t=cool_t, cool_t2=cool_t2, chrom=chrom, loop=loop, n_cells=n_cells) print(f'{chrom}\tDoing paired T test with the global background.') global_p_value, loop_e, global_d = paired_t_test(cool_t=cool_e, cool_t2=cool_e2, chrom=chrom, loop=loop, n_cells=n_cells) print(f'{chrom}\tCalculating loop local background with different masks.') loop_bl, loop_donut, loop_h, loop_v = loop_background(E=E, pad=pad, gap=gap, loop=loop) # put together loop table data = pd.DataFrame({ 'x': loop[0], 'y': loop[1], 'distance': (loop[1] - loop[0]) * resolution, 'local_pval': local_p_value, 'local_cohen_d': local_d, 'global_pval': global_p_value, 'global_cohen_d': global_d, 'E': loop_e, 'T': loop_t, 'E_bl': loop_bl, 'E_donut': loop_donut, 'E_h': loop_h, 'E_v': loop_v }) data['chrom'] = chrom return data
[docs] def filter_by_background(data, thres_bl, thres_donut, thres_h, thres_v, resolution): data['bkfilter'] = (((data['E'] / data['E_bl'] > thres_bl) | (data['E_bl'] < 0)) & ((data['E'] / data['E_donut'] > thres_donut) | (data['E_donut'] < 0)) & ((data['E'] / data['E_h'] > thres_h) | (data['E_h'] < 0)) & ((data['E'] / data['E_v'] > thres_v) | (data['E_v'] < 0))) data['x1'] = data['x'].astype(int) * resolution data['y1'] = data['y'].astype(int) * resolution data['x2'] = data['x1'] + resolution data['y2'] = data['y1'] + resolution del data['x'] del data['y'] return data
[docs] def find_summit(loop, res, dist_thres): loop = loop.copy() start_time = time.time() cord = loop[['x1', 'y1']].values // res idx = np.argsort(cord[:, 0]) neighbor = {i: [] for i in range(len(idx))} for i in range(len(idx) - 1): tmp = cord[idx[i]] for j in range(i + 1, len(idx)): if cord[idx[j], 0] - tmp[0] > dist_thres: break if np.abs(tmp[1] - cord[idx[j], 1]) <= dist_thres: neighbor[idx[i]].append(idx[j]) neighbor[idx[j]].append(idx[i]) nodescore = loop['E'].values flag = np.zeros(len(nodescore)) tot = len(nodescore) summit = [] nodeheap = (loop['E'] * -1).reset_index().reset_index()[['E', 'level_0']].values.tolist() heapify(nodeheap) while tot > 0: t = int(heappop(nodeheap)[1]) while flag[t]: t = int(heappop(nodeheap)[1]) q = [t] flag[t] = 1 tot -= 1 head = 0 flagtmp = np.zeros(len(nodescore)) while (head < len(q)): for t in neighbor[q[head]]: if not flagtmp[t] and nodescore[t] < nodescore[q[head]]: if not flag[t]: flag[t] = 1 tot -= 1 flagtmp[t] = 1 q.append(t) head += 1 summit.append([q[0], len(q)]) summit = np.array(summit) loop = loop.iloc[summit[:, 0]] loop['size'] = summit[:, 1] return loop
[docs] def call_loops(group_prefix, resolution, output_prefix, thres_bl=1.33, thres_donut=1.33, thres_h=1.2, thres_v=1.2, fdr_thres=0.1, dist_thres=20000, size_thres=1): try: group_q = f'{group_prefix}.Q.cool' chroms = cooler.Cooler(group_q).chromnames except OSError: group_q = f'{group_prefix}.Q.mcool::/resolutions/10000' chroms = cooler.Cooler(group_q).chromnames total_loops = [] for chrom in chroms: print(f'Calling loops of chromosome {chrom}') data = call_loop_single_chrom(group_prefix, chrom, resolution=10000, min_dist=50000, max_dist=10000000, pad=5, gap=2) total_loops.append(data) total_loops = pd.concat(total_loops).reset_index(drop=True) # add background judge info print('Filtering loop by background.') total_loops = filter_by_background(data=total_loops, thres_bl=thres_bl, thres_donut=thres_donut, thres_h=thres_h, thres_v=thres_v, resolution=resolution) # Group the loops by distance then calculate FDR separately print('Filtering loop by FDR.') total_loops.dropna(subset=['local_pval', 'global_pval'], how='any', inplace=True) local_qs = [] global_qs = [] for dist in total_loops['distance'].unique(): p_values = total_loops.loc[total_loops['distance'] == dist, ['local_pval', 'global_pval']] _, local_q, *_ = multipletests(p_values['local_pval']) local_qs.append(pd.Series(local_q, index=p_values.index)) _, global_q, *_ = multipletests(p_values['global_pval']) global_qs.append(pd.Series(global_q, index=p_values.index)) local_qs = pd.concat(local_qs).sort_index() global_qs = pd.concat(global_qs).sort_index() total_loops['local_qval'] = local_qs total_loops['global_qval'] = global_qs # apply all the filters total_loops.to_hdf(f'{output_prefix}.totalloop_info.hdf', key='data') # filter loops, save bedpe, and get summit filter_loops(total_loops, output_prefix, fdr_thres=fdr_thres, resolution=resolution, dist_thres=dist_thres, size_thres=size_thres) return
[docs] def filter_loops(total_loops, output_prefix, fdr_thres, resolution, dist_thres, size_thres): loop = total_loops.loc[total_loops['bkfilter'] & (total_loops['local_qval'] < fdr_thres) & (total_loops['global_qval'] < fdr_thres)].copy() loop.to_hdf(f'{output_prefix}.loop_info.hdf', key='data') # filter and save bedpe bedpe_cols = ['chrom', 'x1', 'x2', 'chrom', 'y1', 'y2', 'E'] loop.sort_values(by=['chrom', 'x1', 'y1'])[bedpe_cols].to_csv( f'{output_prefix}.loop.bedpe', sep='\t', index=False, header=None) scloop = total_loops.loc[(total_loops['local_qval'] < fdr_thres)] scloop.sort_values(by=['chrom', 'x1', 'y1'])[bedpe_cols].to_csv( f'{output_prefix}.localloop.bedpe', sep='\t', index=False, header=None) scloop = total_loops.loc[(total_loops['global_qval'] < fdr_thres)] scloop.sort_values(by=['chrom', 'x1', 'y1'])[bedpe_cols].to_csv( f'{output_prefix}.globalloop.bedpe', sep='\t', index=False, header=None) bkloop = total_loops.loc[total_loops['bkfilter']] bkloop.sort_values(by=['chrom', 'x1', 'y1'])[bedpe_cols].to_csv( f'{output_prefix}.bkloop.bedpe', sep='\t', index=False, header=None) # find summit print('Finding loop summit.') if loop.shape[0] > 0: summit = pd.concat([ find_summit( loop=sub_df, res=resolution, dist_thres=dist_thres // resolution) for chrom, sub_df in loop.groupby('chrom') ], axis=0) summit = summit[summit['size'] >= size_thres] summit.sort_values(by=['chrom', 'x1', 'y1'])[bedpe_cols + ['size']].to_csv( f'{output_prefix}.loop_summit.bedpe', sep='\t', index=False, header=None) else: with open(f'{output_prefix}.loop_summit.bedpe', 'w') as f: f.write(pd.DataFrame([]).to_csv()) return