Source code for schicluster.impute.snakemake
import pathlib
import schicluster
import cooler
import pandas as pd
[docs]
PACKAGE_DIR = pathlib.Path(schicluster.__path__[0])
[docs]
def prepare_impute(output_dir,
chrom_size_path,
output_dist,
window_size,
step_size,
resolution,
input_scool=None,
cell_table=None,
batch_size=100,
logscale=False,
pad=1,
std=1,
rp=0.5,
tol=0.01,
min_cutoff=1e-5,
chrom1=1,
pos1=2,
chrom2=5,
pos2=6,
cpu_per_job=10):
"""
prepare snakemake files and directory structure for cell contacts imputation
"""
output_dir = pathlib.Path(output_dir).absolute()
output_dir.mkdir(parents=True, exist_ok=True)
with open(PACKAGE_DIR / 'impute/impute_new.Snakefile') as f:
snake_template = f.read()
if logscale:
logscale_str = '--logscale'
else:
logscale_str = ''
if input_scool is not None:
input_scool = str(pathlib.Path(input_scool).absolute())
cell_list = cooler.fileops.list_coolers(input_scool)
scool_cell_ids = [i.split('/')[-1] for i in cell_list]
elif cell_table is not None:
cell_list = pd.read_csv(cell_table, sep='\t', index_col=0, header=None)
chunk_dirs = []
for i, chunk_start in enumerate(range(0, len(cell_list), batch_size)):
chunk_dir = output_dir / f'chunk{i}'
chunk_dir.mkdir(parents=True, exist_ok=True)
parameters = dict(
chrom_size_path=f"'{pathlib.Path(chrom_size_path).absolute()}'",
logscale_str=f'"{logscale_str}"',
pad=pad,
std=std,
window_size=int(window_size),
step_size=int(step_size),
resolution=int(resolution),
output_dist=int(output_dist),
rp=rp,
tol=tol,
min_cutoff=min_cutoff,
)
if input_scool is not None:
this_cell_ids = scool_cell_ids[chunk_start:chunk_start + batch_size]
parameters['input_scool'] = f"'{pathlib.Path(input_scool).absolute()}'"
parameters['cell_ids'] = str(this_cell_ids)
elif cell_table is not None:
parameters['chrom1'] = chrom1
parameters['chrom2'] = chrom2
parameters['pos1'] = int(pos1)
parameters['pos2'] = int(pos2)
cell_list.iloc[chunk_start:chunk_start + batch_size].to_csv(output_dir / f'chunk{i}/cell_table.csv', index=True, header=False)
parameters_str = '\n'.join([f'{k} = {v}' for k, v in parameters.items()])
this_snakefile = parameters_str + snake_template
with open(output_dir / f'chunk{i}/Snakefile', 'w') as f:
f.write(this_snakefile)
chunk_dirs.append(chunk_dir)
with open(output_dir / 'snakemake_cmd.txt', 'w') as f:
for chunk_dir in chunk_dirs:
cmd = f'snakemake -d {chunk_dir} --snakefile {chunk_dir}/Snakefile -j {cpu_per_job}'
f.write(cmd + '\n')
return