Prepare pseudobulk analyses#
import os
import numpy as np
import pandas as pd
from glob import glob
import schicluster
PACKAGE_DIR = schicluster.__path__[0]
metadata = pd.read_csv('Tan2021_dipc_cluster.csv.gz', header=0, index_col=0)
metadata['rnatype'] = [xx.replace(' ', '_').replace('/', '').replace(',', '').replace('.', '') for xx in metadata['rnatype'].values]
metadata
rnatype | score | dipctype | dipcleiden | |
---|---|---|---|---|
cortex-p028-cb_116 | Cortical_L6_Pyramidal_Cell | 0.999996 | Cortical L6 Pyramidal Cell | 5 |
cortex-visual-control-p007-b6_182 | Cortical_L6_Pyramidal_Cell | 0.999992 | Neuron | 2 |
cortex-p028-cb_112 | Cortical_L6_Pyramidal_Cell | 0.999996 | Cortical L2–5 Pyramidal Cell | 5 |
cortex-visual-control-p001-b6_061 | Unknown_Interneuron_2 | 0.330496 | Neuron | 2 |
cortex-p056-cb_216 | Microglia_Etc | 0.999989 | Microglia Etc. | 14 |
... | ... | ... | ... | ... |
cortex-visual-control-p021-b6_090 | Mature_Oligodendrocyte | 0.999994 | Mature Oligodendrocyte | 16 |
cortex-visual-control-p021-b6_012 | Cortical_L6_Pyramidal_Cell | 0.999997 | Neuron | 5 |
hippocampus-p007-cb_046 | Microglia_Etc | 0.999993 | Microglia Etc. | 13 |
cortex-visual-dark-p014-b6_106 | Microglia_Etc | 0.999994 | Microglia Etc. | 13 |
cortex-visual-control-p021-b6_174 | Microglia_Etc | 0.999995 | Microglia Etc. | 13 |
3646 rows × 4 columns
def prepare_dir(output_dir, chunk_df, template, params):
os.makedirs(output_dir, exist_ok=True)
cell_table_path = f'{output_dir}cell_table.csv'
chunk_df.to_csv(cell_table_path, header=False, index=True)
params_str = '\n'.join(f'{k} = {v}' for k, v in params.items())
with open(f'{output_dir}Snakefile_master', 'w') as f:
f.write(params_str + template)
return
Raw#
Generate pseudobulk cool files of raw contacts (before imputation) at 5kb resolution, so the mcool files contain 10kb, 25kb, and 100kb resolution. The files contain both cis and trans contacts.
The code below divides large cell groups into chunks of 200 cells. “snakemake_cmd_step1.txt” contains commands to generate pseudobulk matrices for each chunk, and “snakemake_cmd_step2.txt” contains commands to merge chunks into cell groups. Both of them could be distributed across HPC.
cell_table = pd.read_csv('contact_table.tsv', sep='\t', header=None, index_col=0, names=['cell_id','csv_path'])
cell_table = cell_table.loc[metadata.index]
cell_table
csv_path | |
---|---|
cortex-p028-cb_116 | /data/test_schicluster/Tan2021/raw/CTX_HIP/con... |
cortex-visual-control-p007-b6_182 | /data/test_schicluster/Tan2021/raw/VIS/contact... |
cortex-p028-cb_112 | /data/test_schicluster/Tan2021/raw/CTX_HIP/con... |
cortex-visual-control-p001-b6_061 | /data/test_schicluster/Tan2021/raw/VIS/contact... |
cortex-p056-cb_216 | /data/test_schicluster/Tan2021/raw/CTX_HIP/con... |
... | ... |
cortex-visual-control-p021-b6_090 | /data/test_schicluster/Tan2021/raw/VIS/contact... |
cortex-visual-control-p021-b6_012 | /data/test_schicluster/Tan2021/raw/VIS/contact... |
hippocampus-p007-cb_046 | /data/test_schicluster/Tan2021/raw/CTX_HIP/con... |
cortex-visual-dark-p014-b6_106 | /data/test_schicluster/Tan2021/raw/VIS/contact... |
cortex-visual-control-p021-b6_174 | /data/test_schicluster/Tan2021/raw/VIS/contact... |
3646 rows × 1 columns
cell_table['cluster'] = metadata['rnatype'].copy()
leg = {}
chunk_size = 200
outdir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/merged_raw/'
for cluster, sub_df in cell_table.groupby('cluster'):
legtmp = []
# group = cluster.replace(' ', '_').replace('/', '').replace(',', '').replace('.', '')
os.makedirs(f'{outdir}{cluster}', exist_ok=True)
if sub_df.shape[0]>1500:
tmp = sub_df.loc[np.random.choice(sub_df.index, 1500, False)]
else:
tmp = sub_df.copy()
for i,chunk_start in enumerate(np.arange(0, tmp.shape[0], chunk_size)):
os.makedirs(f'{outdir}{cluster}_chunk{i}', exist_ok=True)
tmp['csv_path'].iloc[chunk_start:(chunk_start+chunk_size)].to_csv(f'{outdir}{cluster}_chunk{i}/cell_table.tsv', sep='\t', header=False, index=True)
legtmp.append(f'{cluster}_chunk{i}')
tmp['csv_path'].to_csv(f'{outdir}{cluster}/cell_table.tsv', sep='\t', header=False, index=True)
leg[cluster] = legtmp
print(cluster, tmp.shape[0])
Adult_Astrocyte 144
Cajal-Retzius_Cell 23
Cortical_L2-4_Pyramidal_Cell_Intermediate 37
Cortical_L2-5_Pyramidal_Cell_Neonatal 211
Cortical_L23_Pyramidal_Cell 204
Cortical_L4_Pyramidal_Cell 195
Cortical_L5_Pyramidal_Cell 98
Cortical_L6_Pyramidal_Cell 333
Hippocampal_CA1_Pyramidal_Cell 134
Hippocampal_CA3_Pyramidal_Cell 70
Hippocampal_Granuale_Cell 309
Hippocampal_Pyramidal_Cell_Neonatal 93
MEIS2_Interneuron 88
Mature_Oligodendrocyte 210
Medium_Spiny_Neuron 167
Microglia_Etc 391
NDNF_Interneuron 36
Neonatal_Astrocyte 234
Newly_Formed_Oligodendrocyte 27
Oligodendrocyte_Progenitor 189
PVSST_Interneuron_Neonatal 85
PV_Interneuron 61
SST_Interneuron 55
Unknown_Interneuron_1 89
Unknown_Interneuron_2 51
VIP_Interneuron 112
Note#
Run the merge command using batch job submission.
The following code generates command files for step1 and step2.
Each line of a file is a command and can be appended into the job submission template of users system.
f1 = open(f'{outdir}snakemake_cmd_step1.txt', 'w')
f2 = open(f'{outdir}snakemake_cmd_step2.txt', 'w')
for ct in leg:
for group in leg[ct]:
cmd = f'hicluster merge-cell-raw --cell_table {outdir}{group}/cell_table.tsv --chrom_size_path /data/ref/mm10/genome/mm10.main.chrom.sizes --output_file {outdir}{group}/raw.cool --chr1 1 --pos1 2 --chr2 3 --pos2 4'
f1.write(cmd + '\n')
if len(leg[ct])<2:
group = leg[ct][0]
cmd = f'rsync -arv {outdir}{group}/raw.cool {outdir}{ct}/{ct}.raw.cool'
f2.write(cmd + '\n')
else:
cmd = f'cooler merge {outdir}{ct}/{ct}.raw.cool'
for group in leg[ct]:
cmd += f' {outdir}{group}/raw.cool'
f2.write(cmd + '\n')
f1.close()
f2.close()
Note#
Run the merge command on a single node, parallelized with snakemake.
The following code generates Snakemake files for step1 and step2.
The template notebook for step1 can be found at zhoujt1994/scHiCluster.
The template notebook for step2 can be found at zhoujt1994/scHiCluster.
Each cell type is assigned to a separate folder and processed separately.
from gliderport.preset import notebook_snakemake
notebook_snakemake(
work_dir=f"merged_raw/",
notebook_dir="merged_raw/template_step1/",
groups=np.concatenate([leg[xx] for xx in leg]).tolist(),
default_cpu=1,
default_mem_gb=5,
redo_prepare=True,
)
!snakemake --snakefile Snakefile -j 8 --keep-going
notebook_snakemake(
work_dir=f"merged_raw/",
notebook_dir="merged_raw/template_step2/",
groups=list(leg.keys()),
default_cpu=1,
default_mem_gb=5,
redo_prepare=True,
)
!snakemake --snakefile Snakefile -j 8 --keep-going
Loop#
Identify pseudobulk loop pixels and summits at 10kb resolution with single-cell information.
The code below generates a loop folder, with each cell group as a subfolder, and a file “snakemake_cmds.txt” with each line as a command to run loop calling for one cell group.
Each command takes the imputed matrices of cells belonging to the same group as input, to generate for each cell group, a bedpe file of loops, and Q, E, T matrices for differential loop calling. The files only contain interations < 5Mb.
The command can be appended into the job submission template of users system. We suggest to distribute the cell groups across different computing nodes in HPC as separate job.
On anvil standard node (128 CPUs), loop calling of a cell group with 2000 cells will take ~12 hours.
coollist = glob('/data/test_schicluster/Tan2021/scool/impute/10K/*/*.cool')
cell_table = pd.DataFrame(coollist, index=[xx.split('/')[-1].replace('.cool', '') for xx in coollist], columns=['cool_path'])
cell_table = cell_table.loc[metadata.index]
cell_table['cluster'] = metadata['rnatype'].copy()
cell_table['cool_path'] = cell_table['cool_path'].str.replace('/data/test_schicluster', '/anvil/scratch/x-zhou')
cell_table
cool_path | cluster | |
---|---|---|
cortex-p028-cb_116 | /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... | Cortical_L6_Pyramidal_Cell |
cortex-visual-control-p007-b6_182 | /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... | Cortical_L6_Pyramidal_Cell |
cortex-p028-cb_112 | /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... | Cortical_L6_Pyramidal_Cell |
cortex-visual-control-p001-b6_061 | /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... | Unknown_Interneuron_2 |
cortex-p056-cb_216 | /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... | Microglia_Etc |
... | ... | ... |
cortex-visual-control-p021-b6_090 | /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... | Mature_Oligodendrocyte |
cortex-visual-control-p021-b6_012 | /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... | Cortical_L6_Pyramidal_Cell |
hippocampus-p007-cb_046 | /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... | Microglia_Etc |
cortex-visual-dark-p014-b6_106 | /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... | Microglia_Etc |
cortex-visual-control-p021-b6_174 | /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... | Microglia_Etc |
3646 rows × 2 columns
outdir = 'Tan2021_loop/'
loop_dir = f'/anvil/scratch/x-zhou/{outdir}'
params = {
'cpu': 96,
'resolution': 10000,
'chrom_size_path': f'"{loop_dir}mm10.main20.chrom.sizes"',
'black_list_path': f'"{loop_dir}mm10.dipc.rowsum1000.blf50.merged.bed"',
}
with open(f'{PACKAGE_DIR}/loop/snakemake_template_loop.txt') as tmp:
GENERATE_MATRIX_CHUNK_TEMPLATE = tmp.read()
for cluster, sub_df in cell_table.groupby('cluster'):
if sub_df.shape[0]>1500:
tmp = sub_df.loc[np.random.choice(sub_df.index, 1500, False)]
else:
tmp = sub_df.copy()
prepare_dir(f'{outdir}{cluster}/', tmp, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
with open(f'{outdir}snakemake_cmds.txt', 'w') as f:
for cluster, sub_df in cell_table.groupby('cluster'):
cluster_dir = f'{loop_dir}{cluster}'
f.write(f'snakemake -d {cluster_dir} -s {cluster_dir}/Snakefile_master -j {params["cpu"]}\n')
!cp /data/ref/mm10/genome/mm10.main20.chrom.sizes Tan2021_loop/
!cp /home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/mm10.dipc.rowsum1000.blf50.merged.bed Tan2021_loop/
/bin/bash: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8)
/bin/bash: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8)
Domain#
Generate pseudobulk cool files of imputed contacts at 25kb resolution by summing up imputed matrices of single cells.
The code below divides large cell groups into chunks of 200 cells. “snakemake_cmd_step1.txt” contains commands to generate pseudobulk matrices for each chunk and could be distributed across HPC.
“snakemake_cmd_step2.txt” contains a command to merge chunks into cell groups and could be run directly on a single node.
coollist = glob('/data/test_schicluster/Tan2021/scool/impute/25K/*/*.cool')
cell_table = pd.DataFrame(coollist, index=[xx.split('/')[-1].replace('.cool', '') for xx in coollist], columns=['cool_path'])
cell_table = cell_table.loc[metadata.index]
cell_table['cluster'] = metadata['rnatype'].copy()
# cell_table['cool_path'] = cell_table['cool_path'].str.replace('/data/test_schicluster', '/anvil/scratch/x-zhou')
cell_table
cool_path | cluster | |
---|---|---|
cortex-p028-cb_116 | /data/test_schicluster/Tan2021/scool/impute/25... | Cortical_L6_Pyramidal_Cell |
cortex-visual-control-p007-b6_182 | /data/test_schicluster/Tan2021/scool/impute/25... | Cortical_L6_Pyramidal_Cell |
cortex-p028-cb_112 | /data/test_schicluster/Tan2021/scool/impute/25... | Cortical_L6_Pyramidal_Cell |
cortex-visual-control-p001-b6_061 | /data/test_schicluster/Tan2021/scool/impute/25... | Unknown_Interneuron_2 |
cortex-p056-cb_216 | /data/test_schicluster/Tan2021/scool/impute/25... | Microglia_Etc |
... | ... | ... |
cortex-visual-control-p021-b6_090 | /data/test_schicluster/Tan2021/scool/impute/25... | Mature_Oligodendrocyte |
cortex-visual-control-p021-b6_012 | /data/test_schicluster/Tan2021/scool/impute/25... | Cortical_L6_Pyramidal_Cell |
hippocampus-p007-cb_046 | /data/test_schicluster/Tan2021/scool/impute/25... | Microglia_Etc |
cortex-visual-dark-p014-b6_106 | /data/test_schicluster/Tan2021/scool/impute/25... | Microglia_Etc |
cortex-visual-control-p021-b6_174 | /data/test_schicluster/Tan2021/scool/impute/25... | Microglia_Etc |
3646 rows × 2 columns
outdir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/domain/'
for cluster, sub_df in cell_table.groupby('cluster'):
os.makedirs(f'{outdir}{cluster}', exist_ok=True)
sub_df.to_csv(f'{outdir}{cluster}/cell_table.csv', header=False, index=True)
#with open(f'{cluster}/Snakefile_master', 'w') as f:
# f.write(snakemake_str)
print(cluster, sub_df.shape[0])
Adult_Astrocyte 144
Cajal-Retzius_Cell 23
Cortical_L2-4_Pyramidal_Cell_Intermediate 37
Cortical_L2-5_Pyramidal_Cell_Neonatal 211
Cortical_L23_Pyramidal_Cell 204
Cortical_L4_Pyramidal_Cell 195
Cortical_L5_Pyramidal_Cell 98
Cortical_L6_Pyramidal_Cell 333
Hippocampal_CA1_Pyramidal_Cell 134
Hippocampal_CA3_Pyramidal_Cell 70
Hippocampal_Granuale_Cell 309
Hippocampal_Pyramidal_Cell_Neonatal 93
MEIS2_Interneuron 88
Mature_Oligodendrocyte 210
Medium_Spiny_Neuron 167
Microglia_Etc 391
NDNF_Interneuron 36
Neonatal_Astrocyte 234
Newly_Formed_Oligodendrocyte 27
Oligodendrocyte_Progenitor 189
PVSST_Interneuron_Neonatal 85
PV_Interneuron 61
SST_Interneuron 55
Unknown_Interneuron_1 89
Unknown_Interneuron_2 51
VIP_Interneuron 112
params = {
'resolution': 25000,
'chrom_size_path': '"/data/ref/mm10/genome/mm10.main20.chrom.sizes"',
}
chunk_size = 200
res = 25000
total_chunk_dirs = []
group_chunks = {}
with open(f'{PACKAGE_DIR}/cool/Snakefile_chunk_template') as tmp:
GENERATE_MATRIX_CHUNK_TEMPLATE = tmp.read()
for group, group_df in cell_table.groupby('cluster'):
group_chunks[group] = []
if group_df.shape[0] <= chunk_size:
this_dir = f'{outdir}{group}_chunk0/'
params['cell_table_path'] = f'"{this_dir}cell_table.csv"'
prepare_dir(this_dir, group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
total_chunk_dirs.append(this_dir)
group_chunks[group].append(this_dir)
else:
group_df['chunk'] = [i // chunk_size for i in range(group_df.shape[0])]
for chunk, chunk_df in group_df.groupby('chunk'):
this_dir = f'{outdir}{group}_chunk{chunk}/'
params['cell_table_path'] = f'"{this_dir}cell_table.csv"'
prepare_dir(this_dir, chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
total_chunk_dirs.append(this_dir)
group_chunks[group].append(this_dir)
with open(f'{outdir}snakemake_cmd_step1.txt', 'w') as f:
for chunk_dir in total_chunk_dirs:
cmd = f'snakemake -d {chunk_dir} --snakefile {chunk_dir}Snakefile_master -j 5 --rerun-incomplete'
f.write(cmd + '\n')
params.pop('cell_table_path')
params_str = '\n'.join(f'{k} = {v}' for k, v in params.items())
with open(f'{PACKAGE_DIR}/cool/Snakefile_group_template') as tmp:
GENERATE_MATRIX_GROUP_TEMPLATE = tmp.read()
with open(f'{outdir}Snakefile', 'w') as f:
f.write(params_str + '\n' + GENERATE_MATRIX_GROUP_TEMPLATE)
with open(f'{outdir}snakemake_cmd_step2.txt', 'w') as f:
cmd = f'snakemake -d {outdir} --snakefile {outdir}Snakefile -j 10 --rerun-incomplete'
f.write(cmd + '\n')
Note#
To merge a chunk of 200 cells, using 5 cpus on a nodes with 32 cpus and 128G memory takes ~10 minutes.
Compartment#
Generate pseudobulk cool files of imputed contacts at 100kb resolution by summing up imputed matrices of single cells.
The code below divides large cell groups into chunks of 200 cells. “snakemake_cmd_step1.txt” contains commands to generate pseudobulk matrices for each chunk and could be distributed across HPC.
“snakemake_cmd_step2.txt” contains a command to merge chunks into cell groups and could be run directly on a single node.
coollist = glob('/data/test_schicluster/Tan2021/scool/impute/100K/*/*.cool')
cell_table = pd.DataFrame(coollist, index=[xx.split('/')[-1].replace('.cool', '') for xx in coollist], columns=['cool_path'])
cell_table = cell_table.loc[metadata.index]
cell_table['cluster'] = metadata['rnatype'].copy()
# cell_table['cool_path'] = cell_table['cool_path'].str.replace('/data/test_schicluster', '/anvil/scratch/x-zhou')
cell_table
cool_path | cluster | |
---|---|---|
cortex-p028-cb_116 | /data/test_schicluster/Tan2021/scool/impute/10... | Cortical_L6_Pyramidal_Cell |
cortex-visual-control-p007-b6_182 | /data/test_schicluster/Tan2021/scool/impute/10... | Cortical_L6_Pyramidal_Cell |
cortex-p028-cb_112 | /data/test_schicluster/Tan2021/scool/impute/10... | Cortical_L6_Pyramidal_Cell |
cortex-visual-control-p001-b6_061 | /data/test_schicluster/Tan2021/scool/impute/10... | Unknown_Interneuron_2 |
cortex-p056-cb_216 | /data/test_schicluster/Tan2021/scool/impute/10... | Microglia_Etc |
... | ... | ... |
cortex-visual-control-p021-b6_090 | /data/test_schicluster/Tan2021/scool/impute/10... | Mature_Oligodendrocyte |
cortex-visual-control-p021-b6_012 | /data/test_schicluster/Tan2021/scool/impute/10... | Cortical_L6_Pyramidal_Cell |
hippocampus-p007-cb_046 | /data/test_schicluster/Tan2021/scool/impute/10... | Microglia_Etc |
cortex-visual-dark-p014-b6_106 | /data/test_schicluster/Tan2021/scool/impute/10... | Microglia_Etc |
cortex-visual-control-p021-b6_174 | /data/test_schicluster/Tan2021/scool/impute/10... | Microglia_Etc |
3646 rows × 2 columns
outdir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/compartment/'
for cluster, sub_df in cell_table.groupby('cluster'):
os.makedirs(f'{outdir}{cluster}', exist_ok=True)
sub_df.to_csv(f'{outdir}{cluster}/cell_table.csv', header=False, index=True)
print(cluster, sub_df.shape[0])
Adult_Astrocyte 144
Cajal-Retzius_Cell 23
Cortical_L2-4_Pyramidal_Cell_Intermediate 37
Cortical_L2-5_Pyramidal_Cell_Neonatal 211
Cortical_L23_Pyramidal_Cell 204
Cortical_L4_Pyramidal_Cell 195
Cortical_L5_Pyramidal_Cell 98
Cortical_L6_Pyramidal_Cell 333
Hippocampal_CA1_Pyramidal_Cell 134
Hippocampal_CA3_Pyramidal_Cell 70
Hippocampal_Granuale_Cell 309
Hippocampal_Pyramidal_Cell_Neonatal 93
MEIS2_Interneuron 88
Mature_Oligodendrocyte 210
Medium_Spiny_Neuron 167
Microglia_Etc 391
NDNF_Interneuron 36
Neonatal_Astrocyte 234
Newly_Formed_Oligodendrocyte 27
Oligodendrocyte_Progenitor 189
PVSST_Interneuron_Neonatal 85
PV_Interneuron 61
SST_Interneuron 55
Unknown_Interneuron_1 89
Unknown_Interneuron_2 51
VIP_Interneuron 112
params = {
'resolution': 100000,
'chrom_size_path': '"/data/ref/mm10/genome/mm10.main20.chrom.sizes"',
}
chunk_size = 200
total_chunk_dirs = []
group_chunks = {}
with open(f'{PACKAGE_DIR}/cool/Snakefile_chunk_template') as tmp:
GENERATE_MATRIX_CHUNK_TEMPLATE = tmp.read()
for group, group_df in cell_table.groupby('cluster'):
group_chunks[group] = []
if group_df.shape[0] <= chunk_size:
this_dir = f'{outdir}{group}_chunk0/'
params['cell_table_path'] = f'"{this_dir}cell_table.csv"'
prepare_dir(this_dir, group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
total_chunk_dirs.append(this_dir)
group_chunks[group].append(this_dir)
else:
group_df['chunk'] = [i // chunk_size for i in range(group_df.shape[0])]
for chunk, chunk_df in group_df.groupby('chunk'):
this_dir = f'{outdir}{group}_chunk{chunk}/'
params['cell_table_path'] = f'"{this_dir}cell_table.csv"'
prepare_dir(this_dir, chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
total_chunk_dirs.append(this_dir)
group_chunks[group].append(this_dir)
with open(f'{outdir}snakemake_cmd_step1.txt', 'w') as f:
for chunk_dir in total_chunk_dirs:
cmd = f'snakemake -d {chunk_dir} --snakefile {chunk_dir}Snakefile_master -j 5 --rerun-incomplete'
f.write(cmd + '\n')
params.pop('cell_table_path')
params_str = '\n'.join(f'{k} = {v}' for k, v in params.items())
with open(f'{PACKAGE_DIR}/cool/Snakefile_group_template') as tmp:
GENERATE_MATRIX_GROUP_TEMPLATE = tmp.read()
with open(f'{outdir}Snakefile', 'w') as f:
f.write(params_str + '\n' + GENERATE_MATRIX_GROUP_TEMPLATE)
with open(f'{outdir}snakemake_cmd_step2.txt', 'w') as f:
cmd = f'snakemake -d {outdir} --snakefile {outdir}Snakefile -j 10 --rerun-incomplete'
f.write(cmd + '\n')
Note#
To merge a chunk of 200 cells, using 5 cpus on a nodes with 32 cpus and 128G memory takes ~5 minutes.