Prepare pseudobulk analyses

Prepare pseudobulk analyses#

import os
import numpy as np
import pandas as pd
from glob import glob
import schicluster
PACKAGE_DIR = schicluster.__path__[0]
metadata = pd.read_csv('Tan2021_dipc_cluster.csv.gz', header=0, index_col=0)
metadata['rnatype'] = [xx.replace(' ', '_').replace('/', '').replace(',', '').replace('.', '') for xx in metadata['rnatype'].values]
metadata
rnatype score dipctype dipcleiden
cortex-p028-cb_116 Cortical_L6_Pyramidal_Cell 0.999996 Cortical L6 Pyramidal Cell 5
cortex-visual-control-p007-b6_182 Cortical_L6_Pyramidal_Cell 0.999992 Neuron 2
cortex-p028-cb_112 Cortical_L6_Pyramidal_Cell 0.999996 Cortical L2–5 Pyramidal Cell 5
cortex-visual-control-p001-b6_061 Unknown_Interneuron_2 0.330496 Neuron 2
cortex-p056-cb_216 Microglia_Etc 0.999989 Microglia Etc. 14
... ... ... ... ...
cortex-visual-control-p021-b6_090 Mature_Oligodendrocyte 0.999994 Mature Oligodendrocyte 16
cortex-visual-control-p021-b6_012 Cortical_L6_Pyramidal_Cell 0.999997 Neuron 5
hippocampus-p007-cb_046 Microglia_Etc 0.999993 Microglia Etc. 13
cortex-visual-dark-p014-b6_106 Microglia_Etc 0.999994 Microglia Etc. 13
cortex-visual-control-p021-b6_174 Microglia_Etc 0.999995 Microglia Etc. 13

3646 rows × 4 columns

def prepare_dir(output_dir, chunk_df, template, params):

    os.makedirs(output_dir, exist_ok=True)
    cell_table_path = f'{output_dir}cell_table.csv'
    chunk_df.to_csv(cell_table_path, header=False, index=True)
    params_str = '\n'.join(f'{k} = {v}' for k, v in params.items())

    with open(f'{output_dir}Snakefile_master', 'w') as f:
        f.write(params_str + template)
    return

Raw#

Generate pseudobulk cool files of raw contacts (before imputation) at 5kb resolution, so the mcool files contain 10kb, 25kb, and 100kb resolution. The files contain both cis and trans contacts.

The code below divides large cell groups into chunks of 200 cells. “snakemake_cmd_step1.txt” contains commands to generate pseudobulk matrices for each chunk, and “snakemake_cmd_step2.txt” contains commands to merge chunks into cell groups. Both of them could be distributed across HPC.

cell_table = pd.read_csv('contact_table.tsv', sep='\t', header=None, index_col=0, names=['cell_id','csv_path'])
cell_table = cell_table.loc[metadata.index]
cell_table
csv_path
cortex-p028-cb_116 /data/test_schicluster/Tan2021/raw/CTX_HIP/con...
cortex-visual-control-p007-b6_182 /data/test_schicluster/Tan2021/raw/VIS/contact...
cortex-p028-cb_112 /data/test_schicluster/Tan2021/raw/CTX_HIP/con...
cortex-visual-control-p001-b6_061 /data/test_schicluster/Tan2021/raw/VIS/contact...
cortex-p056-cb_216 /data/test_schicluster/Tan2021/raw/CTX_HIP/con...
... ...
cortex-visual-control-p021-b6_090 /data/test_schicluster/Tan2021/raw/VIS/contact...
cortex-visual-control-p021-b6_012 /data/test_schicluster/Tan2021/raw/VIS/contact...
hippocampus-p007-cb_046 /data/test_schicluster/Tan2021/raw/CTX_HIP/con...
cortex-visual-dark-p014-b6_106 /data/test_schicluster/Tan2021/raw/VIS/contact...
cortex-visual-control-p021-b6_174 /data/test_schicluster/Tan2021/raw/VIS/contact...

3646 rows × 1 columns

cell_table['cluster'] = metadata['rnatype'].copy()
leg = {}
chunk_size = 200
outdir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/merged_raw/'
for cluster, sub_df in cell_table.groupby('cluster'):
    legtmp = []
    # group = cluster.replace(' ', '_').replace('/', '').replace(',', '').replace('.', '')
    os.makedirs(f'{outdir}{cluster}', exist_ok=True)
    if sub_df.shape[0]>1500:
        tmp = sub_df.loc[np.random.choice(sub_df.index, 1500, False)]
    else:
        tmp = sub_df.copy()
    for i,chunk_start in enumerate(np.arange(0, tmp.shape[0], chunk_size)):
        os.makedirs(f'{outdir}{cluster}_chunk{i}', exist_ok=True)
        tmp['csv_path'].iloc[chunk_start:(chunk_start+chunk_size)].to_csv(f'{outdir}{cluster}_chunk{i}/cell_table.tsv', sep='\t', header=False, index=True)
        legtmp.append(f'{cluster}_chunk{i}')
    tmp['csv_path'].to_csv(f'{outdir}{cluster}/cell_table.tsv', sep='\t', header=False, index=True)
    leg[cluster] = legtmp
    print(cluster, tmp.shape[0])
    
Adult_Astrocyte 144
Cajal-Retzius_Cell 23
Cortical_L2-4_Pyramidal_Cell_Intermediate 37
Cortical_L2-5_Pyramidal_Cell_Neonatal 211
Cortical_L23_Pyramidal_Cell 204
Cortical_L4_Pyramidal_Cell 195
Cortical_L5_Pyramidal_Cell 98
Cortical_L6_Pyramidal_Cell 333
Hippocampal_CA1_Pyramidal_Cell 134
Hippocampal_CA3_Pyramidal_Cell 70
Hippocampal_Granuale_Cell 309
Hippocampal_Pyramidal_Cell_Neonatal 93
MEIS2_Interneuron 88
Mature_Oligodendrocyte 210
Medium_Spiny_Neuron 167
Microglia_Etc 391
NDNF_Interneuron 36
Neonatal_Astrocyte 234
Newly_Formed_Oligodendrocyte 27
Oligodendrocyte_Progenitor 189
PVSST_Interneuron_Neonatal 85
PV_Interneuron 61
SST_Interneuron 55
Unknown_Interneuron_1 89
Unknown_Interneuron_2 51
VIP_Interneuron 112

Note#

Run the merge command using batch job submission.

The following code generates command files for step1 and step2.

Each line of a file is a command and can be appended into the job submission template of users system.

f1 = open(f'{outdir}snakemake_cmd_step1.txt', 'w')
f2 = open(f'{outdir}snakemake_cmd_step2.txt', 'w')
for ct in leg:
    for group in leg[ct]:
        cmd = f'hicluster merge-cell-raw --cell_table {outdir}{group}/cell_table.tsv --chrom_size_path /data/ref/mm10/genome/mm10.main.chrom.sizes --output_file {outdir}{group}/raw.cool --chr1 1 --pos1 2 --chr2 3 --pos2 4'
        f1.write(cmd + '\n')
    if len(leg[ct])<2:
        group = leg[ct][0]
        cmd = f'rsync -arv {outdir}{group}/raw.cool {outdir}{ct}/{ct}.raw.cool'
        f2.write(cmd + '\n')
    else:
        cmd = f'cooler merge {outdir}{ct}/{ct}.raw.cool'
        for group in leg[ct]:
            cmd += f' {outdir}{group}/raw.cool'
        f2.write(cmd + '\n')
        
f1.close()
f2.close()

Note#

Run the merge command on a single node, parallelized with snakemake.

The following code generates Snakemake files for step1 and step2.

The template notebook for step1 can be found at zhoujt1994/scHiCluster.

The template notebook for step2 can be found at zhoujt1994/scHiCluster.

Each cell type is assigned to a separate folder and processed separately.

from gliderport.preset import notebook_snakemake

notebook_snakemake(
    work_dir=f"merged_raw/",
    notebook_dir="merged_raw/template_step1/",
    groups=np.concatenate([leg[xx] for xx in leg]).tolist(),
    default_cpu=1,
    default_mem_gb=5,
    redo_prepare=True,
)
!snakemake --snakefile Snakefile -j 8 --keep-going
notebook_snakemake(
    work_dir=f"merged_raw/",
    notebook_dir="merged_raw/template_step2/",
    groups=list(leg.keys()),
    default_cpu=1,
    default_mem_gb=5,
    redo_prepare=True,
)
!snakemake --snakefile Snakefile -j 8 --keep-going

Loop#

Identify pseudobulk loop pixels and summits at 10kb resolution with single-cell information.

The code below generates a loop folder, with each cell group as a subfolder, and a file “snakemake_cmds.txt” with each line as a command to run loop calling for one cell group.

Each command takes the imputed matrices of cells belonging to the same group as input, to generate for each cell group, a bedpe file of loops, and Q, E, T matrices for differential loop calling. The files only contain interations < 5Mb.

The command can be appended into the job submission template of users system. We suggest to distribute the cell groups across different computing nodes in HPC as separate job.

On anvil standard node (128 CPUs), loop calling of a cell group with 2000 cells will take ~12 hours.

coollist = glob('/data/test_schicluster/Tan2021/scool/impute/10K/*/*.cool')
cell_table = pd.DataFrame(coollist, index=[xx.split('/')[-1].replace('.cool', '') for xx in coollist], columns=['cool_path'])
cell_table = cell_table.loc[metadata.index]
cell_table['cluster'] = metadata['rnatype'].copy()
cell_table['cool_path'] = cell_table['cool_path'].str.replace('/data/test_schicluster', '/anvil/scratch/x-zhou')
cell_table
cool_path cluster
cortex-p028-cb_116 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... Cortical_L6_Pyramidal_Cell
cortex-visual-control-p007-b6_182 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... Cortical_L6_Pyramidal_Cell
cortex-p028-cb_112 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... Cortical_L6_Pyramidal_Cell
cortex-visual-control-p001-b6_061 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... Unknown_Interneuron_2
cortex-p056-cb_216 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... Microglia_Etc
... ... ...
cortex-visual-control-p021-b6_090 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... Mature_Oligodendrocyte
cortex-visual-control-p021-b6_012 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... Cortical_L6_Pyramidal_Cell
hippocampus-p007-cb_046 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... Microglia_Etc
cortex-visual-dark-p014-b6_106 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... Microglia_Etc
cortex-visual-control-p021-b6_174 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... Microglia_Etc

3646 rows × 2 columns

outdir = 'Tan2021_loop/'
loop_dir = f'/anvil/scratch/x-zhou/{outdir}'
params = {
    'cpu': 96,
    'resolution': 10000,
    'chrom_size_path': f'"{loop_dir}mm10.main20.chrom.sizes"',
    'black_list_path': f'"{loop_dir}mm10.dipc.rowsum1000.blf50.merged.bed"',
}
with open(f'{PACKAGE_DIR}/loop/snakemake_template_loop.txt') as tmp:
    GENERATE_MATRIX_CHUNK_TEMPLATE = tmp.read()
for cluster, sub_df in cell_table.groupby('cluster'):
    if sub_df.shape[0]>1500:
        tmp = sub_df.loc[np.random.choice(sub_df.index, 1500, False)]
    else:
        tmp = sub_df.copy()
    prepare_dir(f'{outdir}{cluster}/', tmp, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
    
with open(f'{outdir}snakemake_cmds.txt', 'w') as f:
    for cluster, sub_df in cell_table.groupby('cluster'):
        cluster_dir = f'{loop_dir}{cluster}'
        f.write(f'snakemake -d {cluster_dir} -s {cluster_dir}/Snakefile_master -j {params["cpu"]}\n')
        
!cp /data/ref/mm10/genome/mm10.main20.chrom.sizes Tan2021_loop/
!cp /home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/mm10.dipc.rowsum1000.blf50.merged.bed Tan2021_loop/
/bin/bash: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8)
/bin/bash: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8)

Domain#

Generate pseudobulk cool files of imputed contacts at 25kb resolution by summing up imputed matrices of single cells.

The code below divides large cell groups into chunks of 200 cells. “snakemake_cmd_step1.txt” contains commands to generate pseudobulk matrices for each chunk and could be distributed across HPC.

“snakemake_cmd_step2.txt” contains a command to merge chunks into cell groups and could be run directly on a single node.

coollist = glob('/data/test_schicluster/Tan2021/scool/impute/25K/*/*.cool')
cell_table = pd.DataFrame(coollist, index=[xx.split('/')[-1].replace('.cool', '') for xx in coollist], columns=['cool_path'])
cell_table = cell_table.loc[metadata.index]
cell_table['cluster'] = metadata['rnatype'].copy()
# cell_table['cool_path'] = cell_table['cool_path'].str.replace('/data/test_schicluster', '/anvil/scratch/x-zhou')
cell_table
cool_path cluster
cortex-p028-cb_116 /data/test_schicluster/Tan2021/scool/impute/25... Cortical_L6_Pyramidal_Cell
cortex-visual-control-p007-b6_182 /data/test_schicluster/Tan2021/scool/impute/25... Cortical_L6_Pyramidal_Cell
cortex-p028-cb_112 /data/test_schicluster/Tan2021/scool/impute/25... Cortical_L6_Pyramidal_Cell
cortex-visual-control-p001-b6_061 /data/test_schicluster/Tan2021/scool/impute/25... Unknown_Interneuron_2
cortex-p056-cb_216 /data/test_schicluster/Tan2021/scool/impute/25... Microglia_Etc
... ... ...
cortex-visual-control-p021-b6_090 /data/test_schicluster/Tan2021/scool/impute/25... Mature_Oligodendrocyte
cortex-visual-control-p021-b6_012 /data/test_schicluster/Tan2021/scool/impute/25... Cortical_L6_Pyramidal_Cell
hippocampus-p007-cb_046 /data/test_schicluster/Tan2021/scool/impute/25... Microglia_Etc
cortex-visual-dark-p014-b6_106 /data/test_schicluster/Tan2021/scool/impute/25... Microglia_Etc
cortex-visual-control-p021-b6_174 /data/test_schicluster/Tan2021/scool/impute/25... Microglia_Etc

3646 rows × 2 columns

outdir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/domain/'
for cluster, sub_df in cell_table.groupby('cluster'):
    os.makedirs(f'{outdir}{cluster}', exist_ok=True)
    sub_df.to_csv(f'{outdir}{cluster}/cell_table.csv', header=False, index=True)
    #with open(f'{cluster}/Snakefile_master', 'w') as f:
    #    f.write(snakemake_str)
    print(cluster, sub_df.shape[0])
Adult_Astrocyte 144
Cajal-Retzius_Cell 23
Cortical_L2-4_Pyramidal_Cell_Intermediate 37
Cortical_L2-5_Pyramidal_Cell_Neonatal 211
Cortical_L23_Pyramidal_Cell 204
Cortical_L4_Pyramidal_Cell 195
Cortical_L5_Pyramidal_Cell 98
Cortical_L6_Pyramidal_Cell 333
Hippocampal_CA1_Pyramidal_Cell 134
Hippocampal_CA3_Pyramidal_Cell 70
Hippocampal_Granuale_Cell 309
Hippocampal_Pyramidal_Cell_Neonatal 93
MEIS2_Interneuron 88
Mature_Oligodendrocyte 210
Medium_Spiny_Neuron 167
Microglia_Etc 391
NDNF_Interneuron 36
Neonatal_Astrocyte 234
Newly_Formed_Oligodendrocyte 27
Oligodendrocyte_Progenitor 189
PVSST_Interneuron_Neonatal 85
PV_Interneuron 61
SST_Interneuron 55
Unknown_Interneuron_1 89
Unknown_Interneuron_2 51
VIP_Interneuron 112
params = {
    'resolution': 25000,
    'chrom_size_path': '"/data/ref/mm10/genome/mm10.main20.chrom.sizes"',
}
chunk_size = 200
res = 25000
total_chunk_dirs = []
group_chunks = {}

with open(f'{PACKAGE_DIR}/cool/Snakefile_chunk_template') as tmp:
    GENERATE_MATRIX_CHUNK_TEMPLATE = tmp.read()

for group, group_df in cell_table.groupby('cluster'):
    group_chunks[group] = []
    if group_df.shape[0] <= chunk_size:
        this_dir = f'{outdir}{group}_chunk0/'
        params['cell_table_path'] = f'"{this_dir}cell_table.csv"'
        prepare_dir(this_dir, group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
        total_chunk_dirs.append(this_dir)
        group_chunks[group].append(this_dir)
    else:
        group_df['chunk'] = [i // chunk_size for i in range(group_df.shape[0])]
        for chunk, chunk_df in group_df.groupby('chunk'):
            this_dir = f'{outdir}{group}_chunk{chunk}/'
            params['cell_table_path'] = f'"{this_dir}cell_table.csv"'
            prepare_dir(this_dir, chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
            total_chunk_dirs.append(this_dir)
            group_chunks[group].append(this_dir)

            
with open(f'{outdir}snakemake_cmd_step1.txt', 'w') as f:
    for chunk_dir in total_chunk_dirs:
        cmd = f'snakemake -d {chunk_dir} --snakefile {chunk_dir}Snakefile_master -j 5 --rerun-incomplete'
        f.write(cmd + '\n')
params.pop('cell_table_path')
params_str = '\n'.join(f'{k} = {v}' for k, v in params.items())

with open(f'{PACKAGE_DIR}/cool/Snakefile_group_template') as tmp:
    GENERATE_MATRIX_GROUP_TEMPLATE = tmp.read()

with open(f'{outdir}Snakefile', 'w') as f:
    f.write(params_str + '\n' + GENERATE_MATRIX_GROUP_TEMPLATE)
    
with open(f'{outdir}snakemake_cmd_step2.txt', 'w') as f:
    cmd = f'snakemake -d {outdir} --snakefile {outdir}Snakefile -j 10 --rerun-incomplete'
    f.write(cmd + '\n')

Note#

To merge a chunk of 200 cells, using 5 cpus on a nodes with 32 cpus and 128G memory takes ~10 minutes.

Compartment#

Generate pseudobulk cool files of imputed contacts at 100kb resolution by summing up imputed matrices of single cells.

The code below divides large cell groups into chunks of 200 cells. “snakemake_cmd_step1.txt” contains commands to generate pseudobulk matrices for each chunk and could be distributed across HPC.

“snakemake_cmd_step2.txt” contains a command to merge chunks into cell groups and could be run directly on a single node.

coollist = glob('/data/test_schicluster/Tan2021/scool/impute/100K/*/*.cool')
cell_table = pd.DataFrame(coollist, index=[xx.split('/')[-1].replace('.cool', '') for xx in coollist], columns=['cool_path'])
cell_table = cell_table.loc[metadata.index]
cell_table['cluster'] = metadata['rnatype'].copy()
# cell_table['cool_path'] = cell_table['cool_path'].str.replace('/data/test_schicluster', '/anvil/scratch/x-zhou')
cell_table
cool_path cluster
cortex-p028-cb_116 /data/test_schicluster/Tan2021/scool/impute/10... Cortical_L6_Pyramidal_Cell
cortex-visual-control-p007-b6_182 /data/test_schicluster/Tan2021/scool/impute/10... Cortical_L6_Pyramidal_Cell
cortex-p028-cb_112 /data/test_schicluster/Tan2021/scool/impute/10... Cortical_L6_Pyramidal_Cell
cortex-visual-control-p001-b6_061 /data/test_schicluster/Tan2021/scool/impute/10... Unknown_Interneuron_2
cortex-p056-cb_216 /data/test_schicluster/Tan2021/scool/impute/10... Microglia_Etc
... ... ...
cortex-visual-control-p021-b6_090 /data/test_schicluster/Tan2021/scool/impute/10... Mature_Oligodendrocyte
cortex-visual-control-p021-b6_012 /data/test_schicluster/Tan2021/scool/impute/10... Cortical_L6_Pyramidal_Cell
hippocampus-p007-cb_046 /data/test_schicluster/Tan2021/scool/impute/10... Microglia_Etc
cortex-visual-dark-p014-b6_106 /data/test_schicluster/Tan2021/scool/impute/10... Microglia_Etc
cortex-visual-control-p021-b6_174 /data/test_schicluster/Tan2021/scool/impute/10... Microglia_Etc

3646 rows × 2 columns

outdir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/compartment/'
for cluster, sub_df in cell_table.groupby('cluster'):
    os.makedirs(f'{outdir}{cluster}', exist_ok=True)
    sub_df.to_csv(f'{outdir}{cluster}/cell_table.csv', header=False, index=True)
    print(cluster, sub_df.shape[0])
Adult_Astrocyte 144
Cajal-Retzius_Cell 23
Cortical_L2-4_Pyramidal_Cell_Intermediate 37
Cortical_L2-5_Pyramidal_Cell_Neonatal 211
Cortical_L23_Pyramidal_Cell 204
Cortical_L4_Pyramidal_Cell 195
Cortical_L5_Pyramidal_Cell 98
Cortical_L6_Pyramidal_Cell 333
Hippocampal_CA1_Pyramidal_Cell 134
Hippocampal_CA3_Pyramidal_Cell 70
Hippocampal_Granuale_Cell 309
Hippocampal_Pyramidal_Cell_Neonatal 93
MEIS2_Interneuron 88
Mature_Oligodendrocyte 210
Medium_Spiny_Neuron 167
Microglia_Etc 391
NDNF_Interneuron 36
Neonatal_Astrocyte 234
Newly_Formed_Oligodendrocyte 27
Oligodendrocyte_Progenitor 189
PVSST_Interneuron_Neonatal 85
PV_Interneuron 61
SST_Interneuron 55
Unknown_Interneuron_1 89
Unknown_Interneuron_2 51
VIP_Interneuron 112
params = {
    'resolution': 100000,
    'chrom_size_path': '"/data/ref/mm10/genome/mm10.main20.chrom.sizes"',
}
chunk_size = 200
total_chunk_dirs = []
group_chunks = {}

with open(f'{PACKAGE_DIR}/cool/Snakefile_chunk_template') as tmp:
    GENERATE_MATRIX_CHUNK_TEMPLATE = tmp.read()

for group, group_df in cell_table.groupby('cluster'):
    group_chunks[group] = []
    if group_df.shape[0] <= chunk_size:
        this_dir = f'{outdir}{group}_chunk0/'
        params['cell_table_path'] = f'"{this_dir}cell_table.csv"'
        prepare_dir(this_dir, group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
        total_chunk_dirs.append(this_dir)
        group_chunks[group].append(this_dir)
    else:
        group_df['chunk'] = [i // chunk_size for i in range(group_df.shape[0])]
        for chunk, chunk_df in group_df.groupby('chunk'):
            this_dir = f'{outdir}{group}_chunk{chunk}/'
            params['cell_table_path'] = f'"{this_dir}cell_table.csv"'
            prepare_dir(this_dir, chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)
            total_chunk_dirs.append(this_dir)
            group_chunks[group].append(this_dir)

            
with open(f'{outdir}snakemake_cmd_step1.txt', 'w') as f:
    for chunk_dir in total_chunk_dirs:
        cmd = f'snakemake -d {chunk_dir} --snakefile {chunk_dir}Snakefile_master -j 5 --rerun-incomplete'
        f.write(cmd + '\n')
params.pop('cell_table_path')
params_str = '\n'.join(f'{k} = {v}' for k, v in params.items())

with open(f'{PACKAGE_DIR}/cool/Snakefile_group_template') as tmp:
    GENERATE_MATRIX_GROUP_TEMPLATE = tmp.read()

with open(f'{outdir}Snakefile', 'w') as f:
    f.write(params_str + '\n' + GENERATE_MATRIX_GROUP_TEMPLATE)
    
with open(f'{outdir}snakemake_cmd_step2.txt', 'w') as f:
    cmd = f'snakemake -d {outdir} --snakefile {outdir}Snakefile -j 10 --rerun-incomplete'
    f.write(cmd + '\n')

Note#

To merge a chunk of 200 cells, using 5 cpus on a nodes with 32 cpus and 128G memory takes ~5 minutes.