Source code for schicluster.__main__

"""
CLI defined here

When adding new function:
1. add a func_register_subparser function to register the subparser
2. add a condition in main func about this new func name, import the real func as func in main
"""
import schicluster
from schicluster import __version__

import argparse
import inspect
import subprocess
import sys
import logging
import os

os.environ["NUMEXPR_MAX_THREADS"] = "8"

[docs] log = logging.getLogger()
[docs] DESCRIPTION = """ scHiCluster is a toolkit for single-cell HiC data preprocessing, imputation, and clustering analysis. Current Tool List in scHiCluster: """
[docs] EPILOG = ''
[docs] class NiceFormatter(logging.Formatter): """ From Cutadapt https://github.com/marcelm/cutadapt Do not prefix "INFO:" to info-level log messages (but do it for all other levels). Based on http://stackoverflow.com/a/9218261/715090 . """
[docs] def format(self, record): if record.levelno != logging.INFO: record.msg = '{}: {}'.format(record.levelname, record.msg) return super().format(record)
[docs] def validate_environment(): try: # TODO add env validation here subprocess.run(['bedtools', '--version'], stderr=subprocess.PIPE, stdout=subprocess.PIPE, encoding='utf8', check=True) except subprocess.CalledProcessError as e: print(e.stderr) raise e return True
[docs] def setup_logging(stdout=False, quiet=False, debug=False): """ From Cutadapt https://github.com/marcelm/cutadapt Attach handler to the global logger object """ # Due to backwards compatibility, logging output is sent to standard output # instead of standard error if the -o option is used. stream_handler = logging.StreamHandler(sys.stdout if stdout else sys.stderr) stream_handler.setFormatter(NiceFormatter()) # debug overrides quiet if debug: level = logging.DEBUG elif quiet: level = logging.ERROR else: level = logging.INFO stream_handler.setLevel(level) log.setLevel(level) log.addHandler(stream_handler)
[docs] def _str_to_bool(v: str) -> bool: if v.lower() in {'true', '1', 't', 'y', 'yes', 'yeah', 'yup', 'certainly', 'uh-huh'}: return True else: return False
[docs] def comp_cpg_cell_register_subparser(subparser): parser = subparser.add_parser('comp-cpg-cell', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--indir', type=str, default=None, required=True, help='Directory of raw/imputed matrices end with /') parser_req.add_argument('--outdir', type=str, default=None, required=True, help='Directory of compartment matrix end with /') parser_req.add_argument('--cell', type=str, default=None, required=True, help='Specific identifier of a cell') parser_req.add_argument('--chrom', type=str, default=None, required=True, help='Chromosome to compute compartment') parser.add_argument('--mode', type=str, default=None, help='Suffix of imputed matrix file names or raw') parser.add_argument('--cpg_file', type=str, default=None, help='CpG counts generated by bedtools nuc')
[docs] def comp_concatcell_chr_register_subparser(subparser): parser = subparser.add_parser('comp-concatcell-chr', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--cell_list', type=str, default=None, required=True, help='Full path of a file containing the full path of all CpG npy files to be concatenate') parser_req.add_argument('--outprefix', type=str, default=None, required=True, help='Prefix of concatenated matrix including directory') parser.add_argument('--ncpus', type=int, default=10, help='# threads for parallelization')
[docs] def domain_insulation_cell_register_subparser(subparser): parser = subparser.add_parser('domain-insulation-cell', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--indir', type=str, default=None, required=True, help='Directory of imputed matrices end with /') parser_req.add_argument('--cell', type=str, default=None, required=True, help='Specific identifier of a cell') parser_req.add_argument('--chrom', type=str, default=None, required=True, help='Chromosome to compute insulation score') parser_req.add_argument('--mode', type=str, default=None, required=True, help='Suffix of imputed matrix file names') parser.add_argument('--w', type=int, default=10, help='Window size for insulation score')
[docs] def domain_concatcell_chr_register_subparser(subparser): parser = subparser.add_parser('domain-concatcell-chr', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--cell_list', type=str, default=None, required=True, help='Full path of a file containing the full path of all insulation npy or domain txt files to be concatenate') parser_req.add_argument('--outprefix', type=str, default=None, required=True, help='Prefix of concatenated matrix including directory') parser_req.add_argument('--res', type=int, default=None, required=True, help='Bin size as integer') parser.add_argument('--input_type', type=str, default='insulation', help='Whether input files are insulation.npy or domain.txt') # insulation or boundary parser.add_argument('--ncpus', type=int, default=10, help='# threads for parallelization')
[docs] def embed_concatcell_chr_register_subparser(subparser): parser = subparser.add_parser('embed-concatcell-chr', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--cell_list', type=str, default=None, required=True, help='Full path of a file containing the full path to all imputed matrices to be concatenate') parser_req.add_argument('--outprefix', type=str, default=None, required=True, help='Prefix of concatenated matrix including directory') parser_req.add_argument('--res', type=int, default=None, required=True, help='Bin size as integer of imputed matrices') parser.add_argument('--dist', type=int, default=10000000, help='Maximum distance threshold of contacts to use') parser.add_argument('--skip_raw', dest='save_raw', action='store_false', help='Not to save cell-by-feature matrix before SVD') parser.set_defaults(save_raw=True) parser.add_argument('--dim', type=int, default=50, help='Number of dimensions to return from SVD')
[docs] def embed_mergechr_register_subparser(subparser): parser = subparser.add_parser('embed-mergechr', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--embed_list', type=str, default=None, required=True, help='Full path of a file containing the full path to dimension reduction files of all chromosomes') parser_req.add_argument('--outprefix', type=str, default=None, required=True, help='Prefix of final dimension reduction file including directory') parser.add_argument('--dim', type=int, default=20, help='Number of dimensions to return from SVD') parser.add_argument('--use_pc', dest='norm_sig', action='store_false', help='Not to normalize PCs by singular values') parser.set_defaults(norm_sig=True)
[docs] def generatematrix_cell_register_subparser(subparser): parser = subparser.add_parser('generatematrix-cell', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help='') parser_req = parser.add_argument_group('required arguments') parser_req.add_argument('--infile', type=str, default=None, required=True, help='Path to the short format contact file') parser_req.add_argument('--outdir', type=str, default=None, required=True, help='Output directory end with /') parser_req.add_argument('--cell', type=str, default=None, required=True, help='Specific identifier of a cell') parser_req.add_argument('--res', type=int, default=None, required=True, help='Bin size as integer to generate contact matrix') parser_req.add_argument('--chrom_file', type=str, default=None, required=True, help='Path to the chromosome size files containing all chromosome to be analyzed as the first column') parser.add_argument('--split_file', type=str, default=None, help='Path to the bed file containing all chromosomes need to split and one region per chromosome as splitting point') parser.add_argument('--chr1', type=int, default=1, help='Column of fragment1 chromosome, counting starts from 0') parser.add_argument('--pos1', type=int, default=2, help='Column of fragment1 position, counting starts from 0') parser.add_argument('--chr2', type=int, default=5, help='Column of fragment2 chromosome, counting starts from 0') parser.add_argument('--pos2', type=int, default=6, help='Column of fragment2 position, counting starts from 0') parser.add_argument('--dist', type=int, default=2500, help='Minimum distance threshold of contacts to use')
[docs] def impute_cell_register_subparser(subparser): parser = subparser.add_parser('impute-cell', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--indir', type=str, default=None, help='Directory of the contact matrix') parser_req.add_argument('--outdir', type=str, default=None, help='Output directory end with /') parser_req.add_argument('--cell', type=str, default=None, help='Specific identifier of a cell') parser_req.add_argument('--chrom', type=str, default=None, help='Chromosome to impute') parser_req.add_argument('--res', type=int, default=None, help='Bin size as integer to generate contact matrix') parser_req.add_argument('--chrom_file', type=str, default=None, help='Path to the chromosome size files containing all chromosomes to be analyzed') parser.add_argument('--logscale', dest='logscale', action='store_true', help='Log transform raw count') parser.set_defaults(logscale=False) parser.add_argument('--pad', type=int, default=1, help='Gaussian kernal size') parser.add_argument('--std', type=float, default=1, help='Gaussian kernal standard deviation') parser.add_argument('--rp', type=float, default=0.5, help='Restart probability of RWR') parser.add_argument('--tol', type=float, default=0.01, help='Convergence tolerance of RWR') parser.add_argument('--window_size', type=int, default=500000000, help='Size of RWR sliding window') parser.add_argument('--step_size', type=int, default=10000000, help='Step length of RWR sliding window') parser.add_argument('--output_dist', type=int, default=500000000, help='Maximum distance threshold of contacts when writing output file') parser.add_argument('--output_format', type=str, default='hdf5', help='Output file format (hdf5 or npz)') parser.add_argument('--mode', type=str, default=None, help='Suffix of output file name')
[docs] def loop_bkg_cell_register_subparser(subparser): parser = subparser.add_parser('loop-bkg-cell', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--indir', type=str, default=None, required=True, help='Directory of imputed matrix') parser_req.add_argument('--cell', type=str, default=None, required=True, help='Specific identifier of a cell') parser_req.add_argument('--chrom', type=str, default=None, required=True, help='Chromosome to compute background') parser_req.add_argument('--impute_mode', type=str, default=None, required=True, help='Suffix of imputed matrix file names') parser_req.add_argument('--res', type=int, default=None, required=True, help='Bin size as integer to generate contact matrix') parser.add_argument('--dist', type=int, default=10050000, help='Maximum distance threshold of contacts to use') parser.add_argument('--cap', type=int, default=5, help='Trim Z-scores over the threshold') parser.add_argument('--pad', type=int, default=5, help='One direction size of larger square for donut background') parser.add_argument('--gap', type=int, default=2, help='One direction size of smaller square for donut background') parser.add_argument('--norm_mode', type=str, default='dist_trim', help='Suffix of normalized file names')
[docs] def loop_sumcell_chr_register_subparser(subparser): parser = subparser.add_parser('loop-sumcell-chr', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--cell_list', type=str, default=None, required=True, help='Full path of a file containing the full path of all imputed files to be summed without .hdf5 suffix') parser_req.add_argument('--outprefix', type=str, default=None, required=True, help='Prefix of summed matrix including directory') parser_req.add_argument('--res', type=int, default=None, required=True, help='Bin size as integer of imputed matrices') parser.add_argument('--group_list', type=str, default=None, help='Full path of a file containing the full path of all summed imputed files to be merged without .hdf5 suffix') parser.add_argument('--matrix', type=str, default='QEO', help='Types of matrices to merge (Q: imputed matrix; E: OVE matrix; O: Outlier matrix)') parser.add_argument('--sum_only', dest='sum_only', action='store_true', help='Sum cells only and do not test for loops') parser.set_defaults(sum_only=False) parser.add_argument('--test_only', dest='test_only', action='store_true', help='Sum of cells already exist and do loop test only') parser.set_defaults(test_only=False) parser.add_argument('--norm_mode', type=str, default='dist_trim', help='Suffix of normalized file names') parser.add_argument('--min_dist', type=int, default=50000, help='Minimum distance threshold of loop') parser.add_argument('--max_dist', type=int, default=10000000, help='Maximum distance threshold of loop') parser.add_argument('--pad', type=int, default=5, help='One direction size of larger square for donut background') parser.add_argument('--gap', type=int, default=2, help='One direction size of smaller square for donut background')
[docs] def loop_mergechr_register_subparser(subparser): parser = subparser.add_parser('loop-mergechr', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--inprefix', type=str, default=None, required=True, help='Prefix of summed matrix including directory') parser_req.add_argument('--outprefix', type=str, default=None, required=True, help='Prefix of loop files including directory') parser_req.add_argument('--chrom_file', type=str, default=None, required=True, help='Path to the chromosome size files containing all chromosome to be analyzed as the first column') parser.add_argument('--split_file', type=str, default=None, help='Path to the bed file containing all chromosomes need to split and one region per chromosome as splitting point') parser.add_argument('--res', type=int, default=10000, help='Bin size as integer of loop calling') parser.add_argument('--thres_bl', type=int, default=1.33, help='Lowest fold change threshold against bottom left background') parser.add_argument('--thres_d', type=int, default=1.33, help='Lowest fold change threshold against donut background') parser.add_argument('--thres_h', type=int, default=1.2, help='Lowest fold change threshold against horizontal background') parser.add_argument('--thres_v', type=int, default=1.2, help='Lowest fold change threshold against vertical background') parser.add_argument('--fdr_thres', type=int, default=0.1, help='Highest t-test FDR threshold of loops') parser.add_argument('--dist_thres', type=int, default=20000, help='Highest distance threshold to merge loops into summit') parser.add_argument('--size_thres', type=int, default=1, help='Lowest loop number threshold of summit')
[docs] def generate_scool_register_subparser(subparser): parser = subparser.add_parser('generate-scool', aliases=['scool'], formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--contacts_table', type=str, default=None, required=True, help='tab-separated table containing tow columns, 1) cell id, ' '2) cell contact file path (juicer-pre format). No header') parser_req.add_argument('--output_prefix', type=str, default=None, required=True, help='Output prefix of the scool files. ' 'Output path will be {output_prefix}.{resolution_str}.scool') parser_req.add_argument('--chrom_size_path', type=str, default=None, required=True, help='Path to the chromosome size file, this file should be in UCSC chrom.sizes format. ' 'We will use this file as the reference to create matrix. ' 'It is recommended to remove small contigs or chrM from this file.') parser_req.add_argument('--resolutions', type=int, nargs='+', default=[], required=True, help='Resolutions to generate the matrix. ' 'You can provide multiple resolutions separated by space, ' '(e.g., "--resolutions 10000 100000"). ' 'Each resolution will be stored in a separate file.') parser.add_argument('--blacklist_1d_path', type=str, default=None, required=False, help='Path to blacklist region BED file, such as ENCODE blacklist. ' 'Either side of the contact overlapping with a blacklist region will be removed.') parser.add_argument('--blacklist_2d_path', type=str, default=None, required=False, help='Path to blacklist region pair BEDPE file. ' 'Both side of the contact overlapping with the same ' 'blacklist region pair will be removed.') parser.add_argument('--blacklist_resolution', type=int, default=10000, required=False, help='Resolution in bps when consider the 2D blacklist region pairs.') parser.add_argument('--not_remove_duplicates', dest='remove_duplicates', action='store_false', help='If set, will NOT remove duplicated contacts based on ' '[chr1, pos1, chr2, pos2] values') parser.set_defaults(remove_duplicates=True) parser.add_argument('--chr1', type=int, default=1, required=False, help='0 based index of chr1 column.') parser.add_argument('--chr2', type=int, default=5, required=False, help='0 based index of chr2 column.') parser.add_argument('--pos1', type=int, default=2, required=False, help='0 based index of pos1 column.') parser.add_argument('--pos2', type=int, default=6, required=False, help='0 based index of pos2 column.') parser.add_argument('--min_pos_dist', type=int, default=2500, required=False, help='Minimum distance for a fragment to be considered.') parser.add_argument('--cpu', type=int, default=1, required=False, help='number of cpus to parallel.') parser.add_argument('--batch_n', type=int, default=50, required=False, help='number of cells to deal with in each cpu process.')
[docs] def prepare_imputation_register_subparser(subparser): parser = subparser.add_parser('prepare-impute', aliases=['imputation'], formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--output_dir', type=str, required=True, default=None, help='Path to output directory containing the single-cell cool files after imputation') parser_req.add_argument('--chrom_size_path', type=str, required=True, default=None, help='Path to UCSC chrom size file' 'Contain all the chromosome information in two tab-separated columns: 1. chromosome name, 2. chromosome length. No header') parser_req.add_argument('--output_dist', type=int, required=True, default=500000000, help='Maximum distance for a contact to be written to the output') parser_req.add_argument('--window_size', type=int, required=True, default=500000000, help='The size in base pairs of sliding window for imputation ') parser_req.add_argument('--step_size', type=int, required=True, default=10000000, help='The step size in base pairs that the sliding window moves') parser_req.add_argument('--resolution', type=int, required=True, default=None, help='Resolution for imputation') parser.add_argument('--input_scool', type=str, required=False, default=None, help='Path to input scool file') parser.add_argument('--cell_table', type=str, required=False, default=None, help='Contain all the cell contact file after blacklist removal ' 'in two tab-separated columns: 1. cell_uid, 2. file_path. ' 'No header') parser.add_argument('--batch_size', type=int, required=False, default=100, help='Number of cells to include in each batch run') parser.add_argument('--logscale', dest='logscale', action='store_true', required=False, help='') parser.set_defaults(logscale=False) parser.add_argument('--pad', type=int, required=False, default=1, help='') parser.add_argument('--std', type=int, required=False, default=1, help='') parser.add_argument('--rp', type=float, required=False, default=0.5, help='') parser.add_argument('--tol', type=float, required=False, default=0.01, help='') parser.add_argument('--min_cutoff', type=float, required=False, default=1e-5, help='') parser.add_argument('--cpu_per_job', type=int, required=False, default=10, help='Number of cpus to parallel.') parser.add_argument('--chr1', type=int, dest='chrom1', default=1, required=False, help='0 based index of chr1 column.') parser.add_argument('--chr2', type=int, dest='chrom2', default=5, required=False, help='0 based index of chr2 column.') parser.add_argument('--pos1', type=int, default=2, required=False, help='0 based index of pos1 column.') parser.add_argument('--pos2', type=int, default=6, required=False, help='0 based index of pos2 column.')
[docs] def call_domain_register_subparser(subparser): parser = subparser.add_parser('domain', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--cell_table_path', type=str, required=True, default=None, help='Path to cell table file') parser_req.add_argument('--output_prefix', type=str, required=True, default=None, help='Output files prefix. ' 'The domain boundary matrix will be saved as ' '{output_prefix}.boundary.h5ad (anndata.AnnData); ' 'The insulation score matrix will be saved as ' '{output_prefix}.insulation.nc (xarray.DataSet)') parser.add_argument('--resolution', type=int, required=False, default=25000, help='Matrix resolution') parser.add_argument('--window_size', type=int, required=False, default=10, help='The size in base pairs of sliding window for calculating insulation score') parser.add_argument('--save_count', dest='save_count', action='store_true', required=False, help='to save intra and inter separately') parser.add_argument('--cpu', type=int, required=False, default=10, help='Number of CPUs to use')
[docs] def call_compartment_register_subparser(subparser): parser = subparser.add_parser('compartment', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--cell_table_path', type=str, required=True, default=None, help='Contain all the cell contact file information in two ' 'tab-separated columns: 1. cell_uid, 2. file_path. No header') parser_req.add_argument('--output_prefix', type=str, required=True, default=None, help='Output files prefix. ' 'The compartment score matrix will be saved as ' '{output_prefix}.compartment.nc (use xr.open_dataset() to load).') parser_req.add_argument('--cpg_profile_path', type=str, required=True, default=None, help='Genome bins CpG ratio. Use "schicluster cpg-ratio" to calculate') parser.add_argument('--cpu', type=int, required=False, default=10, help='Number of CPUs to use') parser.add_argument('--calc_strength', dest='calc_strength', action='store_true', help='Calculate compartment strength summary') parser.set_defaults(calc_strength=False) parser.add_argument('--mode', type=str, default='cool', required=False, choices=['tsv', 'cool'], help='cool or tsv') parser.add_argument('--chrom_size_path', type=str, required=False, default=None, help='Path to UCSC chrom size fileContain all the chromosome ' 'information in two tab-separated columns: ' '1. chromosome name, 2. chromosome length. No header') parser.add_argument('--resolution', type=int, required=False, default=100000, help='') parser.add_argument('--chr1', type=int, dest='chrom1', default=1, required=False, help='0 based index of chr1 column.') parser.add_argument('--chr2', type=int, dest='chrom2', default=5, required=False, help='0 based index of chr2 column.') parser.add_argument('--pos1', type=int, default=2, required=False, help='0 based index of pos1 column.') parser.add_argument('--pos2', type=int, default=6, required=False, help='0 based index of pos2 column.')
[docs] def cpg_ratio_register_subparser(subparser): parser = subparser.add_parser('cpg-ratio', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--fasta_path', type=str, required=True, default=None, help='Path to genome FASTA file') parser_req.add_argument('--hdf_output_path', type=str, required=True, default=None, help='Output path of the CpG ratio hdf') parser.add_argument('--cell_url', type=str, required=False, default=None, help='Path to a cell Cooler URL') parser.add_argument('--chrom_size_path', type=str, required=False, default=None, help='Path to UCSC chrom size fileContain all the chromosome ' 'information in two tab-separated columns: ' '1. chromosome name, 2. chromosome length. No header') parser.add_argument('--resolution', type=int, required=False, default=100000, help='Resolution of the bin size in output CpG file')
[docs] def embedding_register_subparser(subparser): parser = subparser.add_parser('embedding', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--cell_table_path', type=str, required=True, default=None, help='Contain all the imputed contact files information in two' 'tab-separated columns: 1. cell_uid, 2. file_path. No header') parser_req.add_argument('--output_dir', type=str, required=True, default=None, help='Path to the output directory of the embedding output') parser.add_argument('--chrom_size_path', type=str, required=False, default=None, help='Path to UCSC chrom size fileContain all the chromosome' 'information in two tab-separated columns:' '1. chromosome name, 2. chromosome length. No header') parser.add_argument('--dim', type=int, required=False, default=50, help='') parser.add_argument('--dist', type=int, required=False, default=1000000, help='') parser.add_argument('--resolution', type=int, required=False, default=100000, help='Resolution for embedding.' 'Consistent with resolution of imputed contact files') parser.add_argument('--scale_factor', type=int, required=False, default=100000, help='') parser.add_argument('--cpu', type=int, required=False, default=1, help='') parser.add_argument('--norm_sig', dest='norm_sig', action='store_true', help='') parser.set_defaults(norm_sig=False) parser.add_argument('--save_model', dest='save_model', action='store_true', help='') parser.set_defaults(save_model=False) parser.add_argument('--save_raw', dest='save_raw', action='store_true', help='') parser.set_defaults(save_raw=False)
[docs] def gene_score_register_subparser(subparser): parser = subparser.add_parser('gene-score', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--cell_table_path', type=str, default=None, required=True, help='Contain all the cool file information in two' 'tab-separated columns: ' '1. cell_uid, 2. file_path. No header') parser_req.add_argument('--gene_meta_path', type=str, default=None, required=True, help='Contain all gene information in four tab-seperated columns: ' '1. chromosome, 2. start, 3. end, 4. gene_id. No header') parser_req.add_argument('--resolution', type=int, default=10000, required=True, help='Resolution of cool file; normally use resolution at 10k') parser_req.add_argument('--output_hdf_path', type=str, default=None, required=True, help='Full path to output file') parser_req.add_argument('--chrom_size_path', type=str, default=None, required=True, help='Path to UCSC chrom size file. Contain all the chromosome ' 'information in two tab-separated columns: ' '1. chromosome name, 2. chromosome length. No header') parser.add_argument('--cpu', type=int, default=10, required=False, help='CPUs to use') parser.add_argument('--slop', type=int, default=0, required=False, help='gene slop distance on both sides') parser.add_argument('--mode', type=str, default='impute', required=False, help='raw or impute') parser.add_argument('--chr1', type=int, dest='chrom1', default=1, required=False, help='0 based index of chr1 column.') parser.add_argument('--chr2', type=int, dest='chrom2', default=5, required=False, help='0 based index of chr2 column.') parser.add_argument('--pos1', type=int, default=2, required=False, help='0 based index of pos1 column.') parser.add_argument('--pos2', type=int, default=6, required=False, help='0 based index of pos2 column.')
[docs] def merge_cell_raw_register_subparser(subparser): parser = subparser.add_parser('merge-cell-raw', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--cell_table', type=str, default=None, required=True, help='Contain all the cell contact file after blacklist' 'removal in two tab-separated columns: ' '1. cell_uid, 2.file_path. No header') parser_req.add_argument('--chrom_size_path', type=str, default=None, required=True, help='Path to UCSC chrom size fileContain all the chromosome' 'information in two tab-separated columns: ' '1. chromosome name, 2. chromosome length. No header') parser_req.add_argument('--output_file', type=str, default=None, required=True, help='Full path to output file') parser.add_argument('--resolution', type=int, default=5000, required=False, help='Resolution of cool file') parser.add_argument('--chr1', dest='chrom1', type=int, default=1, required=False, help='0 based index of chr1 column.') parser.add_argument('--pos1', type=int, default=2, required=False, help='0 based index of pos1 column.') parser.add_argument('--chr2', dest='chrom2', type=int, default=5, required=False, help='0 based index of chr2 column.') parser.add_argument('--pos2', type=int, default=6, required=False, help='0 based index of pos2 column.') parser.add_argument('--min_pos_dist', type=int, default=2500, required=False, help='Minimum distance for a fragment to be considered.')
[docs] def merge_cool_register_subparser(subparser): parser = subparser.add_parser('merge-cool', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--input_cool_tsv_file', type=str, default=None, required=True, help='Contain all the imputed cool file information in a tsv file: ' '1. file_path. No header') parser_req.add_argument('--output_cool', type=str, default=None, required=True, help='Full path to output merged cool file')
[docs] def filter_contacts_register_subparser(subparser): parser = subparser.add_parser('filter-contact', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--contact_table', type=str, default=None, required=True, help='Contain all the cell contact file information in two tab-separated columns: 1. cell_uid, 2. file_path. No header') parser_req.add_argument('--chrom_size_path', type=str, default=None, required=True, help='Path to UCSC chrom size file' 'Contain all the chromosome information in two tab-separated columns: 1. chromosome name, 2. chromosome length. No header') parser_req.add_argument('--output_dir', type=str, default=None, required=True, help='Path to the output directory of the contact files' 'after blacklist filtering') parser.add_argument('--blacklist_1d_path', type=str, default=None, required=False, help='Path to blacklist region BED file, such as ENCODE blacklist. ' 'Either side of the contact overlapping with a blacklist region will be removed.') parser.add_argument('--blacklist_2d_path', type=str, default=None, required=False, help='Path to blacklist region pair BEDPE file. ' 'Both side of the contact overlapping with the same ' 'blacklist region pair will be removed.') parser.add_argument('--blacklist_resolution', dest='resolution_2d', type=int, default=10000, required=False, help='Resolution in bps when consider the 2D blacklist region pairs.') parser.add_argument('--not_remove_duplicates', dest='remove_duplicates', action='store_false', required=False, help='If set, will NOT remove duplicated contacts based on ' '[chr1, pos1, chr2, pos2] values') parser.set_defaults(remove_duplicates=True) parser.add_argument('--chr1', dest='chrom1', type=int, default=1, required=False, help='0 based index of chr1 column.') parser.add_argument('--pos1', type=int, default=2, required=False, help='0 based index of pos1 column.') parser.add_argument('--chr2', dest='chrom2', type=int, default=5, required=False, help='0 based index of chr2 column.') parser.add_argument('--pos2', type=int, default=6, required=False, help='0 based index of pos2 column.') parser.add_argument('--min_pos_dist', type=int, default=0, required=False, help='Minimum distance for a contact to be kept.') parser.add_argument('--cpu', type=int, default=20, required=False, help='Number of cpus to parallel.')
[docs] def contact_distance_register_subparser(subparser): parser = subparser.add_parser('contact-distance', formatter_class=argparse.ArgumentDefaultsHelpFormatter, help="") parser_req = parser.add_argument_group("required arguments") parser_req.add_argument('--contact_table', type=str, default=None, required=True, help='Contain all the cell contact file after blacklist region removwl; ' 'information in two tab-separated columns: 1. cell_uid, 2. file_path. No header') parser_req.add_argument('--chrom_size_path', type=str, default=None, required=True, help='Path to UCSC chrom size fileContain all the chromosome' 'information in two tab-separated columns: ' '1.chromosome name, 2. chromosome length. No header') parser_req.add_argument('--output_prefix', type=str, default=None, required=True, help='Output hdf file prefix including the directory') parser.add_argument('--resolution', type=int, default=10000, required=False, help='Resolution of contact length') parser.add_argument('--chr1', dest='chrom1', type=int, default=1, required=False, help='0 based index of chr1 column.') parser.add_argument('--pos1', type=int, default=2, required=False, help='0 based index of pos1 column.') parser.add_argument('--chr2', dest='chrom2', type=int, default=5, required=False, help='0 based index of chr2 column.') parser.add_argument('--pos2', type=int, default=6, required=False, help='0 based index of pos2 column.') parser.add_argument('--cpu', type=int, default=20, required=False, help='number of cpus to parallel.')
[docs] def main(): parser = argparse.ArgumentParser(description=DESCRIPTION, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) subparsers = parser.add_subparsers( title="functions", dest="command", metavar="" ) # add subparsers current_module = sys.modules[__name__] # get all functions in parser for name, register_subparser_func in inspect.getmembers(current_module, inspect.isfunction): if 'register_subparser' in name: register_subparser_func(subparsers) # initiate args = None if len(sys.argv) > 1: # print out version if sys.argv[1] in ['-v', '--version']: print(schicluster.__version__) exit() else: args = parser.parse_args() else: # print out help parser.parse_args(["-h"]) exit() # set up logging if not logging.root.handlers: setup_logging(stdout=True, quiet=False) # execute command args_vars = vars(args) for k, v in args_vars.items(): log.info(f'{k}\t{v}') cur_command = args_vars.pop('command').lower().replace('_', '-') # Do real import here: if cur_command in ['generatematrix-cell']: from .draft.generatematrix_cell import generatematrix_cell as func elif cur_command in ['impute-cell']: from .draft.impute_cell import impute_cell as func elif cur_command in ['embed-concatcell-chr']: from .draft.embed_concatcell_chr import embed_concatcell_chr as func elif cur_command in ['embed-mergechr']: from .draft.embed_mergechr import embed_mergechr as func elif cur_command in ['loop-bkg-cell']: from .draft.loop_bkg_cell import loop_bkg_cell as func elif cur_command in ['loop-sumcell-chr']: from .draft.loop_sumcell_chr import loop_sumcell_chr as func elif cur_command in ['loop-mergechr']: from .draft.loop_mergechr import loop_mergechr as func elif cur_command in ['domain-insulation-cell']: from .draft.domain_insulation_cell import domain_insulation_cell as func elif cur_command in ['domain-concatcell-chr']: from .draft.domain_concatcell_chr import domain_concatcell_chr as func elif cur_command in ['comp-cpg-cell']: from .draft.comp_cpg_cell import comp_cpg_cell as func elif cur_command in ['comp-concatcell-chr']: from .draft.comp_concatcell_chr import comp_concatcell_chr as func elif cur_command in ['generate-scool', 'scool']: from .cool import generate_scool as func elif cur_command in ['prepare-impute', 'imputation']: from .impute import prepare_impute as func elif cur_command in ['domain']: from .domain import multiple_call_domain_and_insulation as func elif cur_command in ['compartment']: from .compartment import multiple_cell_compartment as func elif cur_command in ['cpg-ratio']: from .compartment import get_cpg_profile as func elif cur_command in ['embedding']: from .embedding import embedding as func elif cur_command in ['gene-score']: from .draft.gene_score import gene_score as func elif cur_command in ['merge-cell-raw']: from .cool.merge import merge_cell_raw as func elif cur_command in ['merge-cool']: from .loop.merge_cell_to_group import merge_cool as func elif cur_command in ['filter-contact']: from .cool.remove_blacklist import filter_contacts_wrapper as func elif cur_command in ['contact-distance']: from .cool.contact_distance import contact_distance as func else: log.debug(f'{cur_command} is not an valid sub-command') parser.parse_args(["-h"]) return # run the command log.info(f"hicluster: Executing {cur_command}...") func(**args_vars) log.info(f"{cur_command} finished.") return
if __name__ == '__main__': main()