{ "cells": [ { "cell_type": "markdown", "id": "53b25811-670c-4799-9202-433491f375b5", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T06:26:41.399996Z", "iopub.status.busy": "2023-06-28T06:26:41.399147Z", "iopub.status.idle": "2023-06-28T06:26:41.403087Z", "shell.execute_reply": "2023-06-28T06:26:41.402344Z", "shell.execute_reply.started": "2023-06-28T06:26:41.399975Z" } }, "source": [ "# Prepare pseudobulk analyses" ] }, { "cell_type": "code", "execution_count": 1, "id": "def14806", "metadata": { "ExecuteTime": { "end_time": "2022-09-24T22:43:29.784664Z", "start_time": "2022-09-24T22:43:29.782188Z" }, "execution": { "iopub.execute_input": "2023-06-28T04:46:25.455079Z", "iopub.status.busy": "2023-06-28T04:46:25.454828Z", "iopub.status.idle": "2023-06-28T04:46:25.712807Z", "shell.execute_reply": "2023-06-28T04:46:25.712050Z", "shell.execute_reply.started": "2023-06-28T04:46:25.455062Z" }, "tags": [] }, "outputs": [], "source": [ "import os\n", "import numpy as np\n", "import pandas as pd\n", "from glob import glob\n", "import schicluster\n", "PACKAGE_DIR = schicluster.__path__[0]\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "60ad5ef5", "metadata": { "ExecuteTime": { "end_time": "2022-09-24T22:43:34.833972Z", "start_time": "2022-09-24T22:43:34.419484Z" }, "execution": { "iopub.execute_input": "2023-06-28T04:46:27.832196Z", "iopub.status.busy": "2023-06-28T04:46:27.831924Z", "iopub.status.idle": "2023-06-28T04:46:27.882284Z", "shell.execute_reply": "2023-06-28T04:46:27.881754Z", "shell.execute_reply.started": "2023-06-28T04:46:27.832177Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
rnatypescoredipctypedipcleiden
cortex-p028-cb_116Cortical_L6_Pyramidal_Cell0.999996Cortical L6 Pyramidal Cell5
cortex-visual-control-p007-b6_182Cortical_L6_Pyramidal_Cell0.999992Neuron2
cortex-p028-cb_112Cortical_L6_Pyramidal_Cell0.999996Cortical L2–5 Pyramidal Cell5
cortex-visual-control-p001-b6_061Unknown_Interneuron_20.330496Neuron2
cortex-p056-cb_216Microglia_Etc0.999989Microglia Etc.14
...............
cortex-visual-control-p021-b6_090Mature_Oligodendrocyte0.999994Mature Oligodendrocyte16
cortex-visual-control-p021-b6_012Cortical_L6_Pyramidal_Cell0.999997Neuron5
hippocampus-p007-cb_046Microglia_Etc0.999993Microglia Etc.13
cortex-visual-dark-p014-b6_106Microglia_Etc0.999994Microglia Etc.13
cortex-visual-control-p021-b6_174Microglia_Etc0.999995Microglia Etc.13
\n", "

3646 rows × 4 columns

\n", "
" ], "text/plain": [ " rnatype score \\\n", "cortex-p028-cb_116 Cortical_L6_Pyramidal_Cell 0.999996 \n", "cortex-visual-control-p007-b6_182 Cortical_L6_Pyramidal_Cell 0.999992 \n", "cortex-p028-cb_112 Cortical_L6_Pyramidal_Cell 0.999996 \n", "cortex-visual-control-p001-b6_061 Unknown_Interneuron_2 0.330496 \n", "cortex-p056-cb_216 Microglia_Etc 0.999989 \n", "... ... ... \n", "cortex-visual-control-p021-b6_090 Mature_Oligodendrocyte 0.999994 \n", "cortex-visual-control-p021-b6_012 Cortical_L6_Pyramidal_Cell 0.999997 \n", "hippocampus-p007-cb_046 Microglia_Etc 0.999993 \n", "cortex-visual-dark-p014-b6_106 Microglia_Etc 0.999994 \n", "cortex-visual-control-p021-b6_174 Microglia_Etc 0.999995 \n", "\n", " dipctype dipcleiden \n", "cortex-p028-cb_116 Cortical L6 Pyramidal Cell 5 \n", "cortex-visual-control-p007-b6_182 Neuron 2 \n", "cortex-p028-cb_112 Cortical L2–5 Pyramidal Cell 5 \n", "cortex-visual-control-p001-b6_061 Neuron 2 \n", "cortex-p056-cb_216 Microglia Etc. 14 \n", "... ... ... \n", "cortex-visual-control-p021-b6_090 Mature Oligodendrocyte 16 \n", "cortex-visual-control-p021-b6_012 Neuron 5 \n", "hippocampus-p007-cb_046 Microglia Etc. 13 \n", "cortex-visual-dark-p014-b6_106 Microglia Etc. 13 \n", "cortex-visual-control-p021-b6_174 Microglia Etc. 13 \n", "\n", "[3646 rows x 4 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata = pd.read_csv('Tan2021_dipc_cluster.csv.gz', header=0, index_col=0)\n", "metadata['rnatype'] = [xx.replace(' ', '_').replace('/', '').replace(',', '').replace('.', '') for xx in metadata['rnatype'].values]\n", "metadata\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "e1896f2d-f90e-4360-b1cd-2465bf428c13", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T04:46:28.324947Z", "iopub.status.busy": "2023-06-28T04:46:28.324704Z", "iopub.status.idle": "2023-06-28T04:46:28.329240Z", "shell.execute_reply": "2023-06-28T04:46:28.328657Z", "shell.execute_reply.started": "2023-06-28T04:46:28.324929Z" }, "tags": [] }, "outputs": [], "source": [ "def prepare_dir(output_dir, chunk_df, template, params):\n", "\n", " os.makedirs(output_dir, exist_ok=True)\n", " cell_table_path = f'{output_dir}cell_table.csv'\n", " chunk_df.to_csv(cell_table_path, header=False, index=True)\n", " params_str = '\\n'.join(f'{k} = {v}' for k, v in params.items())\n", "\n", " with open(f'{output_dir}Snakefile_master', 'w') as f:\n", " f.write(params_str + template)\n", " return\n" ] }, { "cell_type": "code", "execution_count": null, "id": "b526c612-902b-4fd5-b78d-3000de33bacb", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "2c537f34-03ae-4911-b836-078d1bebfed8", "metadata": { "tags": [] }, "source": [ "## Raw\n", "\n", "Generate pseudobulk cool files of raw contacts (before imputation) at 5kb resolution, \n", "so the mcool files contain 10kb, 25kb, and 100kb resolution.\n", "The files contain both cis and trans contacts.\n", "\n", "The code below divides large cell groups into chunks of 200 cells. \"snakemake_cmd_step1.txt\" contains commands to generate pseudobulk matrices for each chunk, \n", "and \"snakemake_cmd_step2.txt\" contains commands to merge chunks into cell groups. \n", "Both of them could be distributed across HPC. \n" ] }, { "cell_type": "code", "execution_count": 5, "id": "c0357481-66e7-438e-8164-170a2830f556", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T20:59:02.475872Z", "iopub.status.busy": "2023-06-27T20:59:02.475490Z", "iopub.status.idle": "2023-06-27T20:59:02.499950Z", "shell.execute_reply": "2023-06-27T20:59:02.498725Z", "shell.execute_reply.started": "2023-06-27T20:59:02.475846Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
csv_path
cortex-p028-cb_116/data/test_schicluster/Tan2021/raw/CTX_HIP/con...
cortex-visual-control-p007-b6_182/data/test_schicluster/Tan2021/raw/VIS/contact...
cortex-p028-cb_112/data/test_schicluster/Tan2021/raw/CTX_HIP/con...
cortex-visual-control-p001-b6_061/data/test_schicluster/Tan2021/raw/VIS/contact...
cortex-p056-cb_216/data/test_schicluster/Tan2021/raw/CTX_HIP/con...
......
cortex-visual-control-p021-b6_090/data/test_schicluster/Tan2021/raw/VIS/contact...
cortex-visual-control-p021-b6_012/data/test_schicluster/Tan2021/raw/VIS/contact...
hippocampus-p007-cb_046/data/test_schicluster/Tan2021/raw/CTX_HIP/con...
cortex-visual-dark-p014-b6_106/data/test_schicluster/Tan2021/raw/VIS/contact...
cortex-visual-control-p021-b6_174/data/test_schicluster/Tan2021/raw/VIS/contact...
\n", "

3646 rows × 1 columns

\n", "
" ], "text/plain": [ " csv_path\n", "cortex-p028-cb_116 /data/test_schicluster/Tan2021/raw/CTX_HIP/con...\n", "cortex-visual-control-p007-b6_182 /data/test_schicluster/Tan2021/raw/VIS/contact...\n", "cortex-p028-cb_112 /data/test_schicluster/Tan2021/raw/CTX_HIP/con...\n", "cortex-visual-control-p001-b6_061 /data/test_schicluster/Tan2021/raw/VIS/contact...\n", "cortex-p056-cb_216 /data/test_schicluster/Tan2021/raw/CTX_HIP/con...\n", "... ...\n", "cortex-visual-control-p021-b6_090 /data/test_schicluster/Tan2021/raw/VIS/contact...\n", "cortex-visual-control-p021-b6_012 /data/test_schicluster/Tan2021/raw/VIS/contact...\n", "hippocampus-p007-cb_046 /data/test_schicluster/Tan2021/raw/CTX_HIP/con...\n", "cortex-visual-dark-p014-b6_106 /data/test_schicluster/Tan2021/raw/VIS/contact...\n", "cortex-visual-control-p021-b6_174 /data/test_schicluster/Tan2021/raw/VIS/contact...\n", "\n", "[3646 rows x 1 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cell_table = pd.read_csv('contact_table.tsv', sep='\\t', header=None, index_col=0, names=['cell_id','csv_path'])\n", "cell_table = cell_table.loc[metadata.index]\n", "cell_table\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "7e097eee-b1c0-4539-8cff-69b0ca884a3b", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T20:59:06.817167Z", "iopub.status.busy": "2023-06-27T20:59:06.816523Z", "iopub.status.idle": "2023-06-27T20:59:06.826206Z", "shell.execute_reply": "2023-06-27T20:59:06.825488Z", "shell.execute_reply.started": "2023-06-27T20:59:06.817136Z" }, "tags": [] }, "outputs": [], "source": [ "cell_table['cluster'] = metadata['rnatype'].copy()\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "c997d073-e56f-4df0-95e5-16b2b7c6559d", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T20:59:51.725398Z", "iopub.status.busy": "2023-06-27T20:59:51.725138Z", "iopub.status.idle": "2023-06-27T20:59:51.937053Z", "shell.execute_reply": "2023-06-27T20:59:51.936380Z", "shell.execute_reply.started": "2023-06-27T20:59:51.725379Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Adult_Astrocyte 144\n", "Cajal-Retzius_Cell 23\n", "Cortical_L2-4_Pyramidal_Cell_Intermediate 37\n", "Cortical_L2-5_Pyramidal_Cell_Neonatal 211\n", "Cortical_L23_Pyramidal_Cell 204\n", "Cortical_L4_Pyramidal_Cell 195\n", "Cortical_L5_Pyramidal_Cell 98\n", "Cortical_L6_Pyramidal_Cell 333\n", "Hippocampal_CA1_Pyramidal_Cell 134\n", "Hippocampal_CA3_Pyramidal_Cell 70\n", "Hippocampal_Granuale_Cell 309\n", "Hippocampal_Pyramidal_Cell_Neonatal 93\n", "MEIS2_Interneuron 88\n", "Mature_Oligodendrocyte 210\n", "Medium_Spiny_Neuron 167\n", "Microglia_Etc 391\n", "NDNF_Interneuron 36\n", "Neonatal_Astrocyte 234\n", "Newly_Formed_Oligodendrocyte 27\n", "Oligodendrocyte_Progenitor 189\n", "PVSST_Interneuron_Neonatal 85\n", "PV_Interneuron 61\n", "SST_Interneuron 55\n", "Unknown_Interneuron_1 89\n", "Unknown_Interneuron_2 51\n", "VIP_Interneuron 112\n" ] } ], "source": [ "leg = {}\n", "chunk_size = 200\n", "outdir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/merged_raw/'\n", "for cluster, sub_df in cell_table.groupby('cluster'):\n", " legtmp = []\n", " # group = cluster.replace(' ', '_').replace('/', '').replace(',', '').replace('.', '')\n", " os.makedirs(f'{outdir}{cluster}', exist_ok=True)\n", " if sub_df.shape[0]>1500:\n", " tmp = sub_df.loc[np.random.choice(sub_df.index, 1500, False)]\n", " else:\n", " tmp = sub_df.copy()\n", " for i,chunk_start in enumerate(np.arange(0, tmp.shape[0], chunk_size)):\n", " os.makedirs(f'{outdir}{cluster}_chunk{i}', exist_ok=True)\n", " tmp['csv_path'].iloc[chunk_start:(chunk_start+chunk_size)].to_csv(f'{outdir}{cluster}_chunk{i}/cell_table.tsv', sep='\\t', header=False, index=True)\n", " legtmp.append(f'{cluster}_chunk{i}')\n", " tmp['csv_path'].to_csv(f'{outdir}{cluster}/cell_table.tsv', sep='\\t', header=False, index=True)\n", " leg[cluster] = legtmp\n", " print(cluster, tmp.shape[0])\n", " " ] }, { "cell_type": "markdown", "id": "5db791c9-891e-4a72-b18f-fda77da78568", "metadata": {}, "source": [ "### Note\n", "\n", "Run the merge command using batch job submission.\n", "\n", "The following code generates command files for step1 and step2. \n", "\n", "Each line of a file is a command and can be appended into the job submission template of users system." ] }, { "cell_type": "code", "execution_count": 16, "id": "b9871192-7a16-4c1d-b074-7e01f8cc172b", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T07:53:00.283674Z", "iopub.status.busy": "2023-06-27T07:53:00.283383Z", "iopub.status.idle": "2023-06-27T07:53:00.289368Z", "shell.execute_reply": "2023-06-27T07:53:00.288771Z", "shell.execute_reply.started": "2023-06-27T07:53:00.283654Z" }, "tags": [] }, "outputs": [], "source": [ "f1 = open(f'{outdir}snakemake_cmd_step1.txt', 'w')\n", "f2 = open(f'{outdir}snakemake_cmd_step2.txt', 'w')\n", "for ct in leg:\n", " for group in leg[ct]:\n", " cmd = f'hicluster merge-cell-raw --cell_table {outdir}{group}/cell_table.tsv --chrom_size_path /data/ref/mm10/genome/mm10.main.chrom.sizes --output_file {outdir}{group}/raw.cool --chr1 1 --pos1 2 --chr2 3 --pos2 4'\n", " f1.write(cmd + '\\n')\n", " if len(leg[ct])<2:\n", " group = leg[ct][0]\n", " cmd = f'rsync -arv {outdir}{group}/raw.cool {outdir}{ct}/{ct}.raw.cool'\n", " f2.write(cmd + '\\n')\n", " else:\n", " cmd = f'cooler merge {outdir}{ct}/{ct}.raw.cool'\n", " for group in leg[ct]:\n", " cmd += f' {outdir}{group}/raw.cool'\n", " f2.write(cmd + '\\n')\n", " \n", "f1.close()\n", "f2.close()\n" ] }, { "cell_type": "markdown", "id": "b3d7662e-8a5d-425e-9b5f-69854cc8a064", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T08:14:11.672102Z", "iopub.status.busy": "2023-06-27T08:14:11.671819Z", "iopub.status.idle": "2023-06-27T08:14:11.677132Z", "shell.execute_reply": "2023-06-27T08:14:11.676232Z", "shell.execute_reply.started": "2023-06-27T08:14:11.672082Z" } }, "source": [ "### Note\n", "\n", "Run the merge command on a single node, parallelized with snakemake.\n", "\n", "The following code generates Snakemake files for step1 and step2. \n", "\n", "The template notebook for step1 can be found at https://github.com/zhoujt1994/scHiCluster/blob/master/schicluster/cool/01.merge_cell_to_chunk_raw.ipynb.\n", "\n", "The template notebook for step2 can be found at https://github.com/zhoujt1994/scHiCluster/blob/master/schicluster/cool/01.merge_chunk_to_group_raw.ipynb.\n", "\n", "Each cell type is assigned to a separate folder and processed separately." ] }, { "cell_type": "code", "execution_count": 14, "id": "0244b996-7aac-4193-83a0-662751ddd5f3", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T07:41:14.587193Z", "iopub.status.busy": "2023-06-27T07:41:14.586885Z", "iopub.status.idle": "2023-06-27T07:41:14.610282Z", "shell.execute_reply": "2023-06-27T07:41:14.609654Z", "shell.execute_reply.started": "2023-06-27T07:41:14.587173Z" }, "tags": [] }, "outputs": [], "source": [ "from gliderport.preset import notebook_snakemake\n", "\n", "notebook_snakemake(\n", " work_dir=f\"merged_raw/\",\n", " notebook_dir=\"merged_raw/template_step1/\",\n", " groups=np.concatenate([leg[xx] for xx in leg]).tolist(),\n", " default_cpu=1,\n", " default_mem_gb=5,\n", " redo_prepare=True,\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "dabe2526-aa80-4720-8444-12f6b8e08cdd", "metadata": {}, "outputs": [], "source": [ "!snakemake --snakefile Snakefile -j 8 --keep-going" ] }, { "cell_type": "code", "execution_count": 17, "id": "6793be2c-f85b-48e5-9901-92df68314f22", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T07:55:53.277356Z", "iopub.status.busy": "2023-06-27T07:55:53.277052Z", "iopub.status.idle": "2023-06-27T07:55:53.297709Z", "shell.execute_reply": "2023-06-27T07:55:53.297089Z", "shell.execute_reply.started": "2023-06-27T07:55:53.277333Z" }, "tags": [] }, "outputs": [], "source": [ "notebook_snakemake(\n", " work_dir=f\"merged_raw/\",\n", " notebook_dir=\"merged_raw/template_step2/\",\n", " groups=list(leg.keys()),\n", " default_cpu=1,\n", " default_mem_gb=5,\n", " redo_prepare=True,\n", ")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "6bdf0620-c33c-4967-9aff-ac864ae9efb1", "metadata": {}, "outputs": [], "source": [ "!snakemake --snakefile Snakefile -j 8 --keep-going" ] }, { "cell_type": "code", "execution_count": null, "id": "63e4070e-bfb2-4048-b912-b2ba0891c2a5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "8cd5f3e3", "metadata": {}, "source": [ "## Loop\n", "\n", "Identify pseudobulk loop pixels and summits at 10kb resolution with single-cell information.\n", "\n", "The code below generates a loop folder, with each cell group as a subfolder, \n", "and a file \"snakemake_cmds.txt\" with each line as a command to run loop calling for one cell group.\n", "\n", "Each command takes the imputed matrices of cells belonging to the same group as input,\n", "to generate for each cell group, a bedpe file of loops, and Q, E, T matrices for differential loop calling.\n", "The files only contain interations < 5Mb.\n", "\n", "The command can be appended into the job submission template of users system.\n", "We suggest to distribute the cell groups across different computing nodes in HPC as separate job.\n", "\n", "On anvil standard node (128 CPUs), loop calling of a cell group with 2000 cells will take ~12 hours.\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "24485989", "metadata": { "ExecuteTime": { "end_time": "2022-09-24T22:43:44.793403Z", "start_time": "2022-09-24T22:43:44.774118Z" }, "execution": { "iopub.execute_input": "2023-06-28T06:11:45.004143Z", "iopub.status.busy": "2023-06-28T06:11:45.003881Z", "iopub.status.idle": "2023-06-28T06:11:45.315650Z", "shell.execute_reply": "2023-06-28T06:11:45.314854Z", "shell.execute_reply.started": "2023-06-28T06:11:45.004124Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cool_pathcluster
cortex-p028-cb_116/anvil/scratch/x-zhou/Tan2021/scool/impute/10K...Cortical_L6_Pyramidal_Cell
cortex-visual-control-p007-b6_182/anvil/scratch/x-zhou/Tan2021/scool/impute/10K...Cortical_L6_Pyramidal_Cell
cortex-p028-cb_112/anvil/scratch/x-zhou/Tan2021/scool/impute/10K...Cortical_L6_Pyramidal_Cell
cortex-visual-control-p001-b6_061/anvil/scratch/x-zhou/Tan2021/scool/impute/10K...Unknown_Interneuron_2
cortex-p056-cb_216/anvil/scratch/x-zhou/Tan2021/scool/impute/10K...Microglia_Etc
.........
cortex-visual-control-p021-b6_090/anvil/scratch/x-zhou/Tan2021/scool/impute/10K...Mature_Oligodendrocyte
cortex-visual-control-p021-b6_012/anvil/scratch/x-zhou/Tan2021/scool/impute/10K...Cortical_L6_Pyramidal_Cell
hippocampus-p007-cb_046/anvil/scratch/x-zhou/Tan2021/scool/impute/10K...Microglia_Etc
cortex-visual-dark-p014-b6_106/anvil/scratch/x-zhou/Tan2021/scool/impute/10K...Microglia_Etc
cortex-visual-control-p021-b6_174/anvil/scratch/x-zhou/Tan2021/scool/impute/10K...Microglia_Etc
\n", "

3646 rows × 2 columns

\n", "
" ], "text/plain": [ " cool_path \\\n", "cortex-p028-cb_116 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... \n", "cortex-visual-control-p007-b6_182 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... \n", "cortex-p028-cb_112 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... \n", "cortex-visual-control-p001-b6_061 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... \n", "cortex-p056-cb_216 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... \n", "... ... \n", "cortex-visual-control-p021-b6_090 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... \n", "cortex-visual-control-p021-b6_012 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... \n", "hippocampus-p007-cb_046 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... \n", "cortex-visual-dark-p014-b6_106 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... \n", "cortex-visual-control-p021-b6_174 /anvil/scratch/x-zhou/Tan2021/scool/impute/10K... \n", "\n", " cluster \n", "cortex-p028-cb_116 Cortical_L6_Pyramidal_Cell \n", "cortex-visual-control-p007-b6_182 Cortical_L6_Pyramidal_Cell \n", "cortex-p028-cb_112 Cortical_L6_Pyramidal_Cell \n", "cortex-visual-control-p001-b6_061 Unknown_Interneuron_2 \n", "cortex-p056-cb_216 Microglia_Etc \n", "... ... \n", "cortex-visual-control-p021-b6_090 Mature_Oligodendrocyte \n", "cortex-visual-control-p021-b6_012 Cortical_L6_Pyramidal_Cell \n", "hippocampus-p007-cb_046 Microglia_Etc \n", "cortex-visual-dark-p014-b6_106 Microglia_Etc \n", "cortex-visual-control-p021-b6_174 Microglia_Etc \n", "\n", "[3646 rows x 2 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coollist = glob('/data/test_schicluster/Tan2021/scool/impute/10K/*/*.cool')\n", "cell_table = pd.DataFrame(coollist, index=[xx.split('/')[-1].replace('.cool', '') for xx in coollist], columns=['cool_path'])\n", "cell_table = cell_table.loc[metadata.index]\n", "cell_table['cluster'] = metadata['rnatype'].copy()\n", "cell_table['cool_path'] = cell_table['cool_path'].str.replace('/data/test_schicluster', '/anvil/scratch/x-zhou')\n", "cell_table\n" ] }, { "cell_type": "code", "execution_count": 39, "id": "2e398a56", "metadata": { "ExecuteTime": { "end_time": "2022-09-24T22:45:01.306187Z", "start_time": "2022-09-24T22:45:00.927385Z" }, "execution": { "iopub.execute_input": "2023-06-28T06:33:18.008932Z", "iopub.status.busy": "2023-06-28T06:33:18.008584Z", "iopub.status.idle": "2023-06-28T06:33:18.012945Z", "shell.execute_reply": "2023-06-28T06:33:18.012300Z", "shell.execute_reply.started": "2023-06-28T06:33:18.008910Z" }, "tags": [] }, "outputs": [], "source": [ "outdir = 'Tan2021_loop/'\n", "loop_dir = f'/anvil/scratch/x-zhou/{outdir}'\n" ] }, { "cell_type": "code", "execution_count": 37, "id": "d11b65e8-7fa8-4174-aab5-ae636b96a654", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T06:30:59.218726Z", "iopub.status.busy": "2023-06-28T06:30:59.218400Z", "iopub.status.idle": "2023-06-28T06:30:59.223254Z", "shell.execute_reply": "2023-06-28T06:30:59.222463Z", "shell.execute_reply.started": "2023-06-28T06:30:59.218705Z" }, "tags": [] }, "outputs": [], "source": [ "params = {\n", " 'cpu': 96,\n", " 'resolution': 10000,\n", " 'chrom_size_path': f'\"{loop_dir}mm10.main20.chrom.sizes\"',\n", " 'black_list_path': f'\"{loop_dir}mm10.dipc.rowsum1000.blf50.merged.bed\"',\n", "}" ] }, { "cell_type": "code", "execution_count": 29, "id": "62918641", "metadata": { "ExecuteTime": { "end_time": "2022-09-24T22:45:01.928558Z", "start_time": "2022-09-24T22:45:01.926038Z" }, "execution": { "iopub.execute_input": "2023-06-28T06:12:31.362863Z", "iopub.status.busy": "2023-06-28T06:12:31.362598Z", "iopub.status.idle": "2023-06-28T06:12:31.366546Z", "shell.execute_reply": "2023-06-28T06:12:31.365996Z", "shell.execute_reply.started": "2023-06-28T06:12:31.362843Z" }, "tags": [] }, "outputs": [], "source": [ "with open(f'{PACKAGE_DIR}/loop/snakemake_template_loop.txt') as tmp:\n", " GENERATE_MATRIX_CHUNK_TEMPLATE = tmp.read()\n" ] }, { "cell_type": "code", "execution_count": 33, "id": "110c8d29", "metadata": { "ExecuteTime": { "end_time": "2022-09-24T22:45:05.733069Z", "start_time": "2022-09-24T22:45:02.419558Z" }, "execution": { "iopub.execute_input": "2023-06-28T06:13:07.004268Z", "iopub.status.busy": "2023-06-28T06:13:07.003971Z", "iopub.status.idle": "2023-06-28T06:13:07.054050Z", "shell.execute_reply": "2023-06-28T06:13:07.052968Z", "shell.execute_reply.started": "2023-06-28T06:13:07.004244Z" }, "tags": [] }, "outputs": [], "source": [ "for cluster, sub_df in cell_table.groupby('cluster'):\n", " if sub_df.shape[0]>1500:\n", " tmp = sub_df.loc[np.random.choice(sub_df.index, 1500, False)]\n", " else:\n", " tmp = sub_df.copy()\n", " prepare_dir(f'{outdir}{cluster}/', tmp, GENERATE_MATRIX_CHUNK_TEMPLATE, params)\n", " " ] }, { "cell_type": "code", "execution_count": 34, "id": "64619969", "metadata": { "ExecuteTime": { "end_time": "2022-09-23T19:09:22.135654Z", "start_time": "2022-09-23T19:09:22.116743Z" }, "execution": { "iopub.execute_input": "2023-06-28T06:13:07.352221Z", "iopub.status.busy": "2023-06-28T06:13:07.351881Z", "iopub.status.idle": "2023-06-28T06:13:07.358979Z", "shell.execute_reply": "2023-06-28T06:13:07.358419Z", "shell.execute_reply.started": "2023-06-28T06:13:07.352193Z" }, "tags": [] }, "outputs": [], "source": [ "with open(f'{outdir}snakemake_cmds.txt', 'w') as f:\n", " for cluster, sub_df in cell_table.groupby('cluster'):\n", " cluster_dir = f'{loop_dir}{cluster}'\n", " f.write(f'snakemake -d {cluster_dir} -s {cluster_dir}/Snakefile_master -j {params[\"cpu\"]}\\n')\n", " " ] }, { "cell_type": "code", "execution_count": 14, "id": "2cc64d8e-5a81-4724-bb66-9e94d68b0ebd", "metadata": { "execution": { "iopub.execute_input": "2023-06-28T00:12:49.719527Z", "iopub.status.busy": "2023-06-28T00:12:49.719213Z", "iopub.status.idle": "2023-06-28T00:12:50.202402Z", "shell.execute_reply": "2023-06-28T00:12:50.201338Z", "shell.execute_reply.started": "2023-06-28T00:12:49.719506Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/bin/bash: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8)\n", "/bin/bash: warning: setlocale: LC_ALL: cannot change locale (en_US.UTF-8)\n" ] } ], "source": [ "!cp /data/ref/mm10/genome/mm10.main20.chrom.sizes Tan2021_loop/\n", "!cp /home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/mm10.dipc.rowsum1000.blf50.merged.bed Tan2021_loop/\n" ] }, { "cell_type": "code", "execution_count": null, "id": "fbe88837", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "5bd3f8c3", "metadata": {}, "source": [ "## Domain\n", "\n", "Generate pseudobulk cool files of imputed contacts at 25kb resolution by summing up imputed matrices of single cells.\n", "\n", "The code below divides large cell groups into chunks of 200 cells. \"snakemake_cmd_step1.txt\" contains commands to generate pseudobulk matrices for each chunk and could be distributed across HPC. \n", "\n", "\"snakemake_cmd_step2.txt\" contains a command to merge chunks into cell groups and could be run directly on a single node." ] }, { "cell_type": "code", "execution_count": 23, "id": "007f7e56", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:05:45.301531Z", "iopub.status.busy": "2023-06-27T21:05:45.301227Z", "iopub.status.idle": "2023-06-27T21:05:45.843624Z", "shell.execute_reply": "2023-06-27T21:05:45.843079Z", "shell.execute_reply.started": "2023-06-27T21:05:45.301508Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cool_pathcluster
cortex-p028-cb_116/data/test_schicluster/Tan2021/scool/impute/25...Cortical_L6_Pyramidal_Cell
cortex-visual-control-p007-b6_182/data/test_schicluster/Tan2021/scool/impute/25...Cortical_L6_Pyramidal_Cell
cortex-p028-cb_112/data/test_schicluster/Tan2021/scool/impute/25...Cortical_L6_Pyramidal_Cell
cortex-visual-control-p001-b6_061/data/test_schicluster/Tan2021/scool/impute/25...Unknown_Interneuron_2
cortex-p056-cb_216/data/test_schicluster/Tan2021/scool/impute/25...Microglia_Etc
.........
cortex-visual-control-p021-b6_090/data/test_schicluster/Tan2021/scool/impute/25...Mature_Oligodendrocyte
cortex-visual-control-p021-b6_012/data/test_schicluster/Tan2021/scool/impute/25...Cortical_L6_Pyramidal_Cell
hippocampus-p007-cb_046/data/test_schicluster/Tan2021/scool/impute/25...Microglia_Etc
cortex-visual-dark-p014-b6_106/data/test_schicluster/Tan2021/scool/impute/25...Microglia_Etc
cortex-visual-control-p021-b6_174/data/test_schicluster/Tan2021/scool/impute/25...Microglia_Etc
\n", "

3646 rows × 2 columns

\n", "
" ], "text/plain": [ " cool_path \\\n", "cortex-p028-cb_116 /data/test_schicluster/Tan2021/scool/impute/25... \n", "cortex-visual-control-p007-b6_182 /data/test_schicluster/Tan2021/scool/impute/25... \n", "cortex-p028-cb_112 /data/test_schicluster/Tan2021/scool/impute/25... \n", "cortex-visual-control-p001-b6_061 /data/test_schicluster/Tan2021/scool/impute/25... \n", "cortex-p056-cb_216 /data/test_schicluster/Tan2021/scool/impute/25... \n", "... ... \n", "cortex-visual-control-p021-b6_090 /data/test_schicluster/Tan2021/scool/impute/25... \n", "cortex-visual-control-p021-b6_012 /data/test_schicluster/Tan2021/scool/impute/25... \n", "hippocampus-p007-cb_046 /data/test_schicluster/Tan2021/scool/impute/25... \n", "cortex-visual-dark-p014-b6_106 /data/test_schicluster/Tan2021/scool/impute/25... \n", "cortex-visual-control-p021-b6_174 /data/test_schicluster/Tan2021/scool/impute/25... \n", "\n", " cluster \n", "cortex-p028-cb_116 Cortical_L6_Pyramidal_Cell \n", "cortex-visual-control-p007-b6_182 Cortical_L6_Pyramidal_Cell \n", "cortex-p028-cb_112 Cortical_L6_Pyramidal_Cell \n", "cortex-visual-control-p001-b6_061 Unknown_Interneuron_2 \n", "cortex-p056-cb_216 Microglia_Etc \n", "... ... \n", "cortex-visual-control-p021-b6_090 Mature_Oligodendrocyte \n", "cortex-visual-control-p021-b6_012 Cortical_L6_Pyramidal_Cell \n", "hippocampus-p007-cb_046 Microglia_Etc \n", "cortex-visual-dark-p014-b6_106 Microglia_Etc \n", "cortex-visual-control-p021-b6_174 Microglia_Etc \n", "\n", "[3646 rows x 2 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coollist = glob('/data/test_schicluster/Tan2021/scool/impute/25K/*/*.cool')\n", "cell_table = pd.DataFrame(coollist, index=[xx.split('/')[-1].replace('.cool', '') for xx in coollist], columns=['cool_path'])\n", "cell_table = cell_table.loc[metadata.index]\n", "cell_table['cluster'] = metadata['rnatype'].copy()\n", "# cell_table['cool_path'] = cell_table['cool_path'].str.replace('/data/test_schicluster', '/anvil/scratch/x-zhou')\n", "cell_table\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "c9de7992-e1e2-4745-931c-26175e62a726", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:05:46.851706Z", "iopub.status.busy": "2023-06-27T21:05:46.851400Z", "iopub.status.idle": "2023-06-27T21:05:46.854803Z", "shell.execute_reply": "2023-06-27T21:05:46.854276Z", "shell.execute_reply.started": "2023-06-27T21:05:46.851686Z" }, "tags": [] }, "outputs": [], "source": [ "outdir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/domain/'\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "b0753831", "metadata": { "ExecuteTime": { "end_time": "2022-03-05T20:28:34.445699Z", "start_time": "2022-03-05T20:28:34.352677Z" }, "execution": { "iopub.execute_input": "2023-06-27T21:05:47.121548Z", "iopub.status.busy": "2023-06-27T21:05:47.121316Z", "iopub.status.idle": "2023-06-27T21:05:47.147378Z", "shell.execute_reply": "2023-06-27T21:05:47.146832Z", "shell.execute_reply.started": "2023-06-27T21:05:47.121530Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Adult_Astrocyte 144\n", "Cajal-Retzius_Cell 23\n", "Cortical_L2-4_Pyramidal_Cell_Intermediate 37\n", "Cortical_L2-5_Pyramidal_Cell_Neonatal 211\n", "Cortical_L23_Pyramidal_Cell 204\n", "Cortical_L4_Pyramidal_Cell 195\n", "Cortical_L5_Pyramidal_Cell 98\n", "Cortical_L6_Pyramidal_Cell 333\n", "Hippocampal_CA1_Pyramidal_Cell 134\n", "Hippocampal_CA3_Pyramidal_Cell 70\n", "Hippocampal_Granuale_Cell 309\n", "Hippocampal_Pyramidal_Cell_Neonatal 93\n", "MEIS2_Interneuron 88\n", "Mature_Oligodendrocyte 210\n", "Medium_Spiny_Neuron 167\n", "Microglia_Etc 391\n", "NDNF_Interneuron 36\n", "Neonatal_Astrocyte 234\n", "Newly_Formed_Oligodendrocyte 27\n", "Oligodendrocyte_Progenitor 189\n", "PVSST_Interneuron_Neonatal 85\n", "PV_Interneuron 61\n", "SST_Interneuron 55\n", "Unknown_Interneuron_1 89\n", "Unknown_Interneuron_2 51\n", "VIP_Interneuron 112\n" ] } ], "source": [ "for cluster, sub_df in cell_table.groupby('cluster'):\n", " os.makedirs(f'{outdir}{cluster}', exist_ok=True)\n", " sub_df.to_csv(f'{outdir}{cluster}/cell_table.csv', header=False, index=True)\n", " #with open(f'{cluster}/Snakefile_master', 'w') as f:\n", " # f.write(snakemake_str)\n", " print(cluster, sub_df.shape[0])\n" ] }, { "cell_type": "code", "execution_count": 26, "id": "254223e9-f60b-47ae-8371-37e7c0b1ae4a", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:05:48.058039Z", "iopub.status.busy": "2023-06-27T21:05:48.057773Z", "iopub.status.idle": "2023-06-27T21:05:48.061580Z", "shell.execute_reply": "2023-06-27T21:05:48.061027Z", "shell.execute_reply.started": "2023-06-27T21:05:48.058016Z" }, "tags": [] }, "outputs": [], "source": [ "params = {\n", " 'resolution': 25000,\n", " 'chrom_size_path': '\"/data/ref/mm10/genome/mm10.main20.chrom.sizes\"',\n", "}\n" ] }, { "cell_type": "code", "execution_count": 33, "id": "ec3ca915", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:07:27.922003Z", "iopub.status.busy": "2023-06-27T21:07:27.921758Z", "iopub.status.idle": "2023-06-27T21:07:28.107317Z", "shell.execute_reply": "2023-06-27T21:07:28.106533Z", "shell.execute_reply.started": "2023-06-27T21:07:27.921984Z" }, "tags": [] }, "outputs": [], "source": [ "chunk_size = 200\n", "res = 25000\n", "total_chunk_dirs = []\n", "group_chunks = {}\n", "\n", "with open(f'{PACKAGE_DIR}/cool/Snakefile_chunk_template') as tmp:\n", " GENERATE_MATRIX_CHUNK_TEMPLATE = tmp.read()\n", "\n", "for group, group_df in cell_table.groupby('cluster'):\n", " group_chunks[group] = []\n", " if group_df.shape[0] <= chunk_size:\n", " this_dir = f'{outdir}{group}_chunk0/'\n", " params['cell_table_path'] = f'\"{this_dir}cell_table.csv\"'\n", " prepare_dir(this_dir, group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)\n", " total_chunk_dirs.append(this_dir)\n", " group_chunks[group].append(this_dir)\n", " else:\n", " group_df['chunk'] = [i // chunk_size for i in range(group_df.shape[0])]\n", " for chunk, chunk_df in group_df.groupby('chunk'):\n", " this_dir = f'{outdir}{group}_chunk{chunk}/'\n", " params['cell_table_path'] = f'\"{this_dir}cell_table.csv\"'\n", " prepare_dir(this_dir, chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)\n", " total_chunk_dirs.append(this_dir)\n", " group_chunks[group].append(this_dir)\n", "\n", " " ] }, { "cell_type": "code", "execution_count": 31, "id": "f1eea08c", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:06:10.181363Z", "iopub.status.busy": "2023-06-27T21:06:10.181124Z", "iopub.status.idle": "2023-06-27T21:06:10.185033Z", "shell.execute_reply": "2023-06-27T21:06:10.184510Z", "shell.execute_reply.started": "2023-06-27T21:06:10.181343Z" }, "tags": [] }, "outputs": [], "source": [ "with open(f'{outdir}snakemake_cmd_step1.txt', 'w') as f:\n", " for chunk_dir in total_chunk_dirs:\n", " cmd = f'snakemake -d {chunk_dir} --snakefile {chunk_dir}Snakefile_master -j 5 --rerun-incomplete'\n", " f.write(cmd + '\\n')\n" ] }, { "cell_type": "code", "execution_count": 32, "id": "6fbc3193", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:06:10.475843Z", "iopub.status.busy": "2023-06-27T21:06:10.475660Z", "iopub.status.idle": "2023-06-27T21:06:10.480996Z", "shell.execute_reply": "2023-06-27T21:06:10.480414Z", "shell.execute_reply.started": "2023-06-27T21:06:10.475826Z" }, "tags": [] }, "outputs": [], "source": [ "params.pop('cell_table_path')\n", "params_str = '\\n'.join(f'{k} = {v}' for k, v in params.items())\n", "\n", "with open(f'{PACKAGE_DIR}/cool/Snakefile_group_template') as tmp:\n", " GENERATE_MATRIX_GROUP_TEMPLATE = tmp.read()\n", "\n", "with open(f'{outdir}Snakefile', 'w') as f:\n", " f.write(params_str + '\\n' + GENERATE_MATRIX_GROUP_TEMPLATE)\n", " \n", "with open(f'{outdir}snakemake_cmd_step2.txt', 'w') as f:\n", " cmd = f'snakemake -d {outdir} --snakefile {outdir}Snakefile -j 10 --rerun-incomplete'\n", " f.write(cmd + '\\n')\n" ] }, { "cell_type": "markdown", "id": "b9cf347c-a225-4745-9c94-a1a674d8f096", "metadata": {}, "source": [ "### Note\n", "\n", "To merge a chunk of 200 cells, using 5 cpus on a nodes with 32 cpus and 128G memory takes ~10 minutes." ] }, { "cell_type": "code", "execution_count": null, "id": "9bb76f1b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "60b37180", "metadata": {}, "source": [ "## Compartment\n", "\n", "Generate pseudobulk cool files of imputed contacts at 100kb resolution by summing up imputed matrices of single cells.\n", "\n", "The code below divides large cell groups into chunks of 200 cells. \"snakemake_cmd_step1.txt\" contains commands to generate pseudobulk matrices for each chunk and could be distributed across HPC. \n", "\n", "\"snakemake_cmd_step2.txt\" contains a command to merge chunks into cell groups and could be run directly on a single node." ] }, { "cell_type": "code", "execution_count": 34, "id": "7a9299ff-eb58-4f51-95d0-7c21a4eee36c", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:46:16.250356Z", "iopub.status.busy": "2023-06-27T21:46:16.250097Z", "iopub.status.idle": "2023-06-27T21:46:16.797402Z", "shell.execute_reply": "2023-06-27T21:46:16.796856Z", "shell.execute_reply.started": "2023-06-27T21:46:16.250337Z" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
cool_pathcluster
cortex-p028-cb_116/data/test_schicluster/Tan2021/scool/impute/10...Cortical_L6_Pyramidal_Cell
cortex-visual-control-p007-b6_182/data/test_schicluster/Tan2021/scool/impute/10...Cortical_L6_Pyramidal_Cell
cortex-p028-cb_112/data/test_schicluster/Tan2021/scool/impute/10...Cortical_L6_Pyramidal_Cell
cortex-visual-control-p001-b6_061/data/test_schicluster/Tan2021/scool/impute/10...Unknown_Interneuron_2
cortex-p056-cb_216/data/test_schicluster/Tan2021/scool/impute/10...Microglia_Etc
.........
cortex-visual-control-p021-b6_090/data/test_schicluster/Tan2021/scool/impute/10...Mature_Oligodendrocyte
cortex-visual-control-p021-b6_012/data/test_schicluster/Tan2021/scool/impute/10...Cortical_L6_Pyramidal_Cell
hippocampus-p007-cb_046/data/test_schicluster/Tan2021/scool/impute/10...Microglia_Etc
cortex-visual-dark-p014-b6_106/data/test_schicluster/Tan2021/scool/impute/10...Microglia_Etc
cortex-visual-control-p021-b6_174/data/test_schicluster/Tan2021/scool/impute/10...Microglia_Etc
\n", "

3646 rows × 2 columns

\n", "
" ], "text/plain": [ " cool_path \\\n", "cortex-p028-cb_116 /data/test_schicluster/Tan2021/scool/impute/10... \n", "cortex-visual-control-p007-b6_182 /data/test_schicluster/Tan2021/scool/impute/10... \n", "cortex-p028-cb_112 /data/test_schicluster/Tan2021/scool/impute/10... \n", "cortex-visual-control-p001-b6_061 /data/test_schicluster/Tan2021/scool/impute/10... \n", "cortex-p056-cb_216 /data/test_schicluster/Tan2021/scool/impute/10... \n", "... ... \n", "cortex-visual-control-p021-b6_090 /data/test_schicluster/Tan2021/scool/impute/10... \n", "cortex-visual-control-p021-b6_012 /data/test_schicluster/Tan2021/scool/impute/10... \n", "hippocampus-p007-cb_046 /data/test_schicluster/Tan2021/scool/impute/10... \n", "cortex-visual-dark-p014-b6_106 /data/test_schicluster/Tan2021/scool/impute/10... \n", "cortex-visual-control-p021-b6_174 /data/test_schicluster/Tan2021/scool/impute/10... \n", "\n", " cluster \n", "cortex-p028-cb_116 Cortical_L6_Pyramidal_Cell \n", "cortex-visual-control-p007-b6_182 Cortical_L6_Pyramidal_Cell \n", "cortex-p028-cb_112 Cortical_L6_Pyramidal_Cell \n", "cortex-visual-control-p001-b6_061 Unknown_Interneuron_2 \n", "cortex-p056-cb_216 Microglia_Etc \n", "... ... \n", "cortex-visual-control-p021-b6_090 Mature_Oligodendrocyte \n", "cortex-visual-control-p021-b6_012 Cortical_L6_Pyramidal_Cell \n", "hippocampus-p007-cb_046 Microglia_Etc \n", "cortex-visual-dark-p014-b6_106 Microglia_Etc \n", "cortex-visual-control-p021-b6_174 Microglia_Etc \n", "\n", "[3646 rows x 2 columns]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coollist = glob('/data/test_schicluster/Tan2021/scool/impute/100K/*/*.cool')\n", "cell_table = pd.DataFrame(coollist, index=[xx.split('/')[-1].replace('.cool', '') for xx in coollist], columns=['cool_path'])\n", "cell_table = cell_table.loc[metadata.index]\n", "cell_table['cluster'] = metadata['rnatype'].copy()\n", "# cell_table['cool_path'] = cell_table['cool_path'].str.replace('/data/test_schicluster', '/anvil/scratch/x-zhou')\n", "cell_table\n" ] }, { "cell_type": "code", "execution_count": 35, "id": "0b9f7563-d9be-4a8a-9bae-e8335005b1a7", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:46:24.796423Z", "iopub.status.busy": "2023-06-27T21:46:24.796166Z", "iopub.status.idle": "2023-06-27T21:46:24.799983Z", "shell.execute_reply": "2023-06-27T21:46:24.799314Z", "shell.execute_reply.started": "2023-06-27T21:46:24.796404Z" }, "tags": [] }, "outputs": [], "source": [ "outdir = '/home/jzhou_salk_edu/sky_workdir/test_schicluster/Tan2021/compartment/'\n" ] }, { "cell_type": "code", "execution_count": 36, "id": "4f76c700-d6cd-4191-8833-0f863b20847b", "metadata": { "ExecuteTime": { "end_time": "2022-03-05T20:28:34.445699Z", "start_time": "2022-03-05T20:28:34.352677Z" }, "execution": { "iopub.execute_input": "2023-06-27T21:47:07.054984Z", "iopub.status.busy": "2023-06-27T21:47:07.054642Z", "iopub.status.idle": "2023-06-27T21:47:07.102660Z", "shell.execute_reply": "2023-06-27T21:47:07.093078Z", "shell.execute_reply.started": "2023-06-27T21:47:07.054952Z" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Adult_Astrocyte 144\n", "Cajal-Retzius_Cell 23\n", "Cortical_L2-4_Pyramidal_Cell_Intermediate 37\n", "Cortical_L2-5_Pyramidal_Cell_Neonatal 211\n", "Cortical_L23_Pyramidal_Cell 204\n", "Cortical_L4_Pyramidal_Cell 195\n", "Cortical_L5_Pyramidal_Cell 98\n", "Cortical_L6_Pyramidal_Cell 333\n", "Hippocampal_CA1_Pyramidal_Cell 134\n", "Hippocampal_CA3_Pyramidal_Cell 70\n", "Hippocampal_Granuale_Cell 309\n", "Hippocampal_Pyramidal_Cell_Neonatal 93\n", "MEIS2_Interneuron 88\n", "Mature_Oligodendrocyte 210\n", "Medium_Spiny_Neuron 167\n", "Microglia_Etc 391\n", "NDNF_Interneuron 36\n", "Neonatal_Astrocyte 234\n", "Newly_Formed_Oligodendrocyte 27\n", "Oligodendrocyte_Progenitor 189\n", "PVSST_Interneuron_Neonatal 85\n", "PV_Interneuron 61\n", "SST_Interneuron 55\n", "Unknown_Interneuron_1 89\n", "Unknown_Interneuron_2 51\n", "VIP_Interneuron 112\n" ] } ], "source": [ "for cluster, sub_df in cell_table.groupby('cluster'):\n", " os.makedirs(f'{outdir}{cluster}', exist_ok=True)\n", " sub_df.to_csv(f'{outdir}{cluster}/cell_table.csv', header=False, index=True)\n", " print(cluster, sub_df.shape[0])\n" ] }, { "cell_type": "code", "execution_count": 37, "id": "8dae177f-0a48-477b-aaa9-13dd8219011f", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:47:16.119307Z", "iopub.status.busy": "2023-06-27T21:47:16.119012Z", "iopub.status.idle": "2023-06-27T21:47:16.122981Z", "shell.execute_reply": "2023-06-27T21:47:16.122147Z", "shell.execute_reply.started": "2023-06-27T21:47:16.119284Z" }, "tags": [] }, "outputs": [], "source": [ "params = {\n", " 'resolution': 100000,\n", " 'chrom_size_path': '\"/data/ref/mm10/genome/mm10.main20.chrom.sizes\"',\n", "}\n" ] }, { "cell_type": "code", "execution_count": 38, "id": "692596a1-33e1-4149-8af8-84134cda5e4f", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:47:36.349320Z", "iopub.status.busy": "2023-06-27T21:47:36.349057Z", "iopub.status.idle": "2023-06-27T21:47:36.391662Z", "shell.execute_reply": "2023-06-27T21:47:36.391062Z", "shell.execute_reply.started": "2023-06-27T21:47:36.349295Z" }, "tags": [] }, "outputs": [], "source": [ "chunk_size = 200\n", "total_chunk_dirs = []\n", "group_chunks = {}\n", "\n", "with open(f'{PACKAGE_DIR}/cool/Snakefile_chunk_template') as tmp:\n", " GENERATE_MATRIX_CHUNK_TEMPLATE = tmp.read()\n", "\n", "for group, group_df in cell_table.groupby('cluster'):\n", " group_chunks[group] = []\n", " if group_df.shape[0] <= chunk_size:\n", " this_dir = f'{outdir}{group}_chunk0/'\n", " params['cell_table_path'] = f'\"{this_dir}cell_table.csv\"'\n", " prepare_dir(this_dir, group_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)\n", " total_chunk_dirs.append(this_dir)\n", " group_chunks[group].append(this_dir)\n", " else:\n", " group_df['chunk'] = [i // chunk_size for i in range(group_df.shape[0])]\n", " for chunk, chunk_df in group_df.groupby('chunk'):\n", " this_dir = f'{outdir}{group}_chunk{chunk}/'\n", " params['cell_table_path'] = f'\"{this_dir}cell_table.csv\"'\n", " prepare_dir(this_dir, chunk_df, GENERATE_MATRIX_CHUNK_TEMPLATE, params)\n", " total_chunk_dirs.append(this_dir)\n", " group_chunks[group].append(this_dir)\n", "\n", " " ] }, { "cell_type": "code", "execution_count": 39, "id": "62039456-5537-47a9-9b05-eb6bbe4d2563", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:47:39.215234Z", "iopub.status.busy": "2023-06-27T21:47:39.214972Z", "iopub.status.idle": "2023-06-27T21:47:39.219096Z", "shell.execute_reply": "2023-06-27T21:47:39.218475Z", "shell.execute_reply.started": "2023-06-27T21:47:39.215214Z" }, "tags": [] }, "outputs": [], "source": [ "with open(f'{outdir}snakemake_cmd_step1.txt', 'w') as f:\n", " for chunk_dir in total_chunk_dirs:\n", " cmd = f'snakemake -d {chunk_dir} --snakefile {chunk_dir}Snakefile_master -j 5 --rerun-incomplete'\n", " f.write(cmd + '\\n')\n" ] }, { "cell_type": "code", "execution_count": 40, "id": "23a8e050-e2ad-406c-8429-210b218182f7", "metadata": { "execution": { "iopub.execute_input": "2023-06-27T21:47:48.833676Z", "iopub.status.busy": "2023-06-27T21:47:48.833423Z", "iopub.status.idle": "2023-06-27T21:47:48.838666Z", "shell.execute_reply": "2023-06-27T21:47:48.838101Z", "shell.execute_reply.started": "2023-06-27T21:47:48.833656Z" }, "tags": [] }, "outputs": [], "source": [ "params.pop('cell_table_path')\n", "params_str = '\\n'.join(f'{k} = {v}' for k, v in params.items())\n", "\n", "with open(f'{PACKAGE_DIR}/cool/Snakefile_group_template') as tmp:\n", " GENERATE_MATRIX_GROUP_TEMPLATE = tmp.read()\n", "\n", "with open(f'{outdir}Snakefile', 'w') as f:\n", " f.write(params_str + '\\n' + GENERATE_MATRIX_GROUP_TEMPLATE)\n", " \n", "with open(f'{outdir}snakemake_cmd_step2.txt', 'w') as f:\n", " cmd = f'snakemake -d {outdir} --snakefile {outdir}Snakefile -j 10 --rerun-incomplete'\n", " f.write(cmd + '\\n')\n" ] }, { "cell_type": "markdown", "id": "0b119dba-fb04-4bbe-9cfa-61086015867d", "metadata": {}, "source": [ "### Note\n", "\n", "To merge a chunk of 200 cells, using 5 cpus on a nodes with 32 cpus and 128G memory takes ~5 minutes." ] }, { "cell_type": "code", "execution_count": null, "id": "914ca144-da78-4292-b821-c97cf3e1ba37", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "hide_input": false, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": true, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": true } }, "nbformat": 4, "nbformat_minor": 5 }