ANOVA of loop matrices across cell types

ANOVA of loop matrices across cell types#

import os
import cooler
import pathlib
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, save_npz, vstack, csr_matrix, triu
from scipy.stats import f, zscore, ranksums
from schicluster.cool.utilities import get_chrom_offsets
from multiprocessing import Pool
from concurrent.futures import ProcessPoolExecutor, as_completed
leg = {'exc': ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy'], 
       'inh': ['Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7'], 
       'msn': ['MSN_D1', 'MSN_D2', 'Foxp2'], 
       'sub': ['SubCtx'], 
       'glia': ['ASC', 'ODC', 'OPC'], 
       'mgc': ['MGC'], 
       'smc': ['PC'], 
       'endo': ['EC'], 
       'fibro': ['VLMC'],
      }
leg['neu'] = leg['exc'] + leg['inh'] + leg['msn'] + leg['sub']
leg['all'] = leg['neu'] + leg['glia'] + leg['mgc'] + leg['smc'] + leg['endo'] + leg['fibro']
group_name = 'neu'
ctgroup = []
if '_' in group_name:
    for xx in group_name.split('_'):
        ctgroup.append(leg[xx])
else:
    for xx in leg[group_name]:
        ctgroup.append([xx])
        
ctgroup
[['L23_IT'],
 ['L4_IT'],
 ['L5_IT'],
 ['L6_IT'],
 ['L6_IT_Car3'],
 ['L56_NP'],
 ['L6_CT'],
 ['L6b'],
 ['Amy'],
 ['Lamp5'],
 ['Lamp5_LHX6'],
 ['Sncg'],
 ['Vip'],
 ['Pvalb'],
 ['Pvalb_ChC'],
 ['Sst'],
 ['CHD7'],
 ['MSN_D1'],
 ['MSN_D2'],
 ['Foxp2'],
 ['SubCtx']]
indir = '/home/jzhou_salk_edu/sky_workdir/hba/loop_majortype/'
outdir = '/home/jzhou_salk_edu/sky_workdir/hba/loop_majortype/'
res = 10000
group = group_name
chrom_size_path = f'{indir}hg38_with_chrl.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
bins_df = cooler.binnify(chrom_sizes, res)
chrom_offset = get_chrom_offsets(bins_df)
bkl = pd.read_csv(f'{indir}M1C.rowsumpb1000.blf50.merged.bed', sep='\t', header=None, index_col=None)
def compute_anova(c, matrix):
    # c, matrix = args
    ngene = int(chrom_sizes.loc[c] // res) + 1
    bkl_tmp = bkl.loc[(bkl[0]==c), [1,2]].values // res
    cov = np.zeros(ngene)
    for xx,yy in bkl_tmp:
        cov[xx-7:yy+7] = 1
    tot, last = 0, 0
    Esum, E2sum, Elast, E2last, ss_intra = [csr_matrix((ngene, ngene)) for i in range(5)]
    for ctlist in ctgroup:
        for ct in ctlist:
            cool_e = cooler.Cooler(f'{indir}/{ct}/{ct}/{ct}.{matrix}.cool')
            E = triu(cool_e.matrix(balance=False, sparse=True).fetch(c))
            cool_e2 = cooler.Cooler(f'{indir}/{ct}/{ct}/{ct}.{matrix}2.cool')
            E2 = triu(cool_e2.matrix(balance=False, sparse=True).fetch(c))
            n = cool_e.info['group_n_cells']
            Esum += E * n
            E2sum += E2 * n
            tot += n
            # print(c, ct)
        Egroup = Esum - Elast
        E2group = E2sum - E2last
        Egroup.data = Egroup.data ** 2 / (tot - last)
        ss_intra += (E2group - Egroup)
        Elast = Esum.copy()
        E2last = E2sum.copy()
        last = tot
    Esum.data = Esum.data ** 2 / tot
    ss_total = E2sum - Esum
    ss_intra.data = 1 / ss_intra.data
    ss_total = ss_total.multiply(ss_intra)
    # print(c, ss_total.data.min(), ss_intra.data.min())

    ss_total.data = (ss_total.data - 1) * (tot - len(ctgroup)) / (len(ctgroup) - 1)
    ss_total = ss_total.tocoo()
    bklfilter = np.logical_and(cov[ss_total.row]==0, cov[ss_total.col]==0)
    distfilter = np.logical_and((ss_total.col-ss_total.row)>5, (ss_total.col-ss_total.row)<500)
    idxfilter = np.logical_and(bklfilter, distfilter)
    # print(idxfilter.sum(), len(idxfilter))
    ss_total = csr_matrix((ss_total.data[idxfilter], (ss_total.row[idxfilter], ss_total.col[idxfilter])), (ngene, ngene))
    save_npz(f'{outdir}diff/{group}/majortype_{matrix}pv_{c}.npz', ss_total)

    return [c, matrix, tot]
cpu = 10
with ProcessPoolExecutor(cpu) as executor:
    futures = []
    for x in chrom_sizes.index:
        for y in ['Q', 'E', 'T']:
            future = executor.submit(
                compute_anova,
                c=x,
                matrix=y,
            )
            futures.append(future)

    # result = []
    for future in as_completed(futures):
        # result.append(future.result())
        # c1, c2 = result[-1][0], result[-1][1]
        tmp = future.result()
        print(f'{tmp[0]} {tmp[1]} finished')
        
chr3 E finished
chr4 Q finished
chr3 Q finished
chr3 T finished
chr1 E finished
chr1 Q finished
chr2 E finished
chr2 Q finished
chr1 T finished
chr2 T finished
chr7 Q finished
chr6 Q finished
chr7 E finished
chr6 E finished
chr5 E finished
chr5 Q finished
chr4 E finished
chr4 T finished
chr5 T finished
chr6 T finished
chr9 Q finished
chr8 Q finished
chr9 E finished
chr8 E finished
chr10 Q finished
chr9 T finished
chr10 T finished
chr8 T finished
chr10 E finished
chr7 T finished
chr14 Q finished
chr13 Q finished
chr13 E finished
chr11 Q finished
chr11 E finished
chr12 Q finished
chr13 T finished
chr12 E finished
chr11 T finished
chr12 T finished
chr16 Q finished
chr15 Q finished
chr14 T finished
chr14 E finished
chr16 E finished
chr15 E finished
chr17 E finished
chr15 T finished
chr16 T finished
chr17 Q finished
chr19 Q finished
chr18 Q finished
chr19 E finished
chr20 Q finished
chr19 T finished
chr20 T finished
chr18 E finished
chr17 T finished
chr18 T finished
chr20 E finished
chr21 Q finished
chr22 Q finished
chr21 E finished
chr21 T finished
chr22 E finished
chr22 T finished
chrX E finished
chrX Q finished
chrX T finished
def chrom_iterator(input_dir, chrom_order, chrom_offset):
    for chrom in chrom_order:
        output_path = f'{input_dir}_{chrom}.npz'
        if not pathlib.Path(output_path).exists():
            continue
        chunk_size = 5000000
        data = load_npz(output_path).tocoo()
        df = pd.DataFrame({'bin1_id': data.row, 'bin2_id': data.col, 'count': data.data})
        df = df[df['bin1_id'] <= df['bin2_id']]
        for i, chunk_start in enumerate(range(0, df.shape[0], chunk_size)):
            chunk = df.iloc[chunk_start:chunk_start + chunk_size]
            chunk.iloc[:, :2] += chrom_offset[chrom]
            yield chunk
for matrix in ['Q', 'E', 'T']:
    output_path = f'{outdir}diff/{group}/majortype_{matrix}pv'
    cooler.create_cooler(cool_uri=f'{output_path}.cool',
                         bins=bins_df,
                         pixels=chrom_iterator(input_dir=output_path,
                                               chrom_order=chrom_sizes.index,
                                               chrom_offset=chrom_offset
                                              ),
                         ordered=True,
                         dtypes={'count': np.float32})
os.system(f'rm {outdir}diff/{group}/majortype_*pv_c*.npz')
0