ANOVA of loop matrices across cell types#
import os
import cooler
import pathlib
import numpy as np
import pandas as pd
from scipy.sparse import load_npz, save_npz, vstack, csr_matrix, triu
from scipy.stats import f, zscore, ranksums
from schicluster.cool.utilities import get_chrom_offsets
from multiprocessing import Pool
from concurrent.futures import ProcessPoolExecutor, as_completed
leg = {'exc': ['L23_IT', 'L4_IT', 'L5_IT', 'L6_IT', 'L6_IT_Car3', 'L56_NP', 'L6_CT', 'L6b', 'Amy'],
'inh': ['Lamp5', 'Lamp5_LHX6', 'Sncg', 'Vip', 'Pvalb', 'Pvalb_ChC', 'Sst', 'CHD7'],
'msn': ['MSN_D1', 'MSN_D2', 'Foxp2'],
'sub': ['SubCtx'],
'glia': ['ASC', 'ODC', 'OPC'],
'mgc': ['MGC'],
'smc': ['PC'],
'endo': ['EC'],
'fibro': ['VLMC'],
}
leg['neu'] = leg['exc'] + leg['inh'] + leg['msn'] + leg['sub']
leg['all'] = leg['neu'] + leg['glia'] + leg['mgc'] + leg['smc'] + leg['endo'] + leg['fibro']
group_name = 'neu'
ctgroup = []
if '_' in group_name:
for xx in group_name.split('_'):
ctgroup.append(leg[xx])
else:
for xx in leg[group_name]:
ctgroup.append([xx])
ctgroup
[['L23_IT'],
['L4_IT'],
['L5_IT'],
['L6_IT'],
['L6_IT_Car3'],
['L56_NP'],
['L6_CT'],
['L6b'],
['Amy'],
['Lamp5'],
['Lamp5_LHX6'],
['Sncg'],
['Vip'],
['Pvalb'],
['Pvalb_ChC'],
['Sst'],
['CHD7'],
['MSN_D1'],
['MSN_D2'],
['Foxp2'],
['SubCtx']]
indir = '/home/jzhou_salk_edu/sky_workdir/hba/loop_majortype/'
outdir = '/home/jzhou_salk_edu/sky_workdir/hba/loop_majortype/'
res = 10000
group = group_name
chrom_size_path = f'{indir}hg38_with_chrl.main.chrom.sizes'
chrom_sizes = cooler.read_chromsizes(chrom_size_path, all_names=True)
bins_df = cooler.binnify(chrom_sizes, res)
chrom_offset = get_chrom_offsets(bins_df)
bkl = pd.read_csv(f'{indir}M1C.rowsumpb1000.blf50.merged.bed', sep='\t', header=None, index_col=None)
def compute_anova(c, matrix):
# c, matrix = args
ngene = int(chrom_sizes.loc[c] // res) + 1
bkl_tmp = bkl.loc[(bkl[0]==c), [1,2]].values // res
cov = np.zeros(ngene)
for xx,yy in bkl_tmp:
cov[xx-7:yy+7] = 1
tot, last = 0, 0
Esum, E2sum, Elast, E2last, ss_intra = [csr_matrix((ngene, ngene)) for i in range(5)]
for ctlist in ctgroup:
for ct in ctlist:
cool_e = cooler.Cooler(f'{indir}/{ct}/{ct}/{ct}.{matrix}.cool')
E = triu(cool_e.matrix(balance=False, sparse=True).fetch(c))
cool_e2 = cooler.Cooler(f'{indir}/{ct}/{ct}/{ct}.{matrix}2.cool')
E2 = triu(cool_e2.matrix(balance=False, sparse=True).fetch(c))
n = cool_e.info['group_n_cells']
Esum += E * n
E2sum += E2 * n
tot += n
# print(c, ct)
Egroup = Esum - Elast
E2group = E2sum - E2last
Egroup.data = Egroup.data ** 2 / (tot - last)
ss_intra += (E2group - Egroup)
Elast = Esum.copy()
E2last = E2sum.copy()
last = tot
Esum.data = Esum.data ** 2 / tot
ss_total = E2sum - Esum
ss_intra.data = 1 / ss_intra.data
ss_total = ss_total.multiply(ss_intra)
# print(c, ss_total.data.min(), ss_intra.data.min())
ss_total.data = (ss_total.data - 1) * (tot - len(ctgroup)) / (len(ctgroup) - 1)
ss_total = ss_total.tocoo()
bklfilter = np.logical_and(cov[ss_total.row]==0, cov[ss_total.col]==0)
distfilter = np.logical_and((ss_total.col-ss_total.row)>5, (ss_total.col-ss_total.row)<500)
idxfilter = np.logical_and(bklfilter, distfilter)
# print(idxfilter.sum(), len(idxfilter))
ss_total = csr_matrix((ss_total.data[idxfilter], (ss_total.row[idxfilter], ss_total.col[idxfilter])), (ngene, ngene))
save_npz(f'{outdir}diff/{group}/majortype_{matrix}pv_{c}.npz', ss_total)
return [c, matrix, tot]
cpu = 10
with ProcessPoolExecutor(cpu) as executor:
futures = []
for x in chrom_sizes.index:
for y in ['Q', 'E', 'T']:
future = executor.submit(
compute_anova,
c=x,
matrix=y,
)
futures.append(future)
# result = []
for future in as_completed(futures):
# result.append(future.result())
# c1, c2 = result[-1][0], result[-1][1]
tmp = future.result()
print(f'{tmp[0]} {tmp[1]} finished')
chr3 E finished
chr4 Q finished
chr3 Q finished
chr3 T finished
chr1 E finished
chr1 Q finished
chr2 E finished
chr2 Q finished
chr1 T finished
chr2 T finished
chr7 Q finished
chr6 Q finished
chr7 E finished
chr6 E finished
chr5 E finished
chr5 Q finished
chr4 E finished
chr4 T finished
chr5 T finished
chr6 T finished
chr9 Q finished
chr8 Q finished
chr9 E finished
chr8 E finished
chr10 Q finished
chr9 T finished
chr10 T finished
chr8 T finished
chr10 E finished
chr7 T finished
chr14 Q finished
chr13 Q finished
chr13 E finished
chr11 Q finished
chr11 E finished
chr12 Q finished
chr13 T finished
chr12 E finished
chr11 T finished
chr12 T finished
chr16 Q finished
chr15 Q finished
chr14 T finished
chr14 E finished
chr16 E finished
chr15 E finished
chr17 E finished
chr15 T finished
chr16 T finished
chr17 Q finished
chr19 Q finished
chr18 Q finished
chr19 E finished
chr20 Q finished
chr19 T finished
chr20 T finished
chr18 E finished
chr17 T finished
chr18 T finished
chr20 E finished
chr21 Q finished
chr22 Q finished
chr21 E finished
chr21 T finished
chr22 E finished
chr22 T finished
chrX E finished
chrX Q finished
chrX T finished
def chrom_iterator(input_dir, chrom_order, chrom_offset):
for chrom in chrom_order:
output_path = f'{input_dir}_{chrom}.npz'
if not pathlib.Path(output_path).exists():
continue
chunk_size = 5000000
data = load_npz(output_path).tocoo()
df = pd.DataFrame({'bin1_id': data.row, 'bin2_id': data.col, 'count': data.data})
df = df[df['bin1_id'] <= df['bin2_id']]
for i, chunk_start in enumerate(range(0, df.shape[0], chunk_size)):
chunk = df.iloc[chunk_start:chunk_start + chunk_size]
chunk.iloc[:, :2] += chrom_offset[chrom]
yield chunk
for matrix in ['Q', 'E', 'T']:
output_path = f'{outdir}diff/{group}/majortype_{matrix}pv'
cooler.create_cooler(cool_uri=f'{output_path}.cool',
bins=bins_df,
pixels=chrom_iterator(input_dir=output_path,
chrom_order=chrom_sizes.index,
chrom_offset=chrom_offset
),
ordered=True,
dtypes={'count': np.float32})
os.system(f'rm {outdir}diff/{group}/majortype_*pv_c*.npz')
0