Source code for embed_mergechr

# command time python /gale/ddn/snm3C/humanPFC/code/embed_mergechr.py --embed_list /gale/ddn/snm3C/humanPFC/smoothed_matrix/100kb_resolution/filelist/embedlist_pad1_std1_rp0.5_sqrtvc.txt --outprefix /gale/ddn/snm3C/humanPFC/smoothed_matrix/100kb_resolution/merged/pad1_std1_rp0.5_sqrtvc

import h5py
import argparse
import numpy as np
from sklearn.decomposition import TruncatedSVD


[docs]
def embed_mergechr(embed_list, outprefix, dim=20, norm_sig=True):
	embedlist = np.loadtxt(embed_list, dtype=np.str)
	matrix_reduce = np.concatenate([np.load(x) for x in embedlist], axis=1)
	if norm_sig:
		matrix_reduce = matrix_reduce[1:] / matrix_reduce[0]
	else:
		matrix_reduce = matrix_reduce[1:]
	svd = TruncatedSVD(n_components=dim, algorithm='arpack')
	matrix_reduce = svd.fit_transform(matrix_reduce)
	if norm_sig:
		matrix_reduce = matrix_reduce / svd.singular_values_
	with h5py.File(f'{outprefix}.svd{dim}.hdf5', 'w') as f:
		tmp = f.create_dataset('data', matrix_reduce.shape, dtype='float32', compression='gzip')
		tmp[()] = matrix_reduce
	return


'''
parser = argparse.ArgumentParser()
parser.add_argument('--embed_list', type=str, default=None, help='Full path of a file containing the full path to dimension reduction files of all chromosomes')
parser.add_argument('--outprefix', type=str, default=None, help='Prefix of final dimension reduction file including directory')
parser.add_argument('--dim', type=int, default=20, help='Number of dimensions to return from SVD')
parser.add_argument('--use_pc', dest='norm_sig', action='store_false', help='Not to normalize PCs by singular values')
parser.set_defaults(norm_sig=True)
opt = parser.parse_args()

embed_mergechr(opt.embed_list, opt.outprefix, opt.dim, opt.norm_sig)
'''