Source code for schicluster.diff.loop

import numpy as np
import pandas as pd
from scipy.stats import f as f_dist


[docs] def one_way_anova(chrom_loop_ds, da_name, value_type, group_n_dim='group_n', group_dim='sample_id'): """ Perform one-way ANOVA on a single-chrom loop dataset. Parameters ---------- chrom_loop_ds A single-chrom loop dataset. da_name The name of the data array to perform ANOVA on. value_type The value type of the data array to perform ANOVA on. both "{value_type}" and "{value_type}2" should be present in the "{da_name}_value_type" dimension. group_n_dim The name of the group number variable. group_dim The name of the group dimension. Returns ------- F statistics and P-values of the one-way ANOVA. """ loop_ds = chrom_loop_ds # number of cells per group n = loop_ds[group_n_dim] n_group = n.size total_n = int(n.sum()) # sum(x) / ni, ni is number of cells in i group x = loop_ds[da_name].sel({f'{da_name}_value_type': value_type}) # sum(x^2) / ni x2 = loop_ds[da_name].sel({f'{da_name}_value_type': f'{value_type}2'}) # total variance x_sum = (x * n).sum(dim=group_dim) x2_sum = (x2 * n).sum(dim=group_dim) sst = x2_sum - np.power(x_sum, 2) / total_n # within group variance ssw = ((x2 - np.power(x, 2)) * n).sum(dim=group_dim) # ssb between group variance; ssb = sst - ssw # ssb / ssw = sst / ssw - 1 # f = (sst / ssw - 1) * (N - k) / (k - 1) f = (sst / ssw - 1) * (total_n - n_group) / (n_group - 1) f = f.to_pandas() # p value from F distribution p = f_dist(n_group - 1, total_n - n_group).sf(f) p = pd.Series(p, index=f.index) return f, p
[docs] def merge_groups(loop_ds, group_map, da_name, group_dim='sample_id', group_n_dim='group_n'): """ Merge groups into larger groups in a loop dataset. Parameters ---------- loop_ds A loop dataset. group_map A pd.Series mapping from old group names to new group names. da_name The name of the data array to merge groups for. group_dim The name of the group dimension. group_n_dim The name of the group number variable. Returns ------- A loop dataset with merged groups. """ cell_count = loop_ds.coords[group_n_dim].to_pandas() loop_ds[da_name] = loop_ds[da_name] * loop_ds.coords[group_n_dim] loop_ds['_sample_group'] = group_map loop_ds = loop_ds.groupby('_sample_group').sum(dim=group_dim) sample_group_count = cell_count.groupby(group_map).sum() sample_group_count.index.name = '_sample_group' loop_ds.coords[group_n_dim] = sample_group_count loop_ds[da_name] = loop_ds[da_name] / loop_ds[group_n_dim] loop_ds = loop_ds.rename({ '_sample_group': group_dim }) return loop_ds