Source code for schicluster.diff.loop

import numpy as np
import pandas as pd
from scipy.stats import f as f_dist



[docs]
def one_way_anova(chrom_loop_ds, da_name, value_type, group_n_dim='group_n', group_dim='sample_id'):
    """
    Perform one-way ANOVA on a single-chrom loop dataset.

    Parameters
    ----------
    chrom_loop_ds
        A single-chrom loop dataset.
    da_name
        The name of the data array to perform ANOVA on.
    value_type
        The value type of the data array to perform ANOVA on.
        both "{value_type}" and "{value_type}2" should be present in the "{da_name}_value_type" dimension.
    group_n_dim
        The name of the group number variable.
    group_dim
        The name of the group dimension.

    Returns
    -------
    F statistics and P-values of the one-way ANOVA.
    """
    loop_ds = chrom_loop_ds
    # number of cells per group
    n = loop_ds[group_n_dim]
    n_group = n.size
    total_n = int(n.sum())

    # sum(x) / ni, ni is number of cells in i group
    x = loop_ds[da_name].sel({f'{da_name}_value_type': value_type})
    # sum(x^2) / ni
    x2 = loop_ds[da_name].sel({f'{da_name}_value_type': f'{value_type}2'})

    # total variance
    x_sum = (x * n).sum(dim=group_dim)
    x2_sum = (x2 * n).sum(dim=group_dim)
    sst = x2_sum - np.power(x_sum, 2) / total_n

    # within group variance
    ssw = ((x2 - np.power(x, 2)) * n).sum(dim=group_dim)

    # ssb between group variance; ssb = sst - ssw
    # ssb / ssw = sst / ssw - 1
    # f = (sst / ssw - 1) * (N - k) / (k - 1)
    f = (sst / ssw - 1) * (total_n - n_group) / (n_group - 1)
    f = f.to_pandas()

    # p value from F distribution
    p = f_dist(n_group - 1, total_n - n_group).sf(f)
    p = pd.Series(p, index=f.index)
    return f, p




[docs]
def merge_groups(loop_ds, group_map, da_name, group_dim='sample_id', group_n_dim='group_n'):
    """
    Merge groups into larger groups in a loop dataset.

    Parameters
    ----------
    loop_ds
        A loop dataset.
    group_map
        A pd.Series mapping from old group names to new group names.
    da_name
        The name of the data array to merge groups for.
    group_dim
        The name of the group dimension.
    group_n_dim
        The name of the group number variable.

    Returns
    -------
    A loop dataset with merged groups.
    """
    cell_count = loop_ds.coords[group_n_dim].to_pandas()
    loop_ds[da_name] = loop_ds[da_name] * loop_ds.coords[group_n_dim]

    loop_ds['_sample_group'] = group_map
    loop_ds = loop_ds.groupby('_sample_group').sum(dim=group_dim)

    sample_group_count = cell_count.groupby(group_map).sum()
    sample_group_count.index.name = '_sample_group'
    loop_ds.coords[group_n_dim] = sample_group_count

    loop_ds[da_name] = loop_ds[da_name] / loop_ds[group_n_dim]

    loop_ds = loop_ds.rename({
        '_sample_group': group_dim
    })
    return loop_ds