Source code for cascade_at.executor.mulcov_statistics

#!/usr/bin/env python
import logging
import sys
from typing import List, Optional

import numpy as np
import pandas as pd

from cascade_at.executor.args.arg_utils import ArgumentList
from cascade_at.executor.args.args import ModelVersionID, BoolArg, ListArg, StrArg, LogLevel
from cascade_at.context.model_context import Context
from cascade_at.core.log import get_loggers, LEVELS
from cascade_at.dismod.api.dismod_io import DismodIO

LOG = get_loggers(__name__)


ARG_LIST = ArgumentList([
    ModelVersionID(),
    ListArg('--locations', help='The locations to pull mulcov statistics from', type=int, required=True),
    ListArg('--sexes', help='The sexes to pull mulcov statistics from', type=int, required=True),
    StrArg('--outfile-name', help='Filepath where mulcov statistics will be saved', required=False, default='mulcov_stats'),
    BoolArg('--sample', help='If true, the results will be pulled from the sample table rather'
                             'than the fit_var table'),
    BoolArg('--mean', help='Whether or not to compute the mean'),
    BoolArg('--std', help='Whether or not to compute the standard deviation'),
    ListArg('--quantile', help='Quantiles to compute', type=float),
    LogLevel()
])


def common_covariate_names(dbs):
    return set.intersection(
        *map(set, [d.covariate.c_covariate_name.tolist() for d in dbs])
    )


[docs]def get_mulcovs(dbs: List[DismodIO], covs: List[str],
                table: str = 'fit_var') -> pd.DataFrame:
    """
    Get mulcov values from all of the dbs, with all of the common covariates.

    Parameters
    dbs
        A list of dismod i/o objects
    covs
        A list of covariate names
    table
        Name of the table to pull from (can be fit_var or sample)
    """
    if table == 'fit_var':
        id_col = 'fit_var_id'
        val_col = 'fit_var_value'
    elif table == 'sample':
        id_col = 'var_id'
        val_col = 'var_value'
    else:
        raise ValueError("Must pass tables fit_var or sample.")

    dfs = pd.DataFrame()
    for db in dbs:
        mulcov = db.covariate[db.covariate.c_covariate_name.isin(covs)].merge(db.mulcov)
        try:
            df = db.var.merge(getattr(db, table), left_on='var_id', right_on=id_col)
            df = df.fillna(np.nan)
            df = df.merge(db.integrand, on='integrand_id', how='left')
            df = df.merge(db.rate, on='rate_id', how='left')
            mulcov = mulcov.astype({'integrand_id': 'float64', 'rate_id': 'float64'})
            df = mulcov.merge(df)
            df.rename(columns={val_col: 'mulcov_value'}, inplace=True)
            df = df[[
                'c_covariate_name', 'mulcov_type', 'rate_name',
                'integrand_name', 'mulcov_value'
            ]]
        except AttributeError:
            df = pd.DataFrame()
        dfs = dfs.append(df)
    return dfs


[docs]def compute_statistics(df, mean=True, std=True, quantile=None):
    """
    Compute statistics on a data frame with covariate multipliers.
    Args:
        df: pd.DataFrame
        mean: bool
        std: bool
        quantile: optional list

    Returns: dictionary with requested statistics

    """
    stats_df = pd.DataFrame()
    group_cols = ['c_covariate_name', 'mulcov_type', 'rate_name', 'integrand_name']
    df_groups = df.fillna('none').copy().groupby(group_cols, sort=False)
    stats_df = df_groups.count().reset_index()[group_cols]
    if mean:
        ds = df_groups.mean().reset_index()
        stats_df['mean'] = ds['mulcov_value']
    if std:
        degrees_of_freedom = int(df_groups.ngroups > len(df))
        ds = df_groups.std(ddof=degrees_of_freedom).reset_index()
        stats_df['std'] = ds['mulcov_value']
    if quantile is not None:
        for q in quantile:
            ds = df_groups.quantile(q=q).reset_index()
            stats_df[f'quantile_{q}'] = ds['mulcov_value']
    return stats_df


[docs]def mulcov_statistics(model_version_id: int, locations: List[int], sexes: List[int],
                      outfile_name: str, sample: bool = True,
                      mean: bool = True, std: bool = True,
                      quantile: Optional[List[float]] = None) -> None:
    """
    Compute statistics for the covariate multipliers on a dismod database,
    and save them to a file.

    Parameters
    ----------
    model_version_id
        The model version ID
    locations
        A list of locations that, when used in combination with sexes, point to the databases
        to pull covariate multiplier estimates from
    sexes
        A list of sexes that, when used in combination with locations, point to the databases
        to pull covariate multiplier estimates from
    outfile_name
        A filepath specifying where to save the covariate multiplier statistics.
    sample
        Whether or not the results are stored in the sample table or the fit_var table.
    mean
        Whether or not to compute the mean
    std
        Whether or not to compute the standard deviation
    quantile
        An optional list of quantiles to compute
    """

    context = Context(model_version_id=model_version_id)
    db_files = [DismodIO(context.db_file(location_id=loc, sex_id=sex))
                for loc in locations for sex in sexes]
    LOG.info(f"There are {len(db_files)} databases that will be aggregated.")

    common_covariates = common_covariate_names(db_files)
    LOG.info(f"The common covariates in the passed databases are {common_covariates}.")

    if sample:
        table_name = 'sample'
    else:
        table_name = 'fit_var'

    LOG.info(f"Will pull from the {table_name} table from each database.")
    mulcov_estimates = get_mulcovs(
        dbs=db_files, covs=common_covariates, table=table_name
    )
    if not mulcov_estimates.empty:
        stats = compute_statistics(
            df=mulcov_estimates, mean=mean, std=std, quantile=quantile
        )
    else:
        stats = mulcov_estimates
    LOG.info('Write to output file.')
    stats.to_csv(context.outputs_dir / f'{outfile_name}.csv', index=False)


def main():

    args = ARG_LIST.parse_args(sys.argv[1:])
    logging.basicConfig(level=LEVELS[args.log_level])

    mulcov_statistics(
        model_version_id=args.model_version_id,
        locations=args.locations,
        sexes=args.sexes,
        outfile_name=args.outfile_name,
        sample=args.sample,
        mean=args.mean,
        std=args.std,
        quantile=args.quantile
    )


if __name__ == '__main__':
    main()