Source code for cascade_at.executor.mulcov_statistics

#!/usr/bin/env python
import logging
import sys
from typing import List, Optional

import numpy as np
import pandas as pd

from cascade_at.executor.args.arg_utils import ArgumentList
from cascade_at.executor.args.args import ModelVersionID, BoolArg, ListArg, StrArg, LogLevel
from cascade_at.context.model_context import Context
from cascade_at.core.log import get_loggers, LEVELS
from cascade_at.dismod.api.dismod_io import DismodIO

LOG = get_loggers(__name__)


ARG_LIST = ArgumentList([
    ModelVersionID(),
    ListArg('--locations', help='The locations to pull mulcov statistics from', type=int, required=True),
    ListArg('--sexes', help='The sexes to pull mulcov statistics from', type=int, required=True),
    StrArg('--outfile-name', help='Filepath where mulcov statistics will be saved', required=False, default='mulcov_stats'),
    BoolArg('--sample', help='If true, the results will be pulled from the sample table rather'
                             'than the fit_var table'),
    BoolArg('--mean', help='Whether or not to compute the mean'),
    BoolArg('--std', help='Whether or not to compute the standard deviation'),
    ListArg('--quantile', help='Quantiles to compute', type=float),
    LogLevel()
])


def common_covariate_names(dbs):
    return set.intersection(
        *map(set, [d.covariate.c_covariate_name.tolist() for d in dbs])
    )


[docs]def get_mulcovs(dbs: List[DismodIO], covs: List[str], table: str = 'fit_var') -> pd.DataFrame: """ Get mulcov values from all of the dbs, with all of the common covariates. Parameters dbs A list of dismod i/o objects covs A list of covariate names table Name of the table to pull from (can be fit_var or sample) """ if table == 'fit_var': id_col = 'fit_var_id' val_col = 'fit_var_value' elif table == 'sample': id_col = 'var_id' val_col = 'var_value' else: raise ValueError("Must pass tables fit_var or sample.") dfs = pd.DataFrame() for db in dbs: mulcov = db.covariate[db.covariate.c_covariate_name.isin(covs)].merge(db.mulcov) try: df = db.var.merge(getattr(db, table), left_on='var_id', right_on=id_col) df = df.fillna(np.nan) df = df.merge(db.integrand, on='integrand_id', how='left') df = df.merge(db.rate, on='rate_id', how='left') mulcov = mulcov.astype({'integrand_id': 'float64', 'rate_id': 'float64'}) df = mulcov.merge(df) df.rename(columns={val_col: 'mulcov_value'}, inplace=True) df = df[[ 'c_covariate_name', 'mulcov_type', 'rate_name', 'integrand_name', 'mulcov_value' ]] except AttributeError: df = pd.DataFrame() dfs = dfs.append(df) return dfs
[docs]def compute_statistics(df, mean=True, std=True, quantile=None): """ Compute statistics on a data frame with covariate multipliers. Args: df: pd.DataFrame mean: bool std: bool quantile: optional list Returns: dictionary with requested statistics """ stats_df = pd.DataFrame() group_cols = ['c_covariate_name', 'mulcov_type', 'rate_name', 'integrand_name'] df_groups = df.fillna('none').copy().groupby(group_cols, sort=False) stats_df = df_groups.count().reset_index()[group_cols] if mean: ds = df_groups.mean().reset_index() stats_df['mean'] = ds['mulcov_value'] if std: degrees_of_freedom = int(df_groups.ngroups > len(df)) ds = df_groups.std(ddof=degrees_of_freedom).reset_index() stats_df['std'] = ds['mulcov_value'] if quantile is not None: for q in quantile: ds = df_groups.quantile(q=q).reset_index() stats_df[f'quantile_{q}'] = ds['mulcov_value'] return stats_df
[docs]def mulcov_statistics(model_version_id: int, locations: List[int], sexes: List[int], outfile_name: str, sample: bool = True, mean: bool = True, std: bool = True, quantile: Optional[List[float]] = None) -> None: """ Compute statistics for the covariate multipliers on a dismod database, and save them to a file. Parameters ---------- model_version_id The model version ID locations A list of locations that, when used in combination with sexes, point to the databases to pull covariate multiplier estimates from sexes A list of sexes that, when used in combination with locations, point to the databases to pull covariate multiplier estimates from outfile_name A filepath specifying where to save the covariate multiplier statistics. sample Whether or not the results are stored in the sample table or the fit_var table. mean Whether or not to compute the mean std Whether or not to compute the standard deviation quantile An optional list of quantiles to compute """ context = Context(model_version_id=model_version_id) db_files = [DismodIO(context.db_file(location_id=loc, sex_id=sex)) for loc in locations for sex in sexes] LOG.info(f"There are {len(db_files)} databases that will be aggregated.") common_covariates = common_covariate_names(db_files) LOG.info(f"The common covariates in the passed databases are {common_covariates}.") if sample: table_name = 'sample' else: table_name = 'fit_var' LOG.info(f"Will pull from the {table_name} table from each database.") mulcov_estimates = get_mulcovs( dbs=db_files, covs=common_covariates, table=table_name ) if not mulcov_estimates.empty: stats = compute_statistics( df=mulcov_estimates, mean=mean, std=std, quantile=quantile ) else: stats = mulcov_estimates LOG.info('Write to output file.') stats.to_csv(context.outputs_dir / f'{outfile_name}.csv', index=False)
def main(): args = ARG_LIST.parse_args(sys.argv[1:]) logging.basicConfig(level=LEVELS[args.log_level]) mulcov_statistics( model_version_id=args.model_version_id, locations=args.locations, sexes=args.sexes, outfile_name=args.outfile_name, sample=args.sample, mean=args.mean, std=args.std, quantile=args.quantile ) if __name__ == '__main__': main()