Source code for cascade_at.dismod.api.dismod_extractor

import os
from typing import List, Optional, Dict
from copy import copy

import numpy as np
import pandas as pd

from cascade_at.core.log import get_loggers
from cascade_at.dismod.api import DismodAPIError
from cascade_at.dismod.api.dismod_io import DismodIO
from cascade_at.dismod.integrand_mappings import PRIMARY_INTEGRANDS_TO_RATES, integrand_to_gbd_measures
from cascade_at.inputs.utilities.gbd_ids import DEMOGRAPHIC_ID_COLS, format_age_time
from cascade_at.inputs.utilities.gbd_ids import SEX_NAME_TO_ID, StudyCovConstants

LOG = get_loggers(__name__)


class ExtractorCols:
    REQUIRED_DEMOGRAPHIC_COLS = ['location_id', 'sex_id']
    OPTIONAL_DEMOGRAPHIC_COLS = ['year_id', 'age_group_id']
    RESULT_COL = 'avg_integrand'
    SAMPLE_COL = 'sample_index'
    VALUE_COL_SAMPLES = 'draw'
    VALUE_COL_FIT = 'mean'


INDEX_COLS = [
    'integrand_id', 'integrand_name', 'rate',
    'time_lower', 'time_upper', 'age_lower', 'age_upper'
]


class DismodExtractorError(DismodAPIError):
    """Errors raised when there are issues with DismodExtractor."""
    pass


[docs]class DismodExtractor(DismodIO): def __init__(self, path: str): """ Sits on top of the DismodIO class, and extracts helpful data frames from the dismod database tables. Parameters ---------- path The database filepath """ super().__init__(path=path) if not os.path.isfile(path): raise DismodExtractorError(f"SQLite file {str(path)} has not been created or filled yet.") def _extract_raw_predictions(self, predictions: Optional[pd.DataFrame] = None) -> pd.DataFrame: """ Grab raw predictions from the predict table. Or, optionally merge some predictions on the avgint table and integrand table. This is a work-around when we've wanted to use a different prediction data frame (from using multithreading) because dismod_at does not allow you to set the predict table. """ if predictions is None: predictions = self.predict df = predictions.merge(self.avgint, on=['avgint_id']) df = df.merge(self.integrand, on=['integrand_id']) df['rate'] = df['integrand_name'].map( PRIMARY_INTEGRANDS_TO_RATES ) # FIXME When running the pytests, the avgint table has node and covariate information included, # but when running the regular code, it does not. if not [c for c in df.columns if 'location_id' in c]: df = df.merge(self.node, on=['node_id']) if not [c for c in df.columns if 'sex_id' in c]: sex_cov = self.covariate.loc[self.covariate.c_covariate_name.isin(['sex', 's_sex']), 'covariate_name'].squeeze() sex_id_map = {v:SEX_NAME_TO_ID[k] for k,v in StudyCovConstants.SEX_COV_VALUE_MAP.items()} df['sex_id'] = df[sex_cov].replace(sex_id_map) return df
[docs] def get_predictions(self, locations: Optional[List[int]] = None, sexes: Optional[List[int]] = None, samples: bool = False, predictions: Optional[pd.DataFrame] = None) -> pd.DataFrame: """ Get the predictions from the predict table for locations and sexes. Will either return a column of 'mean' if not samples, otherwise 'draw', which can then be reshaped wide if necessary. """ df = self._extract_raw_predictions(predictions=predictions) if locations is not None: df = df.loc[df.c_location_id.isin(locations)].copy() missing_locations = set(df.c_location_id.values) - set(locations) if missing_locations: raise DismodExtractorError("The following locations you asked for were missing: " f"{missing_locations}.") df.rename( columns={'c_' + x: x for x in DEMOGRAPHIC_ID_COLS}, inplace=True ) if sexes is not None: df = df.loc[df.sex_id.isin(sexes)].copy() if set(df.sex_id.values) != set(sexes): missing_sexes = set(df.sex_id.values) - set(sexes) raise DismodExtractorError(f"The following sexes you asked for were missing: {missing_sexes}.") DEMOGRAPHIC_COLS = copy(ExtractorCols.REQUIRED_DEMOGRAPHIC_COLS) for col in ExtractorCols.REQUIRED_DEMOGRAPHIC_COLS: if col not in df.columns: raise DismodExtractorError(f"Cannot find required col {col} in the " "predictions columns: {predictions.columns}.") for col in ExtractorCols.OPTIONAL_DEMOGRAPHIC_COLS: if col in df.columns: DEMOGRAPHIC_COLS.append(col) if samples: VALUE_COL = ExtractorCols.VALUE_COL_SAMPLES if ExtractorCols.SAMPLE_COL not in df.columns: raise DismodExtractorError("Cannot find sample index column. Are you sure you created samples?") if df[ExtractorCols.SAMPLE_COL].isnull().all(): raise DismodExtractorError("All sample index values are null. Are you sure you created samples?") df[ExtractorCols.VALUE_COL_SAMPLES] = df[ExtractorCols.SAMPLE_COL].apply( lambda x: f'{ExtractorCols.VALUE_COL_SAMPLES}_{x}' ) VALUE_COLS = df[ExtractorCols.VALUE_COL_SAMPLES].unique().tolist() df = df[INDEX_COLS + DEMOGRAPHIC_COLS + [ExtractorCols.VALUE_COL_SAMPLES] + [ExtractorCols.RESULT_COL]] if df[INDEX_COLS + DEMOGRAPHIC_COLS + [ExtractorCols.VALUE_COL_SAMPLES]].duplicated().any(): raise DismodExtractorError("There are duplicate entries in the prediction data frame" "based on the expected columns. Please check the data.") df.set_index(INDEX_COLS + DEMOGRAPHIC_COLS + [ExtractorCols.VALUE_COL_SAMPLES], inplace=True) df = df.unstack().reset_index() df.columns = INDEX_COLS + DEMOGRAPHIC_COLS + VALUE_COLS else: df.rename(columns={ExtractorCols.RESULT_COL: ExtractorCols.VALUE_COL_FIT}, inplace=True) VALUE_COLS = [ExtractorCols.VALUE_COL_FIT] return df[DEMOGRAPHIC_COLS + INDEX_COLS + VALUE_COLS]
[docs] def gather_draws_for_prior_grid(self, location_id: int, sex_id: int, rates: List[str], value: bool = True, dage: bool = False, dtime: bool = False, samples: bool = True) -> Dict[str, Dict[str, np.ndarray]]: """ Takes draws and formats them for a prior grid for values, dage, and dtime. Assumes that age_lower == age_upper and time_lower == time_upper for all data rows. We might not want to do all value, dage, and dtime, so pass False if you want to skip those. Arguments --------- location_id sex_id rates list of rates to get the draws for value whether to calculate value priors dage whether to calculate dage priors dtime whether to calculate dtime priors samples whether the prior came from samples Returns ------- Dictionary of 3-d arrays of value, dage, and dtime draws over age and time for this loc and sex """ rate_dict = dict() for r in rates: rate_dict[r] = dict() df = self.get_predictions(locations=[location_id], sexes=[sex_id], samples=samples) if samples: DRAW_COLS = [col for col in df if col.startswith(ExtractorCols.VALUE_COL_SAMPLES)] else: DRAW_COLS = [ExtractorCols.VALUE_COL_FIT] assert (df.age_lower.values == df.age_upper.values).all() assert (df.time_lower.values == df.time_upper.values).all() # Loop over rates, age, and time for r in rates: df2 = df.loc[df.rate == r].copy() ages = np.asarray(sorted(df2.age_lower.unique().tolist())) times = np.asarray(sorted(df2.time_lower.unique().tolist())) n_draws = len(DRAW_COLS) # Save these for later for quality checks rate_dict[r]['ages'] = ages rate_dict[r]['times'] = times rate_dict[r]['n_draws'] = n_draws # Create template for filling in the draws draw_data = np.zeros((len(ages), len(times), n_draws)) for age_idx, age in enumerate(ages): for time_idx, time in enumerate(times): # Subset to the draws that we want from avg_integrand # but only for this particular age and time draws = df2.loc[ (df2.age_lower == age) & (df2.time_lower == time) ][DRAW_COLS].values.ravel() # Check to makes sure that the number of draws corresponds to the number # of draws for the whole thing per age and time assert len(draws) == n_draws draw_data[age_idx, time_idx, :] = draws if value: rate_dict[r]['value'] = draw_data if dage: rate_dict[r]['dage'] = np.diff(draw_data, n=1, axis=0) if dtime: rate_dict[r]['dtime'] = np.diff(draw_data, n=1, axis=1) return rate_dict
[docs] def format_predictions_for_ihme(self, gbd_round_id: int, locations: Optional[List[int]] = None, sexes: Optional[List[int]] = None, samples: bool = False, predictions: Optional[pd.DataFrame] = None) -> pd.DataFrame: """ Formats predictions from the prediction table and returns either the mean or draws, based on whether or not samples is False or True. Parameters ---------- locations A list of locations to extract from the predictions sexes A list of sexes to extract from the predictions gbd_round_id The GBD round ID to format the predictions for samples Whether or not the predictions have draws (samples) or whether it is just one fit. predictions An optional data frame with the predictions to use rather than reading them directly from the database. Returns ------- Data frame with predictions formatted for the IHME databases. """ pred = self.get_predictions(locations=locations, sexes=sexes, samples=samples, predictions=predictions) pred = format_age_time(df=pred, gbd_round_id=gbd_round_id) pred = integrand_to_gbd_measures(df=pred, integrand_col='integrand_name') if samples: VALUE_COLS = [col for col in pred.columns if col.startswith(ExtractorCols.VALUE_COL_SAMPLES)] else: VALUE_COLS = [ExtractorCols.VALUE_COL_FIT] return pred[[ 'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id' ] + VALUE_COLS]