Source code for cascade_at.dismod.api.dismod_extractor
import os
from typing import List, Optional, Dict
from copy import copy
import numpy as np
import pandas as pd
from cascade_at.core.log import get_loggers
from cascade_at.dismod.api import DismodAPIError
from cascade_at.dismod.api.dismod_io import DismodIO
from cascade_at.dismod.integrand_mappings import PRIMARY_INTEGRANDS_TO_RATES, integrand_to_gbd_measures
from cascade_at.inputs.utilities.gbd_ids import DEMOGRAPHIC_ID_COLS, format_age_time
from cascade_at.inputs.utilities.gbd_ids import SEX_NAME_TO_ID, StudyCovConstants
LOG = get_loggers(__name__)
class ExtractorCols:
REQUIRED_DEMOGRAPHIC_COLS = ['location_id', 'sex_id']
OPTIONAL_DEMOGRAPHIC_COLS = ['year_id', 'age_group_id']
RESULT_COL = 'avg_integrand'
SAMPLE_COL = 'sample_index'
VALUE_COL_SAMPLES = 'draw'
VALUE_COL_FIT = 'mean'
INDEX_COLS = [
'integrand_id', 'integrand_name', 'rate',
'time_lower', 'time_upper', 'age_lower', 'age_upper'
]
class DismodExtractorError(DismodAPIError):
"""Errors raised when there are issues with DismodExtractor."""
pass
[docs]class DismodExtractor(DismodIO):
def __init__(self, path: str):
"""
Sits on top of the DismodIO class,
and extracts helpful data frames
from the dismod database tables.
Parameters
----------
path
The database filepath
"""
super().__init__(path=path)
if not os.path.isfile(path):
raise DismodExtractorError(f"SQLite file {str(path)} has not been created or filled yet.")
def _extract_raw_predictions(self, predictions: Optional[pd.DataFrame] = None) -> pd.DataFrame:
"""
Grab raw predictions from the predict table.
Or, optionally merge some predictions on the avgint table and integrand table. This
is a work-around when we've wanted to use a different prediction data frame (from using
multithreading) because dismod_at does not allow you to set the predict table.
"""
if predictions is None:
predictions = self.predict
df = predictions.merge(self.avgint, on=['avgint_id'])
df = df.merge(self.integrand, on=['integrand_id'])
df['rate'] = df['integrand_name'].map(
PRIMARY_INTEGRANDS_TO_RATES
)
# FIXME When running the pytests, the avgint table has node and covariate information included,
# but when running the regular code, it does not.
if not [c for c in df.columns if 'location_id' in c]:
df = df.merge(self.node, on=['node_id'])
if not [c for c in df.columns if 'sex_id' in c]:
sex_cov = self.covariate.loc[self.covariate.c_covariate_name.isin(['sex', 's_sex']), 'covariate_name'].squeeze()
sex_id_map = {v:SEX_NAME_TO_ID[k] for k,v in StudyCovConstants.SEX_COV_VALUE_MAP.items()}
df['sex_id'] = df[sex_cov].replace(sex_id_map)
return df
[docs] def get_predictions(self, locations: Optional[List[int]] = None,
sexes: Optional[List[int]] = None,
samples: bool = False,
predictions: Optional[pd.DataFrame] = None) -> pd.DataFrame:
"""
Get the predictions from the predict table for locations and sexes.
Will either return a column of 'mean' if not samples, otherwise 'draw', which can then
be reshaped wide if necessary.
"""
df = self._extract_raw_predictions(predictions=predictions)
if locations is not None:
df = df.loc[df.c_location_id.isin(locations)].copy()
missing_locations = set(df.c_location_id.values) - set(locations)
if missing_locations:
raise DismodExtractorError("The following locations you asked for were missing: "
f"{missing_locations}.")
df.rename(
columns={'c_' + x: x for x in DEMOGRAPHIC_ID_COLS}, inplace=True
)
if sexes is not None:
df = df.loc[df.sex_id.isin(sexes)].copy()
if set(df.sex_id.values) != set(sexes):
missing_sexes = set(df.sex_id.values) - set(sexes)
raise DismodExtractorError(f"The following sexes you asked for were missing: {missing_sexes}.")
DEMOGRAPHIC_COLS = copy(ExtractorCols.REQUIRED_DEMOGRAPHIC_COLS)
for col in ExtractorCols.REQUIRED_DEMOGRAPHIC_COLS:
if col not in df.columns:
raise DismodExtractorError(f"Cannot find required col {col} in the "
"predictions columns: {predictions.columns}.")
for col in ExtractorCols.OPTIONAL_DEMOGRAPHIC_COLS:
if col in df.columns:
DEMOGRAPHIC_COLS.append(col)
if samples:
VALUE_COL = ExtractorCols.VALUE_COL_SAMPLES
if ExtractorCols.SAMPLE_COL not in df.columns:
raise DismodExtractorError("Cannot find sample index column. Are you sure you created samples?")
if df[ExtractorCols.SAMPLE_COL].isnull().all():
raise DismodExtractorError("All sample index values are null. Are you sure you created samples?")
df[ExtractorCols.VALUE_COL_SAMPLES] = df[ExtractorCols.SAMPLE_COL].apply(
lambda x: f'{ExtractorCols.VALUE_COL_SAMPLES}_{x}'
)
VALUE_COLS = df[ExtractorCols.VALUE_COL_SAMPLES].unique().tolist()
df = df[INDEX_COLS + DEMOGRAPHIC_COLS + [ExtractorCols.VALUE_COL_SAMPLES] + [ExtractorCols.RESULT_COL]]
if df[INDEX_COLS + DEMOGRAPHIC_COLS + [ExtractorCols.VALUE_COL_SAMPLES]].duplicated().any():
raise DismodExtractorError("There are duplicate entries in the prediction data frame"
"based on the expected columns. Please check the data.")
df.set_index(INDEX_COLS + DEMOGRAPHIC_COLS + [ExtractorCols.VALUE_COL_SAMPLES], inplace=True)
df = df.unstack().reset_index()
df.columns = INDEX_COLS + DEMOGRAPHIC_COLS + VALUE_COLS
else:
df.rename(columns={ExtractorCols.RESULT_COL: ExtractorCols.VALUE_COL_FIT}, inplace=True)
VALUE_COLS = [ExtractorCols.VALUE_COL_FIT]
return df[DEMOGRAPHIC_COLS + INDEX_COLS + VALUE_COLS]
[docs] def gather_draws_for_prior_grid(self,
location_id: int,
sex_id: int,
rates: List[str],
value: bool = True,
dage: bool = False,
dtime: bool = False,
samples: bool = True) -> Dict[str, Dict[str, np.ndarray]]:
"""
Takes draws and formats them for a prior grid for values, dage, and dtime.
Assumes that age_lower == age_upper and time_lower == time_upper for all
data rows. We might not want to do all value, dage, and dtime, so pass False
if you want to skip those.
Arguments
---------
location_id
sex_id
rates
list of rates to get the draws for
value
whether to calculate value priors
dage
whether to calculate dage priors
dtime
whether to calculate dtime priors
samples
whether the prior came from samples
Returns
-------
Dictionary of 3-d arrays of value, dage, and dtime draws over age and time for this loc and sex
"""
rate_dict = dict()
for r in rates:
rate_dict[r] = dict()
df = self.get_predictions(locations=[location_id], sexes=[sex_id], samples=samples)
if samples:
DRAW_COLS = [col for col in df if col.startswith(ExtractorCols.VALUE_COL_SAMPLES)]
else:
DRAW_COLS = [ExtractorCols.VALUE_COL_FIT]
assert (df.age_lower.values == df.age_upper.values).all()
assert (df.time_lower.values == df.time_upper.values).all()
# Loop over rates, age, and time
for r in rates:
df2 = df.loc[df.rate == r].copy()
ages = np.asarray(sorted(df2.age_lower.unique().tolist()))
times = np.asarray(sorted(df2.time_lower.unique().tolist()))
n_draws = len(DRAW_COLS)
# Save these for later for quality checks
rate_dict[r]['ages'] = ages
rate_dict[r]['times'] = times
rate_dict[r]['n_draws'] = n_draws
# Create template for filling in the draws
draw_data = np.zeros((len(ages), len(times), n_draws))
for age_idx, age in enumerate(ages):
for time_idx, time in enumerate(times):
# Subset to the draws that we want from avg_integrand
# but only for this particular age and time
draws = df2.loc[
(df2.age_lower == age) &
(df2.time_lower == time)
][DRAW_COLS].values.ravel()
# Check to makes sure that the number of draws corresponds to the number
# of draws for the whole thing per age and time
assert len(draws) == n_draws
draw_data[age_idx, time_idx, :] = draws
if value:
rate_dict[r]['value'] = draw_data
if dage:
rate_dict[r]['dage'] = np.diff(draw_data, n=1, axis=0)
if dtime:
rate_dict[r]['dtime'] = np.diff(draw_data, n=1, axis=1)
return rate_dict
[docs] def format_predictions_for_ihme(self, gbd_round_id: int,
locations: Optional[List[int]] = None,
sexes: Optional[List[int]] = None,
samples: bool = False,
predictions: Optional[pd.DataFrame] = None) -> pd.DataFrame:
"""
Formats predictions from the prediction table and returns either the mean
or draws, based on whether or not samples is False or True.
Parameters
----------
locations
A list of locations to extract from the predictions
sexes
A list of sexes to extract from the predictions
gbd_round_id
The GBD round ID to format the predictions for
samples
Whether or not the predictions have draws (samples) or whether
it is just one fit.
predictions
An optional data frame with the predictions to use rather than
reading them directly from the database.
Returns
-------
Data frame with predictions formatted for the IHME databases.
"""
pred = self.get_predictions(locations=locations, sexes=sexes, samples=samples,
predictions=predictions)
pred = format_age_time(df=pred, gbd_round_id=gbd_round_id)
pred = integrand_to_gbd_measures(df=pred, integrand_col='integrand_name')
if samples:
VALUE_COLS = [col for col in pred.columns if col.startswith(ExtractorCols.VALUE_COL_SAMPLES)]
else:
VALUE_COLS = [ExtractorCols.VALUE_COL_FIT]
return pred[[
'location_id', 'year_id', 'age_group_id', 'sex_id',
'measure_id'
] + VALUE_COLS]