Source code for cascade_at.inputs.data

from typing import List, Optional

import pandas as pd

from cascade_at.core.db import elmo
from cascade_at.core.log import get_loggers
from cascade_at.dismod.integrand_mappings import make_integrand_map
from cascade_at.inputs.base_input import BaseInput
from cascade_at.inputs.demographics import Demographics
from cascade_at.inputs.uncertainty import stdev_from_crosswalk_version
from cascade_at.inputs.utilities import gbd_ids
from cascade_at.inputs.utilities.transformations import RELABEL_INCIDENCE_MAP

LOG = get_loggers(__name__)


[docs]class CrosswalkVersion(BaseInput): def __init__(self, crosswalk_version_id: int, exclude_outliers: bool, demographics: Demographics, conn_def: str, gbd_round_id: int): """ Pulls and formats all of the data from a crosswalk version in the epi database. Parameters ---------- crosswalk_version_id The crosswalk version to pull from exclude_outliers whether to exclude outliers conn_def database connection definition gbd_round_id The GBD round demographics The demographics object """ super().__init__(gbd_round_id=gbd_round_id) self.crosswalk_version_id = crosswalk_version_id self.exclude_outliers = exclude_outliers self.demographics = demographics self.conn_def = conn_def self.raw = None
[docs] def get_raw(self): """ Pulls the raw crosswalk version from the database. These are the observations that will be used in the bundle. """ LOG.info(f"Getting crosswalk version for {self.crosswalk_version_id}.") import sys if 'darwin' in sys.platform: LOG.error(f"FIXME gma -- this call to elmo.get_crosswalk_version ought to contain an error_log_path argument.") LOG.error(f"FIXME gma -- START -- This call somehow switches logging from stdout to a socket.") self.raw = elmo.get_crosswalk_version(crosswalk_version_id=self.crosswalk_version_id) if 'darwin' in sys.platform: LOG.error(f"FIXME gma -- END -- Now logging to a socket. LOG.handlers: {LOG.handlers}") return self
[docs] def configure_for_dismod(self, relabel_incidence: int, measures_to_exclude: Optional[List[str]] = None) -> pd.DataFrame: """ Configures the crosswalk version for DisMod. Parameters ---------- measures_to_exclude list of parameters to exclude, by name relabel_incidence how to label incidence -- see RELABEL_INCIDENCE_MAP """ df = self.raw.copy() if self.exclude_outliers: df = df.loc[df.is_outlier != 1].copy() sex_ids = gbd_ids.get_sex_ids() measure_ids = gbd_ids.get_measure_ids(conn_def=self.conn_def) df = df.merge(sex_ids, on='sex') df = df.merge(measure_ids, on='measure') df = df.loc[~df.input_type.isin(['parent', 'group_review'])].copy() df = df.loc[df.location_id.isin(self.demographics.location_id)] df['hold_out'] = 0 df = self.map_to_integrands(df, relabel_incidence=relabel_incidence) if measures_to_exclude: df.loc[df.measure.isin(measures_to_exclude), 'hold_out'] = 1 LOG.info( f"Filtering {df.hold_out.sum()} rows of of data where the measure has been excluded. " f"Measures marked for exclusion: {measures_to_exclude}. " f"{len(df)} rows remaining." ) df = df.loc[df.location_id.isin(self.demographics.location_id)] df = df.loc[df.sex_id.isin(self.demographics.sex_id)] df["age_lower"] = df["age_start"] df["time_lower"] = df["year_start"] df["age_upper"] = df["age_end"] df["time_upper"] = df["year_end"] df = self.get_out_of_demographic_notation(df, columns=['age', 'time']) df["meas_value"] = df["mean"] df["meas_std"] = stdev_from_crosswalk_version(df) df["name"] = df.seq.astype(str) df = self.keep_only_necessary_columns(df) return df
[docs] @staticmethod def map_to_integrands(df: pd.DataFrame, relabel_incidence: int): """ Maps the data from the IHME databases to the integrands expected by DisMod AT. Parameters ---------- df A data frame to map to integrands relabel_incidence A relabel incidence code. Can be found in :py:class:`~cascade_at.inputs.utilities.transformations.RELABEL_INCIDENCE_MAP` """ integrand_map = make_integrand_map() if any(df.measure_id == 17): LOG.info( f"Found case fatality rate, measure_id=17, in data. Ignoring it because it does not " f"map to a Dismod-AT integrand and cannot be used by the model." ) df = df[df.measure_id != 17] try: df["measure"] = df.measure_id.apply(lambda k: integrand_map[k].name) except KeyError as ke: raise RuntimeError( f"The bundle data uses measure {str(ke)} which does not map " f"to an integrand. The map is {integrand_map}." ) measure_dict = {measure: measure for measure in df.measure.unique().tolist()} measure_dict.update(RELABEL_INCIDENCE_MAP[relabel_incidence]) df["measure"] = df["measure"].map(measure_dict) if any(df.measure == 'incidence'): LOG.error(f"Found incidence, measure_id=6, in data. Should be Tincidence or Sincidence.") raise ValueError("Measure ID cannot be 6 for incidence. Must be S or Tincidence.") return df