Source code for cascade_at.inputs.covariate_data

import pandas as pd
from typing import List

from cascade_at.core.db import db_queries
from cascade_at.core.log import get_loggers
from cascade_at.inputs.base_input import BaseInput
from cascade_at.inputs.demographics import Demographics

LOG = get_loggers(__name__)


[docs]class CovariateData(BaseInput):
    def __init__(self, covariate_id: int, demographics: Demographics,
                 decomp_step: str, gbd_round_id: int):
        """
        Get covariate estimates, and map them to the necessary demographic
        ages and sexes. If only one age group is present in the covariate
        data then that means that it's not age-specific and we want to copy
        the values over to all the other age groups we're working with in
        demographics. Same with sex.
        """
        self.covariate_id = covariate_id
        self.demographics = demographics
        self.decomp_step = decomp_step
        self.gbd_round_id = gbd_round_id

        super().__init__(gbd_round_id=gbd_round_id)

        self.raw = None

[docs]    def get_raw(self):
        """
        Pulls the raw covariate data from the database.
        """
        self.raw = db_queries.get_covariate_estimates(
            covariate_id=self.covariate_id,
            year_id=self.demographics.year_id,
            gbd_round_id=self.gbd_round_id,
            decomp_step=self.decomp_step
        )
        return self

[docs]    def configure_for_dismod(self, pop_df: pd.DataFrame, loc_df: pd.DataFrame):
        """
        Configures covariates for DisMod. Completes covariate
        ages, sexes, and locations based on what covariate data is already
        available.

        To fill in ages, it copies over all age or age standardized
        covariates into each of the specific age groups.

        To fill in sexes, it copies over any both sex covariates to
        the sex specific groups.

        To fill in locations, it takes a population-weighted average of child
        locations for parent locations all the way up the location hierarchy.

        Parameters
        ----------
        pop_df
            A data frame with population info for all ages, sexes, locations, and years
        loc_df
            A data frame with location hierarchy information
        """
        df = self.raw[[
            'location_id', 'year_id', 'age_group_id', 'sex_id', 'mean_value'
        ]]
        df = self._complete_covariate_ages(cov_df=df)
        df = self._complete_covariate_sex(cov_df=df, pop_df=pop_df)
        df = self._complete_covariate_locations(cov_df=df, pop_df=pop_df, loc_df=loc_df,
                                                locations=self.demographics.location_id)
        df = self.convert_to_age_lower_upper(df)
        return df
    
    def _complete_covariate_ages(self, cov_df):
        """
        Adds on covariate ages for all age group IDs.
        """
        if (22 in cov_df.age_group_id.tolist()) or (27 in cov_df.age_group_id.tolist()):
            covs = pd.DataFrame()
            for age in self.demographics.age_group_id:
                df = cov_df.copy()
                df['age_group_id'] = age
                covs = covs.append(df)
        else:
            covs = cov_df.copy()
        return covs

    @staticmethod
    def _complete_covariate_locations(cov_df: pd.DataFrame, pop_df: pd.DataFrame, loc_df: pd.DataFrame,
                                      locations: List[int]):
        """
        Completes the covariate locations that aren't in the database as a population-weighted average.
        """
        parent_pop = pop_df[['location_id', 'age_group_id', 'sex_id', 'year_id', 'population']].copy()
        parent_pop.rename(columns={'location_id': 'parent_id', 'population': 'parent_population'}, inplace=True)

        loc_subset_df = loc_df.loc[loc_df.location_id.isin(locations)]
        all_levels = loc_subset_df.level.unique().tolist()
        cov_locations = cov_df.location_id.unique().tolist()
        cov_levels = loc_subset_df.loc[loc_subset_df.location_id.isin(cov_locations)].level.unique().tolist()
        missing_levels = [x for x in all_levels if x not in cov_levels]

        df = cov_df.copy()

        for level in sorted(missing_levels, reverse=True):
            LOG.info(f"Filling in covariate values at location hierarchy level {level}.")
            # Get one location below this level
            ldf = loc_subset_df.loc[loc_subset_df.level == level + 1].copy()

            # Merge on the population just for these locations (left) --
            # builds out the full age-sex-year data frame for populations
            lp = ldf.merge(pop_df, on=['location_id'], how='left')

            # Merge on the covariate data just for these location-populations
            clp = lp.merge(df, on=['location_id', 'age_group_id', 'sex_id', 'year_id'], how='left')

            # Get the parent population based on parent ID
            dp = clp.merge(parent_pop, on=['parent_id', 'age_group_id', 'sex_id', 'year_id'], how='left')
            dp.drop('location_id', inplace=True, axis=1)
            
            # Calculate the weighted value for each row
            dp['cov_weighted'] = dp.mean_value * dp.population / dp.parent_population

            # Group by parent ID and other demographics, over location IDs, summing
            # to get the final weighted covariate value
            dp = dp.groupby([
                'parent_id', 'year_id', 'age_group_id', 'sex_id'
            ])['cov_weighted'].sum().reset_index()

            # Set the new parent ID as location ID so that it can be used one level up the tree
            dp.rename(columns={'parent_id': 'location_id', 'cov_weighted': 'mean_value'}, inplace=True)
            df = df.append(dp)

        return df

    @staticmethod
    def _complete_covariate_sex(cov_df: pd.DataFrame, pop_df: pd.DataFrame):
        """
        Fills in missing sex values so that both is propagated to male and female if missing,
        and both is created as a pop-weighted average between male and female if both missing.
        """
        if set(cov_df.sex_id) == {1, 2, 3}:
            result_df = cov_df
        elif set(cov_df.sex_id) == {3}:
            cov_1 = cov_df.copy()
            cov_1['sex_id'] = 1
            cov_2 = cov_df.copy()
            cov_2['sex_id'] = 2
            result_df = cov_df.append([cov_1, cov_2])
        elif set(cov_df.sex_id) == {1, 2}:
            both_pop = pop_df.loc[pop_df.sex_id == 3][['location_id', 'year_id', 'age_group_id', 'population']].copy()
            both = cov_df.merge(both_pop,
                                on=['location_id', 'year_id', 'age_group_id'], how='left')
            both.rename(columns={'population': 'both_pop'}, inplace=True)
            both = both.merge(pop_df.loc[pop_df.sex_id.isin([1, 2])],
                              on=['location_id', 'year_id', 'age_group_id', 'sex_id'], how='left')
            both['cov_weighted'] = both.mean_value * both.population / both.both_pop
            both = both.groupby(['location_id', 'year_id', 'age_group_id'])['cov_weighted'].sum().reset_index()
            both['sex_id'] = 3
            both.rename(columns={'cov_weighted': 'mean_value'}, inplace=True)
            result_df = cov_df.append([both])
        else:
            raise RuntimeError(f"Unknown covariate sex IDs {set(cov_df.sex_id)}.")
        return result_df