Source code for cascade_at.dismod.api.fill_extract_helpers.posterior_to_prior

import pandas as pd
from typing import Dict, List
import numpy as np
from scipy import stats

from cascade_at.dismod.api.fill_extract_helpers.utils import vec_to_midpoint
from cascade_at.model.utilities.grid_helpers import expand_grid
from cascade_at.dismod.constants import RateToIntegrand, IntegrandEnum, INTEGRAND_TO_WEIGHT
from cascade_at.inputs.utilities.gbd_ids import format_age_time
from cascade_at.dismod.integrand_mappings import RATE_TO_INTEGRAND, integrand_to_gbd_measures
from cascade_at.model.smooth_grid import SmoothGrid


[docs]def get_prior_avgint_grid(grids: Dict[str, Dict[str, np.ndarray]],
                          sexes: List[int],
                          locations: List[int],
                          midpoint: bool = False) -> pd.DataFrame:
    """
    Get a data frame to use for setting up posterior predictions on a grid.
    The grids are specified in the grids parameter.

    Will still need to have covariates added to it, and prep data from
    dismod.api.data_tables.prep_data_avgint to convert nodes and covariate names
    before it can be input into the avgint table in a database.

    Parameters
    ---------
    grids
        A dictionary of grids with keys for each integrand,
        which are dictionaries for "age" and "time".
    sexes
        A list of sexes
    locations
        A list of locations
    midpoint
        Whether to midpoint the grid lower and upper values (recommended for rates).

    Returns
    -------
    Dataframe with columns
        "avgint_id", "integrand_id", "location_id", "weight_id", "subgroup_id",
        "age_lower", "age_upper", "time_lower", "time_upper", "sex_id"

    """
    posterior_dfs = pd.DataFrame()
    for k, v in grids.items():
        if midpoint:
            time = vec_to_midpoint(v['time'])
            age = vec_to_midpoint(v['age'])
        else:
            time = v['time']
            age = v['age']

        posterior_df = expand_grid({
            'age_lower': age,
            'time_lower': time,
            'location_id': locations,
            'sex_id': sexes
        })
        posterior_df['time_upper'] = posterior_df['time_lower']
        posterior_df['age_upper'] = posterior_df['age_lower']

        posterior_df['rate'] = k
        posterior_df['integrand'] = posterior_df['rate'].map(RateToIntegrand)
        posterior_df['integrand_id'] = posterior_df['integrand'].apply(
            lambda x: IntegrandEnum[x].value
        )
        posterior_df['weight_id'] = posterior_df["integrand"].apply(
            lambda x: INTEGRAND_TO_WEIGHT[x].value
        )
        posterior_df['subgroup_id'] = 0

        posterior_dfs = posterior_dfs.append(posterior_df)

    return posterior_dfs[[
        "integrand_id", "location_id", "weight_id", "subgroup_id",
        "age_lower", "age_upper", "time_lower", "time_upper", "sex_id"
    ]]


[docs]def format_rate_grid_for_ihme(rates: Dict[str, SmoothGrid], gbd_round_id: int,
                              location_id: int, sex_id: int) -> pd.DataFrame:
    """
    Formats a grid of mean, upper, and lower for a prior rate
    for the IHME database. **Only does this for Gaussian priors.**

    Parameters
    ----------
    rates
         A dictionary of SmoothGrids, keyed by primary rates like "iota"
    gbd_round_id
        the GBD round
    location_id
        the location ID to append to this data frame
    sex_id
        the sex ID to append to this data frame

    Returns
    -------
    A data frame formatted for the IHME databases
    """
    dfs = []
    for rate, smooth_grid in rates.items():
        df = smooth_grid.value.grid.copy()
        if df.empty:
            continue

        df['age_lower'] = df['age']
        df['age_upper'] = df['age']
        df['time_lower'] = df['time']
        df['time_upper'] = df['time']

        df = format_age_time(df=df, gbd_round_id=gbd_round_id)

        group_cols = ['age', 'time']
        # TODO: Once we can upgrade to pandas 1.1.0, then we can use the groupby(..., dropna=False)
        #  feature, which we need because eta and nu can be null and that's ok, but pandas drops them.
        #  In the meantime, we will group on age and time which means we're looping over each row,
        #  which in some cases will be x 30 more computation than necessary.
        #  Once we upgrade, use the group_cols below and it will skip duplicate computation.
        #  group_cols = ['mean', 'std', 'lower', 'upper', 'density', 'eta', 'nu']

        for name, group in df.groupby(group_cols):
            at_row = smooth_grid.value[group.iloc[0]['age'], group.iloc[0]['time']].quantiles([0.025, 0.975])
            df.loc[group.index, 'lower'] = at_row[0]
            df.loc[group.index, 'upper'] = at_row[1]

        df['integrand'] = RATE_TO_INTEGRAND[rate].name
        df = integrand_to_gbd_measures(df=df, integrand_col='integrand')

        df['location_id'] = location_id
        df['sex_id'] = sex_id

        dfs.append(df)
    return pd.concat(dfs, axis=0, sort=False).reset_index()[[
        'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id',
        'mean', 'upper', 'lower'
    ]]