import logging
import numpy as np
import pandas as pd
from genno import KeySeq
from message_ix_models import Context
from .config import Config
log = logging.getLogger(__name__)
[docs]def process_raw_ssp_data(context: Context, config: Config) -> pd.DataFrame:
"""Retrieve SSP data as required for :mod:`.tools.costs`.
This method uses :class:`.SSPOriginal` and :class:`.SSPUpdate` via
:func:`.exo_data.prepare_computer`
Returns
-------
pandas.DataFrame
with the columns:
- scenario_version: version of SSP scenario data
- scenario: scenario (SSP1-5, LED)
- region: region name
- year: year of data
- total_population: total population aggregated to the regional level
- total_gdp: total GDP aggregated to the regional level
- gdp_ppp_per_capita: regional GDP per capita in PPP
- gdp_ratio_reg_to_reference: ratio of regional GDP per capita to \
reference region's GDP per capita
"""
from collections import defaultdict
import xarray as xr
from genno import Computer, Key, Quantity, quote
from message_ix_models.project.ssp.data import SSPUpdate # noqa: F401
from message_ix_models.tools.exo_data import prepare_computer
# Computer to hold computations
c = Computer()
# Common dimensions
dims = ("n", "y", "scenario")
def broadcast_qty(s) -> Quantity:
"""Return a quantity with a "scenario" dimension with the single label `s`.
Multiplying this by any other quantity adds the "scenario" dimension."""
return Quantity(xr.DataArray([1.0], coords={"scenario": [s]}))
c.add("LED:scenario", broadcast_qty("LED"))
# Keys prepared in the loop
keys = defaultdict(list)
for n in "12345":
# Source/scenario identifier
ssp = f"ICONICS:SSP(2024).{n}"
# Add a quantity for broadcasting
c.add(f"SSP{n}:scenario", broadcast_qty(f"SSP{n}"))
# Both population and GDP data
for source_kw in (
dict(measure="POP", model="IIASA-WiC POP 2023", name=f"_pop {n}"),
dict(measure="GDP", model="OECD ENV-Growth 2023", name=f"_gdp {n}"),
):
m = source_kw["measure"].lower()
# Add tasks to `c` that retrieve and (partly) process data from the database
key, *_ = prepare_computer(context, c, ssp, source_kw, strict=False)
# Add a "scenario" dimension
for label in [f"SSP{n}"] + (["LED"] if n == "2" else []):
keys[m].append(c.add(f"{m} {label}", "mul", key, f"{label}:scenario"))
# Concatenate single-scenario data
k_pop = Key("pop", dims)
c.add(k_pop, "concat", *keys["pop"])
k_gdp = KeySeq("gdp", dims)
c.add(k_gdp.base, "concat", *keys["gdp"])
# Further calculations
# GDP per capita
c.add(k_gdp["cap"], "div", k_gdp.base, k_pop)
# Ratio to reference region value
c.add(
k_gdp["indexed"], "index_to", k_gdp["cap"], quote("n"), quote(config.ref_region)
)
def merge(*dfs: pd.DataFrame) -> pd.DataFrame:
"""Merge data to a single data frame with the expected format."""
return (
pd.concat(
[
dfs[0].to_series().rename("total_population"),
dfs[1].to_series().rename("total_gdp"),
dfs[2].to_series().rename("gdp_ppp_per_capita"),
dfs[3].to_series().rename("gdp_ratio_reg_to_reference"),
],
axis=1,
)
.reset_index()
.rename(columns={"n": "region", "y": "year"})
.sort_values(by=["scenario", "region", "year"])
.assign(scenario_version="2023")
)
k_result = "data::pandas"
c.add(k_result, merge, k_pop, k_gdp.base, k_gdp["cap"], k_gdp["indexed"])
# log.debug(c.describe(k_result)) # DEBUG Show what would be done
result = c.get(k_result)
# Ensure no NaN values in the ratio column
assert not result.gdp_ratio_reg_to_reference.isna().any()
return result
[docs]def adjust_cost_ratios_with_gdp(region_diff_df, config: Config):
"""Calculate adjusted region-differentiated cost ratios.
This function takes in a data frame with region-differentiated cost ratios and
calculates adjusted region-differentiated cost ratios using GDP per capita data.
Parameters
----------
region_diff_df : pandas.DataFrame
Output of :func:`apply_regional_differentiation`.
config : .Config
The function responds to, or passes on to other functions, the fields:
:attr:`~.Config.base_year`,
:attr:`~.Config.node`,
:attr:`~.Config.ref_region`,
:attr:`~.Config.scenario`, and
:attr:`~.Config.scenario_version`.
Returns
-------
pandas.DataFrame
DataFrame with columns:
- scenario_version: scenario version
- scenario: SSP scenario
- message_technology: message technology
- region: R11, R12, or R20 region
- year
- gdp_ratio_reg_to_reference: ratio of GDP per capita in respective region to
GDP per capita in reference region.
- reg_cost_ratio_adj: adjusted region-differentiated cost ratio
"""
from .projections import _maybe_query_scenario, _maybe_query_scenario_version
context = Context.get_instance(-1)
context.model.regions = config.node
# - Retrieve GDP from SSP databases and compute and ratios (per capita; versus
# ref_region.
# - Keep only data from y₀ onwards.
# - Map "scenario_version" strings to the desired output.
# - Set the dtype of the "year" column.
# - Filter on config.scenario and config.scenario_version, if configured.
df_gdp = (
process_raw_ssp_data(context, config)
.query("year >= @config.y0")
.drop(columns=["total_gdp", "total_population"])
.assign(
scenario_version=lambda x: np.where(
x.scenario_version.str.contains("2013"),
"Previous (2013)",
"Review (2023)",
)
)
.astype({"year": int})
.pipe(_maybe_query_scenario, config)
.pipe(_maybe_query_scenario_version, config)
)
# If base year does not exist in GDP data, then use earliest year in GDP data and
# give warning
base_year = config.base_year
if base_year not in df_gdp.year.unique():
new_base_year = min(df_gdp.year.unique())
log.warning(f"Use year={new_base_year} GDP data as proxy for {base_year}")
base_year = new_base_year
def _constrain_cost_ratio(df: pd.DataFrame, base_year):
"""Constrain "reg_cost_ratio_adj".
In cases where gdp_ratio_reg_to_reference is < 1 and reg_cost_ratio_adj > 1 in
the base period, ensure reg_cost_ratio_adj(y) <= reg_cost_ratio_adj(base_year)
for all future periods y.
"""
ref = df.query("year == @base_year").iloc[0]
if ref.gdp_ratio_reg_to_reference < 1 and ref.reg_cost_ratio_adj > 1:
return df.assign(
reg_cost_ratio_adj=df.reg_cost_ratio_adj.clip(
upper=ref.reg_cost_ratio_adj
)
)
else:
return df
# 1. Select base-year GDP data for "gdp_ratio_reg_to_reference".
# 2. Drop "year".
# 3. Merge `df_region_diff` for "reg_cost_ratio".
# 4. Compute slope.
# 5. Compute intercept.
# 6. Drop "gdp_ratio_reg_to_reference"—because of (1–2), this is the base period
# value only.
# 7. Merge `df_gdp` again to re-adds "year" and "gdp_ratio_reg_to_reference" with
# distinct values for each period.
# 8. Compute ref_cost_ratio_adj
# 9. Fill 1.0 where NaNs occur in (8), i.e. for the reference region.
# 10. Group by (sv, s, r, t) and apply _constrain_cost_ratio(), above, to each
# group.
# 11. Select the desired columns.
return (
df_gdp.query("year == @base_year")
.drop("year", axis=1)
.merge(region_diff_df, on=["region"])
.eval("slope = (reg_cost_ratio - 1) / (gdp_ratio_reg_to_reference - 1)")
.eval("intercept = 1 - slope")
.drop("gdp_ratio_reg_to_reference", axis=1)
.merge(df_gdp, on=["scenario_version", "scenario", "region"], how="right")
.eval("reg_cost_ratio_adj = slope * gdp_ratio_reg_to_reference + intercept")
.fillna({"reg_cost_ratio_adj": 1.0})
.groupby(
["scenario_version", "scenario", "region", "message_technology"],
group_keys=False,
)
.apply(_constrain_cost_ratio, base_year)[
[
"scenario_version",
"scenario",
"message_technology",
"region",
"year",
"gdp_ratio_reg_to_reference",
"reg_cost_ratio_adj",
]
]
)