Source code for message_ix_models.model.material.data_petro

"""
Data and parameter generation for the petrochemicals sector in MESSAGEix models.

This module provides functions to read, process, and generate parameter data
for petrochemical technologies, demand, trade, emissions, and related constraints.
"""

from collections import defaultdict
from typing import TYPE_CHECKING

import pandas as pd
from message_ix import make_df

from message_ix_models import ScenarioInfo
from message_ix_models.model.material.data_util import (
    gen_chemicals_co2_ind_factors,
    gen_ethanol_to_ethylene_emi_factor,
    gen_plastics_emission_factors,
    read_timeseries,
)
from message_ix_models.model.material.demand import gen_demand_petro
from message_ix_models.model.material.util import (
    get_ssp_from_context,
    read_config,
)
from message_ix_models.util import (
    broadcast,
    merge_data,
    nodes_ex_world,
    package_data_path,
    same_node,
)

if TYPE_CHECKING:
    from message_ix import Scenario

    from message_ix_models.types import ParameterData
    from message_ix_models.util import Context

ssp_mode_map = {
    "SSP1": "CTS core",
    "SSP2": "RTS core",
    "SSP3": "RTS high",
    "SSP4": "CTS high",
    "SSP5": "RTS high",
    "LED": "CTS core",  # TODO: maybe move to OECD projection instead
}

iea_elasticity_map = {
    "CTS core": (0.75, 0.15),
    "CTS high": (0.9, 0.45),
    "RTS core": (0.75, 0.4),
    "RTS high": (0.95, 0.7),
}



[docs]
def read_data_petrochemicals(fname) -> pd.DataFrame:
    """Read and clean data from the petrochemicals techno-economic files.

    Returns
    -------
    pd.DataFrame
        Cleaned techno-economic data for petrochemicals.
    """
    # Read the file
    data_petro = pd.read_csv(
        package_data_path("material", "petrochemicals", f"data{fname}.csv")
    )
    # Clean the data
    data_petro = data_petro.drop(["Source", "Description"], axis=1)
    return data_petro




[docs]
def gen_mock_demand_petro(
    scenario: "Scenario",
    gdp_elasticity_2020: float,
    gdp_elasticity_2030: float,
) -> pd.DataFrame:
    """Generate petrochemicals demand time series for MESSAGEix regions using GDP
    elasticities.

    TODO: Remove this function since a copy was moved to demand.

    Parameters
    ----------
    scenario :
        Scenario instance to build demand on.
    gdp_elasticity_2020 :
        GDP elasticity for years before 2030.
    gdp_elasticity_2030 :
        GDP elasticity for years from 2030 onward.

    Returns
    -------
    pd.DataFrame
        DataFrame with demand for petrochemicals by region and year.
    """
    s_info = ScenarioInfo(scenario)
    modelyears = s_info.Y
    fy = scenario.firstmodelyear

    def get_demand_t1_with_income_elasticity(
        demand_t0, income_t0, income_t1, elasticity
    ):
        return (
            elasticity * demand_t0.mul(((income_t1 - income_t0) / income_t0), axis=0)
        ) + demand_t0

    gdp_mer = scenario.par("bound_activity_up", {"technology": "GDP"})
    mer_to_ppp = pd.read_csv(
        package_data_path("material", "other", "mer_to_ppp_default.csv")
    ).set_index(["node", "year"])
    # mer_to_ppp = scenario.par("MERtoPPP").set_index("node", "year")
    # TODO: might need to be re-activated for different SSPs
    gdp_mer = gdp_mer.merge(
        mer_to_ppp.reset_index()[["node", "year", "value"]],
        left_on=["node_loc", "year_act"],
        right_on=["node", "year"],
    )
    gdp_mer["gdp_ppp"] = gdp_mer["value_y"] * gdp_mer["value_x"]
    gdp_mer = gdp_mer[["year", "node_loc", "gdp_ppp"]].reset_index()
    gdp_mer["Region"] = gdp_mer["node_loc"]  # .str.replace("R12_", "")
    df_gdp_ts = gdp_mer.pivot(
        index="Region", columns="year", values="gdp_ppp"
    ).reset_index()
    num_cols = [i for i in df_gdp_ts.columns if isinstance(i, int)]
    hist_yrs = [i for i in num_cols if i < fy]
    df_gdp_ts = (
        df_gdp_ts.drop([i for i in hist_yrs if i in df_gdp_ts.columns], axis=1)
        .set_index("Region")
        .sort_index()
    )

    from message_ix_models.model.material.demand.__init__ import (
        read_base_demand,
    )

    df_demand_2020 = read_base_demand(
        package_data_path() / "material" / "petrochemicals/demand_petro.yaml"
    )
    df_demand_2020 = df_demand_2020.rename({"region": "Region"}, axis=1)
    df_demand = df_demand_2020.pivot(index="Region", columns="year", values="value")
    dem_next_yr = df_demand

    for i in range(len(modelyears) - 1):
        income_year1 = modelyears[i]
        income_year2 = modelyears[i + 1]

        if income_year2 >= 2030:
            dem_next_yr = get_demand_t1_with_income_elasticity(
                dem_next_yr,
                df_gdp_ts[income_year1],
                df_gdp_ts[income_year2],
                gdp_elasticity_2030,
            )
        else:
            dem_next_yr = get_demand_t1_with_income_elasticity(
                dem_next_yr,
                df_gdp_ts[income_year1],
                df_gdp_ts[income_year2],
                gdp_elasticity_2020,
            )
        df_demand[income_year2] = dem_next_yr

    df_melt = df_demand.melt(ignore_index=False).reset_index()

    return make_df(
        "demand",
        unit="t",
        level="demand",
        value=df_melt.value,
        time="year",
        commodity="HVC",
        year=df_melt.year,
        node=df_melt["Region"],
    )




[docs]
def gen_data_petro_ts(
    data_petro_ts: pd.DataFrame, results: dict[list], tec_ts: set[str], nodes: list[str]
) -> None:
    """Generate time-series parameter data for petrochemical technologies.

    Parameters
    ----------
    data_petro_ts :
        DataFrame with time-series parameter data.
    results :
        Dictionary to collect parameter DataFrames.
    tec_ts :
        Set of technology names.
    nodes :
        List of model nodes.
    """
    for t in tec_ts:
        common = dict(
            time="year",
            time_origin="year",
            time_dest="year",
        )

        param_name = data_petro_ts.loc[
            (data_petro_ts["technology"] == t), "parameter"
        ].unique()

        for p in set(param_name):
            val = data_petro_ts.loc[
                (data_petro_ts["technology"] == t) & (data_petro_ts["parameter"] == p),
                "value",
            ]
            mod = data_petro_ts.loc[
                (data_petro_ts["technology"] == t) & (data_petro_ts["parameter"] == p),
                "mode",
            ]
            yr = data_petro_ts.loc[
                (data_petro_ts["technology"] == t) & (data_petro_ts["parameter"] == p),
                "year",
            ]

            if p == "var_cost":
                df = make_df(
                    p,
                    technology=t,
                    value=val,
                    unit="t",
                    year_vtg=yr,
                    year_act=yr,
                    mode=mod,
                    **common,
                ).pipe(broadcast, node_loc=nodes)
            else:
                rg = data_petro_ts.loc[
                    (data_petro_ts["technology"] == t)
                    & (data_petro_ts["parameter"] == p),
                    "region",
                ]
                df = make_df(
                    p,
                    technology=t,
                    value=val,
                    unit="t",
                    year_vtg=yr,
                    year_act=yr,
                    mode=mod,
                    node_loc=rg,
                    **common,
                )

            results[p].append(df)




[docs]
def assign_input_outpt(
    split: str,
    param_name: str,
    regions: pd.DataFrame,
    val: float | int,
    t: str,
    rg: str,
    global_region: str,
    common: dict,
    nodes: list[str],
) -> pd.DataFrame:
    """Assign input/output parameters for petrochemical technologies.

    Parameters
    ----------
    split :
        Split parameter name.
    param_name :
        Parameter name.
    regions :
        Regions for the parameter.
    val :
        Parameter values.
    t :
        Technology name.
    rg :
        Region name
    global_region :
        Name of the global region.
    common :
        Common parameter dictionary.
    nodes :
        Model nodes.
    """
    com = split[1]
    lev = split[2]
    mod = split[3]

    if (param_name == "input") and (lev == "import"):
        df = make_df(
            param_name,
            technology=t,
            commodity=com,
            level=lev,
            mode=mod,
            value=val[regions[regions == rg].index[0]],
            unit="t",
            node_loc=rg,
            node_origin=global_region,
            **common,
        )
    elif (param_name == "output") and (lev == "export"):
        df = make_df(
            param_name,
            technology=t,
            commodity=com,
            level=lev,
            mode=mod,
            value=val[regions[regions == rg].index[0]],
            unit="t",
            node_loc=rg,
            node_dest=global_region,
            **common,
        )
    else:
        df = make_df(
            param_name,
            technology=t,
            commodity=com,
            level=lev,
            mode=mod,
            value=val[regions[regions == rg].index[0]],
            unit="t",
            node_loc=rg,
            **common,
        ).pipe(same_node)

    # Copy parameters to all regions, when node_loc is not GLB
    if (len(regions) == 1) and (rg != global_region):
        df["node_loc"] = None
        df = df.pipe(broadcast, node_loc=nodes)
        if (lev != "import") and (lev != "export"):
            df = df.pipe(same_node)
    return df




[docs]
def broadcast_to_regions(df: pd.DataFrame, global_region: str, nodes: list[str]):
    """Broadcast a DataFrame to all regions if node_loc is not the global region.

    Parameters
    ----------
    df :
        DataFrame to broadcast regions.
    global_region :
        Name of the global region.
    nodes :
        List of model nodes.
    """
    if "node_loc" in df.columns:
        if (
            len(set(df["node_loc"])) == 1
            and list(set(df["node_loc"]))[0] != global_region
        ):
            df["node_loc"] = None
            df = df.pipe(broadcast, node_loc=nodes)
    return df



def format_par_data(
    config: "Context",
    data_petro: pd.DataFrame,
    results: dict,
    yv_ya: pd.DataFrame,
    modelyears: list[int],
    nodes: list[str],
    global_region: str,
):
    for t in config["technology"]["add"]:
        # Retrieve the id if `t` is a Code instance; otherwise use str
        t = getattr(t, "id", t)
        # years = s_info.Y
        params = data_petro.loc[(data_petro["technology"] == t), "parameter"].unique()

        # Availability year of the technology
        av = data_petro.loc[(data_petro["technology"] == t), "availability"].values[0]
        modelyears = [year for year in modelyears if year >= av]
        yva = yv_ya.loc[yv_ya.year_vtg >= av,]

        # Iterate over parameters
        for par in params:
            split = par.split("|")
            param_name = par.split("|")[0]

            val = data_petro.loc[
                ((data_petro["technology"] == t) & (data_petro["parameter"] == par)),
                "value",
            ]

            regions = data_petro.loc[
                ((data_petro["technology"] == t) & (data_petro["parameter"] == par)),
                "Region",
            ]

            # Common parameters for all input and output tables
            # node_dest and node_origin are the same as node_loc

            common = dict(
                year_vtg=yva.year_vtg,
                year_act=yva.year_act,
                time="year",
                time_origin="year",
                time_dest="year",
            )

            for rg in regions:
                if len(split) > 1:
                    if (param_name == "input") | (param_name == "output"):
                        df = assign_input_outpt(
                            split,
                            param_name,
                            regions,
                            val,
                            t,
                            rg,
                            global_region,
                            common,
                            nodes,
                        )
                    elif param_name == "emission_factor":
                        emi = split[1]
                        mod = split[2]

                        df = make_df(
                            param_name,
                            technology=t,
                            value=val[regions[regions == rg].index[0]],
                            emission=emi,
                            mode=mod,
                            unit="t",
                            node_loc=rg,
                            **common,
                        )

                    elif param_name == "var_cost":
                        mod = split[1]
                        if rg != global_region:
                            df = (
                                make_df(
                                    param_name,
                                    technology=t,
                                    mode=mod,
                                    value=val[regions[regions == rg].index[0]],
                                    unit="t",
                                    **common,
                                )
                                .pipe(broadcast, node_loc=nodes)
                                .pipe(same_node)
                            )
                        else:
                            df = make_df(
                                param_name,
                                technology=t,
                                mode=mod,
                                value=val[regions[regions == rg].index[0]],
                                node_loc=rg,
                                unit="t",
                                **common,
                            ).pipe(same_node)

                    elif param_name == "share_mode_up":
                        mod = split[1]

                        df = (
                            make_df(
                                param_name,
                                technology=t,
                                mode=mod,
                                shares="steam_cracker",
                                value=val[regions[regions == rg].index[0]],
                                unit="-",
                                **common,
                            )
                            .pipe(broadcast, node_share=nodes)
                            .pipe(same_node)
                        )

                # Rest of the parameters apart from input, output and emission_factor

                else:
                    df = make_df(
                        param_name,
                        technology=t,
                        value=val[regions[regions == rg].index[0]],
                        unit="t",
                        node_loc=rg,
                        **common,
                    )
                    df = df.drop_duplicates()

                # Copy parameters to all regions
                if (len(regions) == 1) and (rg != global_region):
                    df = broadcast_to_regions(df, global_region, nodes)

                results[param_name].append(df)



[docs]
def gen_data_petro_chemicals(
    scenario: "Scenario", dry_run: bool = False
) -> "ParameterData":
    """Generate all MESSAGEix parameter data for the petrochemicals sector.

    Parameters
    ----------
    scenario :
        Scenario instance to build petrochemicals model on.
    dry_run :
        *Not used, but kept for compatibility.*
    """
    # Load configuration
    context = read_config()
    config = context["material"]["petro_chemicals"]
    ssp = get_ssp_from_context(context)
    # Information about scenario, e.g. node, year
    s_info = ScenarioInfo(scenario)

    if "R12_CHN" in s_info.N:
        fname_suffix = "_R12"
    else:
        fname_suffix = "_R11"

    # Techno-economic assumptions
    data_petro = read_data_petrochemicals(fname_suffix)
    data_petro_ts = read_timeseries(
        scenario, "petrochemicals", None, f"timeseries{fname_suffix}.csv"
    )
    # List of data frames, to be concatenated together at end
    results = defaultdict(list)

    # For each technology there are differnet input and output combinations
    # Iterate over technologies

    modelyears = s_info.Y  # s_info.Y is only for modeling years
    nodes = nodes_ex_world(s_info.N)
    global_region = [i for i in s_info.N if i.endswith("_GLB")][0]
    yv_ya = s_info.yv_ya

    format_par_data(
        config, data_petro, results, yv_ya, modelyears, nodes, global_region
    )

    share_dict = {
        "shares": "steam_cracker",
        "node_share": ["R12_MEA", "R12_NAM"],
        "technology": "steam_cracker_petro",
        "mode": "ethane",
        "year_act": "2020",
        "time": "year",
        "value": [0.4, 0.4],
        "unit": "-",
    }
    results["share_mode_lo"].append(make_df("share_mode_lo", **share_dict))

    default_gdp_elasticity_2020, default_gdp_elasticity_2030 = iea_elasticity_map[
        ssp_mode_map[ssp]
    ]
    df_demand = gen_demand_petro(
        scenario, "HVC", default_gdp_elasticity_2020, default_gdp_elasticity_2030
    )
    df_2025 = pd.read_csv(
        package_data_path("material", "petrochemicals", "demand_2025.csv")
    )
    df_demand = df_demand[df_demand["year"] != 2025]
    df_demand = pd.concat([df_2025, df_demand])
    results["demand"].append(df_demand)

    # Special treatment for time-varying params
    tec_ts = set(data_petro_ts.technology)  # set of tecs in timeseries sheet

    gen_data_petro_ts(data_petro_ts, results, tec_ts, nodes)

    results = {par_name: pd.concat(dfs) for par_name, dfs in results.items()}

    # modify steam cracker hist data (naphtha -> gasoil) to make model feasible
    df_cap = pd.read_csv(
        package_data_path(
            "material", "petrochemicals", "steam_cracking_hist_new_cap.csv"
        )
    )
    df_act = pd.read_csv(
        package_data_path("material", "petrochemicals", "steam_cracking_hist_act.csv")
    )
    df_act.loc[df_act["mode"] == "naphtha", "mode"] = "vacuum_gasoil"
    df = results["historical_activity"]
    results["historical_activity"] = pd.concat(
        [df.loc[df["technology"] != "steam_cracker_petro"], df_act]
    )
    df = results["historical_new_capacity"]
    results["historical_new_capacity"] = pd.concat(
        [df.loc[df["technology"] != "steam_cracker_petro"], df_cap]
    )

    # remove growth constraint for R12_AFR to make trade constraints feasible
    df = results["growth_activity_up"]
    results["growth_activity_up"] = df[
        ~(
            (df["technology"] == "steam_cracker_petro")
            & (df["node_loc"] == "R12_AFR")
            & (df["year_act"] == 2020)
        )
    ]

    # add 25% total trade bound
    df_dem = results["demand"]
    df_dem = df_dem.groupby("year").sum(numeric_only=True) * 0.25
    df_dem = df_dem.reset_index()
    df_dem = df_dem.rename({"year": "year_act"}, axis=1)

    par_dict = {
        "node_loc": "R12_GLB",
        "technology": "trade_petro",
        "mode": "M1",
        "time": "year",
        "unit": "-",
    }
    results["bound_activity_up"] = pd.concat(
        [
            results["bound_activity_up"],
            make_df("bound_activity_up", **df_dem, **par_dict),
        ]
    )

    # TODO: move this to input xlsx file
    df = scenario.par(
        "relation_activity",
        filters={"relation": "h2_scrub_limit", "technology": "gas_bio"},
    )
    df["value"] = -(1.33181 * 0.482)  # gas input * emission factor of gas
    df["technology"] = "gas_processing_petro"
    results["relation_activity"] = df

    meth_downstream_emi_top_down = gen_plastics_emission_factors(s_info, "HVCs")
    meth_downstream_emi_bot_up = gen_chemicals_co2_ind_factors(s_info, "HVCs")
    meth_downstream_emi_eth = gen_ethanol_to_ethylene_emi_factor(s_info)

    merge_data(
        results,
        meth_downstream_emi_top_down,
        meth_downstream_emi_bot_up,
        meth_downstream_emi_eth,
    )

    # TODO: move this to input xlsx file
    df_gro = results["growth_activity_up"]
    drop_idx = df_gro[
        (df_gro["technology"] == "steam_cracker_petro")
        & (df_gro["node_loc"] == "R12_RCPA")
        & (df_gro["year_act"] == 2020)
    ].index
    results["growth_activity_up"] = results["growth_activity_up"].drop(drop_idx)

    reduced_pdict = {}
    for k, v in results.items():
        if set(["year_act", "year_vtg"]).issubset(v.columns):
            v = v[(v["year_act"] - v["year_vtg"]) <= 40]
        reduced_pdict[k] = v.drop_duplicates().copy(deep=True)

    return reduced_pdict