Source code for message_ix_models.tools.bilateralize.pull_gem

# -*- coding: utf-8 -*-
"""
Historical Calibration
"""

# Import packages
import os
from pathlib import Path

import message_ix
import numpy as np
import pandas as pd
import yaml

from message_ix_models.tools.bilateralize.utils import load_config
from message_ix_models.util import package_data_path

oil_pipeline_file = "GEM-GOIT-Oil-NGL-Pipelines-2025-03.xlsx"
oil_pipeline_sheet = "Pipelines"

gas_pipeline_file = "GEM-GGIT-Gas-Pipelines-2024-12.xlsx"
gas_pipeline_sheet = "Gas Pipelines 2024-12-17"


# Set up MESSAGE regions

[docs]
def gem_region(project_name: str | None = None, config_name: str | None = None):
    """
    Set up MESSAGE regions

    Args:
        project_name: Name of project
        config_name: Name of config file
    """

    config, config_path = load_config(
        project_name=project_name, config_name=config_name
    )
    with open(config_path, "r", encoding="utf-8") as f:
        config = yaml.safe_load(f)
    message_regions = config["scenario"]["regions"]

    full_path = package_data_path("node", message_regions + ".yaml")
    with open(full_path, "r", encoding="utf-8") as f:
        message_regions = yaml.safe_load(f)
    message_regions_list = [
        r for r in message_regions.keys() if r not in ["World", "GLB"]
    ]

    return message_regions_list, message_regions



# Import files

[docs]
def import_gem(
    input_file: str | Path,
    # input_sheet: str,
    trade_technology: str,
    flow_technology: str,
    flow_commodity: str,
    project_name: str | None = None,
    config_name: str | None = None,
    first_model_year: int = 2030,
):
    """
    Import Global Energy Monitor data

    Args:
        input_file: Name of input file
        input_sheet: Name of input sheet
        trade_technology: Name of trade technology
        flow_technology: Name of flow technology
        flow_commodity: Name of flow commodity
        project_name: Name of project
        config_name: Name of config file
        first_model_year: First model year
    """

    # Pull in configuration
    config, config_path = load_config(
        project_name=project_name, config_name=config_name
    )
    p_drive = config["p_drive_location"]

    # Data paths
    data_path = os.path.join(p_drive, "MESSAGE_trade")
    gem_path = os.path.join(data_path, "Global Energy Monitor")

    # df = pd.read_excel(os.path.join(gem_path, input_file), sheet_name=input_sheet)
    df = pd.read_csv(os.path.join(gem_path, input_file), encoding="utf-8")

    df = df[df["StopYear"].isnull()]  # Only continuing projects

    df = df[
        [
            "StartYear1",
            "StartCountry",
            "EndCountry",
            "CapacityBOEd",
            "CostUSD",
            "LengthMergedKm",
        ]
    ].drop_duplicates()

    # Clean up country codes
    cw = pd.read_csv(os.path.join(gem_path, "country_crosswalk.csv"))
    for i in ["Start", "End"]:
        df = df.merge(cw, left_on=i + "Country", right_on="GEM Country", how="left")
        df = df.rename(columns={"ISO": i + "ISO"})

    # Add MESSAGE regions
    message_regions_list, message_regions = gem_region(project_name, config_name)
    df["EXPORTER"] = ""
    df["IMPORTER"] = ""
    for r in message_regions_list:
        df["EXPORTER"] = np.where(
            df["StartISO"].isin(message_regions[r]["child"]), r, df["EXPORTER"]
        )
        df["IMPORTER"] = np.where(
            df["EndISO"].isin(message_regions[r]["child"]), r, df["IMPORTER"]
        )

    # Collapse
    df["CapacityBOEd"] = df["CapacityBOEd"].replace("--", "0", regex=True).astype(float)
    df["CostUSD"] = df["CostUSD"].replace("--", "0", regex=True).astype(float)
    df["LengthMergedKm"] = (
        df["LengthMergedKm"].replace("--", "0", regex=True).astype(float)
    )

    df = df[(~df["CapacityBOEd"].isnull()) & (~df["CostUSD"].isnull())]

    df = df[df["StartYear1"] < first_model_year]  # No planned capacity
    df["YEAR"] = (
        round(df["StartYear1"].astype(float) / 5) * 5
    )  # Round year to the nearest 5

    df = (
        df.groupby(["EXPORTER", "IMPORTER", "YEAR"])[
            ["CapacityBOEd", "CostUSD", "LengthMergedKm"]
        ]
        .sum()
        .reset_index()
    )

    # Convert units
    df["Capacity (BOEa)"] = df["CapacityBOEd"] * 365
    df["Capacity (TJ)"] = df["Capacity (BOEa)"] * 0.006  # BOEa to TJ
    df["Capacity (GWa)"] = df["Capacity (TJ)"] * (3.1712 * 1e-5)  # TJ to GWa

    # Generate investment costs
    df["InvCost (USD/km)"] = (df["CostUSD"]) / df["LengthMergedKm"]
    # TODO: Add industry-specific deflators

    # Generate capacity
    df["Capacity (GWa/km)"] = (df["Capacity (GWa)"]) / df["LengthMergedKm"]

    # Cut down
    df = df[df["EXPORTER"] != df["IMPORTER"]]
    df = df[(df["EXPORTER"] != "") & (df["IMPORTER"] != "")]

    # Base file with all historical timesteps
    hist_base = pd.DataFrame()
    for y in list(range(2000, 2030, 5)):
        ydf = df[["EXPORTER", "IMPORTER"]].drop_duplicates().copy()
        ydf["YEAR"] = y
        hist_base = pd.concat([hist_base, ydf])
    hist_base = hist_base[(hist_base["EXPORTER"] != "") & (hist_base["IMPORTER"] != "")]

    df = hist_base.merge(
        df,
        left_on=["EXPORTER", "IMPORTER", "YEAR"],
        right_on=["EXPORTER", "IMPORTER", "YEAR"],
        how="outer",
    )  # Set 2005 to 0 if missing
    for c in [c for c in df.columns if c not in ["EXPORTER", "IMPORTER", "YEAR"]]:
        df[c] = np.where((df[c].isnull()) & (df["YEAR"] == 2000), 0, df[c])

    df["trade_technology"] = trade_technology
    df["flow_technology"] = flow_technology

    # Output to trade_technology
    export_dir = package_data_path("bilateralize", trade_technology)
    gem_dir_out = os.path.join(os.path.dirname(export_dir), trade_technology, "GEM")

    trade_dir = os.path.join(os.path.dirname(gem_dir_out), "edit_files")
    flow_dir = os.path.join(
        os.path.dirname(gem_dir_out), "edit_files", "flow_technology"
    )
    trade_dir_out = os.path.join(os.path.dirname(gem_dir_out), "bare_files")
    flow_dir_out = os.path.join(
        os.path.dirname(gem_dir_out), "bare_files", "flow_technology"
    )
    if not os.path.isdir(gem_dir_out):
        os.makedirs(Path(gem_dir_out))

    df.to_csv(os.path.join(gem_dir_out, "GEM.csv"))

    # Investment Costs
    inv_cost = (
        df.groupby(["EXPORTER", "IMPORTER"])[["CostUSD", "LengthMergedKm"]]
        .sum()
        .reset_index()
    )
    inv_cost["InvCost (USD/km)"] = (inv_cost["CostUSD"]) / inv_cost["LengthMergedKm"]
    inv_cost = inv_cost[["EXPORTER", "IMPORTER", "InvCost (USD/km)"]].drop_duplicates()
    inv_cost["node_loc"] = inv_cost["EXPORTER"]
    inv_cost["technology"] = (
        flow_technology + "_" + inv_cost["IMPORTER"].str.lower().str.split("_").str[-1]
    )
    inv_cost["value_update"] = inv_cost["InvCost (USD/km)"] / 1e6  # in MUSD/km
    inv_cost = inv_cost[["node_loc", "technology", "value_update"]]
    inv_cost.to_csv(os.path.join(gem_dir_out, "inv_cost_GEM.csv"), index=False)

    basedf = pd.read_csv(os.path.join(flow_dir, "inv_cost.csv"))
    basedf["value"] = 10
    inv_cost = basedf.merge(
        inv_cost,
        left_on=["node_loc", "technology"],
        right_on=["node_loc", "technology"],
        how="left",
    )
    inv_cost["value"] = np.where(
        inv_cost["value_update"] > 0,
        round(inv_cost["value_update"], 0),
        inv_cost["value"],
    )
    inv_cost["year_vtg"] = "broadcast"
    inv_cost["unit"] = "USD/km"
    inv_cost = inv_cost[
        ["node_loc", "technology", "year_vtg", "value", "unit"]
    ].drop_duplicates()
    inv_cost.to_csv(os.path.join(flow_dir, "inv_cost.csv"), index=False)
    inv_cost.to_csv(os.path.join(flow_dir_out, "inv_cost.csv"), index=False)

    # Historical activity (flow)
    hist_act = df[["EXPORTER", "IMPORTER", "YEAR", "LengthMergedKm"]].drop_duplicates()
    hist_act["LengthMergedKm"] = np.where(
        hist_act["LengthMergedKm"].isnull(), 0, hist_act["LengthMergedKm"]
    )
    hist_act["LengthMergedKm"] = hist_act.groupby(["EXPORTER", "IMPORTER"])[
        "LengthMergedKm"
    ].transform(pd.Series.cumsum)
    hist_act["node_loc"] = hist_act["EXPORTER"]
    hist_act["technology"] = (
        flow_technology + "_" + hist_act["IMPORTER"].str.lower().str.split("_").str[-1]
    )
    hist_act["value"] = round(hist_act["LengthMergedKm"], 0)
    hist_act["year_act"] = hist_act["YEAR"].astype(int)
    hist_act["unit"] = "km"
    hist_act["mode"] = "M1"
    hist_act["time"] = "year"
    hist_act = hist_act[
        ["node_loc", "technology", "year_act", "value", "unit", "mode", "time"]
    ]
    hist_act.to_csv(
        os.path.join(gem_dir_out, "historical_activity_GEM.csv"), index=False
    )
    hist_act.to_csv(os.path.join(flow_dir, "historical_activity.csv"), index=False)
    hist_act.to_csv(os.path.join(flow_dir_out, "historical_activity.csv"), index=False)

    # Historical activity (trade for oil pipelines only)
    # Share of oil pipeline capacity used for commodity (crude/light oil/fuel oil)
    share_oil_pipelines = {"crudeoil_piped": 0.8, "loil_piped": 0.1, "foil_piped": 0.1}

    if trade_technology in ["crudeoil_piped", "loil_piped", "foil_piped"]:
        hist_tra = df[
            ["EXPORTER", "IMPORTER", "YEAR", "Capacity (GWa)"]
        ].drop_duplicates()
        hist_tra["Capacity (GWa)"] = np.where(
            hist_tra["Capacity (GWa)"].isnull(), 0, hist_tra["Capacity (GWa)"]
        )
        hist_tra["Capacity (GWa)"] = hist_tra.groupby(["EXPORTER", "IMPORTER"])[
            "Capacity (GWa)"
        ].transform(pd.Series.cumsum)
        hist_tra["node_loc"] = hist_tra["EXPORTER"]
        hist_tra["technology"] = (
            trade_technology
            + "_exp_"
            + hist_tra["IMPORTER"].str.lower().str.split("_").str[-1]
        )

        hist_tra["value"] = round(hist_tra["Capacity (GWa)"], 0)
        hist_tra["value"] = hist_tra["value"] * share_oil_pipelines[trade_technology]

        hist_tra["year_act"] = hist_tra["YEAR"].astype(int)
        hist_tra["unit"] = "GWa"
        hist_tra["mode"] = "M1"
        hist_tra["time"] = "year"
        hist_tra = hist_tra[
            ["node_loc", "technology", "year_act", "value", "unit", "mode", "time"]
        ]
        hist_tra["year_act"] = hist_tra["year_act"].astype(int)
        hist_tra.to_csv(
            os.path.join(gem_dir_out, "historical_activity_trade_GEM.csv"), index=False
        )
        hist_tra.to_csv(os.path.join(trade_dir, "historical_activity.csv"), index=False)
        hist_tra.to_csv(
            os.path.join(trade_dir_out, "historical_activity.csv"), index=False
        )

    # Historical new capacity
    hist_cap = df[["EXPORTER", "IMPORTER", "YEAR", "LengthMergedKm"]]
    hist_cap = hist_cap.rename(columns={"LengthMergedKm": "CAPACITY_KM"})

    hist_cap = (
        hist_cap.groupby(["EXPORTER", "IMPORTER", "YEAR"])["CAPACITY_KM"]
        .sum()
        .reset_index()
    )
    hist_cap = hist_cap.sort_values(by=["EXPORTER", "IMPORTER", "YEAR"], ascending=True)
    hist_cap = hist_cap[
        (hist_cap["YEAR"] < first_model_year) & (hist_cap["YEAR"] > 1999)
    ]

    hist_cap = hist_cap.sort_values(
        by=["EXPORTER", "IMPORTER", "YEAR"], ascending=False
    )
    hist_cap = hist_cap[["EXPORTER", "IMPORTER", "YEAR", "CAPACITY_KM"]]

    hist_cap["CAPACITY_KM"] = hist_cap["CAPACITY_KM"] / 5  # Duration time is 5 years

    hist_cap["node_loc"] = hist_cap["EXPORTER"]
    hist_cap["technology"] = (
        flow_technology + "_" + hist_cap["IMPORTER"].str.lower().str.split("_").str[-1]
    )
    hist_cap["value"] = round(hist_cap["CAPACITY_KM"], 0)
    hist_cap["YEAR"] = hist_cap["YEAR"].astype(int)
    hist_cap = message_ix.make_df(
        "historical_new_capacity",
        node_loc=hist_cap["node_loc"],
        technology=hist_cap["technology"],
        year_vtg=hist_cap["YEAR"],
        value=hist_cap["value"],
        unit="km",
    )
    hist_cap.to_csv(
        os.path.join(gem_dir_out, "historical_new_capacity_GEM.csv"), index=False
    )
    hist_cap.to_csv(os.path.join(flow_dir, "historical_new_capacity.csv"), index=False)
    hist_cap.to_csv(
        os.path.join(flow_dir_out, "historical_new_capacity.csv"), index=False
    )

    # Input
    inputdf = (
        df.groupby(["EXPORTER", "IMPORTER"])[["Capacity (GWa)", "LengthMergedKm"]]
        .sum()
        .reset_index()
    )
    inputdf["Capacity (GWa/km)"] = inputdf["Capacity (GWa)"] / inputdf["LengthMergedKm"]
    inputdf = inputdf[["EXPORTER", "IMPORTER", "Capacity (GWa/km)"]].drop_duplicates()
    inputdf["node_loc"] = inputdf["EXPORTER"]
    inputdf["technology"] = (
        trade_technology
        + "_exp_"
        + inputdf["IMPORTER"].str.lower().str.split("_").str[-1]
    )
    inputdf["value_update"] = round((1 / inputdf["Capacity (GWa/km)"]), 0)
    inputdf["commodity"] = (
        flow_commodity + "_" + inputdf["IMPORTER"].str.lower().str.split("_").str[-1]
    )
    inputdf = inputdf[["node_loc", "technology", "value_update", "commodity"]]  # km/GWa

    basedf = pd.read_csv(os.path.join(trade_dir, "input.csv"))
    # The largest capacity pipelines have maximum 300,000GWh (~30bcm) annually
    basedf["value"] = np.where(
        basedf["commodity"].str.contains(flow_commodity), 30, basedf["value"]
    )

    inputdf = basedf.merge(
        inputdf,
        left_on=["node_loc", "technology", "commodity"],
        right_on=["node_loc", "technology", "commodity"],
        how="left",
    )
    inputdf["value"] = np.where(
        (~inputdf["value_update"].isnull())
        & (inputdf["value_update"] < 10000)
        & (inputdf["commodity"].str.contains(flow_commodity)),
        inputdf["value_update"],
        inputdf["value"],
    )
    inputdf = inputdf.drop(["value_update"], axis=1)

    inputdf.to_csv(os.path.join(gem_dir_out, "inputs_GEM.csv"), index=False)
    inputdf.to_csv(os.path.join(trade_dir, "input.csv"), index=False)
    inputdf.to_csv(os.path.join(trade_dir_out, "input.csv"), index=False)