Source code for message_ix_models.model.transport.CHN_IND

"""Retrieve transport activity data for China (Statistical Yearbook of the National
Bureau of Statistics) and for India (iTEM)."""

import numpy as np
import pandas as pd
from iam_units import registry
from item import historical  # type: ignore [import-not-found]

from message_ix_models.util import package_data_path

UNITS = {
    "Population": (1.0e-6, None, "dimensionless"),
    "Vehicle Stock": (1.0e4, "vehicle", "thousand vehicle"),
    "Passenger-Kilometers": (100, "megapkm", "gigapkm"),
    "Freight Ton-Kilometers": (100, "megatkm", "gigatkm"),
}


SDMX_MAP = {
    "REF_AREA": "ISO_code",
    "VARIABLE": "Variable",
    "VEHICLE": "Vehicle type",
    "MODE": "Mode/vehicle type",
    "UNIT": "Units",
    "TIME_PERIOD": "Year",
    "VALUE": "Value",
}
#: Files containing data for China ("name_of_file", rows_to_skip_from_the_bottom)
FILES = {
    "Passenger activity": ("CHN_activity-passenger.csv", 4),
    "Vehicle stock civil": ("CHN_stock-civil.csv", 5),
    "Vehicle stock private": ("CHN_stock-private.csv", 1),
    "Freight activity": ("CHN_activity-freight.csv", 5),
}

POP_FILE = "CHN_IND_population.csv"

# Rail sub-categories to be removed from Chinese dataset
RAIL_SUB_CAT = [
    "Freight Ton-Kilometers of National Railways(100 million ton-km)",
    "Freight Ton-Kilometers of Local Railways(100 million ton-km)",
    "Freight Ton-Kilometers of Joint-venture Railways(100 million ton-km)",
    "Passenger-Kilometers of National Railways(100 million passenger-km)",
    "Passenger-Kilometers of Local Railways(100 million passenger-km)",
    "Passenger-Kilometers of Joint-venture Railways(100 million passenger-km)",
]

# Mapping of variables to vehicle modes
FILL_VALUES = {
    "Passenger-Kilometers": "Total passenger transport",
    "Freight Ton-Kilometers": "Total freight transport",
}


# TODO: migrate to convert_units.py, as an extra argument: e.g. axis=1 being the
#  current way (converting per var names in columns) or axis=0 (converting per rows
#  as coded in _covert below).
def _convert(group_df):
    """ ""Convert units of a pandas.groupby object based on UNITS."""
    info = UNITS.get(group_df["Variable"].unique()[0], None)
    if not info:
        return group_df
    factor, unit_in, unit_out = info
    qty = registry.Quantity(factor * group_df["Value"].values, unit_in).to(
        unit_out or unit_in
    )
    return group_df.assign(Value=qty.magnitude, Units=qty.units)



[docs]
def split_variable(s) -> pd.DataFrame:
    """Split strings in :class:`pandas.Series` *s* into Variable and Mode.

    Parameters
    ----------
    s : pandas.Series

    Returns
    -------
    DataFrame : pandas.DataFrame
        DataFrame with two columns: Variable names and their respective Mode.
    """
    # Split str in *s* into variable name and units
    df = s.str.rsplit(pat=" of ", n=1, expand=True)
    # Remove residual parentheses from variable column, still present in some entries
    for col in list(df.columns):
        df[col] = df[col].str.rsplit(pat="(", n=1, expand=True)[0]
    # Assign labels to columns
    df.columns = ["Var", "Mode/vehicle type"]
    # Use mapping FILL_VALUES to replace NaNs in "Mode/vehicle type" column
    for key, value in FILL_VALUES.items():
        df[df["Var"] == key] = df[df["Var"] == key].fillna(value)
    # Alternative not working:
    # def func(df):
    #     a_func = lambda group: group["target_col"].fillna(FILL_VALUES.get(group[0]))
    #     return df.assign(target_col=a_func)
    # df.groupby(0).pipe(func)
    return df




[docs]
def get_ind_item_data() -> pd.DataFrame:
    """Retrieve activity data for rail and road transport for India from iTEM.

    Data is obtained from iTEM database's file ``T000.csv`` and filtered for the
    period 2000-2018. Data for China is not retrieved since it comes from the NBSC.

    Returns
    -------
    DataFrame : pandas.DataFrame
        DataFrame with transport data for India.
    """
    # Import data from iTEM database historical files:
    all_data = pd.DataFrame()
    for x in range(0, 13, 1):
        # Exclude non-existent historical file 11
        if x != 11:
            df = historical.process(x)
            df.drop(
                columns=[x for x in list(df.columns) if x not in list(SDMX_MAP.keys())],
                inplace=True,
            )
            # Rename columns using SDMX_MAP dict, to match dataset format in
            # get_chn_ind_data()
            df.columns = df.columns.to_series().map(SDMX_MAP)
            # Filter values for IND between 2000-2018
            df = df[
                (df["ISO_code"].isin(["IND"]))
                & (df["Year"].isin(list(np.arange(2000, 2019))))
            ].sort_values(
                ["ISO_code", "Mode/vehicle type", "Vehicle type"], ignore_index=True
            )
            # Concat whenever data is retrieved
            all_data = pd.concat([all_data, df], ignore_index=True)

    return all_data




[docs]
def get_chn_ind_pop() -> pd.DataFrame:
    """Retrieve population data for China and India.

    The dataset is a ``.csv`` file in */data* and was retrieved from `OECD
    <https://stats.oecd.org/Index.aspx?#>`_ website, filtering data for China and India.

    Returns
    -------
    DataFrame : pandas.DataFrame
        DataFrame containing population data for China and India.
    """
    # Read csv file
    pop = pd.read_csv(package_data_path("transport", POP_FILE), header=1)
    # Drop irrelevant columns and rename when necessary
    pop = pop.drop(
        [x for x in pop.columns if x not in ["LOCATION", "Time", "Value"]],
        axis=1,
    ).rename(columns={"LOCATION": "ISO_code", "Time": "Year"})
    # Add "Variable" name column
    pop["Variable"] = "Population"
    return pop




[docs]
def get_chn_ind_data(private_vehicles=False) -> pd.DataFrame:
    """Read transport activity and vehicle stock data for China and India.

    The data is read from ``data/transport`` folder (data for China from NBSC) and
    imported from iTEM project (data for India). Then, it is processed into the same
    format as the IEA's EEI datasets -to be used for MESSAGEix-Transport calibration.

    Parameters
    ----------
    private_vehicles : bool, optional
        If ``True``, also data for private vehicles will be parsed. However,
        it is set by default to ``False`` since this category is included within the
        civil vehicles, which are read from another file. Therefore, in absolute
        terms, only the civil vehicles are relevant for top-level stock insights.

    Returns
    -------
    DataFrame : pandas.DataFrame
        DataFrame with processed transport data for China and India.
    """
    if not private_vehicles:
        del FILES["Vehicle stock private"]
    # Load and process data from China
    df = pd.DataFrame()
    for file, skip_footer in FILES.values():
        # Read excel sheet
        df_aux = pd.read_csv(
            package_data_path("transport", file),
            skipfooter=skip_footer,
            header=2,
        )
        df = pd.concat([df, df_aux], ignore_index=True)
    # Drop rows containing sub-categories of rail transport
    df = df.drop(df[df["Indicators"].isin(RAIL_SUB_CAT)].index).reset_index(drop=True)
    df = pd.concat(
        [
            df.drop("Indicators", axis=1),
            df["Indicators"].str.extract(r"(?P<variable>.*) \((?P<units>.*)\)"),
        ],
        axis=1,
    )
    df["Variable"] = df["Variable"].str.replace("Possession", "Vehicle Stock")
    # Reach **tidy data** structure
    df = df.melt(
        id_vars=["Variable", "Units"], var_name="Year", value_name="Value"
    ).sort_values("Year")

    # Add "ISO_code" column to *df*, and move to first position
    df["ISO_code"] = "CHN"
    df = df.set_index("ISO_code").reset_index()

    df["Year"] = pd.to_numeric(df["Year"])

    # Drop 2019 values so it can be concatenated with population values
    df.drop(df[df["Year"] == 2019].index, inplace=True)

    # Split Variable column into Variable and Mode/vehicle type
    df = (
        pd.concat([df, split_variable(df["Variable"])], axis=1)
        # Drop "Variable" and then rename "Var" to "Variable", since "Variable" column
        # was previously returned with that label by split_units()
        .drop(["Variable"], axis=1)
        .rename(columns={"Var": "Variable"})
    )

    # Reorder columns of *df*
    cols = df.columns.tolist()
    cols = [cols[0]] + cols[-2:] + cols[1:4]
    df = df[cols]

    # Concat population values
    df = pd.concat([df, get_chn_ind_pop()], ignore_index=True).sort_values(
        ["ISO_code", "Year", "Variable", "Mode/vehicle type"], ignore_index=True
    )

    # Import IND data from iTEM database
    df_item = get_ind_item_data()

    df_item.drop(columns="Vehicle type", inplace=True)

    # Concat CHN and IND data
    df = pd.concat([df, df_item], ignore_index=True)

    # Convert units
    df = df.groupby("Variable").apply(_convert).reset_index(drop=True)

    # NB: CHN data for *Waterways* includes *Ocean* and inland freight transport.
    # In IND, vehicle type *Container* would map to *Ocean* from CHN and *Inland* to
    # (Waterways minus Ocean) from CHN.

    return df