Source code for message_ix_models.model.transport.CHN_IND

"""Retrieve transport activity data for China (Statistical Yearbook of the National
Bureau of Statistics) and for India (iTEM)."""

import numpy as np
import pandas as pd
from iam_units import registry
from item import historical  # type: ignore [import-not-found]

from message_ix_models.util import package_data_path

UNITS = {
    "Population": (1.0e-6, None, "dimensionless"),
    "Vehicle Stock": (1.0e4, "vehicle", "thousand vehicle"),
    "Passenger-Kilometers": (100, "megapkm", "gigapkm"),
    "Freight Ton-Kilometers": (100, "megatkm", "gigatkm"),
}


SDMX_MAP = {
    "REF_AREA": "ISO_code",
    "VARIABLE": "Variable",
    "VEHICLE": "Vehicle type",
    "MODE": "Mode/vehicle type",
    "UNIT": "Units",
    "TIME_PERIOD": "Year",
    "VALUE": "Value",
}
#: Files containing data for China ("name_of_file", rows_to_skip_from_the_bottom)
FILES = {
    "Passenger activity": ("CHN_activity-passenger.csv", 4),
    "Vehicle stock civil": ("CHN_stock-civil.csv", 5),
    "Vehicle stock private": ("CHN_stock-private.csv", 1),
    "Freight activity": ("CHN_activity-freight.csv", 5),
}

POP_FILE = "CHN_IND_population.csv"

# Rail sub-categories to be removed from Chinese dataset
RAIL_SUB_CAT = [
    "Freight Ton-Kilometers of National Railways(100 million ton-km)",
    "Freight Ton-Kilometers of Local Railways(100 million ton-km)",
    "Freight Ton-Kilometers of Joint-venture Railways(100 million ton-km)",
    "Passenger-Kilometers of National Railways(100 million passenger-km)",
    "Passenger-Kilometers of Local Railways(100 million passenger-km)",
    "Passenger-Kilometers of Joint-venture Railways(100 million passenger-km)",
]

# Mapping of variables to vehicle modes
FILL_VALUES = {
    "Passenger-Kilometers": "Total passenger transport",
    "Freight Ton-Kilometers": "Total freight transport",
}


# TODO: migrate to convert_units.py, as an extra argument: e.g. axis=1 being the
#  current way (converting per var names in columns) or axis=0 (converting per rows
#  as coded in _covert below).
def _convert(group_df):
    """ ""Convert units of a pandas.groupby object based on UNITS."""
    info = UNITS.get(group_df["Variable"].unique()[0], None)
    if not info:
        return group_df
    factor, unit_in, unit_out = info
    qty = registry.Quantity(factor * group_df["Value"].values, unit_in).to(
        unit_out or unit_in
    )
    return group_df.assign(Value=qty.magnitude, Units=qty.units)


[docs] def split_variable(s) -> pd.DataFrame: """Split strings in :class:`pandas.Series` *s* into Variable and Mode. Parameters ---------- s : pandas.Series Returns ------- DataFrame : pandas.DataFrame DataFrame with two columns: Variable names and their respective Mode. """ # Split str in *s* into variable name and units df = s.str.rsplit(pat=" of ", n=1, expand=True) # Remove residual parentheses from variable column, still present in some entries for col in list(df.columns): df[col] = df[col].str.rsplit(pat="(", n=1, expand=True)[0] # Assign labels to columns df.columns = ["Var", "Mode/vehicle type"] # Use mapping FILL_VALUES to replace NaNs in "Mode/vehicle type" column for key, value in FILL_VALUES.items(): df[df["Var"] == key] = df[df["Var"] == key].fillna(value) # Alternative not working: # def func(df): # a_func = lambda group: group["target_col"].fillna(FILL_VALUES.get(group[0])) # return df.assign(target_col=a_func) # df.groupby(0).pipe(func) return df
[docs] def get_ind_item_data() -> pd.DataFrame: """Retrieve activity data for rail and road transport for India from iTEM. Data is obtained from iTEM database's file ``T000.csv`` and filtered for the period 2000-2018. Data for China is not retrieved since it comes from the NBSC. Returns ------- DataFrame : pandas.DataFrame DataFrame with transport data for India. """ # Import data from iTEM database historical files: all_data = pd.DataFrame() for x in range(0, 13, 1): # Exclude non-existent historical file 11 if x != 11: df = historical.process(x) df.drop( columns=[x for x in list(df.columns) if x not in list(SDMX_MAP.keys())], inplace=True, ) # Rename columns using SDMX_MAP dict, to match dataset format in # get_chn_ind_data() df.columns = df.columns.to_series().map(SDMX_MAP) # Filter values for IND between 2000-2018 df = df[ (df["ISO_code"].isin(["IND"])) & (df["Year"].isin(list(np.arange(2000, 2019)))) ].sort_values( ["ISO_code", "Mode/vehicle type", "Vehicle type"], ignore_index=True ) # Concat whenever data is retrieved all_data = pd.concat([all_data, df], ignore_index=True) return all_data
[docs] def get_chn_ind_pop() -> pd.DataFrame: """Retrieve population data for China and India. The dataset is a ``.csv`` file in */data* and was retrieved from `OECD <https://stats.oecd.org/Index.aspx?#>`_ website, filtering data for China and India. Returns ------- DataFrame : pandas.DataFrame DataFrame containing population data for China and India. """ # Read csv file pop = pd.read_csv(package_data_path("transport", POP_FILE), header=1) # Drop irrelevant columns and rename when necessary pop = pop.drop( [x for x in pop.columns if x not in ["LOCATION", "Time", "Value"]], axis=1, ).rename(columns={"LOCATION": "ISO_code", "Time": "Year"}) # Add "Variable" name column pop["Variable"] = "Population" return pop
[docs] def get_chn_ind_data(private_vehicles=False) -> pd.DataFrame: """Read transport activity and vehicle stock data for China and India. The data is read from ``data/transport`` folder (data for China from NBSC) and imported from iTEM project (data for India). Then, it is processed into the same format as the IEA's EEI datasets -to be used for MESSAGEix-Transport calibration. Parameters ---------- private_vehicles : bool, optional If ``True``, also data for private vehicles will be parsed. However, it is set by default to ``False`` since this category is included within the civil vehicles, which are read from another file. Therefore, in absolute terms, only the civil vehicles are relevant for top-level stock insights. Returns ------- DataFrame : pandas.DataFrame DataFrame with processed transport data for China and India. """ if not private_vehicles: del FILES["Vehicle stock private"] # Load and process data from China df = pd.DataFrame() for file, skip_footer in FILES.values(): # Read excel sheet df_aux = pd.read_csv( package_data_path("transport", file), skipfooter=skip_footer, header=2, ) df = pd.concat([df, df_aux], ignore_index=True) # Drop rows containing sub-categories of rail transport df = df.drop(df[df["Indicators"].isin(RAIL_SUB_CAT)].index).reset_index(drop=True) df = pd.concat( [ df.drop("Indicators", axis=1), df["Indicators"].str.extract(r"(?P<variable>.*) \((?P<units>.*)\)"), ], axis=1, ) df["Variable"] = df["Variable"].str.replace("Possession", "Vehicle Stock") # Reach **tidy data** structure df = df.melt( id_vars=["Variable", "Units"], var_name="Year", value_name="Value" ).sort_values("Year") # Add "ISO_code" column to *df*, and move to first position df["ISO_code"] = "CHN" df = df.set_index("ISO_code").reset_index() df["Year"] = pd.to_numeric(df["Year"]) # Drop 2019 values so it can be concatenated with population values df.drop(df[df["Year"] == 2019].index, inplace=True) # Split Variable column into Variable and Mode/vehicle type df = ( pd.concat([df, split_variable(df["Variable"])], axis=1) # Drop "Variable" and then rename "Var" to "Variable", since "Variable" column # was previously returned with that label by split_units() .drop(["Variable"], axis=1) .rename(columns={"Var": "Variable"}) ) # Reorder columns of *df* cols = df.columns.tolist() cols = [cols[0]] + cols[-2:] + cols[1:4] df = df[cols] # Concat population values df = pd.concat([df, get_chn_ind_pop()], ignore_index=True).sort_values( ["ISO_code", "Year", "Variable", "Mode/vehicle type"], ignore_index=True ) # Import IND data from iTEM database df_item = get_ind_item_data() df_item.drop(columns="Vehicle type", inplace=True) # Concat CHN and IND data df = pd.concat([df, df_item], ignore_index=True) # Convert units df = df.groupby("Variable").apply(_convert).reset_index(drop=True) # NB: CHN data for *Waterways* includes *Ocean* and inland freight transport. # In IND, vehicle type *Container* would map to *Ocean* from CHN and *Inland* to # (Waterways minus Ocean) from CHN. return df