Source code for message_ix_models.tools.newclimate

"""Handle data from the NewClimate Institute's Climate Policy Database (CPDB).

This module provides:

- :class:`.NewClimatePolicy`, a concrete subclass of the abstract/generic
  :class:`.Policy`, that reflects the data model appearing in the CPDB.

  - Enumerations that reflect values appearing in fields of the database which appear to
    be enumerated (as opposed to free text):
    :class:`HIGH_IMPACT`,
    :class:`JURISDICTION`,
    :class:`OBJECTIVE`,
    :class:`SECTOR`,
    :class:`STATUS`,
    :class:`STRINGENCY`,
    :class:`TYPE`, and
    :class:`UPDATE`.

  - A method :meth:`.NewClimatePolicy.from_csv_dict` that interprets the CSV data
    format in which the database is expressed.

- Functions to :func:`fetch` versions of the database from Zenodo, :func:`read` into
  collections of Python objects, or do both (:func:`get`).

These enable programmatic use of the information in the database. For example:

.. code-block:: python

   from message_ix_models.tools.newclimate import SECTOR, get
   from pycountry import countries

   # Fetch and parse the 2024 edition of the database
   policies = get("2024")
   print(len(policies))  # 6507 objects

   # Filter the dict to a list of policy objects matching a certain sector
   p_transport = list(filter(lambda p: SECTOR.Transport in p.sector, policies.values()))
   print(len(p_transport))  # 1298 objects

   # Filter for any policies concerning the country of Austria, or the EU
   match = {pycountry.lookup("Austria"), "European Union"}
   p_AUT = list(filter(lambda p: set(p.geo) & match, policies.values()))
   print(len(p_AUT)))  # 259 objects

.. todo:: Extend the module:

   - Serialize :class:`.NewClimatePolicy` objects in 1 or more formats, preferably
     standards-based.
   - :func:`fetch` versions of the database more recent than the latest Zenodo record,
     using the `cpdb_api package
     <https://github.com/https-github-com-NewClimateInstitute/CPDB-API>`_ or other code.
   - Convert to/from other data models.
"""

import csv
import logging
from functools import cache
from typing import TYPE_CHECKING

from .structure import (
    HIGH_IMPACT,
    JURISDICTION,
    OBJECTIVE,
    SECTOR,
    STATUS,
    STRINGENCY,
    TYPE,
    UPDATE,
    NewClimatePolicy,
)

if TYPE_CHECKING:
    from pathlib import Path

__all__ = [
    "HIGH_IMPACT",
    "JURISDICTION",
    "NewClimatePolicy",
    "OBJECTIVE",
    "SECTOR",
    "STATUS",
    "STRINGENCY",
    "TYPE",
    "UPDATE",
    "read",
    "get",
    "fetch",
]

log = logging.getLogger(__name__)

#: Pooch information for fetching files from the static version of the database.
SOURCE = {  # noqa: E501
    "newclimate-2024": dict(
        pooch_args=dict(
            base_url="doi:10.5281/zenodo.15432946",
            registry={
                "ClimatePolicyDatabase_v2024.csv": (
                    "sha256:e893745bc26d225d8e91d063eb1fdbcbb5da4a51ce05d28ce5b9f51f6ef4408f"
                ),
            },
        ),
    ),
    "newclimate-2023": dict(
        pooch_args=dict(
            base_url="doi:10.5281/zenodo.10869734",
            registry={
                "ClimatePolicyDatabase_v2023.xlsx": (
                    "sha256:bdce700c6b0c2eeb7fa06584cb8523793b64ec5799d91ae65818209aaf9de682"
                ),
            },
        ),
    ),
    "newclimate-2022": dict(
        pooch_args=dict(
            base_url="doi:10.5281/zenodo.7774473",
            registry={
                "ClimatePolicyDatabase_v2022.csv": (
                    "sha256:fe431e41c4c2fb8513d6718fba6ba3bc0a1fd2c5b9016256a106b998f5f48946"
                ),
            },
        ),
    ),
    "newclimate-2021": dict(
        pooch_args=dict(
            base_url="doi:10.5281/zenodo.7774471",
            registry={
                "ClimatePolicyDatabase_v2021.xlsx": (
                    "sha256:d880c2c94c7d8da84bb9cf8d315faf7230e4965cbc679ac1783222ecfe84062a"
                ),
            },
        ),
    ),
    "newclimate-2020": dict(
        pooch_args=dict(
            base_url="doi:10.5281/zenodo.7774462",
            registry={
                "ClimatePolicyDatabase_v2020.xlsx": (
                    "sha256:08818156401200ec094985c34250ef65cea6ff5246cbbeb1d0ade317f8fdaa0c"
                ),
            },
        ),
    ),
    "newclimate-2019": dict(
        pooch_args=dict(
            base_url="doi:10.5281/zenodo.7774110",
            registry={
                "ClimatePolicyDatabase _v2019.xlsx": (
                    "sha256:c28cdd613496d503ae00bacf637fc052128e04361580110829843b4bf0235368"
                ),
            },
        )
    ),
}


[docs] def fetch(version: str) -> "Path": """Retrieve data for `version` of the Climate Policy Database from Zenodo.""" from message_ix_models.util import pooch # Ensure sources for this module are registered pooch.SOURCE.update(SOURCE) # Construct the key source_id = f"newclimate-{version}" return pooch.fetch(**pooch.SOURCE[source_id], extra_cache_path="newclimate")[0]
[docs] def get(version: str) -> dict[str, NewClimatePolicy]: """:func:`fetch` and then :func:`read` data for `version` of the database.""" f_source = fetch(version) if f_source.suffix == ".xlsx": # Convert Excel to CSV import pandas as pd f_read = f_source.with_suffix(".csv") if not f_read.exists(): log.info(f"Unpack {f_source} to {f_read}") pd.read_excel(f_source).to_csv(f_read, index=False) else: f_read = f_source # - Force use of UTF-8 on macOS and Windows. # - The 2022 CSV file is not in UTF-8 format; use a different encoding. kwargs = dict(encoding="latin-1" if version == "2022" else "utf-8") try: return read(f_read, **kwargs) except Exception as e: if version in ("2021", "2020", "2019"): raise NotImplementedError("Read 2021 and earlier data format") from e else: # pragma: no cover raise
[docs] @cache def read(path: "Path", **kwargs) -> dict[str, NewClimatePolicy]: """Read a CSV file into a :class:`dict` of Policy objects. Returns ------- dict Keys are :attr:`.NewClimatePolicy.id`. If the file contains records with the same IDs, only the last appears, and a warning is logged. """ with open(path, **kwargs) as f: policies = [NewClimatePolicy.from_csv_dict(row) for row in csv.DictReader(f)] result = {p.id: p for p in policies} if len(result) < len(policies): log.warning(f"{len(policies) - len(result)} duplicate IDs in `path`") return result