"""Tools for World Bank data."""
import logging
from collections import defaultdict
from collections.abc import MutableMapping
from functools import lru_cache
from typing import TYPE_CHECKING, Optional
import pandas as pd
if TYPE_CHECKING:
import sdmx.model.common
log = logging.getLogger(__name__)
# FIXME Reduce complexity from 12 → ≤11
[docs]
def assign_income_groups( # noqa: C901
cl_node: "sdmx.model.common.Codelist",
cl_income_group: "sdmx.model.common.Codelist",
method: str = "population",
replace: Optional[dict[str, str]] = None,
) -> None:
"""Annotate `cl_node` with income groups.
Each node is assigned an :class:`.Annotation` with :py:`id="wb-income-group"`,
according to the income groups of its children (countries), as reflected in
`cl_income_group` (see :func:`.get_income_group_codelist`).
Parameters
----------
method : "population" or "count"
Method for aggregation:
- :py:`"population"` (default): the WB World Development Indicators (WDI) 2020
population for each country is used as a weight, so that the node's income
group is the income group of the plurality of the population of its children.
- :py:`"count"`: each country is weighted equally, so that the node's income
group is the mode (most frequently occurring value) of its childrens'.
replace : dict
Mapping from wb-income-group annotation text appearing in `cl_income_group` to
texts to be attached to `cl_node`. Mapping two keys to the same value
effectively combines or aggregates those groups. See :func:`.make_map`.
Example
-------
Annotate the R12 node list with income group information, mapping high income
countries (HIC) and upper-middle income countries (UMC) into one group and
aggregating by population.
>>> cl_node = get_codelist(f"node/R12")
>>> cl_ig = get_income_group_codelist()
>>> replace = make_map({"HIC": "HMIC", "UMC": "HMIC"})
>>> assign_income_groups(cl_node, cl_ig, replace=replace)
>>> cl_node["R12_NAM"].get_annotation(id="wb-income-group").text
HMIC
"""
import sdmx
import sdmx.model.v21 as m
replace = replace or dict()
if method == "count":
def get_weight(code: "sdmx.model.common.Code") -> float:
"""Weight of the country `code` in aggregation."""
return 1.0
elif method == "population":
# Work around khaeru/sdmx#191: ensure the HTTPS URL is used
client = sdmx.Client("WB_WDI")
client.source.url = client.source.url.replace("http://", "https://")
# Retrieve WB_WDI data for SERIES=SP_POP_TOTAL (Population, total)
dm = client.data(
"WDI", key="A.SP_POP_TOTL.", params=dict(startPeriod=2020, endPeriod=2020)
)
# Convert to pd.Series with multi-index with levels: REF_AREA, SERIES, FREQ,
# TIME_PERIOD. Because of the query, there is only 1 value for each unique
# REF_AREA.
df = sdmx.to_pandas(dm.data[0])
def get_weight(code: "sdmx.model.common.Code") -> float:
"""Return a weight for the country `code`: its total population."""
try:
return df[code.id].item()
except KeyError:
# log.debug(f"No population data for {code!r}; omitted")
return 0
else: # pragma: no cover
raise ValueError(f"method={method!r}")
weight_info = {} # For debugging
# Iterate over nodes
for node in cl_node:
if not len(node.child):
continue # Country → skip
# Total weight of different income groups among `node`'s countries
weight: MutableMapping[Optional[str], float] = defaultdict(lambda: 0.0)
# Iterate over countries
for country in node.child:
# Identify the income group of `country` from an annotation
try:
ig = str(
cl_income_group[country.id]
.get_annotation(id="wb-income-group")
.text
)
# Apply replacement to `ig`
ig = replace.get(ig, ig)
except KeyError:
# country.id is not in cl_income_group, or no such annotation
ig = None
weight[ig] += get_weight(country)
if {None} == set(weight):
continue # World node → no direct children that are countries
# Sort weights and group IDs from largest/first alphabetically to smallest/last
weight_sorted = sorted([(-v, k) for k, v in weight.items()])
weight_info[node.id] = pd.Series({k: -v for v, k in weight_sorted})
# Identify the income group with the largest weight; not None
_, ig = next(filter(lambda item: item[1] is not None, weight_sorted))
try:
# Remove any existing annotation
node.pop_annotation(id="wb-income-group")
except KeyError:
pass
# Annotate the node
node.annotations.append(m.Annotation(id="wb-income-group", text=ig))
log.debug(
"(node, group) weights:\n"
+ pd.concat(weight_info, axis=1).fillna(0).to_string()
)
[docs]
def fetch_codelist(id: str) -> "sdmx.model.common.Codelist":
"""Retrieve code lists related to the WB World Development Indicators.
In principle this could be done with :py:`sdmx.Client("WB_WDI").codelist(id)`, but
the World Bank SDMX REST API does not support queries for a specific code list. See
https://datahelpdesk.worldbank.org/knowledgebase/articles/1886701-sdmx-api-queries.
:func:`fetch_codelist` retrieves http://api.worldbank.org/v2/sdmx/rest/codelist/WB/,
the structure message containing *all* code lists; and extracts and returns the one
with the given `id`.
"""
import pooch
import sdmx
file = pooch.retrieve(
url="https://api.worldbank.org/v2/sdmx/rest/codelist/WB/", known_hash=None
)
# Read the retrieved SDMX StructureMessage and extract the code list
sm = sdmx.read_sdmx(file)
return sm.codelist[id]
[docs]
@lru_cache()
def get_income_group_codelist() -> "sdmx.model.common.Codelist":
"""Return a :class:`.Codelist` with World Bank income group information.
The returned code list is a modified version of the one with URN
``…Codelist=WB:CL_REF_AREA_WDI(1.0)``, via :func:`.fetch_codelist`.
This is augmented with information about the income group and lending category
concepts as described at
https://datahelpdesk.worldbank.org/knowledgebase/articles/906519
The information is stored two ways:
- Existing codes in the list like "HIC: High income" that designate groups of
countries are associated with child codes that are designated as members of that
country. These can be accessed at :attr:`Code.child
<sdmx.model.common.Item.child>`.
- Existing codes in the list like "ABW: Aruba" are annotated with:
- :py:`id="wb-income-group"`: the URN of the income group code, for instance
"urn:sdmx:org.sdmx.infomodel.codelist.Code=WB:CL_REF_AREA_WDI(1.0).HIC". This is
an unambiguous reference to a code in the same list.
- :py:`id="wb-lending-category"`: the name of the lending category, if any.
These can be accessed using :attr:`Code.annotations
<sdmx.model.common.AnnotableArtefact.annotations>`, :attr:`Code.get_annotation
<sdmx.model.common.AnnotableArtefact.get_annotation>`, and other methods.
"""
import pooch
import sdmx.model.v21 as m
cl = fetch_codelist("CL_REF_AREA_WDI")
@lru_cache()
def urn_for(name: str) -> str:
"""Return the URN of a code in `cl`, given its `name`."""
for code in cl:
if str(code.name) == name:
return code.urn
raise ValueError(name) # pragma: no cover
# Fetch the file containing the classification
file = pooch.retrieve(
url="https://datacatalogfiles.worldbank.org/ddh-published/0037712/DR0090755/"
"CLASS.xlsx",
known_hash="sha256:"
"1418a4fd6badb7c26ae2bc3a9bfef4903f3d9c54c1679f856e1dece3c729e935",
)
# Open the retrieved file
ef = pd.ExcelFile(file)
# Read the "List of economies" sheet → store wb-{income-group,lending-category}
tmp = (
pd.read_excel(ef, sheet_name="List of economies")
.drop(["Economy", "Region"], axis=1)
.dropna(subset=["Income group"], axis=0)
.set_index("Code")
)
for code in cl:
try:
row = tmp.loc[code.id, :]
except KeyError:
# log.debug(f"Not in 'List of economies' sheet: {code!r}")
continue
# Annotate wb-income-group; map a value like "Low income" to a URN
code.annotations.append(
m.Annotation(id="wb-income-group", text=urn_for(row["Income group"]))
)
try:
code.annotations.append(
m.Annotation(id="wb-lending-category", text=row["Lending category"])
)
except ValueError:
pass # text was None → no value
# Read the "Groups" sheet → assign hierarchy
for group_id, group_df in (
pd.read_excel(ef, sheet_name="Groups")
.drop(["GroupName", "CountryName", "Unnamed: 4"], axis=1)
.groupby("GroupCode")
):
try:
# Identify the Code for this group ID
group = cl[group_id]
except KeyError:
# log.debug(f"Group {group_id!r} is not in {cl}")
continue
for child_id in sorted(group_df["CountryCode"]):
try:
group.append_child(cl[child_id])
except KeyError:
# log.debug(f"No code for child {child_id!r}")
continue
# log.debug(f"{cl[group_id]}: {len(cl[group_id].child)} children")
# Read "Notes" sheet → append to description of `cl`
tmp = "\n\n".join(pd.read_excel(ef, sheet_name="Notes").dropna()["Notes"])
# Ensure the "en" localization exists
cl.description.localizations.setdefault("en", "")
cl.description.localizations["en"] += (
"\n\nThis code list has been modified from the official version by the "
"'message-ix-models' Python package to add annotations and hierarchy parsed "
"from the World Bank income groups and lending categories as described at "
"https://datahelpdesk.worldbank.org/knowledgebase/articles/906519. The original"
f" Excel file parsed includes the following descriptive text:\n\n{tmp}"
)
return cl
[docs]
def make_map(
source: dict[str, str], expand_key_urn: bool = True, expand_value_urn: bool = False
) -> dict[str, str]:
"""Prepare the :py:`replace` parameter of :func:`assign_income_groups`.
The result has one (`key`, `value`) for each in `source`.
Parameters
----------
expand_key_urn : bool
If :obj:`True` (the default), replace each `key` from `source` with the URN for
the code in ``CL_REF_AREA_WDI`` with :py:`id=key`.
expand_value_urn : bool
If :obj:`True`, replace each `value` from `source` with the URN for the code in
``CL_REF_AREA_WDI`` with :py:`id=value`.
"""
# Retrieve the code list
cl = fetch_codelist("CL_REF_AREA_WDI")
result = dict()
for key, value in source.items():
key = cl[key].urn if expand_key_urn else key
value = cl[value].urn if expand_value_urn else value
result[key] = value
return result