Source code for message_ix_models.tests.test_util

"""Tests of :mod:`message_ix_models.util`."""

import logging
import re
from importlib.metadata import version
from pathlib import Path

import numpy as np
import pandas as pd
import pytest
from iam_units import registry
from ixmp.testing import assert_logs
from message_ix import Scenario, make_df
from message_ix.testing import make_dantzig
from packaging.version import parse
from pandas.testing import assert_series_equal

from message_ix_models import ScenarioInfo
from message_ix_models.util import (
    MESSAGE_DATA_PATH,
    MESSAGE_MODELS_PATH,
    as_codes,
    broadcast,
    check_support,
    convert_units,
    copy_column,
    ffill,
    iter_parameters,
    load_package_data,
    load_private_data,
    local_data_path,
    make_source_tech,
    maybe_query,
    package_data_path,
    path_fallback,
    private_data_path,
    replace_par_data,
    same_node,
    same_time,
    strip_par_data,
)

_actual_package_data = Path(__file__).parents[1].joinpath("data")



[docs]
def test_as_codes():
    """Forward reference to a child is silently dropped."""
    data = dict(
        foo=dict(child=["bar"]),
        bar=dict(name="Bar!"),
    )
    result = as_codes(data)
    assert result[1] not in result[0].child

    # With Codes already, the function is a pass-through
    assert result == as_codes(result)




[docs]
def test_broadcast(caplog):
    # Base data frame to be broadcast, with 2 rows and dimensions:
    # - a: length 2
    # - b, c, d: missing
    N_a = 2
    base = pd.DataFrame([["a0", 1.2], ["a1", 3.4]], columns=["a", "value"]).assign(
        b=None, c=None, d=None
    )

    # broadcast works with DataFrame.pipe(), using keyword arguments
    result = base.pipe(
        broadcast, b="b0 b1 b2".split(), c="c0 c1 c2 c3".split(), d=["d0"]
    )
    # Results have the expected length: original × cartesian product of 3, 4, and 1
    assert N_a * 3 * 4 * 1 == len(result)
    # Resulting array is completely full, no missing labels
    assert not result.isna().any(axis=None)

    # Length zero labels for one dimension—debug message is logged
    with caplog.at_level(logging.DEBUG, logger="message_ix_models"):
        result = base.pipe(broadcast, b="b0 b1".split(), c="c0 c1".split(), d=[])

    # Debug message is logged
    assert "Don't broadcast over 'd'; labels [] have length 0" in caplog.messages
    caplog.clear()
    assert N_a * 2 * 2 * 1 == len(result)  # Expected length
    assert result["d"].isna().all()  # Dimension d remains empty
    assert not result.drop("d", axis=1).isna().any(axis=None)  # Others completely full

    # Using a DataFrame as the first/only positional argument, plus keyword arguments
    labels = pd.DataFrame(dict(b="b0 b1 b2".split(), c="c0 c1 c2".split()))

    result = base.pipe(broadcast, labels, d="d0 d1".split())
    assert N_a * 3 * 2 == len(result)  # (b, c) dimensions linked with 3 pairs of labels
    assert not result.isna().any(axis=None)  # Completely full

    # Using a positional argument with only 1 column
    result = base.pipe(broadcast, labels[["b"]], c="c0 c1 c2 c3".split(), d=["d0"])
    assert N_a * 3 * 4 * 1 == len(result)  # Expected length
    assert not result.isna().any(axis=None)  # Completely full

    # Overlap between columns in the positional argument and keywords
    with pytest.raises(ValueError):
        result = base.pipe(broadcast, labels, c="c0 c1 c2 c3".split(), d=["d0"])

    # Extra, invalid dimensions result in ValueError
    with pytest.raises(ValueError):
        base.pipe(broadcast, b="b0 b1 b2".split(), c="c0 c1 c2 c3".split(), e=["e0"])

    labels["e"] = "e0 e1 e2".split()

    with pytest.raises(ValueError):
        base.pipe(broadcast, labels, d=["d0"])




[docs]
@pytest.mark.parametrize(
    "data",
    (
        set(),
        # dict() with a value that is not a str or a further dict()
        dict(foo="foo", bar=[1, 2, 3]),
    ),
)
def test_as_codes_invalid(data):
    """as_codes() rejects invalid data."""
    with pytest.raises(TypeError):
        as_codes(data)




[docs]
def test_check_support(test_context):
    """:func:`.check_support` raises an exception for missing/non-matching values."""
    args = [test_context, dict(regions=["R11", "R14"]), "Test data available"]

    # Setting not set → KeyError
    with pytest.raises(KeyError, match="baz"):
        check_support(test_context, dict(baz=["baz"]), "Baz is not set")

    # Accepted value
    test_context.regions = "R11"
    check_support(*args)

    # Wrong setting
    test_context.regions = "FOO"
    with pytest.raises(
        NotImplementedError,
        match=re.escape("Test data available for ['R11', 'R14']; got 'FOO'"),
    ):
        check_support(*args)




[docs]
def test_convert_units(recwarn):
    """:func:`.convert_units` works."""
    # Common arguments
    args = [pd.Series([1.1, 10.2, 100.3], name="bar"), dict(bar=(10.0, "lb", "kg"))]

    exp = pd.Series(
        [registry("4.9895 kg"), registry("46.2664 kg"), registry("454.9531 kg")],
    )

    # With store="quantity", a series of pint.Quantity is returned
    result = convert_units(*args, store="quantity")
    # Will raise a DimensionalityError if units are not equal
    ratios = [(a / b) for a, b in zip(exp.values, result.values)]
    # Assert equal units and sufficiently close values
    for ratio in ratios:
        assert ratio.dimensionless and np.isclose(ratio, 1, atol=1e-4)

    # With store="magnitude", a series of floats
    exp = pd.Series([q.magnitude for q in exp.values], name="bar")
    assert_series_equal(exp, convert_units(*args, store="magnitude"), check_dtype=False)

    N = len(recwarn)

    # Other values for store= are errors
    with pytest.raises(ValueError, match="store = 'foo'"):
        convert_units(*args, store="foo")

    # series_of_pint_quantity() successfully caught warnings
    assert N == len(recwarn)




[docs]
def test_copy_column():
    df = pd.DataFrame([[0, 1], [2, 3]], columns=["a", "b"])
    df = df.assign(c=copy_column("a"), d=4)
    assert all(df["c"] == [0, 2])
    assert all(df["d"] == 4)




[docs]
def test_ffill():
    years = list(range(6))

    df = (
        make_df(
            "fix_cost",
            year_act=[0, 2, 4],
            year_vtg=[0, 2, 4],
            technology=["foo", "bar", "baz"],
            unit="USD",
        )
        .pipe(broadcast, node_loc=["A", "B", "C"])
        .assign(value=list(map(float, range(9))))
    )

    # Function completes
    result = ffill(df, "year_vtg", years, "year_act = year_vtg")

    assert 2 * len(df) == len(result)
    assert years == sorted(result["year_vtg"].unique())

    # Cannot ffill on "value" and "unit" dimensions
    with pytest.raises(ValueError, match="value"):
        ffill(df, "value", [])


    # TODO test some specific values



[docs]
@pytest.mark.skipif(
    parse(version("ixmp")) > parse("3.7.0"), reason="Not used with ixmp > 3.7.0"
)
def test_iter_parameters(test_context):
    """Parameters indexed by set 'node' can be retrieved."""
    result = list(iter_parameters("node"))
    assert result[0] == "abs_cost_activity_soft_lo"
    assert result[-1] == "var_cost"
    # The length of this list depends on message_ix. Changes in message_ix may increase
    # the number of parameters, so use <= to future-proof. See the method comments.
    assert 99 <= len(result)




[docs]
@pytest.mark.parametrize("path", _actual_package_data.rglob("*.yaml"))
def test_load_package_data(path):
    """Existing package data can be loaded."""
    load_package_data(*path.relative_to(_actual_package_data).parts)




[docs]
def test_load_package_data_twice(caplog):
    """Loading the same data twice logs a message."""
    caplog.set_level(logging.DEBUG, logger="message_ix_models")
    load_package_data("node", "R11")
    load_package_data("node", "R11")
    assert "'node R11' already loaded; skip" in caplog.messages




[docs]
def test_load_package_data_invalid():
    """load_package_data() raises an exception for an unsupported file type."""
    with pytest.raises(ValueError):
        load_package_data("test.xml")




[docs]
@pytest.mark.xfail(
    condition=MESSAGE_DATA_PATH is None, reason="Requires message_data to be installed."
)
def test_load_private_data(*parts, suffix=None):
    load_private_data("sources.yaml")



_MST_COMMON = dict(
    commodity="commodity",
    level="level",
    mode="mode",
    technology="technology",
    time="time",
    time_dest="time",
    unit="unit",
)
_MST_VALUES = dict(
    capacity_factor=1.0,
    output=2.0,
    var_cost=3.0,
    technical_lifetime=4.0,
)



[docs]
def test_make_source_tech0():
    info = ScenarioInfo()
    info.set["node"] = ["World", "node0", "node1"]
    info.set["year"] = [1, 2, 3]

    values = _MST_VALUES.copy()

    # Code runs
    result = make_source_tech(info, _MST_COMMON, **values)

    # Result is dictionary with the expected keys
    assert isinstance(result, dict)
    assert set(result.keys()) == set(values.keys())

    # "World" node does not appear in results
    assert set(result["output"]["node_loc"].unique()) == set(info.N[1:])

    for df in result.values():
        # Results have 2 nodes × 3 years
        assert len(df) == 2 * 3
        # No empty values
        assert not df.isna().any(axis=None)

    del values["var_cost"]
    with pytest.raises(ValueError, match=re.escape("needs values for {'var_cost'}")):
        make_source_tech(info, _MST_COMMON, **values)




[docs]
def test_make_source_tech1(test_mp):
    """Test make_source_tech() with a Scenario object as input."""
    s = Scenario(test_mp, model="model", scenario="scenario", version="new")
    s.add_set("node", ["World", "node0", "node1"])
    s.add_set("technology", ["t"])
    s.add_horizon([1, 2, 3])
    s.commit("")

    make_source_tech(s, _MST_COMMON, **_MST_VALUES)




[docs]
def test_maybe_query():
    """:func:`.maybe_query` works as intended."""
    s = pd.Series(
        [0, 1, 2, 3],
        index=pd.MultiIndex.from_product(
            [["a", "b"], ["c", "d"]], names=["foo", "bar"]
        ),
    )

    # No-op
    assert_series_equal(s, maybe_query(s, None))

    # Select a few rows
    assert 2 == len(maybe_query(s, "bar == 'c'"))




[docs]
def test_local_data_path(tmp_path_factory, session_context):
    assert tmp_path_factory.getbasetemp().joinpath(
        "data0", "foo", "bar"
    ) == local_data_path("foo", "bar")




[docs]
def test_package_data_path():
    assert MESSAGE_MODELS_PATH.joinpath("data", "foo", "bar") == package_data_path(
        "foo", "bar"
    )




[docs]
def test_path_fallback(caplog):
    # Can be called with where=list() including both strings and paths
    result = path_fallback(
        "test", "macro", "kgdp.csv", where=["private", package_data_path()]
    )
    assert package_data_path("test", "macro", "kgdp.csv") == result

    assert 1 <= len(caplog.messages)
    assert caplog.messages[-1].startswith("Not found: ")
    caplog.clear()

    # "package", "private", and "test" each expanded to a path
    with pytest.raises(ValueError, match=r"'foo.bar' not found in any of \["):
        path_fallback("foo", "bar", where="cache local package private test")

    assert 4 <= len(caplog.messages)
    assert caplog.messages[-1].startswith("Not found: ")
    caplog.clear()

    # Empty argument raises an exception
    with pytest.raises(ValueError, match="No directories identified among ''"):
        path_fallback("foo", "bar")
    assert 0 == len(caplog.messages)




[docs]
@pytest.mark.xfail(
    condition=MESSAGE_DATA_PATH is None, reason="Requires message_data to be installed."
)
def test_private_data_path():
    assert MESSAGE_DATA_PATH.joinpath("data", "foo", "bar") == private_data_path(
        "foo", "bar"
    )




[docs]
@pytest.mark.parametrize(
    "name, func, col", [("node", same_node, "node_loc"), ("time", same_time, "time")]
)
def test_same(name, func, col):
    """Test both :func:`.same_node` and :func:`.same_time`."""
    df_in = pd.DataFrame(
        {
            col: ["foo", "bar", "baz"],
            f"{name}_dest": None,
            f"{name}_origin": None,
            "value": [1.1, 2.2, 3.3],
        }
    )

    df_out = func(df_in)

    assert not df_out.isna().any(axis=None)
    assert_series_equal(df_out[f"{name}_dest"], df_in[col], check_names=False)
    assert_series_equal(df_out[f"{name}_origin"], df_in[col], check_names=False)




[docs]
def test_replace_par_data(caplog, test_context):
    """Test :func:`.replace_par_data`."""
    # Generate a scenario. This scenario has 3 data points in each of "input" and
    # "output" with technology="transport_from_seattle".
    s = make_dantzig(test_context.get_platform())

    # Arguments to replace_par_data()
    parameters = ["input", "output"]
    filters = dict(mode=["to_chicago", "to_topeka"])
    to_replace = dict(technology={"transport_from_seattle": "tfs"})

    with s.transact("Add a new set element, to which values will be renamed"):
        s.add_set("technology", "tfs")

    # Function runs
    replace_par_data(s, parameters, filters=filters, to_replace=to_replace)

    for data in map(lambda n: s.par(n, filters=dict(node_loc="seattle")), parameters):
        # Data points selected by `filters` have been relabeled
        assert 2 == len(data.query("technology == 'tfs'"))

        # Data points not selected by `filters` are not affected
        assert 1 == len(data.query("technology == 'transport_from_seattle'"))




[docs]
def test_strip_par_data(caplog, test_context):
    """Test the "dry run" feature of :func:`.strip_par_data`."""
    s = make_dantzig(test_context.get_platform())

    N = len(s.par("output"))
    strip_par_data(s, "technology", "canning_plant", dry_run=True, dump=dict())

    assert_logs(
        caplog,
        [
            "Remove data with technology='canning_plant' (DRY RUN)",
            "2 rows in 'output'",
            "with commodity=['cases']",
            "with level=['supply']",
        ],
    )
    # Nothing was actually removed
    assert N == len(s.par("output"))