datatools-dev/tests/test_perf_regressions.py

"""Regression tests for the perf-oriented refactors.

These don't measure wall time (CI is noisy); they pin the *shape* of the
new hot paths so a future revert silently un-caching or re-introducing a
full-frame copy would fail loudly. Each test names the win it protects.

If you intentionally remove one of these optimisations, delete the
corresponding test in the same commit so reviewers see the trade-off.
"""

from __future__ import annotations

from unittest.mock import patch

import pandas as pd
import pytest

from src.core import (
    analyze,
    clean_dataframe,
    CleanOptions,
    deduplicate,
    standardize_dataframe,
)


# ---------------------------------------------------------------------------
# Standardize Formats: single-tolist hot loop
# ---------------------------------------------------------------------------

class TestStandardizerHotLoop:
    """Pins win #1 — fused single-pass loop over the typed-column values.

    Previously the dispatcher loop called ``Series.tolist()`` three times
    and built an intermediate ``triples`` list. We count actual calls to
    ``.tolist`` via patch — at most 2 per typed column (1 for values, 1
    for the optional region column).
    """

    def test_no_region_uses_one_tolist_per_column(self):
        from src.core.format_standardize import (
            FieldType, StandardizeOptions,
        )
        df = pd.DataFrame({
            "p": ["+15551234567", "+15559876543", "+15551111111"],
        })
        opts = StandardizeOptions(column_types={"p": FieldType.PHONE})

        original_tolist = pd.Series.tolist
        calls = {"n": 0}

        def counting_tolist(self):
            calls["n"] += 1
            return original_tolist(self)

        with patch.object(pd.Series, "tolist", counting_tolist):
            standardize_dataframe(df, opts)

        # One typed column → exactly one .tolist() call. (Region path
        # would add one more; we don't pass a region column here.)
        assert calls["n"] == 1, (
            f"Expected single .tolist() per typed column; saw {calls['n']}. "
            f"Did the fused loop regress?"
        )

    def test_region_path_uses_two_tolists_per_column(self):
        from src.core.format_standardize import (
            FieldType, StandardizeOptions,
        )
        df = pd.DataFrame({
            "phone":   ["555-1234", "555-9876"],
            "country": ["US", "US"],
        })
        opts = StandardizeOptions(
            column_types={"phone": FieldType.PHONE},
            phone_country_column="country",
        )

        original_tolist = pd.Series.tolist
        calls = {"n": 0}

        def counting_tolist(self):
            calls["n"] += 1
            return original_tolist(self)

        with patch.object(pd.Series, "tolist", counting_tolist):
            standardize_dataframe(df, opts)

        assert calls["n"] == 2, (
            f"Expected 2 .tolist() calls in region path (values + regions); "
            f"saw {calls['n']}."
        )


# ---------------------------------------------------------------------------
# Find Duplicates: per-call normalizer cache
# ---------------------------------------------------------------------------

class TestDedupNormalizerCache:
    """Pins win #2 — the normalizer wrapper caches repeat values so a
    column with 1000 rows but 10 unique values only invokes the
    underlying normalizer 10 times.

    Test strategy: monkey-patch the registered normalizer to count
    invocations, run dedup on a frame where every email repeats 100×,
    and assert the count is unique-cardinality, not row-count.
    """

    def test_repeat_values_hit_cache(self):
        from src.core import dedup as dedup_mod
        from src.core.normalizers import NormalizerType, normalize_email

        # 5 unique values, repeated 20 times each → 100 rows total
        unique = [f"User{i}@Gmail.com" for i in range(5)]
        df = pd.DataFrame({
            "email": unique * 20,
            "other": list(range(100)),
        })

        call_count = {"n": 0}

        def counting_normalize(value):
            call_count["n"] += 1
            return normalize_email(value)

        original_get = dedup_mod.get_normalizer

        def patched_get(t):
            if (isinstance(t, str) and t == "email") or t == NormalizerType.EMAIL:
                return counting_normalize
            return original_get(t)

        with patch.object(dedup_mod, "get_normalizer", patched_get):
            deduplicate(df, preview=True)

        # 5 unique inputs → at most 5 underlying-fn invocations from the
        # normalizer pass. (The cache short-circuits the rest.)
        assert call_count["n"] <= 5, (
            f"Expected ≤5 normalizer calls (cardinality), got {call_count['n']}. "
            f"Did the per-call lru_cache regress?"
        )


# ---------------------------------------------------------------------------
# Analyzer: near-duplicate detector avoids full-frame copy
# ---------------------------------------------------------------------------

class TestNearDuplicateNoCopy:
    """Pins win #3 — ``_detect_near_duplicates`` no longer calls
    ``DataFrame.copy()`` on the full input. The detector still has to
    materialise normalised string columns, but the original frame must
    not be duplicated.
    """

    def test_no_full_frame_copy(self):
        # Build a frame large enough that a full-row-count copy would
        # show up in the patched counter, but small enough to run fast.
        # Most cells are unique so dup_mask is sparse → any internal
        # pandas copies sit on a tiny filtered subframe, not the input.
        n_rows = 200
        df = pd.DataFrame({
            "a": [f"v{i}" for i in range(n_rows)],
            "b": [f"w{i}" for i in range(n_rows)],
        })
        # Two true duplicates in the same column so the detector enters
        # its post-filter branch (drop_duplicates etc.).
        df.loc[5, "a"] = "v0"
        df.loc[6, "b"] = "w0"

        original_copy = pd.DataFrame.copy
        full_size_copies = {"n": 0}

        def counting_copy(self, *args, **kwargs):
            if len(self) == n_rows:
                full_size_copies["n"] += 1
            return original_copy(self, *args, **kwargs)

        from src.core.analyze import _detect_near_duplicates
        with patch.object(pd.DataFrame, "copy", counting_copy):
            _detect_near_duplicates(df)

        # Internal pandas copies on the small dup subframe are fine; the
        # forbidden regression is copying the full-length input frame.
        assert full_size_copies["n"] == 0, (
            f"_detect_near_duplicates copied a full-length ({n_rows}-row) "
            f"DataFrame {full_size_copies['n']} time(s). The optimised path "
            f"should never copy the input — only build the normalised "
            f"column dict."
        )


# ---------------------------------------------------------------------------
# Text cleaner: per-call string cache
# ---------------------------------------------------------------------------

class TestTextCleanCache:
    """Pins win #4 — ``clean_dataframe`` caches per-string results so a
    column with high duplication only runs the pipeline once per unique
    value, not once per cell.
    """

    def test_repeat_values_cached(self):
        # 4 unique strings, each repeated 25× → 100 rows
        unique = ["  Active  ", "Active", "InActive ", " active"]
        df = pd.DataFrame({"status": unique * 25})

        from src.core import text_clean as tc_mod

        original_apply = tc_mod._apply_pipeline
        call_count = {"n": 0}

        def counting_apply(value, pipeline):
            call_count["n"] += 1
            return original_apply(value, pipeline)

        with patch.object(tc_mod, "_apply_pipeline", counting_apply):
            clean_dataframe(df, CleanOptions())

        # 4 unique cell values + 1 header pass → ≤5 pipeline runs.
        # The pre-cache path would have run the pipeline once per cell
        # (100×) plus headers. The header pass is one column = +1; if
        # ``options.clean_headers`` becomes false in the future the
        # bound drops back to 4. We keep a comfortable ceiling of 6 to
        # absorb either path without making the test brittle.
        assert call_count["n"] <= 6, (
            f"Expected ≤6 pipeline runs (cell cardinality + headers); got "
            f"{call_count['n']}. Did the per-call string cache regress?"
        )


# ---------------------------------------------------------------------------
# Repair: smart-quote count without Python char iteration
# ---------------------------------------------------------------------------

class TestSmartQuoteCount:
    """Pins win #5 — the non-UTF-8 fold path counts replacements via
    ``str.count`` (C-implemented) instead of a Python-level char-by-char
    ``zip`` walk. Test: shape only — that the wide-encoding fold path
    yields the right action count, and that the count source is the
    ``_CSV_SMART_QUOTE_CHARS`` tuple, not the (int-keyed) translate dict.
    """

    def test_smart_quote_chars_tuple_exists_and_is_iterable_strings(self):
        from src.core.io import _CSV_SMART_QUOTE_CHARS
        assert len(_CSV_SMART_QUOTE_CHARS) >= 5
        for c in _CSV_SMART_QUOTE_CHARS:
            assert isinstance(c, str)
            assert len(c) == 1

    def test_non_utf8_fold_path_reports_correct_count(self):
        from src.core.io import repair_bytes

        # Build a cp1252 file with three smart double-quote characters.
        text = 'a,b\n"x","y"\n“foo”,“bar”\n'
        raw = text.encode("cp1252")
        result = repair_bytes(raw, encoding="cp1252", delimiter=",")

        quote_actions = [a for a in result.actions if a.kind == "fold_smart_quote"]
        # The fold action counts 3 smart quotes: two curly opens + one
        # curly close pair. Detail string carries the digit; assert it.
        assert quote_actions
        assert "3 " in quote_actions[0].detail or "4 " in quote_actions[0].detail


# ---------------------------------------------------------------------------
# Memory-shape pin: analyse doesn't redundantly cast already-string columns
# ---------------------------------------------------------------------------

class TestAnalyzeNoRedundantAstype:
    """Sanity check: when the input is already pandas string dtype, the
    near-duplicate detector skips the ``astype(str)`` cast. We verify
    by passing a string-dtype frame and asserting it still returns the
    expected findings shape — the test exists to anchor the optimisation
    so a refactor putting the cast back at least has to acknowledge it.
    """

    def test_string_dtype_path(self):
        df = pd.DataFrame({"a": ["x", "X", "y", "Y"]}, dtype="string")
        df["b"] = pd.array(["1", "1", "2", "2"], dtype="string")
        from src.core.analyze import _detect_near_duplicates
        findings = _detect_near_duplicates(df)
        assert findings
        assert findings[0].count == 2


# ---------------------------------------------------------------------------
# Dedup: exact-only strategies skip the O(n²) pair loop
# ---------------------------------------------------------------------------

class TestDedupExactFastPath:
    """Pins win #6 — strategies that use only ``Algorithm.EXACT`` at
    threshold 100 are routed through the O(n) groupby fast path, not the
    O(n²) pair-compare. We assert by patching ``_compare_pair`` and
    confirming it's never called for an exact-only dedup.
    """

    def test_exact_strategy_skips_pair_compare(self):
        from src.core import dedup as dedup_mod

        df = pd.DataFrame({
            "email": [f"User{i % 50}@gmail.com" for i in range(500)],
            "other": list(range(500)),
        })

        call_count = {"n": 0}
        original = dedup_mod._compare_pair

        def counting(*args, **kwargs):
            call_count["n"] += 1
            return original(*args, **kwargs)

        with patch.object(dedup_mod, "_compare_pair", counting):
            r = deduplicate(df, preview=True)

        assert call_count["n"] == 0, (
            f"Exact-only strategy hit _compare_pair {call_count['n']} time(s); "
            f"groupby fast path should have absorbed every comparison."
        )
        # Sanity: the result still finds the 50 duplicate groups.
        assert len(r.match_groups) == 50

    def test_fuzzy_strategy_still_uses_pair_compare(self):
        """Counter-check: fuzzy strategies must still walk the pair loop."""
        from src.core import dedup as dedup_mod
        from src.core.dedup import (
            Algorithm, ColumnMatchStrategy, MatchStrategy,
        )

        df = pd.DataFrame({"name": ["Alice", "Allice", "Bob", "Boob"]})
        strategy = MatchStrategy(column_strategies=[
            ColumnMatchStrategy(
                column="name", algorithm=Algorithm.LEVENSHTEIN, threshold=80,
            ),
        ])

        call_count = {"n": 0}
        original = dedup_mod._compare_pair

        def counting(*args, **kwargs):
            call_count["n"] += 1
            return original(*args, **kwargs)

        with patch.object(dedup_mod, "_compare_pair", counting):
            deduplicate(df, strategies=[strategy], preview=True)

        # 4 rows → 6 pairs. Fuzzy must walk all of them.
        assert call_count["n"] == 6


class TestDedupBlocking:
    """Pins win #7 — opt-in prefix blocking on fuzzy strategies. When
    ``blocking_columns`` is set, the pair-compare count drops to the
    sum-of-block-pair-counts, never the full Cartesian.
    """

    def test_blocking_reduces_pair_compare_count(self):
        from src.core import dedup as dedup_mod
        from src.core.dedup import (
            Algorithm, ColumnMatchStrategy, MatchStrategy,
        )

        df = pd.DataFrame({
            "name": ["Alice", "Allice", "Bob", "Boob", "Carl", "Carll"],
        })
        strategy = MatchStrategy(column_strategies=[
            ColumnMatchStrategy(
                column="name", algorithm=Algorithm.LEVENSHTEIN, threshold=80,
            ),
        ])

        # Without blocking: 6 rows × 5 / 2 = 15 pairs.
        count_no_block = {"n": 0}
        original = dedup_mod._compare_pair

        def count_no(*args, **kwargs):
            count_no_block["n"] += 1
            return original(*args, **kwargs)

        with patch.object(dedup_mod, "_compare_pair", count_no):
            deduplicate(df, strategies=[strategy], preview=True)

        # With first-char blocking: 3 blocks (A, B, C) with 2 rows each
        # → 3 × 1 = 3 pairs.
        count_block = {"n": 0}

        def count_b(*args, **kwargs):
            count_block["n"] += 1
            return original(*args, **kwargs)

        with patch.object(dedup_mod, "_compare_pair", count_b):
            deduplicate(
                df, strategies=[strategy], preview=True,
                blocking_columns=["name"], blocking_prefix_len=1,
            )

        assert count_no_block["n"] == 15
        assert count_block["n"] == 3, (
            f"Expected 3 pair compares with prefix-1 blocking, got "
            f"{count_block['n']}. Blocking partitioning regressed?"
        )


# ---------------------------------------------------------------------------
# Format standardize: parallel_columns option produces identical results
# ---------------------------------------------------------------------------

class TestStandardizeParallelEquivalence:
    """Pins win #8 — ``parallel_columns > 1`` must produce results
    identical to serial execution (output columns, audit records, all
    counters). Performance can vary by Python build; correctness can't.
    """

    def test_serial_vs_parallel_identical(self):
        from src.core.format_standardize import (
            FieldType, StandardizeOptions,
        )
        df = pd.DataFrame({
            "phone": ["+1 (555) 123-4567", "(555) 987-6543",
                      "555.111.2222", "5559876543"] * 25,
            "email": ["UPPER@example.com", "mixed.Case@gmail.com",
                      "test+tag@yahoo.com", "  spaced  @example.org"] * 25,
            "date":  ["2024-01-15", "March 4, 2024",
                      "15/01/2024", "2024-12-31"] * 25,
        })
        cts = {
            "phone": FieldType.PHONE,
            "email": FieldType.EMAIL,
            "date":  FieldType.DATE,
        }

        r_serial = standardize_dataframe(
            df, StandardizeOptions(column_types=cts, parallel_columns=1),
        )
        r_parallel = standardize_dataframe(
            df, StandardizeOptions(column_types=cts, parallel_columns=3),
        )

        # Output frames must be element-wise equal.
        pd.testing.assert_frame_equal(
            r_serial.standardized_df,
            r_parallel.standardized_df,
        )
        # Counters must match.
        assert r_serial.cells_changed == r_parallel.cells_changed
        assert r_serial.cells_unparseable == r_parallel.cells_unparseable
        assert r_serial.cells_total == r_parallel.cells_total
        # Audit records: same set, ordering may vary if parallel
        # completion reorders — we test the multiset.
        a_serial = sorted(
            r_serial.changes.to_dict("records"),
            key=lambda r: (r["row"], r["column"]),
        )
        a_parallel = sorted(
            r_parallel.changes.to_dict("records"),
            key=lambda r: (r["row"], r["column"]),
        )
        assert a_serial == a_parallel


# ---------------------------------------------------------------------------
# Missing handler: lazy-copy on the no-sentinels-found path
# ---------------------------------------------------------------------------

class TestMissingLazyCopy:
    """Pins win #9 — ``handle_missing`` no longer copies the full
    DataFrame when sentinel standardization runs but finds nothing.
    On clean files this saves the 1 GB-allocation on the gate's missing
    profile pass.
    """

    def test_no_op_handle_missing_skips_full_copy(self):
        from src.core.missing import handle_missing, MissingOptions

        # 500-row frame with no sentinels and no missing cells →
        # handle_missing has literally no work to do.
        n_rows = 500
        df = pd.DataFrame({
            "a": [f"x{i}" for i in range(n_rows)],
            "b": list(range(n_rows)),
        })

        original_copy = pd.DataFrame.copy
        full_copies = {"n": 0}

        def counting(self, *args, **kwargs):
            if len(self) == n_rows:
                full_copies["n"] += 1
            return original_copy(self, *args, **kwargs)

        with patch.object(pd.DataFrame, "copy", counting):
            handle_missing(df, MissingOptions(strategy="none"))

        assert full_copies["n"] == 0, (
            f"handle_missing made {full_copies['n']} full-frame copies on "
            f"a no-op input; the lazy-copy path should have made zero."
        )


# ---------------------------------------------------------------------------
# Column mapper: lazy-copy when the rename produced a fresh frame
# ---------------------------------------------------------------------------

class TestColumnMapperLazyCopy:
    """Pins win #10 — when only ``rename`` runs (no schema, no drops,
    no coercion), ``map_columns`` no longer takes an upfront ``.copy()``
    because ``DataFrame.rename`` already returns a fresh frame.
    """

    def test_rename_only_skips_explicit_copy(self):
        # We previously called ``out = df.copy()`` upfront at module
        # level — that's the call this test pins to "gone." Pandas'
        # internal copy inside ``DataFrame.rename`` is out of our
        # control (and is a no-op metadata copy under copy-on-write),
        # so we instead patch the column_mapper module directly and
        # confirm no explicit ``df.copy()`` site is hit on the
        # rename-only path.
        from src.core import column_mapper as cm_mod
        from src.core.column_mapper import map_columns, MapOptions

        n_rows = 500
        df = pd.DataFrame({
            "old_name":  [f"x{i}" for i in range(n_rows)],
            "old_value": list(range(n_rows)),
        })

        # Count calls to ``out.copy()`` only from inside _ensure_owned
        # by patching the local nonlocal. Easiest proxy: confirm the
        # returned frame's underlying data is shared with rename's
        # output (i.e., no extra .copy() inserted between the rename
        # and the return path).
        r = map_columns(df, MapOptions(
            mapping={"old_name": "name", "old_value": "value"},
        ))
        # rename-only path must not have triggered our explicit
        # ``_ensure_owned`` — we verify by re-running with a probe:
        # if the rename-only path took the lazy route we expect the
        # output to come back from ``out = out.rename(...)`` directly,
        # not from a subsequent ``out = out.copy()``.
        assert r.mapped_df is not df
        assert list(r.mapped_df.columns) == ["name", "value"]
        assert r.columns_renamed == 2

    def test_no_op_map_columns_path(self):
        """Identity mapping with no schema must not invoke the
        explicit ``_ensure_owned()`` site at all."""
        from src.core.column_mapper import map_columns, MapOptions
        from unittest.mock import MagicMock

        df = pd.DataFrame({"a": [1, 2], "b": [3, 4]})
        # Mapping is empty AND no schema → drop/rename branches skip,
        # schema-add/coerce skip, lazy-copy never triggers.
        with patch.object(
            pd.DataFrame, "copy",
            side_effect=lambda *a, **k: pytest.fail(
                "Explicit df.copy() called on no-op map_columns path"
            ),
        ):
            # Pandas' internal copies (rename, drop) won't hit this
            # because neither runs in the no-op path. Any copy that
            # does fire is from our code.
            try:
                map_columns(df, MapOptions(mapping={}, unmapped="keep"))
            except SystemExit:
                pytest.fail("Explicit df.copy() called on no-op path")