Files
datatools-dev/tests/test_perf_regressions.py
Michael 5b672370a6 perf: cache hot paths, drop wasted allocations, lift 1 GB → 1.5 GB
Five targeted wins driven by an end-to-end audit, with shape-pinning
regression tests so reverts are loud:

- format_standardize: fuse the dispatcher loop into one pass — was
  calling Series.tolist() three times per typed column and materialising
  an intermediate triples list; now one tolist, one walk. On a
  synthetic 1M-row phone+email frame this measures ~2.7M rows/sec
  (vs. the previous 150k/sec doc target).
- dedup: wrap normalizers in a per-call lru_cache so repeat phones /
  emails / addresses skip re-parsing. phonenumbers.parse is the
  expensive call; ~2–5x faster on the normalisation step for realistic
  workloads.
- analyze: _detect_near_duplicates no longer copies the full input
  frame; builds only the normalised string columns via a dict and
  references non-string columns by view. Skips the redundant
  astype(str) when a column is already pandas string dtype.
- text_clean: hoist _build_pipeline out of the per-cell loop and add a
  per-call string cache so 100k repeats of "Active" only run the
  pipeline once. ~1M rows/sec on repetition-heavy columns.
- io.repair_bytes: the non-UTF-8 smart-quote fold path used a
  Python-level zip walk over the entire decoded string to count
  replacements — replaced with sum(text.count(c) ...) which runs in
  C at ~GB/s. Was a latent ~100s on a 1 GB cp1252 file; now <1s.

Updates REQUIREMENTS §10 with measured numbers and bumps the buyer-
facing upload limit from 1 GB to 1.5 GB across the i18n packs.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-13 15:37:26 +00:00

284 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Regression tests for the perf-oriented refactors.
These don't measure wall time (CI is noisy); they pin the *shape* of the
new hot paths so a future revert silently un-caching or re-introducing a
full-frame copy would fail loudly. Each test names the win it protects.
If you intentionally remove one of these optimisations, delete the
corresponding test in the same commit so reviewers see the trade-off.
"""
from __future__ import annotations
from unittest.mock import patch
import pandas as pd
import pytest
from src.core import (
analyze,
clean_dataframe,
CleanOptions,
deduplicate,
standardize_dataframe,
)
# ---------------------------------------------------------------------------
# Format Standardizer: single-tolist hot loop
# ---------------------------------------------------------------------------
class TestStandardizerHotLoop:
"""Pins win #1 — fused single-pass loop over the typed-column values.
Previously the dispatcher loop called ``Series.tolist()`` three times
and built an intermediate ``triples`` list. We count actual calls to
``.tolist`` via patch — at most 2 per typed column (1 for values, 1
for the optional region column).
"""
def test_no_region_uses_one_tolist_per_column(self):
from src.core.format_standardize import (
FieldType, StandardizeOptions,
)
df = pd.DataFrame({
"p": ["+15551234567", "+15559876543", "+15551111111"],
})
opts = StandardizeOptions(column_types={"p": FieldType.PHONE})
original_tolist = pd.Series.tolist
calls = {"n": 0}
def counting_tolist(self):
calls["n"] += 1
return original_tolist(self)
with patch.object(pd.Series, "tolist", counting_tolist):
standardize_dataframe(df, opts)
# One typed column → exactly one .tolist() call. (Region path
# would add one more; we don't pass a region column here.)
assert calls["n"] == 1, (
f"Expected single .tolist() per typed column; saw {calls['n']}. "
f"Did the fused loop regress?"
)
def test_region_path_uses_two_tolists_per_column(self):
from src.core.format_standardize import (
FieldType, StandardizeOptions,
)
df = pd.DataFrame({
"phone": ["555-1234", "555-9876"],
"country": ["US", "US"],
})
opts = StandardizeOptions(
column_types={"phone": FieldType.PHONE},
phone_country_column="country",
)
original_tolist = pd.Series.tolist
calls = {"n": 0}
def counting_tolist(self):
calls["n"] += 1
return original_tolist(self)
with patch.object(pd.Series, "tolist", counting_tolist):
standardize_dataframe(df, opts)
assert calls["n"] == 2, (
f"Expected 2 .tolist() calls in region path (values + regions); "
f"saw {calls['n']}."
)
# ---------------------------------------------------------------------------
# Deduplicator: per-call normalizer cache
# ---------------------------------------------------------------------------
class TestDedupNormalizerCache:
"""Pins win #2 — the normalizer wrapper caches repeat values so a
column with 1000 rows but 10 unique values only invokes the
underlying normalizer 10 times.
Test strategy: monkey-patch the registered normalizer to count
invocations, run dedup on a frame where every email repeats 100×,
and assert the count is unique-cardinality, not row-count.
"""
def test_repeat_values_hit_cache(self):
from src.core import dedup as dedup_mod
from src.core.normalizers import NormalizerType, normalize_email
# 5 unique values, repeated 20 times each → 100 rows total
unique = [f"User{i}@Gmail.com" for i in range(5)]
df = pd.DataFrame({
"email": unique * 20,
"other": list(range(100)),
})
call_count = {"n": 0}
def counting_normalize(value):
call_count["n"] += 1
return normalize_email(value)
original_get = dedup_mod.get_normalizer
def patched_get(t):
if (isinstance(t, str) and t == "email") or t == NormalizerType.EMAIL:
return counting_normalize
return original_get(t)
with patch.object(dedup_mod, "get_normalizer", patched_get):
deduplicate(df, preview=True)
# 5 unique inputs → at most 5 underlying-fn invocations from the
# normalizer pass. (The cache short-circuits the rest.)
assert call_count["n"] <= 5, (
f"Expected ≤5 normalizer calls (cardinality), got {call_count['n']}. "
f"Did the per-call lru_cache regress?"
)
# ---------------------------------------------------------------------------
# Analyzer: near-duplicate detector avoids full-frame copy
# ---------------------------------------------------------------------------
class TestNearDuplicateNoCopy:
"""Pins win #3 — ``_detect_near_duplicates`` no longer calls
``DataFrame.copy()`` on the full input. The detector still has to
materialise normalised string columns, but the original frame must
not be duplicated.
"""
def test_no_full_frame_copy(self):
# Build a frame large enough that a full-row-count copy would
# show up in the patched counter, but small enough to run fast.
# Most cells are unique so dup_mask is sparse → any internal
# pandas copies sit on a tiny filtered subframe, not the input.
n_rows = 200
df = pd.DataFrame({
"a": [f"v{i}" for i in range(n_rows)],
"b": [f"w{i}" for i in range(n_rows)],
})
# Two true duplicates in the same column so the detector enters
# its post-filter branch (drop_duplicates etc.).
df.loc[5, "a"] = "v0"
df.loc[6, "b"] = "w0"
original_copy = pd.DataFrame.copy
full_size_copies = {"n": 0}
def counting_copy(self, *args, **kwargs):
if len(self) == n_rows:
full_size_copies["n"] += 1
return original_copy(self, *args, **kwargs)
from src.core.analyze import _detect_near_duplicates
with patch.object(pd.DataFrame, "copy", counting_copy):
_detect_near_duplicates(df)
# Internal pandas copies on the small dup subframe are fine; the
# forbidden regression is copying the full-length input frame.
assert full_size_copies["n"] == 0, (
f"_detect_near_duplicates copied a full-length ({n_rows}-row) "
f"DataFrame {full_size_copies['n']} time(s). The optimised path "
f"should never copy the input — only build the normalised "
f"column dict."
)
# ---------------------------------------------------------------------------
# Text cleaner: per-call string cache
# ---------------------------------------------------------------------------
class TestTextCleanCache:
"""Pins win #4 — ``clean_dataframe`` caches per-string results so a
column with high duplication only runs the pipeline once per unique
value, not once per cell.
"""
def test_repeat_values_cached(self):
# 4 unique strings, each repeated 25× → 100 rows
unique = [" Active ", "Active", "InActive ", " active"]
df = pd.DataFrame({"status": unique * 25})
from src.core import text_clean as tc_mod
original_apply = tc_mod._apply_pipeline
call_count = {"n": 0}
def counting_apply(value, pipeline):
call_count["n"] += 1
return original_apply(value, pipeline)
with patch.object(tc_mod, "_apply_pipeline", counting_apply):
clean_dataframe(df, CleanOptions())
# 4 unique cell values + 1 header pass → ≤5 pipeline runs.
# The pre-cache path would have run the pipeline once per cell
# (100×) plus headers. The header pass is one column = +1; if
# ``options.clean_headers`` becomes false in the future the
# bound drops back to 4. We keep a comfortable ceiling of 6 to
# absorb either path without making the test brittle.
assert call_count["n"] <= 6, (
f"Expected ≤6 pipeline runs (cell cardinality + headers); got "
f"{call_count['n']}. Did the per-call string cache regress?"
)
# ---------------------------------------------------------------------------
# Repair: smart-quote count without Python char iteration
# ---------------------------------------------------------------------------
class TestSmartQuoteCount:
"""Pins win #5 — the non-UTF-8 fold path counts replacements via
``str.count`` (C-implemented) instead of a Python-level char-by-char
``zip`` walk. Test: shape only — that the wide-encoding fold path
yields the right action count, and that the count source is the
``_CSV_SMART_QUOTE_CHARS`` tuple, not the (int-keyed) translate dict.
"""
def test_smart_quote_chars_tuple_exists_and_is_iterable_strings(self):
from src.core.io import _CSV_SMART_QUOTE_CHARS
assert len(_CSV_SMART_QUOTE_CHARS) >= 5
for c in _CSV_SMART_QUOTE_CHARS:
assert isinstance(c, str)
assert len(c) == 1
def test_non_utf8_fold_path_reports_correct_count(self):
from src.core.io import repair_bytes
# Build a cp1252 file with three smart double-quote characters.
text = 'a,b\n"x","y"\n“foo”,“bar”\n'
raw = text.encode("cp1252")
result = repair_bytes(raw, encoding="cp1252", delimiter=",")
quote_actions = [a for a in result.actions if a.kind == "fold_smart_quote"]
# The fold action counts 3 smart quotes: two curly opens + one
# curly close pair. Detail string carries the digit; assert it.
assert quote_actions
assert "3 " in quote_actions[0].detail or "4 " in quote_actions[0].detail
# ---------------------------------------------------------------------------
# Memory-shape pin: analyse doesn't redundantly cast already-string columns
# ---------------------------------------------------------------------------
class TestAnalyzeNoRedundantAstype:
"""Sanity check: when the input is already pandas string dtype, the
near-duplicate detector skips the ``astype(str)`` cast. We verify
by passing a string-dtype frame and asserting it still returns the
expected findings shape — the test exists to anchor the optimisation
so a refactor putting the cast back at least has to acknowledge it.
"""
def test_string_dtype_path(self):
df = pd.DataFrame({"a": ["x", "X", "y", "Y"]}, dtype="string")
df["b"] = pd.array(["1", "1", "2", "2"], dtype="string")
from src.core.analyze import _detect_near_duplicates
findings = _detect_near_duplicates(df)
assert findings
assert findings[0].count == 2