Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
166 lines
6.5 KiB
Python
166 lines
6.5 KiB
Python
"""Tests added to close gaps surfaced by the test audit.
|
||
|
||
These cover edges that existing suites missed:
|
||
|
||
- ``CleanOptions.clean_headers=False`` toggle (added but not directly tested).
|
||
- ``repair_bytes`` with non-comma delimiters and combined-fix scenarios.
|
||
- ``analyze()`` over a path-based Excel file.
|
||
- ``analyze()`` with ``sample_rows >= len(df)`` (uses copy(), not head()).
|
||
- ``findings_by_tool`` on an empty list.
|
||
- BOM that appears mid-cell rather than at file start.
|
||
- The collapse-whitespace heuristic for numeric/date/phone-shaped cells
|
||
(spec §4.17), now wired in via ``_smart_collapse_whitespace``.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import io
|
||
|
||
import pandas as pd
|
||
import pytest
|
||
|
||
from src.core.analyze import analyze, findings_by_tool
|
||
from src.core.io import RepairAction, repair_bytes
|
||
from src.core.text_clean import CleanOptions, clean_dataframe
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# clean_headers toggle
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestCleanHeadersToggle:
|
||
def test_default_cleans_headers(self):
|
||
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
|
||
result = clean_dataframe(df)
|
||
assert list(result.cleaned_df.columns) == ["id", "Email"]
|
||
|
||
def test_disable_preserves_dirty_headers(self):
|
||
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
|
||
result = clean_dataframe(df, CleanOptions(clean_headers=False))
|
||
assert list(result.cleaned_df.columns) == [" id ", "Email"]
|
||
|
||
def test_disable_still_cleans_data_cells(self):
|
||
df = pd.DataFrame({"name": [" Alice ", "Bob "]})
|
||
result = clean_dataframe(df, CleanOptions(clean_headers=False))
|
||
assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# repair_bytes — non-comma delimiters and combined fixes
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestRepairBytesDelimiters:
|
||
def test_tab_delimited_smart_quote_fold(self):
|
||
raw = "id\tnote\n1\t“hi”\n".encode("utf-8")
|
||
result = repair_bytes(raw, delimiter="\t")
|
||
text = result.repaired_bytes.decode("utf-8")
|
||
assert "“" not in text and "”" not in text
|
||
assert "\t" in text # delimiter preserved
|
||
|
||
def test_semicolon_delimited_unrepairable_extras(self):
|
||
raw = b"id;a;b\n1;foo;bar\n2;1;2;3;4\n"
|
||
result = repair_bytes(raw, delimiter=";")
|
||
# Extra-field row with no clear merge candidate is logged unrepairable.
|
||
assert 3 in result.unrepairable_lines
|
||
|
||
|
||
class TestRepairBytesCombinedFixes:
|
||
def test_bom_plus_nul_plus_smart_quotes(self):
|
||
raw = (
|
||
b"\xef\xbb\xbf"
|
||
b"id,note\n"
|
||
b"1,Hel\x00lo \xe2\x80\x9cworld\xe2\x80\x9d\n"
|
||
)
|
||
result = repair_bytes(raw)
|
||
kinds = {a.kind for a in result.actions}
|
||
assert {"strip_bom", "strip_nul", "fold_smart_quote"} <= kinds
|
||
# Resulting bytes parse cleanly.
|
||
df = pd.read_csv(io.BytesIO(result.repaired_bytes))
|
||
assert df.iloc[0]["note"] == 'Hello "world"'
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# analyze() — path-based Excel and large-sample edges
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestAnalyzeXlsxPath:
|
||
def test_excel_path_runs_without_repair(self, tmp_path):
|
||
path = tmp_path / "small.xlsx"
|
||
df = pd.DataFrame({
|
||
"id": ["1", "2"],
|
||
"name": [" Alice ", "Bob"], # padding in xlsx
|
||
})
|
||
df.to_excel(path, index=False, engine="openpyxl")
|
||
findings = analyze(path)
|
||
ids = {f.id for f in findings}
|
||
assert "whitespace_padding" in ids
|
||
# Excel skips csv_* findings — no pre-parse repair on xlsx.
|
||
assert not any(i.startswith("csv_") for i in ids)
|
||
|
||
|
||
class TestAnalyzeSampleRowsEdge:
|
||
def test_sample_rows_larger_than_df(self):
|
||
df = pd.DataFrame({"x": [" pad ", "clean"]})
|
||
# sample_rows=1000 but df has only 2 rows; must not crash.
|
||
findings = analyze(df, sample_rows=1000)
|
||
assert any(f.id == "whitespace_padding" for f in findings)
|
||
|
||
|
||
class TestAnalyzeMidCellBom:
|
||
def test_bom_inside_cell_treated_as_zero_width(self):
|
||
df = pd.DataFrame({"name": ["Hello"]})
|
||
findings = analyze(df)
|
||
assert any(f.id == "zero_width_or_invisible" for f in findings)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# findings_by_tool — edge cases
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestFindingsByToolEdges:
|
||
def test_empty_list_returns_empty_dict(self):
|
||
assert findings_by_tool([]) == {}
|
||
|
||
def test_only_toolless_findings_returns_empty_dict(self):
|
||
from src.core.analyze import Finding
|
||
# Construct a Finding with no tool — like csv_unrepairable_rows.
|
||
f = Finding(
|
||
id="x", severity="info", tool="", count=1,
|
||
description="d",
|
||
)
|
||
assert findings_by_tool([f]) == {}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Known gap: collapse_whitespace on numeric/date/phone-shaped cells
|
||
# ---------------------------------------------------------------------------
|
||
|
||
class TestStructuredCellWhitespacePreservation:
|
||
"""Spec §4.17: ``collapse_whitespace`` skips numeric/date/phone-shaped cells."""
|
||
|
||
def test_phone_internal_double_space_preserved(self):
|
||
df = pd.DataFrame({"phone": ["(555) 123-4567"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df.iloc[0]["phone"] == "(555) 123-4567"
|
||
|
||
def test_european_thousands_sep_preserved(self):
|
||
df = pd.DataFrame({"price": ["1 234"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df.iloc[0]["price"] == "1 234"
|
||
|
||
def test_iso_date_passes_through(self):
|
||
df = pd.DataFrame({"date": ["2024-01-15"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df.iloc[0]["date"] == "2024-01-15"
|
||
|
||
def test_textual_date_preserves_spaces(self):
|
||
df = pd.DataFrame({"date": ["Jan 15 2024"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df.iloc[0]["date"] == "Jan 15 2024"
|
||
|
||
def test_free_text_double_space_still_collapsed(self):
|
||
# Crucially, the heuristic must NOT trigger on prose with letters.
|
||
df = pd.DataFrame({"note": ["hello world"]})
|
||
result = clean_dataframe(df)
|
||
assert result.cleaned_df.iloc[0]["note"] == "hello world"
|