Compare commits
10 Commits
54f92ae47e
...
794d4cda94
| Author | SHA1 | Date | |
|---|---|---|---|
| 794d4cda94 | |||
| 8dfc6ad8ae | |||
| 0671ef277e | |||
| 0b959dee93 | |||
| 4687cf87b4 | |||
| a8943f29eb | |||
| 5c62fb6117 | |||
| edf6ccf90b | |||
| b8a9fa1b09 | |||
| c349a90e18 |
20
pytest.ini
Normal file
20
pytest.ini
Normal file
@@ -0,0 +1,20 @@
|
||||
; pytest configuration shared by tox, run_tests.py, and direct pytest calls.
|
||||
|
||||
[pytest]
|
||||
testpaths = tests
|
||||
python_files = test_*.py
|
||||
python_classes = Test*
|
||||
python_functions = test_*
|
||||
|
||||
# Custom markers used by run_tests.py --quick and the e2e/install groupings.
|
||||
markers =
|
||||
slow: tests that take longer than ~1s (skipped under --quick)
|
||||
e2e: end-to-end CLI / integration tests
|
||||
install: import / dependency sanity tests
|
||||
fixture_sweep: parametrized sweep over the test-cases/ folder
|
||||
|
||||
# Warnings discipline: fail on unexpected DeprecationWarning from our own
|
||||
# code, but tolerate third-party deprecations that we can't fix.
|
||||
filterwarnings =
|
||||
error::DeprecationWarning:src
|
||||
ignore::DeprecationWarning
|
||||
177
run_tests.py
Executable file
177
run_tests.py
Executable file
@@ -0,0 +1,177 @@
|
||||
#!/usr/bin/env python3
|
||||
"""DataTools test runner — single entry point with category flags.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Run everything (default)::
|
||||
|
||||
python run_tests.py
|
||||
|
||||
Run a single tool's tests::
|
||||
|
||||
python run_tests.py --tool dedup
|
||||
python run_tests.py --tool text_clean
|
||||
python run_tests.py --tool analyze
|
||||
python run_tests.py --tool io
|
||||
python run_tests.py --tool cli
|
||||
|
||||
Categories::
|
||||
|
||||
python run_tests.py --unit # unit tests only (no e2e, no install)
|
||||
python run_tests.py --e2e # end-to-end smoke tests
|
||||
python run_tests.py --install # install / dependency sanity
|
||||
python run_tests.py --fixtures # corpus + dropped-file sweep
|
||||
python run_tests.py --coverage # add a coverage report
|
||||
python run_tests.py --quick # skip @pytest.mark.slow
|
||||
python run_tests.py -v / --verbose # verbose pytest output
|
||||
|
||||
Multiple flags compose. ``--tool X --quick`` runs that tool's quick tests.
|
||||
|
||||
Dropping a new fixture into ``test-cases/`` is automatic: the fixture sweep
|
||||
test (``tests/test_fixtures_sweep.py``) parametrizes over every CSV/XLSX in
|
||||
that directory (excluding ``text-cleaner-corpus/`` which has its own suite).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent
|
||||
|
||||
# Tool name -> matching pytest -k expression. Keep aligned with test_*.py
|
||||
# filenames; run_tests.py --tool foo translates to ``-k foo``.
|
||||
_TOOL_MAP: dict[str, str] = {
|
||||
"dedup": "test_dedup or test_cli.py",
|
||||
"text_clean": "test_text_clean or test_cli_text_clean or test_corpus",
|
||||
"analyze": "test_analyze or test_cli_analyze",
|
||||
"io": "test_io",
|
||||
"cli": "test_cli or test_cli_text_clean or test_cli_analyze",
|
||||
"config": "test_config",
|
||||
"normalizers": "test_normalizers",
|
||||
}
|
||||
|
||||
_CATEGORY_PATHS: dict[str, list[str]] = {
|
||||
"unit": ["tests/"], # all tests are unit unless marked otherwise
|
||||
"e2e": ["tests/test_e2e.py"],
|
||||
"install": ["tests/test_install.py"],
|
||||
"fixtures": ["tests/test_corpus.py", "tests/test_fixtures_sweep.py"],
|
||||
}
|
||||
|
||||
|
||||
def _build_pytest_args(args: argparse.Namespace) -> list[str]:
|
||||
cmd: list[str] = [sys.executable, "-m", "pytest"]
|
||||
|
||||
# Verbosity
|
||||
if args.verbose:
|
||||
cmd.append("-vv")
|
||||
else:
|
||||
cmd.append("-q")
|
||||
|
||||
# Coverage
|
||||
if args.coverage:
|
||||
cmd.extend(["--cov=src", "--cov-report=term-missing"])
|
||||
|
||||
# Quick: skip anything marked slow.
|
||||
if args.quick:
|
||||
cmd.extend(["-m", "not slow"])
|
||||
|
||||
# Tool filter via -k expression.
|
||||
if args.tool:
|
||||
if args.tool not in _TOOL_MAP:
|
||||
print(
|
||||
f"unknown --tool '{args.tool}'. "
|
||||
f"available: {', '.join(sorted(_TOOL_MAP))}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(2)
|
||||
cmd.extend(["-k", _TOOL_MAP[args.tool]])
|
||||
|
||||
# Category selection (--unit/--e2e/--install/--fixtures). When several
|
||||
# categories are requested they're OR'd by passing all paths.
|
||||
paths: list[str] = []
|
||||
selected_categories = [
|
||||
c for c in ("unit", "e2e", "install", "fixtures")
|
||||
if getattr(args, c)
|
||||
]
|
||||
if selected_categories:
|
||||
for cat in selected_categories:
|
||||
paths.extend(_CATEGORY_PATHS[cat])
|
||||
elif args.path:
|
||||
paths.extend(args.path)
|
||||
else:
|
||||
paths.append("tests/")
|
||||
|
||||
cmd.extend(paths)
|
||||
return cmd
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="run_tests.py",
|
||||
description=(
|
||||
"DataTools test runner. With no flags runs every test. Use "
|
||||
"--tool to scope to one tool, --unit/--e2e/--install/--fixtures "
|
||||
"to scope by category. Combine flags freely."
|
||||
),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=(
|
||||
"Available tools: " + ", ".join(sorted(_TOOL_MAP)) + "\n\n"
|
||||
"To add a new fixture-driven test: drop a CSV or XLSX into "
|
||||
"test-cases/ and re-run. tests/test_fixtures_sweep.py picks up "
|
||||
"new files automatically — no test code changes required."
|
||||
),
|
||||
)
|
||||
parser.add_argument("--tool", help="Limit tests to one tool.")
|
||||
parser.add_argument("--unit", action="store_true",
|
||||
help="Unit tests only (default scope).")
|
||||
parser.add_argument("--e2e", action="store_true",
|
||||
help="End-to-end CLI/integration smoke tests.")
|
||||
parser.add_argument("--install", action="store_true",
|
||||
help="Install / import / entry-point sanity tests.")
|
||||
parser.add_argument("--fixtures", action="store_true",
|
||||
help="Run the corpus + dropped-fixture sweep.")
|
||||
parser.add_argument("--coverage", action="store_true",
|
||||
help="Emit a coverage report (term-missing).")
|
||||
parser.add_argument("--quick", action="store_true",
|
||||
help="Skip tests marked @pytest.mark.slow.")
|
||||
parser.add_argument("-v", "--verbose", action="store_true",
|
||||
help="Verbose pytest output.")
|
||||
parser.add_argument("path", nargs="*",
|
||||
help="Optional explicit test paths (override category).")
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
# Ensure we run from the project root so relative imports / paths work.
|
||||
cwd_target = PROJECT_ROOT
|
||||
if Path.cwd() != cwd_target:
|
||||
print(f"running from {cwd_target}")
|
||||
|
||||
if shutil.which("pytest") is None and not _python_has_pytest():
|
||||
print(
|
||||
"pytest is not installed. Install dev deps:\n"
|
||||
" pip install -r requirements-dev.txt",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
|
||||
cmd = _build_pytest_args(args)
|
||||
if args.verbose:
|
||||
print("→", " ".join(cmd))
|
||||
proc = subprocess.run(cmd, cwd=cwd_target)
|
||||
return proc.returncode
|
||||
|
||||
|
||||
def _python_has_pytest() -> bool:
|
||||
try:
|
||||
__import__("pytest")
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
158
src/cli_analyze.py
Normal file
158
src/cli_analyze.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""CLI for the DataTools upload-time analyzer.
|
||||
|
||||
Usage:
|
||||
python -m src.cli_analyze input.csv # human-readable report
|
||||
python -m src.cli_analyze input.csv --json # JSON to stdout
|
||||
python -m src.cli_analyze input.csv --sample-rows 5000
|
||||
|
||||
The analyzer is purely advisory; exit code is always 0 on a successful scan
|
||||
even when findings are present. Use --strict to exit non-zero on warnings.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
from src.core.analyze import analyze, findings_by_tool, to_dict
|
||||
|
||||
app = typer.Typer(
|
||||
name="analyze",
|
||||
help=(
|
||||
"Scan a CSV or Excel file and report data quality issues with the "
|
||||
"tools that can fix each one. Read-only and advisory.\n\n"
|
||||
"Examples:\n\n"
|
||||
" # Default scan (first 1000 rows, human-readable)\n"
|
||||
" python -m src.cli_analyze customers.csv\n\n"
|
||||
" # Machine-readable output for piping\n"
|
||||
" python -m src.cli_analyze customers.csv --json\n\n"
|
||||
" # Scan more rows on a large file\n"
|
||||
" python -m src.cli_analyze big.csv --sample-rows 50000\n\n"
|
||||
" # Exit non-zero when warnings exist (CI gate)\n"
|
||||
" python -m src.cli_analyze customers.csv --strict\n"
|
||||
),
|
||||
add_completion=False,
|
||||
no_args_is_help=True,
|
||||
)
|
||||
|
||||
|
||||
# Tool id -> friendly display name. Kept in the CLI module since the GUI has
|
||||
# its own version; both stay in lockstep with the actual script lineup.
|
||||
_TOOL_DISPLAY = {
|
||||
"01_deduplicator": "Deduplicator",
|
||||
"02_text_cleaner": "Text Cleaner",
|
||||
"03_format_standardizer": "Format Standardizer",
|
||||
"04_missing_handler": "Missing Value Handler",
|
||||
"05_column_mapper": "Column Mapper",
|
||||
"06_outlier_detector": "Outlier Detector",
|
||||
"07_multi_file_merger": "Multi-File Merger",
|
||||
"08_validator_reporter": "Validator & Reporter",
|
||||
"09_pipeline_runner": "Pipeline Runner",
|
||||
}
|
||||
|
||||
|
||||
def _tool_label(tool_id: str) -> str:
|
||||
return _TOOL_DISPLAY.get(tool_id, tool_id) if tool_id else "—"
|
||||
|
||||
|
||||
_SEVERITY_STYLE = {
|
||||
"info": "cyan",
|
||||
"warn": "yellow",
|
||||
"error": "red",
|
||||
}
|
||||
|
||||
|
||||
@app.command()
|
||||
def scan(
|
||||
input_file: str = typer.Argument(
|
||||
..., help="Path to the CSV or Excel file to scan.",
|
||||
),
|
||||
sample_rows: int = typer.Option(
|
||||
1000, "--sample-rows", "-n",
|
||||
help="Cap on rows scanned. Default 1000.",
|
||||
),
|
||||
json_out: bool = typer.Option(
|
||||
False, "--json",
|
||||
help="Print findings as a JSON array on stdout.",
|
||||
),
|
||||
strict: bool = typer.Option(
|
||||
False, "--strict",
|
||||
help="Exit non-zero when any 'warn' or 'error' finding is reported.",
|
||||
),
|
||||
) -> None:
|
||||
path = Path(input_file)
|
||||
if not path.exists():
|
||||
typer.echo(f"File not found: {path}", err=True)
|
||||
raise typer.Exit(code=2)
|
||||
|
||||
findings = analyze(path, sample_rows=sample_rows)
|
||||
|
||||
if json_out:
|
||||
typer.echo(json.dumps([to_dict(f) for f in findings], indent=2))
|
||||
_maybe_strict_exit(findings, strict)
|
||||
return
|
||||
|
||||
console = Console()
|
||||
if not findings:
|
||||
console.print(f"[green]✓[/green] No issues detected in {path.name}.")
|
||||
return
|
||||
|
||||
grouped = findings_by_tool(findings)
|
||||
untargeted = [f for f in findings if not f.tool]
|
||||
|
||||
# Top-line summary
|
||||
by_sev: dict[str, int] = {}
|
||||
for f in findings:
|
||||
by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
|
||||
summary_parts = [
|
||||
f"[{_SEVERITY_STYLE[s]}]{by_sev[s]} {s}[/{_SEVERITY_STYLE[s]}]"
|
||||
for s in ("error", "warn", "info") if by_sev.get(s)
|
||||
]
|
||||
console.print(
|
||||
f"[bold]Scanned[/bold] {path.name}: "
|
||||
f"{len(findings)} finding(s) ({', '.join(summary_parts)})."
|
||||
)
|
||||
console.print()
|
||||
|
||||
# Per-tool tables — surface what each downstream tool would need to do.
|
||||
for tool_id in sorted(grouped):
|
||||
_render_tool_table(console, tool_id, grouped[tool_id])
|
||||
|
||||
if untargeted:
|
||||
_render_tool_table(console, "", untargeted, header="Informational / file-level")
|
||||
|
||||
_maybe_strict_exit(findings, strict)
|
||||
|
||||
|
||||
def _render_tool_table(console: Console, tool_id: str, items, header: str | None = None) -> None:
|
||||
label = header or f"→ {_tool_label(tool_id)}"
|
||||
table = Table(title=label, title_style="bold", show_lines=False, expand=True)
|
||||
table.add_column("Severity", width=8)
|
||||
table.add_column("Finding", width=32)
|
||||
table.add_column("Count", justify="right", width=7)
|
||||
table.add_column("Description")
|
||||
for f in items:
|
||||
sev = f"[{_SEVERITY_STYLE[f.severity]}]{f.severity}[/{_SEVERITY_STYLE[f.severity]}]"
|
||||
table.add_row(sev, f.id, str(f.count), f.description)
|
||||
console.print(table)
|
||||
console.print()
|
||||
|
||||
|
||||
def _maybe_strict_exit(findings, strict: bool) -> None:
|
||||
if not strict:
|
||||
return
|
||||
if any(f.severity in ("warn", "error") for f in findings):
|
||||
raise typer.Exit(code=1)
|
||||
|
||||
|
||||
# Entrypoint when run via `python -m src.cli_analyze`. Typer's no_args_is_help
|
||||
# kicks in when the user invokes without args; we expose the single command at
|
||||
# the top level for convenience: ``python -m src.cli_analyze input.csv``.
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
@@ -280,6 +280,10 @@ def clean(
|
||||
encoding=encoding_override,
|
||||
header_row=header_row,
|
||||
sheet_name=sheet_arg if sheet_arg is not None else 0,
|
||||
# Bypass byte-level repair so the user's preset/flag choices
|
||||
# remain authoritative. The cell-level cleaner does the
|
||||
# smart-quote / NUL / BOM work itself.
|
||||
repair=False,
|
||||
)
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
|
||||
@@ -51,8 +51,18 @@ from .io import (
|
||||
detect_encoding,
|
||||
detect_header_row,
|
||||
list_sheets,
|
||||
read_csv_repaired,
|
||||
read_file,
|
||||
repair_bytes,
|
||||
write_file,
|
||||
RepairAction,
|
||||
RepairResult,
|
||||
)
|
||||
from .analyze import (
|
||||
Finding,
|
||||
analyze,
|
||||
findings_by_tool,
|
||||
to_dict,
|
||||
)
|
||||
from .config import (
|
||||
ColumnStrategyConfig,
|
||||
@@ -105,6 +115,15 @@ __all__ = [
|
||||
"detect_encoding",
|
||||
"detect_delimiter",
|
||||
"detect_header_row",
|
||||
"read_csv_repaired",
|
||||
"repair_bytes",
|
||||
"RepairAction",
|
||||
"RepairResult",
|
||||
# Analyzer
|
||||
"Finding",
|
||||
"analyze",
|
||||
"findings_by_tool",
|
||||
"to_dict",
|
||||
# Config
|
||||
"DeduplicationConfig",
|
||||
"StrategyConfig",
|
||||
|
||||
619
src/core/analyze.py
Normal file
619
src/core/analyze.py
Normal file
@@ -0,0 +1,619 @@
|
||||
"""Upload-time data quality analyzer.
|
||||
|
||||
Runs a fast, read-only scan over an uploaded file (or DataFrame) and
|
||||
returns a list of :class:`Finding` objects. Each finding names the issue,
|
||||
how many cells/rows are affected, and which downstream tool can address
|
||||
it. The GUI consumes findings to badge tool nav items; the CLI prints
|
||||
them as a table.
|
||||
|
||||
The analyzer is *purely advisory*: it never mutates data, never runs a
|
||||
tool, and is safe to skip. Treat it as a guided onboarding step, not a
|
||||
hard gate on the upload flow.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import unicodedata
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Literal, Optional
|
||||
|
||||
import pandas as pd
|
||||
from pandas.api import types as pdtypes
|
||||
|
||||
from .io import RepairResult, repair_bytes, detect_encoding, detect_delimiter
|
||||
|
||||
Severity = Literal["info", "warn", "error"]
|
||||
|
||||
|
||||
# Tool identifiers — match the 0N_<name> convention used by the script set.
|
||||
# Listed here so detectors stay decoupled from the GUI's display layer.
|
||||
TOOL_TEXT_CLEANER = "02_text_cleaner"
|
||||
TOOL_MISSING_HANDLER = "04_missing_handler"
|
||||
TOOL_DEDUPLICATOR = "01_deduplicator"
|
||||
TOOL_FORMAT_STANDARDIZER = "03_format_standardizer"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Finding:
|
||||
"""One issue the analyzer surfaced.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
id
|
||||
Stable identifier (``"smart_quotes_in_data"``); used for GUI lookup
|
||||
and downloadable JSON exports. Never localized.
|
||||
severity
|
||||
``"info"`` (FYI), ``"warn"`` (likely needs cleanup),
|
||||
``"error"`` (will block downstream work).
|
||||
tool
|
||||
Tool id that can address the finding, or empty string for purely
|
||||
informational findings.
|
||||
count
|
||||
Number of cells (or rows) affected.
|
||||
description
|
||||
Single-sentence human summary used for banners and tooltips.
|
||||
column
|
||||
Column name when scoped to one column; ``None`` for whole-frame /
|
||||
file-level findings.
|
||||
samples
|
||||
Up to a handful of ``(row, column, value)`` tuples for the GUI
|
||||
to render. Cap at five so the JSON export stays compact.
|
||||
"""
|
||||
|
||||
id: str
|
||||
severity: Severity
|
||||
tool: str
|
||||
count: int
|
||||
description: str
|
||||
column: Optional[str] = None
|
||||
samples: list[tuple[int, str, str]] = field(default_factory=list)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-cell character classes (kept independent of text_clean to avoid an
|
||||
# import cycle and to keep the analyzer self-contained).
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_SMART_QUOTE_CHARS = set("“”‘’„‟«»′″")
|
||||
_DASH_ELLIPSIS_CHARS = set("–—―−…")
|
||||
_NBSP_LIKE_CHARS = set(" ")
|
||||
_ZERO_WIDTH_CHARS = set("")
|
||||
|
||||
_NULL_LIKE = {
|
||||
"n/a", "na", "nan", "null", "none", "#n/a", "#na", "-", "--",
|
||||
"tbd", "unknown", "n.a.", "(null)",
|
||||
}
|
||||
|
||||
# Mojibake fingerprints: classic UTF-8-as-cp1252 corruptions.
|
||||
_MOJIBAKE_PATTERNS = re.compile(
|
||||
r"Ã[©¨¢¤¶]" # café -> café, étage -> étage etc.
|
||||
r"|â€[™œžs˜“”]" # don't -> don’t
|
||||
r"|Â[ -¿]"
|
||||
)
|
||||
|
||||
_LEADING_ZERO_ID_RE = re.compile(r"^0\d{2,}$")
|
||||
_DIGITS_RE = re.compile(r"^\d+$")
|
||||
_EMAIL_LIKE_COL = re.compile(r"e?[ -_]?mail|^email|address$", re.IGNORECASE)
|
||||
|
||||
|
||||
def _has_any(text: str, chars: set[str]) -> bool:
|
||||
return any(c in chars for c in text)
|
||||
|
||||
|
||||
def _samples(rows: Iterable[tuple[int, str, str]], limit: int = 5) -> list[tuple[int, str, str]]:
|
||||
out: list[tuple[int, str, str]] = []
|
||||
for item in rows:
|
||||
out.append(item)
|
||||
if len(out) >= limit:
|
||||
break
|
||||
return out
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Detectors
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _detect_smart_punctuation(df: pd.DataFrame) -> list[Finding]:
|
||||
affected_cells = 0
|
||||
sample_rows: list[tuple[int, str, str]] = []
|
||||
for col in df.columns:
|
||||
for row_idx, val in enumerate(df[col].tolist()):
|
||||
if not isinstance(val, str):
|
||||
continue
|
||||
if _has_any(val, _SMART_QUOTE_CHARS) or _has_any(val, _DASH_ELLIPSIS_CHARS):
|
||||
affected_cells += 1
|
||||
if len(sample_rows) < 5:
|
||||
sample_rows.append((row_idx, str(col), val))
|
||||
if not affected_cells:
|
||||
return []
|
||||
return [Finding(
|
||||
id="smart_punctuation_in_data",
|
||||
severity="warn",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=affected_cells,
|
||||
description=(
|
||||
f"{affected_cells} cell(s) contain curly quotes, em/en dashes, "
|
||||
f"or ellipsis characters. These break string equality joins and "
|
||||
f"regex patterns."
|
||||
),
|
||||
samples=sample_rows,
|
||||
)]
|
||||
|
||||
|
||||
def _detect_invisible_chars(df: pd.DataFrame) -> list[Finding]:
|
||||
nbsp_cells = 0
|
||||
zw_cells = 0
|
||||
nbsp_samples: list[tuple[int, str, str]] = []
|
||||
zw_samples: list[tuple[int, str, str]] = []
|
||||
for col in df.columns:
|
||||
for row_idx, val in enumerate(df[col].tolist()):
|
||||
if not isinstance(val, str):
|
||||
continue
|
||||
if _has_any(val, _NBSP_LIKE_CHARS):
|
||||
nbsp_cells += 1
|
||||
if len(nbsp_samples) < 5:
|
||||
nbsp_samples.append((row_idx, str(col), val))
|
||||
if _has_any(val, _ZERO_WIDTH_CHARS):
|
||||
zw_cells += 1
|
||||
if len(zw_samples) < 5:
|
||||
zw_samples.append((row_idx, str(col), val))
|
||||
findings: list[Finding] = []
|
||||
if nbsp_cells:
|
||||
findings.append(Finding(
|
||||
id="nbsp_or_unicode_whitespace",
|
||||
severity="warn",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=nbsp_cells,
|
||||
description=(
|
||||
f"{nbsp_cells} cell(s) contain non-breaking or other Unicode "
|
||||
f"spaces. These look identical to a regular space but break "
|
||||
f"join keys."
|
||||
),
|
||||
samples=nbsp_samples,
|
||||
))
|
||||
if zw_cells:
|
||||
findings.append(Finding(
|
||||
id="zero_width_or_invisible",
|
||||
severity="warn",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=zw_cells,
|
||||
description=(
|
||||
f"{zw_cells} cell(s) contain zero-width or invisible "
|
||||
f"characters (ZWSP, ZWJ, soft hyphen, BOM, bidi marks)."
|
||||
),
|
||||
samples=zw_samples,
|
||||
))
|
||||
# Headers carry the same risks; flag separately so the user sees that
|
||||
# df["Email"] vs df["Email"] is the issue.
|
||||
bad_headers = [
|
||||
c for c in df.columns
|
||||
if isinstance(c, str) and (
|
||||
c != c.strip()
|
||||
or _has_any(c, _NBSP_LIKE_CHARS)
|
||||
or _has_any(c, _ZERO_WIDTH_CHARS)
|
||||
or _has_any(c, _SMART_QUOTE_CHARS)
|
||||
)
|
||||
]
|
||||
if bad_headers:
|
||||
findings.append(Finding(
|
||||
id="dirty_column_headers",
|
||||
severity="warn",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=len(bad_headers),
|
||||
description=(
|
||||
f"{len(bad_headers)} column header(s) contain whitespace, "
|
||||
f"smart quotes, or invisible characters. These break "
|
||||
f"df['col'] lookups."
|
||||
),
|
||||
samples=[(0, h, h) for h in bad_headers[:5]],
|
||||
))
|
||||
return findings
|
||||
|
||||
|
||||
def _detect_whitespace_padding(df: pd.DataFrame) -> list[Finding]:
|
||||
affected = 0
|
||||
samples: list[tuple[int, str, str]] = []
|
||||
for col in df.columns:
|
||||
for row_idx, val in enumerate(df[col].tolist()):
|
||||
if not isinstance(val, str) or not val:
|
||||
continue
|
||||
if val != val.strip() or " " in val:
|
||||
affected += 1
|
||||
if len(samples) < 5:
|
||||
samples.append((row_idx, str(col), val))
|
||||
if not affected:
|
||||
return []
|
||||
return [Finding(
|
||||
id="whitespace_padding",
|
||||
severity="warn",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=affected,
|
||||
description=(
|
||||
f"{affected} cell(s) have leading/trailing whitespace or "
|
||||
f"multi-space internal runs. Common cause of failed joins."
|
||||
),
|
||||
samples=samples,
|
||||
)]
|
||||
|
||||
|
||||
def _detect_null_like_sentinels(df: pd.DataFrame) -> list[Finding]:
|
||||
affected = 0
|
||||
samples: list[tuple[int, str, str]] = []
|
||||
cols_with_sentinels: set[str] = set()
|
||||
for col in df.columns:
|
||||
for row_idx, val in enumerate(df[col].tolist()):
|
||||
if not isinstance(val, str):
|
||||
continue
|
||||
if val.strip().lower() in _NULL_LIKE:
|
||||
affected += 1
|
||||
cols_with_sentinels.add(str(col))
|
||||
if len(samples) < 5:
|
||||
samples.append((row_idx, str(col), val))
|
||||
if not affected:
|
||||
return []
|
||||
return [Finding(
|
||||
id="null_like_sentinels",
|
||||
severity="info",
|
||||
tool=TOOL_MISSING_HANDLER,
|
||||
count=affected,
|
||||
description=(
|
||||
f"{affected} cell(s) across {len(cols_with_sentinels)} column(s) "
|
||||
f"look like disguised nulls (N/A, NaN, None, '-'). Decide what "
|
||||
f"counts as missing in the missing-value handler."
|
||||
),
|
||||
samples=samples,
|
||||
)]
|
||||
|
||||
|
||||
def _detect_mojibake(df: pd.DataFrame) -> list[Finding]:
|
||||
affected = 0
|
||||
samples: list[tuple[int, str, str]] = []
|
||||
for col in df.columns:
|
||||
for row_idx, val in enumerate(df[col].tolist()):
|
||||
if not isinstance(val, str):
|
||||
continue
|
||||
if _MOJIBAKE_PATTERNS.search(val):
|
||||
affected += 1
|
||||
if len(samples) < 5:
|
||||
samples.append((row_idx, str(col), val))
|
||||
if not affected:
|
||||
return []
|
||||
return [Finding(
|
||||
id="suspected_mojibake",
|
||||
severity="info",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=affected,
|
||||
description=(
|
||||
f"{affected} cell(s) match common UTF-8-as-cp1252 mojibake "
|
||||
f"patterns (é, ’, etc.). Auto-repair is opt-in (Tier 2)."
|
||||
),
|
||||
samples=samples,
|
||||
)]
|
||||
|
||||
|
||||
def _detect_mixed_case_email(df: pd.DataFrame) -> list[Finding]:
|
||||
findings: list[Finding] = []
|
||||
for col in df.columns:
|
||||
if not isinstance(col, str) or not _EMAIL_LIKE_COL.search(col):
|
||||
continue
|
||||
values = [v for v in df[col].tolist() if isinstance(v, str) and v.strip()]
|
||||
if not values:
|
||||
continue
|
||||
has_upper = any(any(c.isupper() for c in v) for v in values)
|
||||
has_lower = any(any(c.islower() for c in v) for v in values)
|
||||
if has_upper and has_lower:
|
||||
samples = [(i, col, v) for i, v in enumerate(values[:5])]
|
||||
findings.append(Finding(
|
||||
id="mixed_case_email_column",
|
||||
severity="info",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=len(values),
|
||||
description=(
|
||||
f"Column '{col}' has mixed case across email values. "
|
||||
f"Lowercasing emails before dedup avoids false negatives."
|
||||
),
|
||||
column=col,
|
||||
samples=samples,
|
||||
))
|
||||
return findings
|
||||
|
||||
|
||||
def _detect_near_duplicates(df: pd.DataFrame) -> list[Finding]:
|
||||
"""Detect duplicate rows that differ only in case or padding.
|
||||
|
||||
Cheap pass: lowercase + strip every string column, then count
|
||||
``df.duplicated()``. Catches the most common dedup signal (the same
|
||||
customer entered twice with subtle formatting differences) without
|
||||
paying the cost of fuzzy matching. Anything more sophisticated belongs
|
||||
in tool 01.
|
||||
"""
|
||||
if len(df) < 2:
|
||||
return []
|
||||
norm = df.copy()
|
||||
for col in norm.columns:
|
||||
if pdtypes.is_object_dtype(norm[col]) or pdtypes.is_string_dtype(norm[col]):
|
||||
norm[col] = (
|
||||
norm[col].astype(str).str.strip().str.lower()
|
||||
)
|
||||
dup_mask = norm.duplicated(keep=False)
|
||||
n_dupes = int(dup_mask.sum())
|
||||
if n_dupes < 2:
|
||||
return []
|
||||
# Count *extra* copies, not total members of duplicate groups.
|
||||
n_groups = int(norm[dup_mask].drop_duplicates().shape[0])
|
||||
samples: list[tuple[int, str, str]] = []
|
||||
for i in df[dup_mask].index[:5]:
|
||||
# Render the first textual column's value as a sample.
|
||||
col_name = next(
|
||||
(c for c in df.columns if isinstance(df[c].iloc[i], str)),
|
||||
df.columns[0],
|
||||
)
|
||||
samples.append((int(i), str(col_name), str(df[col_name].iloc[i])))
|
||||
return [Finding(
|
||||
id="near_duplicate_rows",
|
||||
severity="info",
|
||||
tool=TOOL_DEDUPLICATOR,
|
||||
count=n_dupes,
|
||||
description=(
|
||||
f"{n_dupes} row(s) across ~{n_groups} group(s) are duplicates "
|
||||
f"after stripping whitespace and lowercasing string columns. "
|
||||
f"Run the deduplicator to merge or remove."
|
||||
),
|
||||
samples=samples,
|
||||
)]
|
||||
|
||||
|
||||
def _detect_leading_zero_ids(df: pd.DataFrame) -> list[Finding]:
|
||||
"""Informational: a column where most values are zero-padded digit IDs.
|
||||
|
||||
Worth surfacing because Excel re-opens often strip them — the user
|
||||
should know they're there before any Excel round-trip.
|
||||
"""
|
||||
findings: list[Finding] = []
|
||||
for col in df.columns:
|
||||
values = [v for v in df[col].tolist() if isinstance(v, str) and v.strip()]
|
||||
if len(values) < 5:
|
||||
continue
|
||||
digit_count = sum(1 for v in values if _DIGITS_RE.match(v))
|
||||
leading_zero_count = sum(1 for v in values if _LEADING_ZERO_ID_RE.match(v))
|
||||
# >80% are zero-padded digit IDs of the same length-ish.
|
||||
if digit_count >= 0.8 * len(values) and leading_zero_count >= 0.5 * len(values):
|
||||
samples = [
|
||||
(i, str(col), v)
|
||||
for i, v in enumerate(values[:5])
|
||||
if _LEADING_ZERO_ID_RE.match(v)
|
||||
][:5]
|
||||
findings.append(Finding(
|
||||
id="leading_zero_ids",
|
||||
severity="info",
|
||||
tool="",
|
||||
count=leading_zero_count,
|
||||
description=(
|
||||
f"Column '{col}' contains zero-padded numeric IDs "
|
||||
f"({leading_zero_count}/{len(values)}). Excel will strip "
|
||||
f"the zeros on round-trip unless saved as text."
|
||||
),
|
||||
column=str(col),
|
||||
samples=samples,
|
||||
))
|
||||
return findings
|
||||
|
||||
|
||||
def _detect_mixed_line_endings(raw: bytes) -> list[Finding]:
|
||||
"""Flag files that mix CRLF, LF, and bare CR line terminators.
|
||||
|
||||
Mixed endings are a classic disaster pattern after multi-source concat
|
||||
(Windows + macOS + Linux exports stitched together). Operates on raw
|
||||
bytes only — DataFrame-mode :func:`analyze` skips this detector.
|
||||
"""
|
||||
if not raw:
|
||||
return []
|
||||
n_crlf = raw.count(b"\r\n")
|
||||
# Count standalone \r and \n (not part of \r\n) by subtracting overlaps.
|
||||
n_lf = raw.count(b"\n") - n_crlf
|
||||
n_cr = raw.count(b"\r") - n_crlf
|
||||
kinds_present = sum(1 for n in (n_crlf, n_lf, n_cr) if n > 0)
|
||||
if kinds_present <= 1:
|
||||
return []
|
||||
breakdown = []
|
||||
if n_crlf:
|
||||
breakdown.append(f"{n_crlf} CRLF")
|
||||
if n_lf:
|
||||
breakdown.append(f"{n_lf} LF")
|
||||
if n_cr:
|
||||
breakdown.append(f"{n_cr} CR")
|
||||
return [Finding(
|
||||
id="mixed_line_endings",
|
||||
severity="warn",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=kinds_present,
|
||||
description=(
|
||||
f"File mixes {kinds_present} line-ending styles "
|
||||
f"({', '.join(breakdown)}). Naive splits on one style produce "
|
||||
f"ghost rows or merged lines. Run the text cleaner to normalize."
|
||||
),
|
||||
)]
|
||||
|
||||
|
||||
def _findings_from_repair(repair: RepairResult) -> list[Finding]:
|
||||
"""Synthesize findings from a :class:`RepairResult`.
|
||||
|
||||
Each repair kind maps to a single info-severity finding so the GUI
|
||||
shows the user what the parser quietly fixed before they reached the
|
||||
tool pages.
|
||||
"""
|
||||
if not repair.changed and not repair.unrepairable_lines:
|
||||
return []
|
||||
summary = repair.summary()
|
||||
findings: list[Finding] = []
|
||||
if "strip_bom" in summary:
|
||||
findings.append(Finding(
|
||||
id="csv_bom_stripped",
|
||||
severity="info",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=1,
|
||||
description="UTF-8 BOM at file start was removed before parsing.",
|
||||
))
|
||||
if "strip_nul" in summary:
|
||||
nul_action = next(a for a in repair.actions if a.kind == "strip_nul")
|
||||
findings.append(Finding(
|
||||
id="csv_nul_stripped",
|
||||
severity="warn",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=1,
|
||||
description=(
|
||||
f"Embedded NUL bytes in the file were stripped before "
|
||||
f"parsing ({nul_action.detail})."
|
||||
),
|
||||
))
|
||||
if "fold_smart_quote" in summary:
|
||||
action = next(a for a in repair.actions if a.kind == "fold_smart_quote")
|
||||
findings.append(Finding(
|
||||
id="csv_smart_quotes_folded",
|
||||
severity="info",
|
||||
tool=TOOL_TEXT_CLEANER,
|
||||
count=1,
|
||||
description=(
|
||||
f"Smart double quotes were folded to ASCII before parsing "
|
||||
f"({action.detail})."
|
||||
),
|
||||
))
|
||||
if "quote_unquoted_delim" in summary:
|
||||
n = summary["quote_unquoted_delim"]
|
||||
findings.append(Finding(
|
||||
id="csv_unquoted_delimiters_repaired",
|
||||
severity="warn",
|
||||
tool="",
|
||||
count=n,
|
||||
description=(
|
||||
f"{n} row(s) had a delimiter inside an unquoted field "
|
||||
f"(e.g. '$1,500.00') and were merged during pre-parse repair."
|
||||
),
|
||||
))
|
||||
if repair.unrepairable_lines:
|
||||
n = len(repair.unrepairable_lines)
|
||||
findings.append(Finding(
|
||||
id="csv_unrepairable_rows",
|
||||
severity="error",
|
||||
tool="",
|
||||
count=n,
|
||||
description=(
|
||||
f"{n} row(s) had ambiguous structural problems and were "
|
||||
f"left as-is. Inspect lines: "
|
||||
f"{repair.unrepairable_lines[:10]}"
|
||||
),
|
||||
))
|
||||
return findings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def analyze(
|
||||
source: pd.DataFrame | str | Path,
|
||||
*,
|
||||
sample_rows: int = 1000,
|
||||
repair_result: Optional[RepairResult] = None,
|
||||
) -> list[Finding]:
|
||||
"""Run all detectors against *source* and return a list of findings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source
|
||||
Either a DataFrame already in memory or a path to a CSV/Excel file.
|
||||
Paths are read with the same encoding/delimiter detection as
|
||||
:func:`read_file`. Only the first *sample_rows* are scanned.
|
||||
sample_rows
|
||||
Cap on how many rows to scan. Defaults to 1000 — enough to detect
|
||||
every per-cell pollution pattern without paying for a multi-GB read.
|
||||
repair_result
|
||||
Optional :class:`RepairResult` from a prior pre-parse pass; used
|
||||
to synthesize ``csv_*`` findings so the user sees what the parser
|
||||
quietly fixed.
|
||||
"""
|
||||
raw_for_byte_scan: Optional[bytes] = None
|
||||
if isinstance(source, (str, Path)):
|
||||
df, internal_repair, raw_for_byte_scan = _load_for_analysis(
|
||||
Path(source), sample_rows=sample_rows,
|
||||
)
|
||||
# Caller-supplied repair_result wins over the internally produced one,
|
||||
# since the caller may have used non-default repair flags.
|
||||
if repair_result is None:
|
||||
repair_result = internal_repair
|
||||
else:
|
||||
df = source.head(sample_rows).copy() if len(source) > sample_rows else source.copy()
|
||||
|
||||
findings: list[Finding] = []
|
||||
if repair_result is not None:
|
||||
findings.extend(_findings_from_repair(repair_result))
|
||||
if raw_for_byte_scan is not None:
|
||||
findings.extend(_detect_mixed_line_endings(raw_for_byte_scan))
|
||||
findings.extend(_detect_smart_punctuation(df))
|
||||
findings.extend(_detect_invisible_chars(df))
|
||||
findings.extend(_detect_whitespace_padding(df))
|
||||
findings.extend(_detect_null_like_sentinels(df))
|
||||
findings.extend(_detect_mojibake(df))
|
||||
findings.extend(_detect_mixed_case_email(df))
|
||||
findings.extend(_detect_leading_zero_ids(df))
|
||||
findings.extend(_detect_near_duplicates(df))
|
||||
return findings
|
||||
|
||||
|
||||
def _load_for_analysis(
|
||||
path: Path, *, sample_rows: int,
|
||||
) -> tuple[pd.DataFrame, Optional[RepairResult], Optional[bytes]]:
|
||||
"""Read just enough of *path* to scan, with the same robust pre-parse
|
||||
repair the tool pages will use.
|
||||
|
||||
Returns ``(df, repair_result, raw_bytes)``. The repair result and raw
|
||||
bytes are *None* for Excel files since the byte-level repair step
|
||||
(BOM/NUL/smart-quote folding) and line-ending scan are CSV-specific.
|
||||
"""
|
||||
suffix = path.suffix.lower()
|
||||
if suffix in (".xlsx", ".xls"):
|
||||
df = pd.read_excel(
|
||||
path, dtype=str, keep_default_na=False, engine="openpyxl",
|
||||
nrows=sample_rows,
|
||||
)
|
||||
return df, None, None
|
||||
enc = detect_encoding(path)
|
||||
delim = detect_delimiter(path, enc)
|
||||
raw = path.read_bytes()
|
||||
repair = repair_bytes(raw, encoding=enc, delimiter=delim)
|
||||
import io as _io
|
||||
df = pd.read_csv(
|
||||
_io.BytesIO(repair.repaired_bytes),
|
||||
encoding="utf-8", delimiter=delim,
|
||||
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||
nrows=sample_rows,
|
||||
)
|
||||
return df, repair, raw
|
||||
|
||||
|
||||
def to_dict(finding: Finding) -> dict[str, Any]:
|
||||
"""JSON-friendly representation; used by the CLI ``--json`` output."""
|
||||
return {
|
||||
"id": finding.id,
|
||||
"severity": finding.severity,
|
||||
"tool": finding.tool,
|
||||
"count": finding.count,
|
||||
"description": finding.description,
|
||||
"column": finding.column,
|
||||
"samples": [
|
||||
{"row": r, "column": c, "value": v}
|
||||
for r, c, v in finding.samples
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def findings_by_tool(findings: list[Finding]) -> dict[str, list[Finding]]:
|
||||
"""Group findings by tool id; useful for the GUI sidebar badges."""
|
||||
out: dict[str, list[Finding]] = {}
|
||||
for f in findings:
|
||||
if not f.tool:
|
||||
continue
|
||||
out.setdefault(f.tool, []).append(f)
|
||||
return out
|
||||
343
src/core/io.py
343
src/core/io.py
@@ -4,6 +4,8 @@ from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Generator, Optional
|
||||
|
||||
@@ -135,6 +137,7 @@ def read_file(
|
||||
header_row: Optional[int] = None,
|
||||
sheet_name: Optional[str | int] = 0,
|
||||
chunk_size: Optional[int] = None,
|
||||
repair: bool = True,
|
||||
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
|
||||
"""Read a CSV, TSV, or Excel file into a DataFrame.
|
||||
|
||||
@@ -145,7 +148,13 @@ def read_file(
|
||||
delimiter : override detected delimiter (CSV only)
|
||||
header_row : 0-based row index for the header; auto-detected if *None*
|
||||
sheet_name : Excel sheet (name or 0-based index). Ignored for CSV.
|
||||
chunk_size : if set, return a generator of DataFrames (CSV only).
|
||||
chunk_size : if set, return a generator of DataFrames (CSV only). When
|
||||
*chunk_size* is set, *repair* is forced off because the pre-parse
|
||||
pass loads the entire file into memory.
|
||||
repair : run :func:`repair_bytes` over the raw CSV before parsing
|
||||
(default ``True``). Excel files always skip this step. Pass
|
||||
``repair=False`` when you specifically need pandas' raw view of
|
||||
the input.
|
||||
|
||||
Returns a DataFrame (or generator when *chunk_size* is set).
|
||||
"""
|
||||
@@ -163,6 +172,7 @@ def read_file(
|
||||
delimiter=delimiter,
|
||||
header_row=header_row,
|
||||
chunk_size=chunk_size,
|
||||
repair=repair,
|
||||
)
|
||||
|
||||
|
||||
@@ -173,15 +183,56 @@ def _read_csv(
|
||||
delimiter: Optional[str] = None,
|
||||
header_row: Optional[int] = None,
|
||||
chunk_size: Optional[int] = None,
|
||||
repair: bool = True,
|
||||
) -> pd.DataFrame | Generator[pd.DataFrame, None, None]:
|
||||
enc = encoding or detect_encoding(path)
|
||||
delim = delimiter or detect_delimiter(path, enc)
|
||||
hdr = header_row if header_row is not None else detect_header_row(path, enc, delim)
|
||||
|
||||
logger.debug("Reading CSV {} (encoding={}, delimiter={!r}, header_row={})",
|
||||
path.name, enc, delim, hdr)
|
||||
logger.debug(
|
||||
"Reading CSV {} (encoding={}, delimiter={!r}, header_row={}, repair={})",
|
||||
path.name, enc, delim, hdr, repair,
|
||||
)
|
||||
|
||||
kwargs: dict = dict(
|
||||
if chunk_size:
|
||||
# Streaming reads can't share memory with the repair pass; fall back
|
||||
# to direct pandas read so chunked workflows on huge files still
|
||||
# work.
|
||||
return pd.read_csv(
|
||||
filepath_or_buffer=path,
|
||||
encoding=enc,
|
||||
delimiter=delim,
|
||||
header=hdr,
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
on_bad_lines="warn",
|
||||
chunksize=chunk_size,
|
||||
)
|
||||
|
||||
if repair:
|
||||
raw = path.read_bytes()
|
||||
repair_result = repair_bytes(raw, encoding=enc, delimiter=delim)
|
||||
if repair_result.changed:
|
||||
logger.info(
|
||||
"Pre-parse repair on {}: {}", path.name, repair_result.summary(),
|
||||
)
|
||||
if repair_result.unrepairable_lines:
|
||||
logger.warning(
|
||||
"Pre-parse repair on {}: {} unrepairable line(s) at {}",
|
||||
path.name, len(repair_result.unrepairable_lines),
|
||||
repair_result.unrepairable_lines[:10],
|
||||
)
|
||||
return pd.read_csv(
|
||||
io.BytesIO(repair_result.repaired_bytes),
|
||||
encoding="utf-8",
|
||||
delimiter=delim,
|
||||
header=hdr,
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
on_bad_lines="warn",
|
||||
)
|
||||
|
||||
return pd.read_csv(
|
||||
filepath_or_buffer=path,
|
||||
encoding=enc,
|
||||
delimiter=delim,
|
||||
@@ -191,11 +242,6 @@ def _read_csv(
|
||||
on_bad_lines="warn",
|
||||
)
|
||||
|
||||
if chunk_size:
|
||||
return pd.read_csv(**kwargs, chunksize=chunk_size)
|
||||
|
||||
return pd.read_csv(**kwargs)
|
||||
|
||||
|
||||
def _read_excel(
|
||||
path: Path,
|
||||
@@ -245,3 +291,282 @@ def write_file(
|
||||
df.to_csv(out, index=False, encoding=encoding)
|
||||
logger.info("Wrote {} rows to {}", len(df), out)
|
||||
return out
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pre-parse repair (CSV / delimited text)
|
||||
# ---------------------------------------------------------------------------
|
||||
#
|
||||
# Some pollution patterns confuse pandas' parser before the cleaner can ever
|
||||
# see the data. Smart double quotes inside an unquoted field, NUL bytes, and
|
||||
# unquoted delimiters embedded in numeric/currency cells all cause structural
|
||||
# parse failures or silent truncation. These helpers operate on raw bytes
|
||||
# (or decoded text) and produce a parseable byte stream plus an audit log.
|
||||
#
|
||||
# Design notes:
|
||||
# - Single curly quotes (U+2018/U+2019) are NOT folded here: they don't
|
||||
# conflict with the default CSV quote char and the cell-level cleaner
|
||||
# handles them more accurately. Only double-quote-equivalents are folded.
|
||||
# - Delimiter-row repair only attempts the unambiguous case (one extra
|
||||
# field, one merge candidate that looks like currency/thousands-sep).
|
||||
# Anything else is logged as unrepairable and the line is left alone.
|
||||
|
||||
# Smart double-quote characters that confuse CSV parsing.
|
||||
_CSV_SMART_QUOTE_TRANS = str.maketrans({
|
||||
"“": '"', # LEFT DOUBLE QUOTATION MARK
|
||||
"”": '"', # RIGHT DOUBLE QUOTATION MARK
|
||||
"„": '"', # DOUBLE LOW-9 QUOTATION MARK
|
||||
"‟": '"', # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
|
||||
"«": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
"»": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
"″": '"', # DOUBLE PRIME
|
||||
})
|
||||
|
||||
# A merged value is "currency-shaped" when it looks like $1,500.00 or 1.234,56
|
||||
# (i.e., a sequence of digits, separators, and an optional currency sigil).
|
||||
_CURRENCY_SHAPED = re.compile(r"^\s*[$€£¥]?\s*\d{1,3}([,.\s]\d{3})+([,.]\d+)?\s*$")
|
||||
# Or a plain decimal with thousands grouping (no currency sigil).
|
||||
_THOUSANDS_SHAPED = re.compile(r"^\s*\d{1,3}(,\d{3})+(\.\d+)?\s*$")
|
||||
|
||||
|
||||
@dataclass
|
||||
class RepairAction:
|
||||
"""One repair the pre-parse pass made to the raw bytes."""
|
||||
|
||||
kind: str # e.g. "strip_bom", "strip_nul", "fold_smart_quote",
|
||||
# "quote_unquoted_delim"
|
||||
line: Optional[int] # 1-indexed source line; None for file-level
|
||||
detail: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class RepairResult:
|
||||
"""Output of :func:`repair_bytes`."""
|
||||
|
||||
repaired_bytes: bytes
|
||||
actions: list[RepairAction] = field(default_factory=list)
|
||||
unrepairable_lines: list[int] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def changed(self) -> bool:
|
||||
return bool(self.actions)
|
||||
|
||||
def summary(self) -> dict[str, int]:
|
||||
"""Action count grouped by kind."""
|
||||
out: dict[str, int] = {}
|
||||
for a in self.actions:
|
||||
out[a.kind] = out.get(a.kind, 0) + 1
|
||||
return out
|
||||
|
||||
|
||||
def _merge_score(left: str, right: str, delimiter: str) -> int:
|
||||
"""Rank how plausible it is that ``left+delimiter+right`` is one field.
|
||||
|
||||
Higher = more confident. ``0`` means the merge is implausible.
|
||||
|
||||
- 3: merged value matches a currency-shaped or thousands-shaped pattern.
|
||||
- 1: loose heuristic (left has $/€/digit and right starts with digit, and
|
||||
delimiter is one of ``,``/``.``).
|
||||
- 0: no signal.
|
||||
|
||||
Tiering matters because ``" $1,500.00 ,7"`` has two raw candidates
|
||||
(``$1+500.00`` and ``500.00+7``) but only the first produces a strict
|
||||
currency shape.
|
||||
"""
|
||||
merged = f"{left}{delimiter}{right}"
|
||||
if _CURRENCY_SHAPED.match(merged) or _THOUSANDS_SHAPED.match(merged):
|
||||
return 3
|
||||
if delimiter in ".,":
|
||||
left_has_money = bool(re.search(r"[$€£¥]\s*\d", left)) or bool(re.search(r"\d\s*$", left))
|
||||
right_starts_digits = bool(re.match(r"\s*\d", right))
|
||||
if left_has_money and right_starts_digits:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def _repair_extra_field_row(
|
||||
fields: list[str], expected: int, delimiter: str,
|
||||
) -> Optional[list[str]]:
|
||||
"""Try to merge one adjacent pair so the row has *expected* fields.
|
||||
|
||||
Returns the repaired field list, or *None* if no unambiguous merge exists.
|
||||
"""
|
||||
if len(fields) != expected + 1:
|
||||
return None
|
||||
scores = [
|
||||
(i, _merge_score(fields[i], fields[i + 1], delimiter))
|
||||
for i in range(len(fields) - 1)
|
||||
]
|
||||
best = max(s for _, s in scores)
|
||||
if best == 0:
|
||||
return None
|
||||
winners = [i for i, s in scores if s == best]
|
||||
if len(winners) != 1:
|
||||
return None
|
||||
i = winners[0]
|
||||
merged = f"{fields[i]}{delimiter}{fields[i + 1]}"
|
||||
return fields[:i] + [merged] + fields[i + 2:]
|
||||
|
||||
|
||||
def repair_bytes(
|
||||
raw: bytes,
|
||||
*,
|
||||
encoding: str = "utf-8",
|
||||
delimiter: str = ",",
|
||||
fold_quotes: bool = True,
|
||||
strip_nul: bool = True,
|
||||
repair_delims: bool = True,
|
||||
) -> RepairResult:
|
||||
"""Pre-parse repair on a raw delimited file.
|
||||
|
||||
Performs (in order, each toggleable):
|
||||
|
||||
1. Strip a leading UTF-8 BOM.
|
||||
2. Strip embedded NUL bytes (the C parser truncates fields at NUL).
|
||||
3. Fold smart double quotes (curly, guillemet, double-prime) to ASCII ``"``.
|
||||
4. Per-row repair when one rogue delimiter is embedded in a field that
|
||||
looks like currency or thousands-grouped digits — quote that field.
|
||||
|
||||
Single curly quotes and other punctuation are deferred to the cell-level
|
||||
cleaner; this layer only fixes things that break CSV *parsing*.
|
||||
"""
|
||||
actions: list[RepairAction] = []
|
||||
unrepairable: list[int] = []
|
||||
data = raw
|
||||
|
||||
# 1. BOM
|
||||
if data.startswith(b"\xef\xbb\xbf"):
|
||||
data = data[3:]
|
||||
actions.append(RepairAction(kind="strip_bom", line=None, detail="UTF-8 BOM removed"))
|
||||
|
||||
# 2. NUL
|
||||
if strip_nul and b"\x00" in data:
|
||||
before = data.count(b"\x00")
|
||||
data = data.replace(b"\x00", b"")
|
||||
actions.append(RepairAction(
|
||||
kind="strip_nul", line=None,
|
||||
detail=f"removed {before} NUL byte(s)",
|
||||
))
|
||||
|
||||
# Decode for character-level work.
|
||||
try:
|
||||
text = data.decode(encoding)
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
text = data.decode("utf-8", errors="replace")
|
||||
actions.append(RepairAction(
|
||||
kind="decode_replaced", line=None,
|
||||
detail=f"decode errors under {encoding}; replaced with U+FFFD",
|
||||
))
|
||||
|
||||
# 3. Smart double quotes
|
||||
if fold_quotes:
|
||||
folded = text.translate(_CSV_SMART_QUOTE_TRANS)
|
||||
if folded != text:
|
||||
# Count is approximate (distinct mapped chars combined).
|
||||
n = sum(1 for a, b in zip(text, folded) if a != b)
|
||||
actions.append(RepairAction(
|
||||
kind="fold_smart_quote", line=None,
|
||||
detail=f"replaced {n} smart double-quote char(s) with ASCII '\"'",
|
||||
))
|
||||
text = folded
|
||||
|
||||
# 4. Per-row delimiter repair
|
||||
if repair_delims:
|
||||
text, row_actions, unrepairable = _repair_rows(text, delimiter)
|
||||
actions.extend(row_actions)
|
||||
|
||||
return RepairResult(
|
||||
repaired_bytes=text.encode("utf-8"),
|
||||
actions=actions,
|
||||
unrepairable_lines=unrepairable,
|
||||
)
|
||||
|
||||
|
||||
def _repair_rows(
|
||||
text: str, delimiter: str,
|
||||
) -> tuple[str, list[RepairAction], list[int]]:
|
||||
"""Per-line field-count repair. Operates on already-decoded text."""
|
||||
actions: list[RepairAction] = []
|
||||
unrepairable: list[int] = []
|
||||
|
||||
reader = csv.reader(io.StringIO(text), delimiter=delimiter)
|
||||
rows = list(reader)
|
||||
if not rows:
|
||||
return text, actions, unrepairable
|
||||
|
||||
expected = len(rows[0])
|
||||
repaired_rows: list[list[str]] = [rows[0]]
|
||||
needs_rewrite = False
|
||||
|
||||
for idx, row in enumerate(rows[1:], start=2): # 1-indexed; header is line 1
|
||||
if len(row) == expected or not row:
|
||||
repaired_rows.append(row)
|
||||
continue
|
||||
if len(row) > expected:
|
||||
fixed = _repair_extra_field_row(row, expected, delimiter)
|
||||
if fixed is not None:
|
||||
repaired_rows.append(fixed)
|
||||
needs_rewrite = True
|
||||
actions.append(RepairAction(
|
||||
kind="quote_unquoted_delim", line=idx,
|
||||
detail=(
|
||||
f"line {idx}: merged adjacent fields to fix "
|
||||
f"unquoted '{delimiter}' (saw {len(row)} fields, "
|
||||
f"expected {expected})"
|
||||
),
|
||||
))
|
||||
continue
|
||||
unrepairable.append(idx)
|
||||
repaired_rows.append(row)
|
||||
else:
|
||||
# Too few fields: leave alone, log info-level only.
|
||||
unrepairable.append(idx)
|
||||
repaired_rows.append(row)
|
||||
|
||||
if not needs_rewrite:
|
||||
return text, actions, unrepairable
|
||||
|
||||
buf = io.StringIO()
|
||||
writer = csv.writer(buf, delimiter=delimiter, lineterminator="\n")
|
||||
for row in repaired_rows:
|
||||
writer.writerow(row)
|
||||
return buf.getvalue(), actions, unrepairable
|
||||
|
||||
|
||||
def read_csv_repaired(
|
||||
path: str | Path,
|
||||
*,
|
||||
encoding: Optional[str] = None,
|
||||
delimiter: Optional[str] = None,
|
||||
header_row: Optional[int] = None,
|
||||
fold_quotes: bool = True,
|
||||
strip_nul: bool = True,
|
||||
repair_delims: bool = True,
|
||||
) -> tuple[pd.DataFrame, RepairResult]:
|
||||
"""Read a CSV after running :func:`repair_bytes` on the raw file.
|
||||
|
||||
Returns ``(df, repair_result)`` so callers can surface the action log.
|
||||
"""
|
||||
p = Path(path)
|
||||
enc = encoding or detect_encoding(p)
|
||||
delim = delimiter or detect_delimiter(p, enc)
|
||||
raw = p.read_bytes()
|
||||
|
||||
repair = repair_bytes(
|
||||
raw, encoding=enc, delimiter=delim,
|
||||
fold_quotes=fold_quotes, strip_nul=strip_nul, repair_delims=repair_delims,
|
||||
)
|
||||
|
||||
hdr = header_row if header_row is not None else 0
|
||||
df = pd.read_csv(
|
||||
io.BytesIO(repair.repaired_bytes),
|
||||
encoding="utf-8",
|
||||
delimiter=delim,
|
||||
header=hdr,
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
on_bad_lines="warn",
|
||||
)
|
||||
if repair.actions:
|
||||
logger.info("Pre-parse repair on {}: {}", p.name, repair.summary())
|
||||
return df, repair
|
||||
|
||||
@@ -40,6 +40,10 @@ _SMART_CHARS: dict[str, str] = {
|
||||
"―": "-", # HORIZONTAL BAR
|
||||
"−": "-", # MINUS SIGN
|
||||
"…": "...", # HORIZONTAL ELLIPSIS
|
||||
"′": "'", # PRIME (foot / minute marker)
|
||||
"″": '"', # DOUBLE PRIME (inch / second marker)
|
||||
"«": '"', # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
"»": '"', # RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
||||
" ": " ", # NO-BREAK SPACE
|
||||
" ": " ", # NARROW NO-BREAK SPACE
|
||||
" ": " ", # THIN SPACE
|
||||
@@ -62,6 +66,7 @@ _ZERO_WIDTH = (
|
||||
"" # LEFT-TO-RIGHT MARK
|
||||
"" # RIGHT-TO-LEFT MARK
|
||||
"" # ZERO WIDTH NO-BREAK SPACE / BOM
|
||||
"" # SOFT HYPHEN
|
||||
)
|
||||
_ZERO_WIDTH_RE = re.compile(f"[{_ZERO_WIDTH}]")
|
||||
|
||||
@@ -87,12 +92,76 @@ def collapse_whitespace(s: str) -> str:
|
||||
Preserves leading/trailing whitespace boundaries (use ``trim`` to remove
|
||||
them). Tabs and other whitespace inside the string become a single
|
||||
regular space.
|
||||
|
||||
This is the *raw* operation — it always collapses. The cell-level
|
||||
pipeline uses :func:`_smart_collapse_whitespace` instead, which skips
|
||||
cells that look structured (numeric, dated, or phone-shaped) per
|
||||
TEST-CASES.md §4.17.
|
||||
"""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
return _WHITESPACE_RUN_RE.sub(" ", s)
|
||||
|
||||
|
||||
# Cell-shape predicates used to skip ``collapse_whitespace`` on values that
|
||||
# carry meaningful internal whitespace (European thousands separators,
|
||||
# phone formatting, dates with space-separated tokens).
|
||||
|
||||
# Numeric: optional sign / currency, digits with optional thousand-grouping
|
||||
# by comma, dot, or single space, and optional decimal portion.
|
||||
_NUMERIC_SHAPED = re.compile(
|
||||
r"^\s*[$€£¥]?\s*[+-]?\d{1,3}(?:[, ]\d{3})+(?:[.,]\d+)?\s*$"
|
||||
r"|^\s*[$€£¥]?\s*[+-]?\d+(?:[.,]\d+)?\s*$"
|
||||
)
|
||||
# Date: ISO, slash, or dot separators with two- or four-digit year, plus
|
||||
# the ``Mon DD YYYY`` / ``DD Mon YYYY`` shapes.
|
||||
_DATE_SHAPED = re.compile(
|
||||
r"^\s*\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}\s*$"
|
||||
r"|^\s*\d{1,2}\s+[A-Za-z]{3,9}\s+\d{2,4}\s*$"
|
||||
r"|^\s*[A-Za-z]{3,9}\s+\d{1,2}\s+\d{2,4}\s*$"
|
||||
)
|
||||
# Phone: a string that's mostly digits with parens / dots / dashes / +
|
||||
# / spaces, contains at least 7 digits total, and has no letters.
|
||||
_PHONE_DIGIT_RE = re.compile(r"\d")
|
||||
_PHONE_NON_DIGIT_RE = re.compile(r"[A-Za-z]")
|
||||
_PHONE_ALLOWED_RE = re.compile(r"^[\d\s().+\-]+$")
|
||||
|
||||
|
||||
def _looks_structured(s: str) -> bool:
|
||||
"""True when *s* looks numeric, dated, or phone-shaped.
|
||||
|
||||
Used by the pipeline-level collapse to leave meaningful internal
|
||||
whitespace alone (``1 234`` European thousand-sep, ``(555) 123-4567``
|
||||
phone formatting, ``Jan 15 2024`` date, etc.). Conservative on purpose:
|
||||
a false negative just means the cell gets collapsed (the existing
|
||||
behavior); a false positive leaves intentional double spaces in free
|
||||
text, which is a worse outcome.
|
||||
"""
|
||||
if not s or not isinstance(s, str):
|
||||
return False
|
||||
stripped = s.strip()
|
||||
if not stripped:
|
||||
return False
|
||||
if _NUMERIC_SHAPED.match(stripped) or _DATE_SHAPED.match(stripped):
|
||||
return True
|
||||
if (
|
||||
_PHONE_ALLOWED_RE.match(stripped)
|
||||
and not _PHONE_NON_DIGIT_RE.search(stripped)
|
||||
and len(_PHONE_DIGIT_RE.findall(stripped)) >= 7
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def _smart_collapse_whitespace(s: str) -> str:
|
||||
"""Pipeline variant of :func:`collapse_whitespace` that skips structured cells."""
|
||||
if not isinstance(s, str):
|
||||
return s
|
||||
if _looks_structured(s):
|
||||
return s
|
||||
return _WHITESPACE_RUN_RE.sub(" ", s)
|
||||
|
||||
|
||||
def to_nfc(s: str) -> str:
|
||||
"""Apply Unicode NFC (canonical composition)."""
|
||||
if not isinstance(s, str):
|
||||
@@ -159,27 +228,37 @@ def _is_all_caps_token(token: str) -> bool:
|
||||
def smart_title_case(s: str) -> str:
|
||||
"""Title-case that preserves all-caps tokens and lowercases mid-string particles.
|
||||
|
||||
- ``USA`` stays ``USA``.
|
||||
- ``USA`` stays ``USA`` when surrounded by mixed-case words (acronym).
|
||||
- ``ALICE SMITH`` becomes ``Alice Smith`` (entire string is shouting).
|
||||
- ``of``, ``and``, ``the``, etc. stay lowercase except as the first/last word.
|
||||
- Apostrophes inside words don't restart capitalization (``O'Neil``).
|
||||
"""
|
||||
if not isinstance(s, str) or not s:
|
||||
return s
|
||||
tokens = s.split(" ")
|
||||
# If every cased token is all-caps, treat the whole string as SHOUT and
|
||||
# title-case it. Otherwise preserve all-caps tokens as acronyms.
|
||||
cased_tokens = [t for t in tokens if any(c.isalpha() for c in t)]
|
||||
all_shouting = bool(cased_tokens) and all(
|
||||
not any(c.islower() for c in t) for t in cased_tokens
|
||||
)
|
||||
out: list[str] = []
|
||||
last_idx = len(tokens) - 1
|
||||
for i, tok in enumerate(tokens):
|
||||
if not tok:
|
||||
out.append(tok)
|
||||
continue
|
||||
if _is_all_caps_token(tok):
|
||||
if not all_shouting and _is_all_caps_token(tok):
|
||||
out.append(tok)
|
||||
continue
|
||||
lowered = tok.lower()
|
||||
if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
|
||||
out.append(lowered)
|
||||
continue
|
||||
# Capitalize first cased character; preserve apostrophes/hyphens
|
||||
# Capitalize first cased character. Inside a token, preserve the
|
||||
# original capitalization of the letter immediately after an
|
||||
# apostrophe so name patterns like ``O'Connor``/``D'Angelo`` survive
|
||||
# while ``o'neil`` -> ``O'neil`` stays lowercase.
|
||||
chars = list(tok)
|
||||
capitalized = False
|
||||
for j, c in enumerate(chars):
|
||||
@@ -187,6 +266,11 @@ def smart_title_case(s: str) -> str:
|
||||
if not capitalized:
|
||||
chars[j] = c.upper()
|
||||
capitalized = True
|
||||
else:
|
||||
prev = chars[j - 1] if j > 0 else ""
|
||||
if prev == "'" and c.isupper():
|
||||
# Preserve original uppercase after apostrophe.
|
||||
pass
|
||||
else:
|
||||
chars[j] = c.lower()
|
||||
out.append("".join(chars))
|
||||
@@ -291,6 +375,11 @@ class CleanOptions:
|
||||
strip_control: bool = True
|
||||
normalize_line_endings: bool = True
|
||||
|
||||
# Apply the same character-level pipeline to column headers. Headers carry
|
||||
# the same pollution as data cells (NBSP padding, smart quotes, ZWSP);
|
||||
# not cleaning them silently breaks df["col"] lookups downstream.
|
||||
clean_headers: bool = True
|
||||
|
||||
# Case conversion: either a single mode applied to all selected columns,
|
||||
# or a dict mapping column name -> mode for per-column control.
|
||||
case: Optional[CaseMode] = None
|
||||
@@ -373,7 +462,9 @@ def _build_pipeline(options: CleanOptions) -> list[tuple[str, Callable[[str], st
|
||||
if options.strip_zero_width:
|
||||
ops.append(("strip_zero_width", strip_zero_width))
|
||||
if options.collapse_whitespace:
|
||||
ops.append(("collapse_whitespace", collapse_whitespace))
|
||||
# The pipeline uses the structured-cell-aware variant so phone /
|
||||
# date / numeric cells keep their meaningful internal whitespace.
|
||||
ops.append(("collapse_whitespace", _smart_collapse_whitespace))
|
||||
if options.trim:
|
||||
ops.append(("trim", trim))
|
||||
return ops
|
||||
@@ -440,6 +531,15 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) ->
|
||||
out = df.copy()
|
||||
columns = _select_columns(out, options)
|
||||
|
||||
if options.clean_headers:
|
||||
new_columns = [clean_value(c, options)[0] for c in out.columns]
|
||||
if new_columns != list(out.columns):
|
||||
# Track column mapping so case_columns/columns/skip_columns based
|
||||
# on the original (dirty) names continue to work after rename.
|
||||
rename = dict(zip(out.columns, new_columns))
|
||||
columns = [rename.get(c, c) for c in columns]
|
||||
out.columns = new_columns
|
||||
|
||||
case_per_col: dict[str, CaseMode] = dict(options.case_columns)
|
||||
if options.case is not None:
|
||||
for c in columns:
|
||||
|
||||
@@ -21,7 +21,11 @@ if str(_project_root) not in sys.path:
|
||||
# Page config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.gui.components import (
|
||||
findings_count_for_tool,
|
||||
hide_streamlit_chrome,
|
||||
upload_and_analyze_section,
|
||||
)
|
||||
|
||||
st.set_page_config(
|
||||
page_title="DataTools — Data Cleaning Mastery",
|
||||
@@ -41,6 +45,14 @@ st.caption("A 9-tool suite for cleaning, standardizing, and validating tabular d
|
||||
|
||||
st.divider()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Upload & analyze (optional onboarding step)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
upload_and_analyze_section()
|
||||
|
||||
st.divider()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tool cards
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -52,13 +64,15 @@ TOOLS = [
|
||||
"description": "Fuzzy matching, normalization, survivor selection, and interactive review.",
|
||||
"status": "Ready",
|
||||
"page": "1_Deduplicator",
|
||||
"tool_id": "01_deduplicator",
|
||||
},
|
||||
{
|
||||
"icon": "✂️",
|
||||
"name": "Text Cleaner",
|
||||
"description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.",
|
||||
"status": "Coming Soon",
|
||||
"status": "Ready",
|
||||
"page": "2_Text_Cleaner",
|
||||
"tool_id": "02_text_cleaner",
|
||||
},
|
||||
{
|
||||
"icon": "📐",
|
||||
@@ -66,6 +80,7 @@ TOOLS = [
|
||||
"description": "Standardize dates, currencies, names, phone numbers, and addresses.",
|
||||
"status": "Coming Soon",
|
||||
"page": "3_Format_Standardizer",
|
||||
"tool_id": "03_format_standardizer",
|
||||
},
|
||||
{
|
||||
"icon": "🕳️",
|
||||
@@ -73,6 +88,7 @@ TOOLS = [
|
||||
"description": "Detect disguised nulls, missingness analysis, and imputation strategies.",
|
||||
"status": "Coming Soon",
|
||||
"page": "4_Missing_Values",
|
||||
"tool_id": "04_missing_handler",
|
||||
},
|
||||
{
|
||||
"icon": "🗂️",
|
||||
@@ -80,6 +96,7 @@ TOOLS = [
|
||||
"description": "Rename columns, enforce a target schema, and coerce types.",
|
||||
"status": "Coming Soon",
|
||||
"page": "5_Column_Mapper",
|
||||
"tool_id": "05_column_mapper",
|
||||
},
|
||||
{
|
||||
"icon": "📊",
|
||||
@@ -87,6 +104,7 @@ TOOLS = [
|
||||
"description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.",
|
||||
"status": "Coming Soon",
|
||||
"page": "6_Outlier_Detector",
|
||||
"tool_id": "06_outlier_detector",
|
||||
},
|
||||
{
|
||||
"icon": "📎",
|
||||
@@ -94,6 +112,7 @@ TOOLS = [
|
||||
"description": "Combine multiple CSV/Excel files with schema alignment.",
|
||||
"status": "Coming Soon",
|
||||
"page": "7_Multi_File_Merger",
|
||||
"tool_id": "07_multi_file_merger",
|
||||
},
|
||||
{
|
||||
"icon": "✅",
|
||||
@@ -101,6 +120,7 @@ TOOLS = [
|
||||
"description": "Validate against rules and generate PDF/Excel quality reports.",
|
||||
"status": "Coming Soon",
|
||||
"page": "8_Validator_Reporter",
|
||||
"tool_id": "08_validator_reporter",
|
||||
},
|
||||
{
|
||||
"icon": "⚙️",
|
||||
@@ -108,10 +128,13 @@ TOOLS = [
|
||||
"description": "Chain tools in recommended order and pass output between steps.",
|
||||
"status": "Coming Soon",
|
||||
"page": "9_Pipeline_Runner",
|
||||
"tool_id": "09_pipeline_runner",
|
||||
},
|
||||
]
|
||||
|
||||
# Render tool cards in a 3-column grid
|
||||
# Render tool cards in a 3-column grid. Cards picked up by the analyzer get a
|
||||
# coloured "N findings" badge so the user can see at a glance which tools
|
||||
# would help with the just-uploaded file.
|
||||
for row_start in range(0, len(TOOLS), 3):
|
||||
cols = st.columns(3)
|
||||
for i, col in enumerate(cols):
|
||||
@@ -121,8 +144,12 @@ for row_start in range(0, len(TOOLS), 3):
|
||||
tool = TOOLS[idx]
|
||||
with col:
|
||||
status_color = "green" if tool["status"] == "Ready" else "orange"
|
||||
badge = ""
|
||||
n = findings_count_for_tool(tool.get("tool_id", ""))
|
||||
if n:
|
||||
badge = f" :red-background[**{n} finding{'s' if n != 1 else ''}**]"
|
||||
st.markdown(
|
||||
f"### {tool['icon']} {tool['name']}\n\n"
|
||||
f"### {tool['icon']} {tool['name']}{badge}\n\n"
|
||||
f"{tool['description']}\n\n"
|
||||
f":{status_color}[**{tool['status']}**]"
|
||||
)
|
||||
|
||||
@@ -686,3 +686,293 @@ def _build_match_groups_csv(
|
||||
|
||||
groups_df = pd.DataFrame(rows)
|
||||
return groups_df.to_csv(index=False).encode("utf-8-sig")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Analyzer integration (upload-time data quality findings)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Tool id -> friendly display name. Single source of truth for the GUI; the
|
||||
# CLI keeps its own copy so each entrypoint stays self-contained.
|
||||
TOOL_DISPLAY_NAMES: dict[str, str] = {
|
||||
"01_deduplicator": "Deduplicator",
|
||||
"02_text_cleaner": "Text Cleaner",
|
||||
"03_format_standardizer": "Format Standardizer",
|
||||
"04_missing_handler": "Missing Value Handler",
|
||||
"05_column_mapper": "Column Mapper",
|
||||
"06_outlier_detector": "Outlier Detector",
|
||||
"07_multi_file_merger": "Multi-File Merger",
|
||||
"08_validator_reporter": "Validator & Reporter",
|
||||
"09_pipeline_runner": "Pipeline Runner",
|
||||
}
|
||||
|
||||
_SEVERITY_ICON: dict[str, str] = {
|
||||
"info": "ℹ️",
|
||||
"warn": "⚠️",
|
||||
"error": "🛑",
|
||||
}
|
||||
|
||||
_SEVERITY_COLOR: dict[str, str] = {
|
||||
"info": "blue",
|
||||
"warn": "orange",
|
||||
"error": "red",
|
||||
}
|
||||
|
||||
# Map tool id to the streamlit page path under src/gui/. Skipped tools (no
|
||||
# page yet) return empty string and the "Open" button is omitted.
|
||||
_TOOL_PAGE_PATHS: dict[str, str] = {
|
||||
"01_deduplicator": "pages/1_Deduplicator.py",
|
||||
"02_text_cleaner": "pages/2_Text_Cleaner.py",
|
||||
"03_format_standardizer": "pages/3_Format_Standardizer.py",
|
||||
"04_missing_handler": "pages/4_Missing_Values.py",
|
||||
"05_column_mapper": "pages/5_Column_Mapper.py",
|
||||
"06_outlier_detector": "pages/6_Outlier_Detector.py",
|
||||
"07_multi_file_merger": "pages/7_Multi_File_Merger.py",
|
||||
"08_validator_reporter": "pages/8_Validator_Reporter.py",
|
||||
"09_pipeline_runner": "pages/9_Pipeline_Runner.py",
|
||||
}
|
||||
|
||||
|
||||
def tool_display_name(tool_id: str) -> str:
|
||||
"""Map a stable tool id to its GUI display name; falls back to the id."""
|
||||
return TOOL_DISPLAY_NAMES.get(tool_id, tool_id) if tool_id else "Informational"
|
||||
|
||||
|
||||
def _tool_page_slug(tool_id: str) -> str:
|
||||
return _TOOL_PAGE_PATHS.get(tool_id, "")
|
||||
|
||||
|
||||
def render_findings_panel(findings, *, header: str = "Detected issues") -> None:
|
||||
"""Render a list of :class:`Finding` objects grouped by tool.
|
||||
|
||||
Each tool gets a header with the count, an open-tool button, and a list
|
||||
of the findings underneath. Severity icon + count are shown inline so
|
||||
the user can decide which tool to open first.
|
||||
"""
|
||||
from src.core.analyze import findings_by_tool # local import to avoid cycle
|
||||
|
||||
if not findings:
|
||||
st.success("No issues detected. Open any tool below to start working.")
|
||||
return
|
||||
|
||||
by_sev: dict[str, int] = {}
|
||||
for f in findings:
|
||||
by_sev[f.severity] = by_sev.get(f.severity, 0) + 1
|
||||
sev_summary = " · ".join(
|
||||
f"{_SEVERITY_ICON[s]} {by_sev[s]} {s}"
|
||||
for s in ("error", "warn", "info") if by_sev.get(s)
|
||||
)
|
||||
st.markdown(f"### {header}")
|
||||
st.caption(sev_summary)
|
||||
|
||||
grouped = findings_by_tool(findings)
|
||||
untargeted = [f for f in findings if not f.tool]
|
||||
|
||||
for tool_id in sorted(grouped):
|
||||
items = grouped[tool_id]
|
||||
with st.expander(
|
||||
f"{tool_display_name(tool_id)} — {len(items)} finding(s)",
|
||||
expanded=any(f.severity == "error" for f in items),
|
||||
):
|
||||
for f in items:
|
||||
_render_one_finding(f)
|
||||
page_slug = _tool_page_slug(tool_id)
|
||||
if page_slug:
|
||||
st.page_link(f"src/gui/{page_slug}", label=f"Open {tool_display_name(tool_id)} →")
|
||||
|
||||
if untargeted:
|
||||
with st.expander(
|
||||
f"Other / file-level — {len(untargeted)} finding(s)",
|
||||
expanded=False,
|
||||
):
|
||||
for f in untargeted:
|
||||
_render_one_finding(f)
|
||||
|
||||
|
||||
def _render_one_finding(f) -> None:
|
||||
color = _SEVERITY_COLOR[f.severity]
|
||||
icon = _SEVERITY_ICON[f.severity]
|
||||
column_part = f" in `{f.column}`" if getattr(f, "column", None) else ""
|
||||
st.markdown(
|
||||
f"{icon} :{color}[**{f.id}**]{column_part} — {f.description}"
|
||||
)
|
||||
if f.samples:
|
||||
sample_df = pd.DataFrame(
|
||||
f.samples, columns=["row", "column", "value"],
|
||||
)
|
||||
st.dataframe(sample_df, use_container_width=True, hide_index=True)
|
||||
|
||||
|
||||
def upload_and_analyze_section() -> None:
|
||||
"""Render the upload + analyze panel for the home page.
|
||||
|
||||
Stashes the uploaded file (name + bytes) and findings in session state
|
||||
so individual tool pages can pick them up if they want to skip their
|
||||
own uploader. Each tool page already has its own uploader today, so
|
||||
this is purely additive.
|
||||
"""
|
||||
st.markdown("### 📤 Upload a file to start")
|
||||
st.caption(
|
||||
"Optional: scan an uploaded file for data quality issues and see "
|
||||
"which tools can fix each one. Skip if you already know what you need."
|
||||
)
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
key="home_upload",
|
||||
)
|
||||
if uploaded is None:
|
||||
return
|
||||
|
||||
# Stash on every fresh upload so all tool pages can pick it up.
|
||||
if (
|
||||
st.session_state.get("home_uploaded_name") != uploaded.name
|
||||
or st.session_state.get("home_uploaded_size") != uploaded.size
|
||||
):
|
||||
st.session_state["home_uploaded_name"] = uploaded.name
|
||||
st.session_state["home_uploaded_size"] = uploaded.size
|
||||
st.session_state["home_uploaded_bytes"] = uploaded.getvalue()
|
||||
# Drop stale findings on a new upload.
|
||||
st.session_state.pop("home_findings", None)
|
||||
st.session_state.pop("home_skipped", None)
|
||||
|
||||
col_run, col_skip, _ = st.columns([1, 1, 4])
|
||||
with col_run:
|
||||
run_clicked = st.button("Run analysis", type="primary", key="home_run_analysis")
|
||||
with col_skip:
|
||||
skip_clicked = st.button("Skip", key="home_skip_analysis")
|
||||
|
||||
if skip_clicked:
|
||||
st.session_state["home_findings"] = []
|
||||
st.session_state["home_skipped"] = True
|
||||
|
||||
if run_clicked:
|
||||
with st.spinner("Scanning…"):
|
||||
findings = _run_analysis_on_upload(uploaded)
|
||||
st.session_state["home_findings"] = findings
|
||||
st.session_state["home_skipped"] = False
|
||||
|
||||
findings = st.session_state.get("home_findings")
|
||||
if findings is None:
|
||||
return
|
||||
|
||||
if st.session_state.get("home_skipped"):
|
||||
st.info("Analysis skipped. Open any tool below to start working.")
|
||||
return
|
||||
|
||||
st.divider()
|
||||
render_findings_panel(findings)
|
||||
|
||||
|
||||
def _run_analysis_on_upload(uploaded):
|
||||
"""Read the uploaded file with pre-parse repair, then analyze."""
|
||||
from src.core.analyze import analyze
|
||||
from src.core.io import repair_bytes
|
||||
|
||||
name = uploaded.name
|
||||
data = uploaded.getvalue()
|
||||
suffix = name.rsplit(".", 1)[-1].lower() if "." in name else ""
|
||||
|
||||
if suffix in ("xlsx", "xls"):
|
||||
df = pd.read_excel(io.BytesIO(data), dtype=str, keep_default_na=False)
|
||||
return analyze(df)
|
||||
|
||||
# CSV / TSV: run repair_bytes so the user sees csv_* findings.
|
||||
text_head = data[:4096].decode("utf-8", errors="replace")
|
||||
delim = "\t" if suffix == "tsv" else ","
|
||||
if delim == ",":
|
||||
for cand in ("\t", ";", "|"):
|
||||
if text_head.count(cand) > text_head.count(",") * 1.5:
|
||||
delim = cand
|
||||
break
|
||||
repair = repair_bytes(data, encoding="utf-8", delimiter=delim)
|
||||
df = pd.read_csv(
|
||||
io.BytesIO(repair.repaired_bytes),
|
||||
encoding="utf-8", delimiter=delim,
|
||||
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||
)
|
||||
return analyze(df, repair_result=repair)
|
||||
|
||||
|
||||
def findings_count_for_tool(tool_id: str) -> int:
|
||||
"""How many findings in session state target *tool_id*; 0 when none.
|
||||
|
||||
Used by the home-page tool grid to badge cards that have actionable
|
||||
findings without re-running the analyzer.
|
||||
"""
|
||||
findings = st.session_state.get("home_findings") or []
|
||||
return sum(1 for f in findings if f.tool == tool_id)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cross-page upload pickup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class _StashedUpload:
|
||||
"""Duck-types ``st.runtime.uploaded_file_manager.UploadedFile`` enough
|
||||
for the tool pages: ``.name``, ``.size``, ``.getvalue()``.
|
||||
|
||||
Tool pages that previously consumed a Streamlit ``UploadedFile`` can
|
||||
accept this in its place without changes.
|
||||
"""
|
||||
|
||||
__slots__ = ("name", "size", "_data")
|
||||
|
||||
def __init__(self, name: str, data: bytes) -> None:
|
||||
self.name = name
|
||||
self.size = len(data)
|
||||
self._data = data
|
||||
|
||||
def getvalue(self) -> bytes:
|
||||
return self._data
|
||||
|
||||
def read(self) -> bytes:
|
||||
return self._data
|
||||
|
||||
|
||||
def pickup_or_upload(
|
||||
*,
|
||||
label: str,
|
||||
key: str,
|
||||
types: list[str],
|
||||
help: str | None = None,
|
||||
):
|
||||
"""Return an upload object, preferring the home-page upload when present.
|
||||
|
||||
Behavior:
|
||||
|
||||
- If ``st.session_state['home_uploaded_bytes']`` is set and the user
|
||||
hasn't asked for a different file on this page, render a banner
|
||||
("Using *<name>* from upload screen") plus a "Use a different file"
|
||||
button, and return a :class:`_StashedUpload` shim.
|
||||
- Otherwise render the standard ``st.file_uploader`` with the supplied
|
||||
*label*, *key*, and *types*. Returns the Streamlit ``UploadedFile``
|
||||
directly (or ``None`` if nothing uploaded).
|
||||
|
||||
The ``_StashedUpload`` shim exposes ``.name``, ``.size``, and
|
||||
``.getvalue()`` so existing tool-page code that consumes a Streamlit
|
||||
upload object works without changes.
|
||||
"""
|
||||
override_key = f"{key}__override"
|
||||
has_session_upload = st.session_state.get("home_uploaded_bytes") is not None
|
||||
use_session = has_session_upload and not st.session_state.get(override_key, False)
|
||||
|
||||
if use_session:
|
||||
name = st.session_state.get("home_uploaded_name", "uploaded file")
|
||||
st.info(f"Using **{name}** from the upload screen.")
|
||||
if st.button("Use a different file", key=f"{key}__pick_diff"):
|
||||
st.session_state[override_key] = True
|
||||
st.rerun()
|
||||
return _StashedUpload(name, st.session_state["home_uploaded_bytes"])
|
||||
|
||||
uploaded = st.file_uploader(label, type=types, key=key, help=help)
|
||||
if uploaded is not None and st.session_state.get(override_key):
|
||||
# User has uploaded their own file on this page; clear the override
|
||||
# so the next visit to a tool page starts fresh.
|
||||
pass
|
||||
if uploaded is None and st.session_state.get(override_key) and has_session_upload:
|
||||
if st.button("Switch back to upload-screen file", key=f"{key}__switch_back"):
|
||||
st.session_state[override_key] = False
|
||||
st.rerun()
|
||||
return uploaded
|
||||
|
||||
@@ -21,6 +21,7 @@ from src.gui.components import (
|
||||
config_panel,
|
||||
hide_streamlit_chrome,
|
||||
match_group_card,
|
||||
pickup_or_upload,
|
||||
results_summary,
|
||||
)
|
||||
|
||||
@@ -56,11 +57,11 @@ st.caption("Find and remove duplicate rows in CSV, delimited text, and Excel fil
|
||||
# File upload
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel file",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
|
||||
uploaded = pickup_or_upload(
|
||||
label="Upload CSV or Excel file",
|
||||
key="dedup_file_upload",
|
||||
types=["csv", "tsv", "xlsx", "xls"],
|
||||
help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
|
||||
)
|
||||
|
||||
if uploaded is not None:
|
||||
|
||||
@@ -14,7 +14,7 @@ _project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.gui.components import hide_streamlit_chrome
|
||||
from src.gui.components import hide_streamlit_chrome, pickup_or_upload
|
||||
from src.core.text_clean import (
|
||||
PRESETS,
|
||||
CleanOptions,
|
||||
@@ -38,10 +38,10 @@ st.caption(
|
||||
# File upload
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel file",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
uploaded = pickup_or_upload(
|
||||
label="Upload CSV or Excel file",
|
||||
key="textclean_file_upload",
|
||||
types=["csv", "tsv", "xlsx", "xls"],
|
||||
)
|
||||
|
||||
if uploaded is None:
|
||||
|
||||
51
test-cases/text-cleaner-corpus/README.md
Normal file
51
test-cases/text-cleaner-corpus/README.md
Normal file
@@ -0,0 +1,51 @@
|
||||
# Text Cleaner Test Corpus
|
||||
|
||||
Test fixtures for `02_text_cleaner.py` (Excel & CSV Data Cleaning Mastery Bundle).
|
||||
|
||||
## Layout
|
||||
|
||||
```
|
||||
text_cleaner_test_corpus/
|
||||
├── README.md # This file
|
||||
├── TEST-CASES.md # Full taxonomy and expected behavior per test
|
||||
├── generate_test_data.py # Regenerates the 20 CSV inputs and expected outputs
|
||||
├── generate_xlsx.py # Regenerates the multi-sheet XLSX fixture
|
||||
├── test_data/ # Inputs (21 fixtures: 20 CSV + 1 XLSX)
|
||||
└── expected/ # Expected outputs (with default and flag variants)
|
||||
```
|
||||
|
||||
## Quick start
|
||||
|
||||
Read `TEST-CASES.md` from top to bottom. Sections 1 (scope boundary) and 2 (default config assumed) are load-bearing; the per-test details in Section 4 don't make sense without them.
|
||||
|
||||
To regenerate the test files (e.g., after editing the generator):
|
||||
```bash
|
||||
python generate_test_data.py
|
||||
python generate_xlsx.py
|
||||
```
|
||||
|
||||
To use as pytest fixtures: see Section 6 of `TEST-CASES.md`.
|
||||
|
||||
## Coverage summary
|
||||
|
||||
| Category | Fixtures |
|
||||
|---|---|
|
||||
| Whitespace (ASCII + Unicode) | 01, 02 |
|
||||
| Smart punctuation | 03 |
|
||||
| Unicode normalization | 04 |
|
||||
| Invisible / zero-width / control | 05, 06 |
|
||||
| BOM | 07 |
|
||||
| Line endings (file-level + embedded) | 08, 09, 10, 11 |
|
||||
| Case operations (opt-in) | 12 |
|
||||
| International script preservation | 13 |
|
||||
| Mojibake | 14 |
|
||||
| Boundary with script 04 (missing values) | 15 |
|
||||
| Headers | 16, 19 |
|
||||
| Negative tests (must NOT touch) | 17 |
|
||||
| File-level edge cases | 18, 19 |
|
||||
| Integration | 20 |
|
||||
| Excel-specific (multi-sheet, Alt+Enter) | 21 |
|
||||
|
||||
## Out of scope
|
||||
|
||||
Documented in `TEST-CASES.md` Section 5: encoding detection, large-file performance, GUI behavior, file-locking, CLI argument parsing. Each needs its own test layer.
|
||||
509
test-cases/text-cleaner-corpus/TEST-CASES.md
Normal file
509
test-cases/text-cleaner-corpus/TEST-CASES.md
Normal file
@@ -0,0 +1,509 @@
|
||||
# TEST-CASES.md - `02_text_cleaner.py` Test Corpus
|
||||
|
||||
**Version**: 1.0
|
||||
**Last updated**: April 29, 2026
|
||||
**Companion to**: TECHNICAL.md Section 9 (script boundaries) and the per-script functional spec template introduced in TECHNICAL.md Section 10.1.
|
||||
|
||||
## Purpose of this document
|
||||
|
||||
Defines the complete set of behaviors `02_text_cleaner.py` is expected to exhibit, with one test fixture per behavior. Used as:
|
||||
|
||||
1. The build target when porting the (currently skeleton) script to working state.
|
||||
2. The pytest input set once the script ships.
|
||||
3. The acceptance criteria for the GUI port (every fixture must produce its expected output through both CLI and Streamlit GUI).
|
||||
|
||||
Each test case has an input file in `test_data/` and (where exact-diff comparison applies) an expected-output file in `expected/`.
|
||||
|
||||
---
|
||||
|
||||
## 1. Scope boundary (what 02 owns vs what it doesn't)
|
||||
|
||||
This is the load-bearing decision. Every contested case routes back to it.
|
||||
|
||||
**02 owns: character-level hygiene only.**
|
||||
|
||||
- Whitespace normalization (outer trim + internal collapse for text columns).
|
||||
- Unicode normalization (NFC by default, NFKC opt-in).
|
||||
- Smart-punctuation ASCII-fication (curly quotes, em/en dash, ellipsis, primes).
|
||||
- Invisible / zero-width character stripping.
|
||||
- Control character stripping (with explicit allowlist for tab/newline inside quoted cells).
|
||||
- BOM detection on input, never written on output.
|
||||
- Line-ending normalization at the file level AND inside multi-line cells.
|
||||
- Optional case operations (per-column, opt-in only).
|
||||
|
||||
**02 does NOT own:**
|
||||
|
||||
| Concern | Owned by |
|
||||
|---|---|
|
||||
| Detecting and replacing nulls / sentinel codes | `04_missing_value_handler` |
|
||||
| Reformatting dates, currencies, phones, names, addresses | `03_format_standardizer` |
|
||||
| Outlier detection or domain-rule violations | `06_outlier_detector` |
|
||||
| Renaming or reordering columns | `05_column_mapper_enforcer` |
|
||||
| Deduplication (even though dedup normalizes internally) | `01_deduplicator` |
|
||||
| File encoding detection on read | The shared I/O layer in `src/core/io.py` |
|
||||
|
||||
**Invariant 02 must preserve:** after running 02, the schema (column count, column order, row count) is unchanged. 02 changes cell *content*, never *structure*. The one nuance: a cell containing only whitespace becomes an empty string, but the cell still exists and the row is not dropped.
|
||||
|
||||
---
|
||||
|
||||
## 2. Default configuration assumed by these tests
|
||||
|
||||
Tests assume the default config below. Any test that exercises a non-default flag explicitly says so in its description.
|
||||
|
||||
| Setting | Default | Notes |
|
||||
|---|---|---|
|
||||
| `--trim` | on | Strip leading/trailing whitespace including Unicode whitespace (NBSP, NNBSP, ideographic space, etc.) |
|
||||
| `--collapse-internal` | on (text columns only) | Collapse runs of internal whitespace to a single ASCII space, ONLY in cells that don't parse as numeric, date, or phone-shaped |
|
||||
| `--unicode-form` | NFC | NFKC available as opt-in; folds ligatures and fullwidth |
|
||||
| `--smart-quotes` | on | Curly to straight, em/en dash to hyphen, ellipsis to `...`, primes to `'`/`"` |
|
||||
| `--strip-zero-width` | on | ZWSP, ZWJ, ZWNJ, LRM, RLM, soft hyphen, word joiner |
|
||||
| `--strip-controls` | on | Strip C0 (except `\t\n\r` inside quoted cells) and DEL |
|
||||
| `--strip-bom` | on | BOM removed on read; never written on output |
|
||||
| `--line-endings` | LF | File-level AND embedded-cell line endings normalized to LF |
|
||||
| `--case` | none | Case operations are opt-in per column |
|
||||
| `--fix-mojibake` | off | Logged as warning by default; opt-in repair via ftfy |
|
||||
| `--columns` | all | All text columns processed; `--columns name,email` restricts |
|
||||
|
||||
**Idempotency requirement:** for any input X, `clean(clean(X)) == clean(X)`. This is a property test, not a fixture-comparison test. Every fixture below should be run through the cleaner twice and produce identical output both times.
|
||||
|
||||
---
|
||||
|
||||
## 3. Test case index
|
||||
|
||||
| # | File | Category | What it tests | Diff-testable |
|
||||
|---|---|---|---|---|
|
||||
| 01 | `01_whitespace_basic.csv` | Whitespace | ASCII space + tab, leading/trailing/internal | Yes |
|
||||
| 02 | `02_whitespace_unicode.csv` | Whitespace | NBSP, narrow NBSP, ideographic, em/thin space | Yes |
|
||||
| 03 | `03_smart_punctuation.csv` | Punctuation | Curly quotes, em/en dash, ellipsis, primes | Yes |
|
||||
| 04 | `04_unicode_forms.csv` | Unicode | NFC vs NFD, ligatures, fullwidth, presentation forms | Yes |
|
||||
| 05 | `05_zero_width_invisible.csv` | Invisible | ZWSP, ZWJ, ZWNJ, LRM, RLM, soft hyphen | Yes |
|
||||
| 06 | `06_control_characters.csv` | Control | NUL, BEL, BS, VT, FF, ESC, DEL | Yes |
|
||||
| 07 | `07_bom_utf8.csv` | Encoding | UTF-8 BOM at file start | Yes (byte-exact) |
|
||||
| 08 | `08_line_endings_crlf.csv` | Line endings | All CRLF (Windows) | Yes (byte-exact) |
|
||||
| 09 | `09_line_endings_cr.csv` | Line endings | All CR (classic Mac) | Yes (byte-exact) |
|
||||
| 10 | `10_line_endings_mixed.csv` | Line endings | CRLF + LF + CR mixed in one file | Yes (byte-exact) |
|
||||
| 11 | `11_embedded_newlines.csv` | Line endings | Newlines inside quoted cells (preserve, normalize) | Yes |
|
||||
| 12 | `12_case_variations.csv` | Case | Mixed case across name/email/product columns | 3 outputs (default + 2 modes) |
|
||||
| 13 | `13_non_latin_scripts.csv` | Preservation | Chinese, Japanese, Arabic, Russian, emoji | Yes |
|
||||
| 14 | `14_mojibake.csv` | Encoding | Double-encoded UTF-8 (warn-by-default; fix opt-in) | 2 outputs (default + fixed) |
|
||||
| 15 | `15_whitespace_only_cells.csv` | Boundary (vs 04) | Cells containing only whitespace become empty | Yes |
|
||||
| 16 | `16_dirty_headers.csv` | Headers | Headers themselves have whitespace, BOM, smart quotes | Yes |
|
||||
| 17 | `17_preserve_intended.csv` | Negative | Things 02 must NOT touch | Yes |
|
||||
| 18 | `18_empty_file.csv` | Edge | Zero-byte file | Yes |
|
||||
| 19 | `19_headers_only.csv` | Edge | Headers but no data rows | Yes |
|
||||
| 20 | `20_kitchen_sink.csv` | Integration | Everything combined in one file | Yes |
|
||||
| 21 | `21_excel_pollution.xlsx` | Excel-specific | Multi-sheet, Alt+Enter cells, force-text, copy-paste pollution | No (manual) |
|
||||
|
||||
---
|
||||
|
||||
## 4. Per-test details
|
||||
|
||||
### 01 - Whitespace basic
|
||||
|
||||
**File**: `test_data/01_whitespace_basic.csv` -> `expected/01_whitespace_basic.csv`
|
||||
|
||||
Tests the core whitespace contract on ASCII space and tab characters. Every kind of placement: leading-only, trailing-only, both, internal-multiple, tab-padded, multiple internal multi-space runs in one cell, all of the above combined.
|
||||
|
||||
**Expected behavior:**
|
||||
- Leading and trailing whitespace stripped from every cell.
|
||||
- Internal runs of whitespace collapsed to a single ASCII space.
|
||||
- Tabs treated as whitespace by both rules.
|
||||
|
||||
**Why it matters:** This is the highest-frequency real-world pollution. Trailing-space pollution alone is what the v1.5 audit identified as the gap that motivated creating script 02 in the first place (DECISIONS.md v1.5 entry).
|
||||
|
||||
---
|
||||
|
||||
### 02 - Whitespace, Unicode
|
||||
|
||||
**File**: `test_data/02_whitespace_unicode.csv` -> `expected/02_whitespace_unicode.csv`
|
||||
|
||||
The whitespace pretenders. Python's `str.strip()` with no argument actually does strip these in 3.x, but a lot of cleaners written by people who were burned in 2.x explicitly pass `' \t\n'` and miss them. Excel and Word produce these constantly when you copy from a styled document.
|
||||
|
||||
Characters covered: NBSP (U+00A0), narrow NBSP (U+202F), ideographic space (U+3000), em space (U+2003), thin space (U+2009).
|
||||
|
||||
**Expected behavior:** treated identically to ASCII space - trimmed at edges, collapsed internally.
|
||||
|
||||
**Why it matters:** "It looks fine but the join doesn't match" debugging sessions almost always end here. NBSP-padded keys are the silent killer.
|
||||
|
||||
---
|
||||
|
||||
### 03 - Smart punctuation
|
||||
|
||||
**File**: `test_data/03_smart_punctuation.csv` -> `expected/03_smart_punctuation.csv`
|
||||
|
||||
Curly quotes, dashes, ellipsis, primes - the autocorrect-as-you-type damage from Word/Excel. ASCII-fy where round-trip-safe.
|
||||
|
||||
| Input | Output | Notes |
|
||||
|---|---|---|
|
||||
| `\u201c` `\u201d` (curly double) | `"` | |
|
||||
| `\u2018` `\u2019` (curly single) | `'` | Includes apostrophe |
|
||||
| `\u2014` (em-dash) | `-` | |
|
||||
| `\u2013` (en-dash) | `-` | |
|
||||
| `\u2026` (ellipsis) | `...` | |
|
||||
| `\u2032` (prime) | `'` | |
|
||||
| `\u2033` (double prime) | `"` | |
|
||||
| `\u00ab` `\u00bb` (guillemets) | `"` | |
|
||||
| `\u00d7` (multiplication sign) | **preserved** | Not safely round-trip-able to ASCII; `x` would be wrong |
|
||||
| `\u00b1` (plus-minus) | **preserved** | Same reasoning |
|
||||
|
||||
**Why it matters:** smart-quote pollution breaks regex, breaks downstream parsers, and breaks string equality joins. The two preservation cases (multiplication, plus-minus) are deliberate - they have no faithful ASCII equivalent and forcing one is destructive.
|
||||
|
||||
---
|
||||
|
||||
### 04 - Unicode normalization forms
|
||||
|
||||
**File**: `test_data/04_unicode_forms.csv` -> `expected/04_unicode_forms.csv`
|
||||
|
||||
`café` can be encoded two ways:
|
||||
|
||||
- NFC: `caf\u00e9` (one code point, e-acute as a unit)
|
||||
- NFD: `cafe\u0301` (two code points, plain e + combining accent)
|
||||
|
||||
These render identically. They compare unequal. They have different lengths. macOS filesystem defaults to NFD, which means a CSV exported from a Mac and joined against a CSV from Excel can silently fail.
|
||||
|
||||
Default normalization: NFC (most compact, what Excel emits, what most Western databases expect).
|
||||
|
||||
**Cases covered:**
|
||||
- Pre-composed (NFC) e-acute and i-diaeresis.
|
||||
- Decomposed (NFD) versions of the same.
|
||||
- The `\uFB03` `ffi` ligature - **preserved** under NFC (NFKC would fold it to `ffi`).
|
||||
- Fullwidth Latin letters (`\uFF21\uFF22\uFF23` = `ABC`) - **preserved** under NFC.
|
||||
- Roman numeral nine character (`\u2168`) - **preserved** under NFC.
|
||||
|
||||
After cleaning, rows 1 and 2 must produce identical bytes (NFC and NFD both normalized to NFC). Same for rows 3 and 4.
|
||||
|
||||
**Why it matters:** Mac-vs-Windows data joins. Catches "they look the same but won't match" bugs.
|
||||
|
||||
**Opt-in `--unicode-form=NFKC` test:** not provided as a fixture but should exist as a unit test. Under NFKC, ligature folds to `ffi`, fullwidth folds to ASCII `ABC`, roman numeral folds to `IX`. NFKC is destructive for some legitimate text (mathematical notation, some CJK content) so it stays opt-in.
|
||||
|
||||
---
|
||||
|
||||
### 05 - Zero-width and invisible characters
|
||||
|
||||
**File**: `test_data/05_zero_width_invisible.csv` -> `expected/05_zero_width_invisible.csv`
|
||||
|
||||
These bytes show up from rich-text copy/paste, from RTL text, from accidentally-included U+FEFF in the middle of a cell (yes, this happens), and from some web-form pastes.
|
||||
|
||||
Characters covered: U+200B (ZWSP), U+200C (ZWNJ), U+200D (ZWJ), U+200E (LRM), U+200F (RLM), U+00AD (soft hyphen), U+2060 (word joiner).
|
||||
|
||||
**Expected behavior:** all stripped unconditionally. None of these has a legitimate role in tabular data cells, even when there's a domain reason for them in prose (typesetting Arabic, hyphenation hints in long-form text). For a CSV, they're noise.
|
||||
|
||||
**Why it matters:** these are the *truly invisible* polluters. You can stare at the cell forever and not see them. They break joins, they bloat string lengths, they hash differently. The first time a buyer hits a zero-width-space in a customer name, this test is what saves them.
|
||||
|
||||
---
|
||||
|
||||
### 06 - Control characters
|
||||
|
||||
**File**: `test_data/06_control_characters.csv` -> `expected/06_control_characters.csv`
|
||||
|
||||
The C0 controls (U+0000 through U+001F) plus DEL (U+007F). Test cases: NUL, BEL, BS, VT, FF, ESC, DEL, and a multi-control combination.
|
||||
|
||||
**Expected behavior:** all stripped from cell content.
|
||||
|
||||
**The exception:** tab (U+0009), LF (U+000A), and CR (U+000D) are NOT stripped from inside quoted cells. Tab might be intentional formatting; LF/CR are handled by line-ending normalization (case 11). Outside of quoted cells, tab is whitespace and gets normalized like space.
|
||||
|
||||
**Why it matters:** real-world exports from broken systems, half-corrupted database dumps, copy-paste from terminals (including ANSI escape sequences starting with ESC), and binary data accidentally exported as text all leave these in cells. A NUL byte mid-string breaks C-string-based parsers; a BEL makes terminals beep when you `cat` the file; ESC sequences corrupt logs.
|
||||
|
||||
---
|
||||
|
||||
### 07 - UTF-8 BOM
|
||||
|
||||
**File**: `test_data/07_bom_utf8.csv` -> `expected/07_bom_utf8.csv` (byte-exact comparison)
|
||||
|
||||
File starts with the three-byte sequence `EF BB BF`. Excel writes UTF-8 with BOM by default. Pandas `read_csv` usually handles this but leaves the BOM as part of the first column header name unless you pass `encoding='utf-8-sig'`. Result: a mystery column called `\ufeffid` that breaks every `df["id"]` lookup downstream.
|
||||
|
||||
**Expected behavior:**
|
||||
- BOM stripped on read.
|
||||
- First column header is the clean string `id`, not `\ufeffid`.
|
||||
- Output file is written WITHOUT a BOM.
|
||||
|
||||
**Diff target:** byte-for-byte equality with `expected/07_bom_utf8.csv`. The expected file must NOT have the BOM.
|
||||
|
||||
**Why it matters:** Excel-origin data is the dominant input for the target buyer. Getting BOM handling wrong silently breaks the rest of the pipeline.
|
||||
|
||||
---
|
||||
|
||||
### 08, 09, 10 - Line endings: CRLF, CR-only, mixed
|
||||
|
||||
**Files**: `08_line_endings_crlf.csv`, `09_line_endings_cr.csv`, `10_line_endings_mixed.csv`
|
||||
|
||||
- 08: every line ends with CRLF (`\r\n`). Standard Windows.
|
||||
- 09: every line ends with CR (`\r`) only. Classic Mac. Rare but seen.
|
||||
- 10: same file contains all three: CRLF, LF, CR, CRLF, LF.
|
||||
|
||||
**Expected behavior on output:** all lines end with LF (`\n`). Byte-exact match to the expected files.
|
||||
|
||||
**Why LF as the default output:** it's what Linux uses, what every modern code editor handles, what Git stores by default, and what Streamlit / pandas write by default. CRLF is an option for buyers who specifically need Windows-style output, but the default minimizes round-trip surprises.
|
||||
|
||||
**Why it matters:** mixed line endings cause "ghost rows" in some parsers, blank lines in some editors, and silent data loss in any tool that splits on one specific newline pattern. Case 10 is the disaster scenario - multi-source concat - and is the most important of the three.
|
||||
|
||||
---
|
||||
|
||||
### 11 - Embedded newlines inside quoted cells
|
||||
|
||||
**File**: `test_data/11_embedded_newlines.csv` -> `expected/11_embedded_newlines.csv`
|
||||
|
||||
The trap. File-level line-ending normalization must NOT collapse intentional newlines inside multi-line cells (addresses, notes columns). But the embedded line endings *should still* be normalized to LF for consistency.
|
||||
|
||||
**Expected behavior:**
|
||||
- File-level line endings: LF.
|
||||
- Embedded CRLF inside a quoted cell: normalized to LF.
|
||||
- Embedded CR inside a quoted cell: normalized to LF.
|
||||
- Cell stays multi-line; the newline character count inside the cell is preserved.
|
||||
|
||||
**Why it matters:** an address column with `123 Main St\r\nApt 4B\r\nNew York` is the canonical legitimate multi-line cell. A naive `text.replace('\r\n', '\n')` works correctly. A naive `text.split('\n')` to "remove blank lines" destroys the address. The cleaner must understand CSV quoting.
|
||||
|
||||
---
|
||||
|
||||
### 12 - Case operations (opt-in)
|
||||
|
||||
**Files**: input `12_case_variations.csv`; three expected outputs:
|
||||
- `expected/12_case_variations__default.csv` (no flag - identity)
|
||||
- `expected/12_case_variations__email_lower.csv` (`--case email=lower`)
|
||||
- `expected/12_case_variations__name_title.csv` (`--case name=title`)
|
||||
|
||||
Default behavior is **preserve case**. Case operations are opt-in per column because:
|
||||
|
||||
- Lowercasing emails is almost always right (emails are case-insensitive per RFC 5321 local-part-aside).
|
||||
- Title-casing names is almost always right (`ALICE SMITH` -> `Alice Smith`), but must handle apostrophes correctly (`O'Connor` -> `O'Connor`, not `O'connor`).
|
||||
- Lowercasing product codes is almost always WRONG (`SKU-A1B2` is a code, not prose).
|
||||
|
||||
So the tool offers per-column case ops, never a global one. The expected outputs cover the two most common configurations.
|
||||
|
||||
**Tricky case to verify:** row 4 name `DAN O'CONNOR`. Under `--case=title` this must become `Dan O'Connor`, not `Dan O'connor`. Python's `str.title()` gets this wrong. Implementations should use `string.capwords()` or a regex that respects apostrophes inside words.
|
||||
|
||||
**Why it matters:** dedup quality (case 01 in the deduplicator) depends on consistent case in the comparison columns. Buyers running 02 before 01 expect this to "just work" for the email column.
|
||||
|
||||
---
|
||||
|
||||
### 13 - Non-Latin scripts and emoji (preservation negative test)
|
||||
|
||||
**File**: `test_data/13_non_latin_scripts.csv` -> `expected/13_non_latin_scripts.csv`
|
||||
|
||||
Negative test: cleaning must not damage characters outside the Latin/punctuation block. Trim and NFC still apply (row 1 has leading and trailing space, which gets trimmed).
|
||||
|
||||
Coverage: Chinese (Beijing), Japanese (katakana test), Arabic RTL, Cyrillic Russian, multi-codepoint emoji (party popper U+1F389, rocket U+1F680), accent + emoji combo (`café ☕`).
|
||||
|
||||
**Expected behavior:** only whitespace and NFC normalization apply. All script-significant characters preserved exactly.
|
||||
|
||||
**Why it matters:** the cleaner must be safe on international buyer data. Stripping "weird-looking" characters because they're outside ASCII is a textbook bug. Emoji in particular are in the supplementary planes (above U+FFFF) and naive byte-level filters often mangle them.
|
||||
|
||||
---
|
||||
|
||||
### 14 - Mojibake
|
||||
|
||||
**Files**: input `14_mojibake.csv`; two expected outputs:
|
||||
- `expected/14_mojibake__default.csv` (no flag - bytes preserved, warning logged)
|
||||
- `expected/14_mojibake__fixed.csv` (`--fix-mojibake` - heuristic repair)
|
||||
|
||||
Mojibake is the result of UTF-8 bytes being interpreted as cp1252 or Latin-1 and re-saved as UTF-8. Classic patterns:
|
||||
|
||||
- `café` becomes `café`
|
||||
- `München` becomes `München`
|
||||
- `naïve` becomes `naïve`
|
||||
- The smart-apostrophe in `don't` becomes `don't`
|
||||
|
||||
**Default behavior: warn, do NOT auto-fix.** Reasoning: mojibake repair is heuristic, and the heuristic can false-positive on legitimate strings that happen to contain `Ã` followed by another Latin-1 character. The right call for a tool sold to non-experts is to flag the suspicious pattern in the log and let the user opt in.
|
||||
|
||||
**With `--fix-mojibake` (uses ftfy or equivalent):** repair attempted. The expected output shows fully repaired text including the smart-apostrophe-via-cp1252 case, which ftfy specifically handles.
|
||||
|
||||
**Why it matters:** mojibake is silent corruption. The customer doesn't know it happened until a name shows up wrong on a printed invoice. Flagging it is the responsible default.
|
||||
|
||||
---
|
||||
|
||||
### 15 - Whitespace-only cells (the 02-vs-04 boundary)
|
||||
|
||||
**File**: `test_data/15_whitespace_only_cells.csv` -> `expected/15_whitespace_only_cells.csv`
|
||||
|
||||
Per TECHNICAL.md Section 9.3: 02 trims whitespace first, leaving an empty string. Script 04 then detects empty strings as disguised null. So 02's job in this file is to convert `" "`, `"\t\t"`, `"\u00A0\u00A0"`, and mixed-whitespace cells all into `""`.
|
||||
|
||||
**What 02 does NOT do here:**
|
||||
- Does not decide whether the cell is "missing." That's 04's call.
|
||||
- Does not write `NaN` or `N/A` or any other sentinel. Just produces empty string.
|
||||
- Does not drop the row. Schema is invariant.
|
||||
|
||||
**Expected behavior:** every whitespace-only cell becomes empty. Row count unchanged. Headers untouched.
|
||||
|
||||
**Why it matters:** this is the single most-relitigated boundary in the bundle. Documenting it via fixture prevents drift.
|
||||
|
||||
---
|
||||
|
||||
### 16 - Dirty headers
|
||||
|
||||
**File**: `test_data/16_dirty_headers.csv` -> `expected/16_dirty_headers.csv`
|
||||
|
||||
Headers themselves are subject to all the same pollution as data cells. A header `" Email "` (NBSP-padded) breaks `df["Email"]` lookups because the actual column name has NBSP padding. Smart-quoted header `"\u201cEmail\u201d"` is even worse.
|
||||
|
||||
**Expected behavior:** headers cleaned by the same rules as data. Note that the smart-quoted header `"Email"` (with surrounding quotes) becomes a header value containing literal ASCII double quotes, which then requires CSV-quoting in the output. The expected file is written with proper CSV escaping.
|
||||
|
||||
**Why it matters:** broken column names break every downstream join, every selectbox in the GUI, and every CLI flag that takes a column name. Cleaning headers is non-negotiable.
|
||||
|
||||
---
|
||||
|
||||
### 17 - Preserve-intended (negative tests)
|
||||
|
||||
**File**: `test_data/17_preserve_intended.csv` -> `expected/17_preserve_intended.csv`
|
||||
|
||||
The negative-test file. Things 02 must NOT touch because they belong to other scripts:
|
||||
|
||||
| Cell content | What 02 does | What 02 does NOT do |
|
||||
|---|---|---|
|
||||
| ` 100 ` | Trims to `100` | Doesn't reformat as `$100.00` (that's 03) |
|
||||
| `1 234` | Preserves as `1 234` | Doesn't collapse internal space (looks numeric, European thousand-sep) |
|
||||
| `$1,500.00` | Trims outer whitespace | Doesn't reformat currency (that's 03) |
|
||||
| `2024-01-15` | Trims outer whitespace | Doesn't reformat date (that's 03) |
|
||||
| `(555) 123-4567` | Trims outer whitespace | Doesn't reformat phone (that's 03); does not collapse internal space |
|
||||
| `+1 555 123 4567` | Trims outer whitespace | Same; phone-shaped, leave internal spacing alone |
|
||||
| `N/A` | Trims to `N/A` | Doesn't replace with empty or NaN (that's 04) |
|
||||
| `nan` | Trims to `nan` | Doesn't replace with empty or NaN (that's 04) |
|
||||
|
||||
The internal-whitespace heuristic: if a cell parses as numeric, looks like a date, or matches a phone-shape regex (digits + common separators), do NOT collapse internal whitespace. Only collapse in cells classified as free text. This requires a per-cell check; document it in the implementation.
|
||||
|
||||
**Why it matters:** scope discipline. If 02 starts reformatting dates because "while we're trimming whitespace anyway", it stops being 02 and starts being a worse 03. The DECISIONS.md Section 4a rule (functional scope) cuts the other way too: 02 must not reach into other scripts' territory.
|
||||
|
||||
---
|
||||
|
||||
### 18 - Empty file
|
||||
|
||||
**File**: `test_data/18_empty_file.csv` (zero bytes) -> `expected/18_empty_file.csv` (zero bytes)
|
||||
|
||||
**Expected behavior:** graceful no-op. Either produces an empty output file with a logged warning, or emits a clean error message naming the problem ("Input file is empty"). What it MUST NOT do: crash with `pandas.errors.EmptyDataError` traceback in the GUI.
|
||||
|
||||
**Why it matters:** error UX standard from DECISIONS.md Section 4b - errors that name the problem and the fix, not stack traces.
|
||||
|
||||
---
|
||||
|
||||
### 19 - Headers only (no data rows)
|
||||
|
||||
**File**: `test_data/19_headers_only.csv` -> `expected/19_headers_only.csv`
|
||||
|
||||
Just headers, no data. Headers themselves are dirty (whitespace + NBSP + ZWSP).
|
||||
|
||||
**Expected behavior:** headers cleaned, output is clean headers + no data rows. No crash, no warning required (it's a legitimate state).
|
||||
|
||||
**Why it matters:** template files often look like this. The buyer might be cleaning a template before populating it. Don't punish them for it.
|
||||
|
||||
---
|
||||
|
||||
### 20 - Kitchen sink (integration)
|
||||
|
||||
**File**: `test_data/20_kitchen_sink.csv` -> `expected/20_kitchen_sink.csv`
|
||||
|
||||
The integration test. Combines:
|
||||
|
||||
- UTF-8 BOM at file start.
|
||||
- CRLF line endings throughout.
|
||||
- Headers with leading/trailing space, NBSP, smart quotes, ZWSP.
|
||||
- Data cells with NBSP, internal multi-space, smart quotes, em-dash, ellipsis, primes (foot/inch markers).
|
||||
- A whitespace-only cell that should become empty.
|
||||
- Multiplication sign (preserved).
|
||||
|
||||
**Expected output:** every transformation applied correctly, schema unchanged, file written as UTF-8 (no BOM) with LF line endings.
|
||||
|
||||
**Why it matters:** this is the one fixture that catches transformation-order bugs. If smart-quote replacement runs before whitespace trim, you get different output than the other order. Picking and locking the order is part of the implementation; the fixture verifies it.
|
||||
|
||||
**Recommended transformation pipeline order** (informative, not normative):
|
||||
|
||||
1. Decode bytes -> strip BOM at file level.
|
||||
2. Normalize file-level line endings -> LF.
|
||||
3. Parse CSV (with proper quoting for embedded newlines).
|
||||
4. Per cell, in order:
|
||||
a. Unicode NFC normalize.
|
||||
b. Strip zero-width and control characters.
|
||||
c. Strip BOM if it appears mid-cell.
|
||||
d. Smart-quote ASCII-fy.
|
||||
e. Normalize embedded line endings to LF.
|
||||
f. Whitespace trim (outer).
|
||||
g. Internal whitespace collapse (text columns only - check after trim).
|
||||
h. Per-column case op (if configured).
|
||||
5. Headers go through the same per-cell pipeline.
|
||||
6. Write as UTF-8, LF line endings, no BOM.
|
||||
|
||||
---
|
||||
|
||||
### 21 - Excel pollution (multi-sheet XLSX)
|
||||
|
||||
**File**: `test_data/21_excel_pollution.xlsx` (no expected file - manual / programmatic verification per sheet)
|
||||
|
||||
Four sheets, each isolating an Excel-specific concern:
|
||||
|
||||
**Sheet `Customers`** - dirty headers (NBSP, smart quotes, ZWSP) and dirty data cells (NBSP padding, tab padding, smart apostrophe in `O'Connor`, em-dash). One whitespace-only `name` cell to verify the 02/04 boundary applies on XLSX too.
|
||||
|
||||
**Sheet `Notes`** - multi-line cells from Alt+Enter (LF inside cell), plus a cell with mixed CRLF inside (from someone pasting Windows text into Excel). Cells have wrap_text formatting set so the line breaks render in Excel. After cleaning, all in-cell line breaks should be LF.
|
||||
|
||||
**Sheet `International`** - non-Latin scripts and emoji with surrounding whitespace. Verifies the preservation contract from case 13 holds for XLSX.
|
||||
|
||||
**Sheet `ForceText`** - leading-zero IDs (e.g., `0001234`). These must not be stripped of leading zeros (that's not 02's job - it doesn't change semantic content). Row 3 has a leaked apostrophe (`'9999999`) from a force-text cell - this is a judgment call but the default is to preserve it; trying to detect "leaked apostrophe" is too error-prone.
|
||||
|
||||
**Why it matters:** XLSX has pollution patterns that don't appear in CSV (Alt+Enter cells, force-text apostrophes, sheet structure). The XLSX reader path needs the same cleaning logic as the CSV reader path; this fixture verifies that.
|
||||
|
||||
---
|
||||
|
||||
## 5. What this corpus does NOT cover
|
||||
|
||||
Listed so the gap is explicit, not hidden:
|
||||
|
||||
1. **Encoding detection** (cp1252 input, Latin-1 input, UTF-16). That's the I/O layer's job, not 02's transformation logic. Once the reader produces a Python `str`, 02 operates the same regardless of source encoding. Add I/O-layer fixtures separately when that layer is built.
|
||||
2. **Performance / large files**. No multi-GB fixture is included because it bloats the repo. Add a benchmark (not a unit test) targeting a 500MB CSV; verify processing completes without OOM via chunked reads.
|
||||
3. **Streamlit UI behavior**. The fixtures verify cleaning correctness; verifying the GUI shows the right preview, applies the right defaults, and renders cleaning in the diff view is a separate test layer (probably manual, possibly Playwright).
|
||||
4. **Concurrency / file-locking** (e.g., user has the input file open in Excel). Expected to fail with a clean error, not corrupt data. Add a manual test, not a fixture.
|
||||
5. **CLI argument parsing** for the various flags. Each flag should have a Typer-level test, separate from the fixtures here.
|
||||
|
||||
---
|
||||
|
||||
## 6. How to use this corpus
|
||||
|
||||
### As a build target
|
||||
Each fixture is one piece of the spec. Implement the cleaner against fixture 01, run, diff, fix, repeat. Move to 02. By the time fixture 20 passes, the script is done.
|
||||
|
||||
### As pytest fixtures
|
||||
```python
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from src.core.text_cleaner import clean_csv
|
||||
|
||||
CORPUS = Path("tests/corpus") # wherever this folder lands
|
||||
|
||||
@pytest.mark.parametrize("name", [
|
||||
"01_whitespace_basic",
|
||||
"02_whitespace_unicode",
|
||||
"03_smart_punctuation",
|
||||
"04_unicode_forms",
|
||||
"05_zero_width_invisible",
|
||||
"06_control_characters",
|
||||
"07_bom_utf8",
|
||||
"08_line_endings_crlf",
|
||||
"09_line_endings_cr",
|
||||
"10_line_endings_mixed",
|
||||
"11_embedded_newlines",
|
||||
"13_non_latin_scripts",
|
||||
"15_whitespace_only_cells",
|
||||
"16_dirty_headers",
|
||||
"17_preserve_intended",
|
||||
"18_empty_file",
|
||||
"19_headers_only",
|
||||
"20_kitchen_sink",
|
||||
])
|
||||
def test_default_config(name, tmp_path):
|
||||
inp = CORPUS / "test_data" / f"{name}.csv"
|
||||
expected = (CORPUS / "expected" / f"{name}.csv").read_bytes()
|
||||
out = tmp_path / "out.csv"
|
||||
clean_csv(inp, out) # default config
|
||||
assert out.read_bytes() == expected
|
||||
|
||||
# Cases 12 and 14 have multiple expected files; parametrize them separately
|
||||
# with the relevant flags.
|
||||
|
||||
# Idempotency property test - applies to every fixture:
|
||||
@pytest.mark.parametrize("name", [...same list...])
|
||||
def test_idempotent(name, tmp_path):
|
||||
inp = CORPUS / "test_data" / f"{name}.csv"
|
||||
out1 = tmp_path / "out1.csv"
|
||||
out2 = tmp_path / "out2.csv"
|
||||
clean_csv(inp, out1)
|
||||
clean_csv(out1, out2)
|
||||
assert out1.read_bytes() == out2.read_bytes()
|
||||
```
|
||||
|
||||
### Regenerating fixtures
|
||||
If a default policy changes (e.g., switch the default Unicode form from NFC to NFKC, which would be a meaningful policy decision), the fixtures in `expected/` need regenerating. Edit `generate_test_data.py` and re-run. Document the policy change in DECISIONS.md before doing this.
|
||||
@@ -0,0 +1,8 @@
|
||||
id,name,city
|
||||
1,Alice,New York
|
||||
2,Bob,Chicago
|
||||
3,Carol,San Francisco
|
||||
4,Dan Smith,Austin
|
||||
5,Eve,Boston
|
||||
6,Frank van der Berg,Denver
|
||||
7,Grace Hopper,Palo Alto
|
||||
|
@@ -0,0 +1,7 @@
|
||||
id,label,note
|
||||
1,Premium,NBSP padding
|
||||
2,Discount,narrow NBSP
|
||||
3,Standard,ideographic space
|
||||
4,Tier One,em-space internal
|
||||
5,Cost Plus,thin-space internal
|
||||
6,mixed,ascii + NBSP combined
|
||||
|
@@ -0,0 +1,6 @@
|
||||
id,quote,measurement
|
||||
1,"""Hello world""","5' 11"""
|
||||
2,it's working,-
|
||||
3,2020-2024,from 'a' to 'z'
|
||||
4,wait...,3 × 4
|
||||
5,"""quoted""",5 ± 0.1
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,name,description
|
||||
1,café,NFC form (single code point)
|
||||
2,café,NFD form (e + combining accent)
|
||||
3,naïve,NFC i-diaeresis
|
||||
4,naïve,NFD i + combining diaeresis
|
||||
5,office,fi-ligature (ffi)
|
||||
6,ABC,fullwidth ABC
|
||||
7,Ⅸ century,roman numeral nine (single code point)
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,value,note
|
||||
1,Hello,zero-width space inside word
|
||||
2,Leading,leading + internal ZWSP
|
||||
3,Trail,trailing ZWSP
|
||||
4,abc,ZWNJ and ZWJ
|
||||
5,Marked,LTR + RTL marks bracketing
|
||||
6,cooperate,soft hyphen
|
||||
7,nobreak,word joiner
|
||||
|
@@ -0,0 +1,9 @@
|
||||
id,value,note
|
||||
1,HelloWorld,NUL byte inside
|
||||
2,BellSound,BEL character
|
||||
3,Backspace,backspace
|
||||
4,VertTab,vertical tab
|
||||
5,FormFeed,form feed
|
||||
6,Escape,ESC character
|
||||
7,Delete,DEL character
|
||||
8,Mixedjunk,multiple controls in one cell
|
||||
|
3
test-cases/text-cleaner-corpus/expected/07_bom_utf8.csv
Normal file
3
test-cases/text-cleaner-corpus/expected/07_bom_utf8.csv
Normal file
@@ -0,0 +1,3 @@
|
||||
id,name,city
|
||||
1,Alice,New York
|
||||
2,Bob,Chicago
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,name
|
||||
1,Alice
|
||||
2,Bob
|
||||
3,Carol
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,name
|
||||
1,Alice
|
||||
2,Bob
|
||||
3,Carol
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name
|
||||
1,Alice
|
||||
2,Bob
|
||||
3,Carol
|
||||
4,Dan
|
||||
|
@@ -0,0 +1,9 @@
|
||||
id,address,notes
|
||||
1,"123 Main St
|
||||
Apt 4B
|
||||
New York, NY","line1
|
||||
line2"
|
||||
2,Single line,"contains
|
||||
classic mac
|
||||
internal"
|
||||
3,normal,no newlines here
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,email,product
|
||||
1,ALICE SMITH,Alice@Example.COM,Widget
|
||||
2,bob jones,BOB@example.com,GADGET
|
||||
3,Carol Brown,carol@EXAMPLE.com,wIdGeT
|
||||
4,DAN O'CONNOR,Dan@Example.com,gizmo
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,email,product
|
||||
1,ALICE SMITH,alice@example.com,Widget
|
||||
2,bob jones,bob@example.com,GADGET
|
||||
3,Carol Brown,carol@example.com,wIdGeT
|
||||
4,DAN O'CONNOR,dan@example.com,gizmo
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,email,product
|
||||
1,Alice Smith,Alice@Example.COM,Widget
|
||||
2,Bob Jones,BOB@example.com,GADGET
|
||||
3,Carol Brown,carol@EXAMPLE.com,wIdGeT
|
||||
4,Dan O'Connor,Dan@Example.com,gizmo
|
||||
|
@@ -0,0 +1,7 @@
|
||||
id,name,note
|
||||
1,中国北京,Beijing in Chinese (with leading/trailing space)
|
||||
2,テスト,Japanese katakana (test)
|
||||
3,تجربة,Arabic (test) - RTL
|
||||
4,Москва,Russian (Moscow)
|
||||
5,🎉 launch 🚀,emoji preserved
|
||||
6,café ☕,emoji + accent combo
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,city
|
||||
1,café,München
|
||||
2,naïve,résumé
|
||||
3,don’t,smart-apostrophe mojibake
|
||||
4,Alice,New York
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,city
|
||||
1,café,München
|
||||
2,naïve,résumé
|
||||
3,don't,smart-apostrophe mojibake
|
||||
4,Alice,New York
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,value
|
||||
1,real
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,actual value
|
||||
|
@@ -0,0 +1,3 @@
|
||||
id,Customer Name,"""Email""",Phone
|
||||
1,Alice,alice@example.com,555-1234
|
||||
2,Bob,bob@example.com,555-5678
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,price,european_number,date,phone,quantity
|
||||
1,100,1 234,2024-01-15,(555) 123-4567,42
|
||||
2,"$1,500.00",12 345,15/01/2024,555.123.4567,7
|
||||
3,N/A,nan,Jan 15 2024,+1 555 123 4567,0
|
||||
|
@@ -0,0 +1 @@
|
||||
id,Name,Email
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,Name,"""Email""",Notes
|
||||
1,Alice Smith,Alice@Example.COM,"""VIP"" customer - contact ASAP..."
|
||||
2,Bob Jones,bob@example.com,"it's 5'6"" tall"
|
||||
3,Carol Brown,CAROL@EXAMPLE.COM,3 × 4 = 12 (preserve ×)
|
||||
4,,empty@example.com,whitespace-only name (becomes empty)
|
||||
|
545
test-cases/text-cleaner-corpus/generate_test_data.py
Normal file
545
test-cases/text-cleaner-corpus/generate_test_data.py
Normal file
@@ -0,0 +1,545 @@
|
||||
"""
|
||||
Generator for the 02_text_cleaner test corpus.
|
||||
|
||||
Writes raw bytes where exact control over encoding/line-endings/invisible
|
||||
characters matters. Do not edit the output files in a text editor that
|
||||
"helpfully" normalizes anything; it will silently break the tests.
|
||||
|
||||
Run from the corpus root:
|
||||
python generate_test_data.py
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
ROOT = Path(__file__).parent
|
||||
TD = ROOT / "test_data"
|
||||
EX = ROOT / "expected"
|
||||
TD.mkdir(exist_ok=True)
|
||||
EX.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
def write_bytes(path, data):
|
||||
Path(path).write_bytes(data)
|
||||
|
||||
|
||||
def write_text(path, text, encoding="utf-8", newline="\n"):
|
||||
# Explicit byte write so we control line endings exactly.
|
||||
if newline != "\n":
|
||||
text = text.replace("\n", newline)
|
||||
Path(path).write_bytes(text.encode(encoding))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 01 Whitespace - basic (ASCII space + tab)
|
||||
# ---------------------------------------------------------------------------
|
||||
write_text(TD / "01_whitespace_basic.csv", (
|
||||
"id,name,city\n"
|
||||
"1, Alice ,New York\n" # leading + trailing spaces
|
||||
"2,Bob, Chicago\n" # leading spaces
|
||||
"3,Carol ,San Francisco \n" # trailing spaces
|
||||
"4,Dan Smith,Austin\n" # internal multi-space
|
||||
"5,\tEve\t,\tBoston\t\n" # tab padding
|
||||
"6,Frank van der Berg,Denver\n" # multiple internal multi-space runs
|
||||
"7, Grace Hopper , Palo Alto \n" # everything at once
|
||||
))
|
||||
|
||||
write_text(EX / "01_whitespace_basic.csv", (
|
||||
"id,name,city\n"
|
||||
"1,Alice,New York\n"
|
||||
"2,Bob,Chicago\n"
|
||||
"3,Carol,San Francisco\n"
|
||||
"4,Dan Smith,Austin\n"
|
||||
"5,Eve,Boston\n"
|
||||
"6,Frank van der Berg,Denver\n"
|
||||
"7,Grace Hopper,Palo Alto\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 02 Whitespace - unicode (NBSP, narrow NBSP, ideographic space, etc.)
|
||||
# ---------------------------------------------------------------------------
|
||||
# These are the whitespace-pretenders that .strip() in Python 3 actually
|
||||
# DOES handle, but that .strip() in many naive implementations (or pandas
|
||||
# defaults) does NOT. Test that they're stripped, not preserved.
|
||||
NBSP = "\u00A0" # non-breaking space (very common from Word/Excel paste)
|
||||
NNBSP = "\u202F" # narrow no-break space
|
||||
IDEO = "\u3000" # ideographic space (CJK)
|
||||
EM_SPACE = "\u2003" # em space
|
||||
THIN_SPACE = "\u2009" # thin space
|
||||
write_text(TD / "02_whitespace_unicode.csv", (
|
||||
"id,label,note\n"
|
||||
f"1,{NBSP}Premium{NBSP},NBSP padding\n"
|
||||
f"2,{NNBSP}Discount{NNBSP},narrow NBSP\n"
|
||||
f"3,{IDEO}Standard{IDEO},ideographic space\n"
|
||||
f"4,Tier{EM_SPACE}{EM_SPACE}One,em-space internal\n"
|
||||
f"5,Cost{THIN_SPACE}Plus,thin-space internal\n"
|
||||
f"6, {NBSP} mixed {NBSP} ,ascii + NBSP combined\n"
|
||||
))
|
||||
|
||||
write_text(EX / "02_whitespace_unicode.csv", (
|
||||
"id,label,note\n"
|
||||
"1,Premium,NBSP padding\n"
|
||||
"2,Discount,narrow NBSP\n"
|
||||
"3,Standard,ideographic space\n"
|
||||
"4,Tier One,em-space internal\n"
|
||||
"5,Cost Plus,thin-space internal\n"
|
||||
"6,mixed,ascii + NBSP combined\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 03 Smart punctuation (curly quotes, em/en dash, ellipsis, primes)
|
||||
# ---------------------------------------------------------------------------
|
||||
# This is the #1 source of pollution from data that ever passed through
|
||||
# Word, Outlook, or Excel autocorrect. ASCII-fy it.
|
||||
write_text(TD / "03_smart_punctuation.csv", (
|
||||
"id,quote,measurement\n"
|
||||
"1,\u201cHello world\u201d,5\u2032 11\u2033\n" # curly double quotes, prime/double-prime
|
||||
"2,it\u2019s working,\u2014\n" # curly apostrophe, em-dash alone
|
||||
"3,2020\u20132024,from \u2018a\u2019 to \u2018z\u2019\n" # en-dash range, curly singles
|
||||
"4,wait\u2026,3 \u00d7 4\n" # ellipsis char, multiplication sign
|
||||
"5,\u00abquoted\u00bb,5 \u00b1 0.1\n" # guillemets, plus-minus
|
||||
))
|
||||
|
||||
# Default policy: ASCII-fy where round-trip-safe.
|
||||
# Notable: \u00d7 (multiplication) and \u00b1 (plus-minus) are typographically
|
||||
# meaningful and not safely round-trippable to ASCII, so we PRESERVE them
|
||||
# (case 4 col3, case 5 col3). Document this in TEST-CASES.md.
|
||||
write_text(EX / "03_smart_punctuation.csv", (
|
||||
"id,quote,measurement\n"
|
||||
"1,\"\"\"Hello world\"\"\",\"5' 11\"\"\"\n"
|
||||
"2,it's working,-\n"
|
||||
"3,2020-2024,from 'a' to 'z'\n"
|
||||
"4,wait...,3 \u00d7 4\n"
|
||||
"5,\"\"\"quoted\"\"\",5 \u00b1 0.1\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 04 Unicode normalization forms (NFC vs NFD, ligatures, fullwidth)
|
||||
# ---------------------------------------------------------------------------
|
||||
# "café" can be either:
|
||||
# NFC: "caf\u00e9" (e-acute as single code point)
|
||||
# NFD: "cafe\u0301" (e + combining acute accent, two code points)
|
||||
# These look identical but compare unequal. Normalize to NFC.
|
||||
write_text(TD / "04_unicode_forms.csv", (
|
||||
"id,name,description\n"
|
||||
"1,caf\u00e9,NFC form (single code point)\n"
|
||||
"2,cafe\u0301,NFD form (e + combining accent)\n"
|
||||
"3,na\u00efve,NFC i-diaeresis\n"
|
||||
"4,nai\u0308ve,NFD i + combining diaeresis\n"
|
||||
"5,o\uFB03ce,fi-ligature (\uFB03)\n" # 'office' written with 'ffi' ligature
|
||||
"6,\uFF21\uFF22\uFF23,fullwidth ABC\n" # A B C
|
||||
"7,\u2168 century,roman numeral nine (single code point)\n" # Ⅸ
|
||||
))
|
||||
|
||||
# Policy: NFC by default (most compatible, smallest, what Excel emits).
|
||||
# NFKC option would also fold ligatures and fullwidth digits/letters,
|
||||
# but is destructive for some legitimate text. Default = NFC.
|
||||
# So:
|
||||
# - Cases 1 vs 2 should produce identical output after normalization
|
||||
# - Cases 3 vs 4 should produce identical output
|
||||
# - Case 5 ligature stays as ligature under NFC (would fold under NFKC)
|
||||
# - Case 6 fullwidth stays fullwidth under NFC (would fold under NFKC)
|
||||
write_text(EX / "04_unicode_forms.csv", (
|
||||
"id,name,description\n"
|
||||
"1,caf\u00e9,NFC form (single code point)\n"
|
||||
"2,caf\u00e9,NFD form (e + combining accent)\n" # same bytes as row 1 now
|
||||
"3,na\u00efve,NFC i-diaeresis\n"
|
||||
"4,na\u00efve,NFD i + combining diaeresis\n" # same as row 3 now
|
||||
"5,o\uFB03ce,fi-ligature (\uFB03)\n"
|
||||
"6,\uFF21\uFF22\uFF23,fullwidth ABC\n"
|
||||
"7,\u2168 century,roman numeral nine (single code point)\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 05 Zero-width / invisible characters
|
||||
# ---------------------------------------------------------------------------
|
||||
ZWSP = "\u200B" # zero-width space
|
||||
ZWNJ = "\u200C" # zero-width non-joiner
|
||||
ZWJ = "\u200D" # zero-width joiner
|
||||
LRM = "\u200E" # left-to-right mark
|
||||
RLM = "\u200F" # right-to-left mark
|
||||
SOFT_HYPHEN = "\u00AD"
|
||||
WORD_JOINER = "\u2060"
|
||||
write_text(TD / "05_zero_width_invisible.csv", (
|
||||
"id,value,note\n"
|
||||
f"1,Hel{ZWSP}lo,zero-width space inside word\n"
|
||||
f"2,{ZWSP}Lead{ZWSP}ing,leading + internal ZWSP\n"
|
||||
f"3,Trail{ZWSP},trailing ZWSP\n"
|
||||
f"4,a{ZWNJ}b{ZWJ}c,ZWNJ and ZWJ\n"
|
||||
f"5,{LRM}Marked{RLM},LTR + RTL marks bracketing\n"
|
||||
f"6,co{SOFT_HYPHEN}operate,soft hyphen\n"
|
||||
f"7,no{WORD_JOINER}break,word joiner\n"
|
||||
))
|
||||
|
||||
write_text(EX / "05_zero_width_invisible.csv", (
|
||||
"id,value,note\n"
|
||||
"1,Hello,zero-width space inside word\n"
|
||||
"2,Leading,leading + internal ZWSP\n"
|
||||
"3,Trail,trailing ZWSP\n"
|
||||
"4,abc,ZWNJ and ZWJ\n"
|
||||
"5,Marked,LTR + RTL marks bracketing\n"
|
||||
"6,cooperate,soft hyphen\n"
|
||||
"7,nobreak,word joiner\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 06 Control characters (non-printable, except tab/CR/LF inside quoted cells)
|
||||
# ---------------------------------------------------------------------------
|
||||
# These bytes show up in real exports from broken systems, terminals, or
|
||||
# binary data accidentally exported as text.
|
||||
# \x00 NUL, \x01 SOH, \x07 BEL, \x08 BS, \x0B VT, \x0C FF, \x1B ESC, \x7F DEL
|
||||
write_text(TD / "06_control_characters.csv", (
|
||||
"id,value,note\n"
|
||||
"1,Hello\x00World,NUL byte inside\n"
|
||||
"2,Bell\x07Sound,BEL character\n"
|
||||
"3,Back\x08space,backspace\n"
|
||||
"4,Vert\x0BTab,vertical tab\n"
|
||||
"5,Form\x0CFeed,form feed\n"
|
||||
"6,Esc\x1Bape,ESC character\n"
|
||||
"7,Del\x7Fete,DEL character\n"
|
||||
"8,Mixed\x00\x07\x1Bjunk,multiple controls in one cell\n"
|
||||
))
|
||||
|
||||
write_text(EX / "06_control_characters.csv", (
|
||||
"id,value,note\n"
|
||||
"1,HelloWorld,NUL byte inside\n"
|
||||
"2,BellSound,BEL character\n"
|
||||
"3,Backspace,backspace\n"
|
||||
"4,VertTab,vertical tab\n"
|
||||
"5,FormFeed,form feed\n"
|
||||
"6,Escape,ESC character\n"
|
||||
"7,Delete,DEL character\n"
|
||||
"8,Mixedjunk,multiple controls in one cell\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 07 BOM at start of file (UTF-8 BOM = EF BB BF)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Excel writes UTF-8 with BOM by default. pandas usually handles it but
|
||||
# leaves the BOM as part of the first column's header name if you're not
|
||||
# careful, producing a mystery column called "\ufeffid" that breaks lookups.
|
||||
bom = b"\xef\xbb\xbf"
|
||||
content = (
|
||||
"id,name,city\n"
|
||||
"1,Alice,New York\n"
|
||||
"2,Bob,Chicago\n"
|
||||
).encode("utf-8")
|
||||
write_bytes(TD / "07_bom_utf8.csv", bom + content)
|
||||
|
||||
# Expected: BOM stripped on read, output written WITHOUT BOM, header is
|
||||
# clean "id" not "\ufeffid".
|
||||
write_bytes(EX / "07_bom_utf8.csv", content)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 08 Line endings - all CRLF (Windows)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Default policy: normalize to LF on output.
|
||||
write_text(TD / "08_line_endings_crlf.csv", (
|
||||
"id,name\n"
|
||||
"1,Alice\n"
|
||||
"2,Bob\n"
|
||||
"3,Carol\n"
|
||||
), newline="\r\n")
|
||||
|
||||
write_text(EX / "08_line_endings_crlf.csv", (
|
||||
"id,name\n"
|
||||
"1,Alice\n"
|
||||
"2,Bob\n"
|
||||
"3,Carol\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 09 Line endings - CR only (classic Mac, pre-OSX, occasionally still seen)
|
||||
# ---------------------------------------------------------------------------
|
||||
write_text(TD / "09_line_endings_cr.csv", (
|
||||
"id,name\n"
|
||||
"1,Alice\n"
|
||||
"2,Bob\n"
|
||||
"3,Carol\n"
|
||||
), newline="\r")
|
||||
|
||||
write_text(EX / "09_line_endings_cr.csv", (
|
||||
"id,name\n"
|
||||
"1,Alice\n"
|
||||
"2,Bob\n"
|
||||
"3,Carol\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 10 Line endings - mixed within the same file
|
||||
# ---------------------------------------------------------------------------
|
||||
# Real-world disaster mode: file edited on multiple OSes, or concatenated
|
||||
# from sources with different conventions.
|
||||
mixed = (
|
||||
b"id,name\r\n"
|
||||
b"1,Alice\n"
|
||||
b"2,Bob\r"
|
||||
b"3,Carol\r\n"
|
||||
b"4,Dan\n"
|
||||
)
|
||||
write_bytes(TD / "10_line_endings_mixed.csv", mixed)
|
||||
|
||||
write_text(EX / "10_line_endings_mixed.csv", (
|
||||
"id,name\n"
|
||||
"1,Alice\n"
|
||||
"2,Bob\n"
|
||||
"3,Carol\n"
|
||||
"4,Dan\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 11 Embedded newlines INSIDE quoted cells (must be preserved!)
|
||||
# ---------------------------------------------------------------------------
|
||||
# This is the trap: line-ending normalization at the FILE level must not
|
||||
# destroy intentional newlines INSIDE quoted multi-line cells (e.g., a
|
||||
# notes column or an address column).
|
||||
# But the embedded line endings should also be normalized to LF for
|
||||
# consistency.
|
||||
write_text(TD / "11_embedded_newlines.csv", (
|
||||
"id,address,notes\n"
|
||||
"1,\"123 Main St\r\nApt 4B\r\nNew York, NY\",\"line1\nline2\"\n"
|
||||
"2,\"Single line\",\"contains\rclassic mac\rinternal\"\n"
|
||||
"3,\"normal\",\"no newlines here\"\n"
|
||||
))
|
||||
|
||||
# Expected: file-level CRLF normalized to LF; embedded CRLF/CR also
|
||||
# normalized to LF; cells stay multi-line.
|
||||
write_text(EX / "11_embedded_newlines.csv", (
|
||||
"id,address,notes\n"
|
||||
"1,\"123 Main St\nApt 4B\nNew York, NY\",\"line1\nline2\"\n"
|
||||
"2,Single line,\"contains\nclassic mac\ninternal\"\n"
|
||||
"3,normal,no newlines here\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 12 Case operations (opt-in, default = preserve)
|
||||
# ---------------------------------------------------------------------------
|
||||
# This file tests case operations IF the user requests them.
|
||||
# Default behavior: PRESERVE. So expected_default == input.
|
||||
# An expected_lower.csv shows what lower-case mode produces.
|
||||
write_text(TD / "12_case_variations.csv", (
|
||||
"id,name,email,product\n"
|
||||
"1,ALICE SMITH,Alice@Example.COM,Widget\n"
|
||||
"2,bob jones,BOB@example.com,GADGET\n"
|
||||
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
|
||||
"4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
|
||||
))
|
||||
|
||||
# Default expected: identical to input (case ops are opt-in).
|
||||
write_text(EX / "12_case_variations__default.csv", (
|
||||
"id,name,email,product\n"
|
||||
"1,ALICE SMITH,Alice@Example.COM,Widget\n"
|
||||
"2,bob jones,BOB@example.com,GADGET\n"
|
||||
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
|
||||
"4,DAN O'CONNOR,Dan@Example.com,gizmo\n"
|
||||
))
|
||||
|
||||
# With --case-email=lower applied to email column only:
|
||||
write_text(EX / "12_case_variations__email_lower.csv", (
|
||||
"id,name,email,product\n"
|
||||
"1,ALICE SMITH,alice@example.com,Widget\n"
|
||||
"2,bob jones,bob@example.com,GADGET\n"
|
||||
"3,Carol Brown,carol@example.com,wIdGeT\n"
|
||||
"4,DAN O'CONNOR,dan@example.com,gizmo\n"
|
||||
))
|
||||
|
||||
# With --case=title applied to name column:
|
||||
write_text(EX / "12_case_variations__name_title.csv", (
|
||||
"id,name,email,product\n"
|
||||
"1,Alice Smith,Alice@Example.COM,Widget\n"
|
||||
"2,Bob Jones,BOB@example.com,GADGET\n"
|
||||
"3,Carol Brown,carol@EXAMPLE.com,wIdGeT\n"
|
||||
"4,Dan O'Connor,Dan@Example.com,gizmo\n" # title-case must not break O'C
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 13 Non-Latin scripts and emoji (PRESERVE; do not mangle)
|
||||
# ---------------------------------------------------------------------------
|
||||
# This is a negative test: the cleaner must not damage characters that
|
||||
# look "foreign" to it. Whitespace trimming and Unicode NFC are still applied.
|
||||
write_text(TD / "13_non_latin_scripts.csv", (
|
||||
"id,name,note\n"
|
||||
"1, \u4e2d\u56fd\u5317\u4eac ,Beijing in Chinese (with leading/trailing space)\n"
|
||||
"2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
|
||||
"3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
|
||||
"4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
|
||||
"5,\U0001F389 launch \U0001F680,emoji preserved\n"
|
||||
"6,caf\u00e9 \u2615,emoji + accent combo\n"
|
||||
))
|
||||
|
||||
write_text(EX / "13_non_latin_scripts.csv", (
|
||||
"id,name,note\n"
|
||||
"1,\u4e2d\u56fd\u5317\u4eac,Beijing in Chinese (with leading/trailing space)\n"
|
||||
"2,\u30c6\u30b9\u30c8,Japanese katakana (test)\n"
|
||||
"3,\u062a\u062c\u0631\u0628\u0629,Arabic (test) - RTL\n"
|
||||
"4,\u041c\u043e\u0441\u043a\u0432\u0430,Russian (Moscow)\n"
|
||||
"5,\U0001F389 launch \U0001F680,emoji preserved\n"
|
||||
"6,caf\u00e9 \u2615,emoji + accent combo\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 14 Mojibake (double-encoded UTF-8 / cp1252 misread as Latin-1)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Classic mojibake: someone took a UTF-8 file, opened it as Windows-1252,
|
||||
# saved as UTF-8 again. "café" becomes "café", "naïve" becomes "naïve".
|
||||
# The text cleaner CANNOT reliably auto-fix this (it's a heuristic and can
|
||||
# false-positive on legitimate strings). Default = WARN, do not auto-fix.
|
||||
# Optional --fix-mojibake flag (uses ftfy library) can attempt repair.
|
||||
write_text(TD / "14_mojibake.csv", (
|
||||
"id,name,city\n"
|
||||
"1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n" # café, München mojibaked
|
||||
"2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n" # naïve, résumé
|
||||
"3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n" # don't via cp1252-mojibake
|
||||
"4,Alice,New York\n" # clean control row
|
||||
))
|
||||
|
||||
# Expected output WITHOUT mojibake fix (default): bytes preserved, but
|
||||
# reader emits a warning to logs.
|
||||
write_text(EX / "14_mojibake__default.csv", (
|
||||
"id,name,city\n"
|
||||
"1,caf\u00c3\u00a9,M\u00c3\u00bcnchen\n"
|
||||
"2,na\u00c3\u00afve,r\u00c3\u00a9sum\u00c3\u00a9\n"
|
||||
"3,don\u00e2\u20ac\u2122t,smart-apostrophe mojibake\n"
|
||||
"4,Alice,New York\n"
|
||||
))
|
||||
|
||||
# Expected output WITH --fix-mojibake (uses ftfy or equivalent):
|
||||
write_text(EX / "14_mojibake__fixed.csv", (
|
||||
"id,name,city\n"
|
||||
"1,caf\u00e9,M\u00fcnchen\n"
|
||||
"2,na\u00efve,r\u00e9sum\u00e9\n"
|
||||
"3,don't,smart-apostrophe mojibake\n" # smart apostrophe also fixed
|
||||
"4,Alice,New York\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 15 Whitespace-only cells (boundary case with script 04)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per TECHNICAL.md Section 9.3: 02 trims first, leaving an empty string.
|
||||
# 04 then detects empty strings as disguised null. So 02's job here is
|
||||
# just to convert " " into "".
|
||||
write_text(TD / "15_whitespace_only_cells.csv", (
|
||||
"id,value\n"
|
||||
"1,real\n"
|
||||
"2, \n" # spaces only
|
||||
"3,\t\t\n" # tabs only
|
||||
"4,\u00A0\u00A0\n" # NBSP only
|
||||
"5, \t \u00A0 \n" # mixed whitespace
|
||||
"6,\n" # already empty
|
||||
"7,actual value\n"
|
||||
))
|
||||
|
||||
write_text(EX / "15_whitespace_only_cells.csv", (
|
||||
"id,value\n"
|
||||
"1,real\n"
|
||||
"2,\n" # all whitespace -> empty
|
||||
"3,\n"
|
||||
"4,\n"
|
||||
"5,\n"
|
||||
"6,\n"
|
||||
"7,actual value\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 16 Dirty headers
|
||||
# ---------------------------------------------------------------------------
|
||||
# Headers themselves have whitespace, BOM remnants, smart quotes, etc.
|
||||
# These break downstream lookups (df["email"] fails because the column
|
||||
# is actually called " Email " with NBSP padding).
|
||||
write_text(TD / "16_dirty_headers.csv", (
|
||||
" id ,\u00a0Customer Name\u00a0,\u201cEmail\u201d,Phone\u200b\n"
|
||||
"1,Alice,alice@example.com,555-1234\n"
|
||||
"2,Bob,bob@example.com,555-5678\n"
|
||||
))
|
||||
|
||||
# Expected: headers cleaned by SAME rules as data cells.
|
||||
# Note: smart quotes around "Email" become straight quotes. The header
|
||||
# "\"Email\"" with embedded quotes needs CSV-quoting in the output.
|
||||
write_text(EX / "16_dirty_headers.csv", (
|
||||
"id,Customer Name,\"\"\"Email\"\"\",Phone\n"
|
||||
"1,Alice,alice@example.com,555-1234\n"
|
||||
"2,Bob,bob@example.com,555-5678\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 17 Preserve-intended (negative tests - things 02 must NOT touch)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Numbers that LOOK like they have whitespace are tricky: " 123 " is
|
||||
# a number with padding (trim) but "1 234" might be a thousands-separator
|
||||
# locale (don't collapse). Default: trim outer whitespace, but DO NOT
|
||||
# collapse internal whitespace in cells that parse as numeric. This is a
|
||||
# judgment call; document it.
|
||||
#
|
||||
# Also: do not reformat dates, currencies, or phone numbers. That's 03.
|
||||
# Do not detect or replace null-like values. That's 04.
|
||||
write_text(TD / "17_preserve_intended.csv", (
|
||||
"id,price,european_number,date,phone,quantity\n"
|
||||
"1, 100 ,1 234,2024-01-15,(555) 123-4567,42\n"
|
||||
"2,\" $1,500.00 \",12 345,15/01/2024,555.123.4567,7\n"
|
||||
"3, N/A ,nan,Jan 15 2024,+1 555 123 4567,0\n"
|
||||
))
|
||||
|
||||
# Expected: outer whitespace trimmed everywhere, but:
|
||||
# - "1 234" stays "1 234" (looks like European/space-thousands; don't collapse)
|
||||
# - "$1,500.00" stays unchanged (currency, that's 03's domain)
|
||||
# - "15/01/2024" stays unchanged (date, that's 03's domain)
|
||||
# - "(555) 123-4567" stays unchanged (phone, that's 03's domain)
|
||||
# - "N/A" stays "N/A" (null-like, that's 04's domain - 02 doesn't decide what's null)
|
||||
# - phone "+1 555 123 4567" - keep internal spaces (it's a phone, 03's domain)
|
||||
write_text(EX / "17_preserve_intended.csv", (
|
||||
"id,price,european_number,date,phone,quantity\n"
|
||||
"1,100,1 234,2024-01-15,(555) 123-4567,42\n"
|
||||
"2,\"$1,500.00\",12 345,15/01/2024,555.123.4567,7\n"
|
||||
"3,N/A,nan,Jan 15 2024,+1 555 123 4567,0\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 18 Empty file (zero bytes)
|
||||
# ---------------------------------------------------------------------------
|
||||
write_bytes(TD / "18_empty_file.csv", b"")
|
||||
|
||||
# Expected: graceful handling, output is also empty (or warning emitted).
|
||||
write_bytes(EX / "18_empty_file.csv", b"")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 19 Headers only (no data rows)
|
||||
# ---------------------------------------------------------------------------
|
||||
write_text(TD / "19_headers_only.csv", (
|
||||
" id ,Name\u00a0,Email\u200b\n"
|
||||
))
|
||||
|
||||
# Expected: headers cleaned, no data rows in output.
|
||||
write_text(EX / "19_headers_only.csv", (
|
||||
"id,Name,Email\n"
|
||||
))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 20 Real-world kitchen sink (everything combined)
|
||||
# ---------------------------------------------------------------------------
|
||||
# Simulates a typical messy export: came from Excel via cp1252 paste,
|
||||
# saved as UTF-8 with BOM, has CRLF, has smart quotes from autocorrect,
|
||||
# has NBSP from copy/paste, has trailing whitespace.
|
||||
content = (
|
||||
" id ,\u00a0Name\u00a0,\u201cEmail\u201d,Notes\u200b\n"
|
||||
"1,\u00a0Alice Smith\u00a0,Alice@Example.COM,\u201cVIP\u201d customer \u2014 contact ASAP\u2026\r\n"
|
||||
"2,\tBob\tJones\t,bob@example.com,it\u2019s 5\u20326\u2033 tall\r\n"
|
||||
"3, Carol Brown ,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\r\n"
|
||||
"4, ,empty@example.com,whitespace-only name (becomes empty)\r\n"
|
||||
)
|
||||
data_bytes = b"\xef\xbb\xbf" + content.encode("utf-8")
|
||||
# replace LF with CRLF wherever it isn't already to be unambiguous
|
||||
write_bytes(TD / "20_kitchen_sink.csv", data_bytes)
|
||||
|
||||
# Expected: BOM gone, headers clean, smart quotes ASCII-fied, NBSP/ZWSP
|
||||
# stripped, internal multi-space collapsed, CRLF normalized to LF,
|
||||
# whitespace-only cells become empty, multiplication sign preserved,
|
||||
# em-dash and ellipsis converted, prime/double-prime converted.
|
||||
write_text(EX / "20_kitchen_sink.csv", (
|
||||
"id,Name,\"\"\"Email\"\"\",Notes\n"
|
||||
"1,Alice Smith,Alice@Example.COM,\"\"\"VIP\"\" customer - contact ASAP...\"\n"
|
||||
"2,Bob Jones,bob@example.com,\"it's 5'6\"\" tall\"\n"
|
||||
"3,Carol Brown,CAROL@EXAMPLE.COM,3 \u00d7 4 = 12 (preserve \u00d7)\n"
|
||||
"4,,empty@example.com,whitespace-only name (becomes empty)\n"
|
||||
))
|
||||
|
||||
print("All CSV test files written.")
|
||||
print(f" inputs: {TD}")
|
||||
print(f" expected: {EX}")
|
||||
74
test-cases/text-cleaner-corpus/generate_xlsx.py
Normal file
74
test-cases/text-cleaner-corpus/generate_xlsx.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
Generate the XLSX test workbook for 02_text_cleaner.
|
||||
|
||||
Excel-specific pollution patterns that don't appear in CSV:
|
||||
- Cells with leading apostrophe (Excel's force-text prefix; openpyxl
|
||||
surfaces these as plain strings but they show up in real exports)
|
||||
- Multi-line cells from Alt+Enter (carry \\n internally)
|
||||
- Smart quotes from Excel's autocorrect-as-you-type
|
||||
- NBSP padding from copy/paste from Word or web pages
|
||||
- Multiple sheets with different pollution profiles
|
||||
"""
|
||||
from pathlib import Path
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, Alignment
|
||||
|
||||
OUT = Path(__file__).parent / "test_data" / "21_excel_pollution.xlsx"
|
||||
|
||||
wb = Workbook()
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 1: Customers - whitespace + smart quotes + NBSP
|
||||
# --------------------------------------------------------------------
|
||||
ws = wb.active
|
||||
ws.title = "Customers"
|
||||
ws.append([" id ", "\u00a0Name\u00a0", "\u201cEmail\u201d", "Phone\u200b"]) # dirty headers
|
||||
ws.append([1, " Alice Smith ", "Alice@Example.COM", "555-1234"])
|
||||
ws.append([2, "\u00a0Bob\u00a0Jones\u00a0", "bob@example.com", "555-5678"])
|
||||
ws.append([3, "\tCarol\tBrown\t", "CAROL@example.com", " 555-9012 "])
|
||||
ws.append([4, "Dan O\u2019Connor", "dan@example.com", "555-3456"]) # curly apostrophe
|
||||
ws.append([5, "Eve \u2014 the Engineer", "eve@example.com", "555-7890"]) # em-dash
|
||||
ws.append([6, " ", "frank@example.com", "555-2468"]) # whitespace-only -> empty for 04
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 2: Notes - multi-line cells, embedded line breaks
|
||||
# --------------------------------------------------------------------
|
||||
ws2 = wb.create_sheet("Notes")
|
||||
ws2.append(["id", "title", "body"])
|
||||
ws2.append([1, "Welcome", "Line one\nLine two\nLine three"])
|
||||
ws2.append([2, "Address", "123 Main St\r\nApt 4B\r\nNew York"]) # mixed line endings inside
|
||||
ws2.append([3, "Quote", "She said \u201chello\u201d and left\u2026"]) # smart quotes + ellipsis
|
||||
ws2.append([4, " padded ", " multiline\n with leading whitespace per line "])
|
||||
# Mark column B with wrap_text so line breaks render in Excel
|
||||
for row in ws2.iter_rows(min_row=2, max_row=ws2.max_row, min_col=3, max_col=3):
|
||||
for cell in row:
|
||||
cell.alignment = Alignment(wrap_text=True)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 3: International - non-Latin scripts and emoji (preserve!)
|
||||
# --------------------------------------------------------------------
|
||||
ws3 = wb.create_sheet("International")
|
||||
ws3.append(["id", "city", "language"])
|
||||
ws3.append([1, " \u4e2d\u56fd\u5317\u4eac ", "Chinese"])
|
||||
ws3.append([2, "\u30c6\u30b9\u30c8 ", "Japanese (with trailing space)"])
|
||||
ws3.append([3, " \u041c\u043e\u0441\u043a\u0432\u0430", "Russian"])
|
||||
ws3.append([4, "\u062a\u062c\u0631\u0628\u0629", "Arabic"])
|
||||
ws3.append([5, "Caf\u00e9 \u2615", "emoji preserved"])
|
||||
ws3.append([6, "Launch \U0001F389\U0001F680", "more emoji"])
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Sheet 4: ForceText - cells originally entered with leading apostrophe
|
||||
# --------------------------------------------------------------------
|
||||
# Excel's force-text prefix '0001234 stores as string "0001234" when read
|
||||
# by openpyxl. Sometimes (broken exports) the apostrophe leaks through as
|
||||
# part of the value. Test that 02 doesn't try to "clean" leading
|
||||
# apostrophes - they may be intentional for ID columns.
|
||||
ws4 = wb.create_sheet("ForceText")
|
||||
ws4.append(["id", "sku", "zip"])
|
||||
ws4.append([1, "0001234", "08540"]) # legitimate leading-zero IDs
|
||||
ws4.append([2, " 0005678 ", "01001"]) # padded - trim outer space, keep zeros
|
||||
ws4.append([3, "'9999999", "10001"]) # leaked apostrophe - PRESERVE (judgment call)
|
||||
|
||||
wb.save(OUT)
|
||||
print(f"Wrote {OUT}")
|
||||
print(f"Sheets: {wb.sheetnames}")
|
||||
@@ -0,0 +1,8 @@
|
||||
id,name,city
|
||||
1, Alice ,New York
|
||||
2,Bob, Chicago
|
||||
3,Carol ,San Francisco
|
||||
4,Dan Smith,Austin
|
||||
5, Eve , Boston
|
||||
6,Frank van der Berg,Denver
|
||||
7, Grace Hopper , Palo Alto
|
||||
|
@@ -0,0 +1,7 @@
|
||||
id,label,note
|
||||
1, Premium ,NBSP padding
|
||||
2, Discount ,narrow NBSP
|
||||
3, Standard ,ideographic space
|
||||
4,Tier One,em-space internal
|
||||
5,Cost Plus,thin-space internal
|
||||
6, mixed ,ascii + NBSP combined
|
||||
|
@@ -0,0 +1,6 @@
|
||||
id,quote,measurement
|
||||
1,“Hello world”,5′ 11″
|
||||
2,it’s working,—
|
||||
3,2020–2024,from ‘a’ to ‘z’
|
||||
4,wait…,3 × 4
|
||||
5,«quoted»,5 ± 0.1
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,name,description
|
||||
1,café,NFC form (single code point)
|
||||
2,café,NFD form (e + combining accent)
|
||||
3,naïve,NFC i-diaeresis
|
||||
4,naïve,NFD i + combining diaeresis
|
||||
5,office,fi-ligature (ffi)
|
||||
6,ABC,fullwidth ABC
|
||||
7,Ⅸ century,roman numeral nine (single code point)
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,value,note
|
||||
1,Hello,zero-width space inside word
|
||||
2,Leading,leading + internal ZWSP
|
||||
3,Trail,trailing ZWSP
|
||||
4,abc,ZWNJ and ZWJ
|
||||
5,Marked,LTR + RTL marks bracketing
|
||||
6,cooperate,soft hyphen
|
||||
7,nobreak,word joiner
|
||||
|
Binary file not shown.
|
3
test-cases/text-cleaner-corpus/test_data/07_bom_utf8.csv
Normal file
3
test-cases/text-cleaner-corpus/test_data/07_bom_utf8.csv
Normal file
@@ -0,0 +1,3 @@
|
||||
id,name,city
|
||||
1,Alice,New York
|
||||
2,Bob,Chicago
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,name
|
||||
1,Alice
|
||||
2,Bob
|
||||
3,Carol
|
||||
|
@@ -0,0 +1 @@
|
||||
id,name
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,name
|
||||
1,Alice
|
||||
2,Bob
|
||||
3,Carol
|
||||
|
@@ -0,0 +1,7 @@
|
||||
id,address,notes
|
||||
1,"123 Main St
|
||||
Apt 4B
|
||||
New York, NY","line1
|
||||
line2"
|
||||
2,"Single line","contains
|
||||
classic mac
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id,name,email,product
|
||||
1,ALICE SMITH,Alice@Example.COM,Widget
|
||||
2,bob jones,BOB@example.com,GADGET
|
||||
3,Carol Brown,carol@EXAMPLE.com,wIdGeT
|
||||
4,DAN O'CONNOR,Dan@Example.com,gizmo
|
||||
|
@@ -0,0 +1,7 @@
|
||||
id,name,note
|
||||
1, 中国北京 ,Beijing in Chinese (with leading/trailing space)
|
||||
2,テスト,Japanese katakana (test)
|
||||
3,تجربة,Arabic (test) - RTL
|
||||
4,Москва,Russian (Moscow)
|
||||
5,🎉 launch 🚀,emoji preserved
|
||||
6,café ☕,emoji + accent combo
|
||||
|
5
test-cases/text-cleaner-corpus/test_data/14_mojibake.csv
Normal file
5
test-cases/text-cleaner-corpus/test_data/14_mojibake.csv
Normal file
@@ -0,0 +1,5 @@
|
||||
id,name,city
|
||||
1,café,München
|
||||
2,naïve,résumé
|
||||
3,don’t,smart-apostrophe mojibake
|
||||
4,Alice,New York
|
||||
|
@@ -0,0 +1,8 @@
|
||||
id,value
|
||||
1,real
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,actual value
|
||||
|
@@ -0,0 +1,3 @@
|
||||
id , Customer Name ,“Email”,Phone
|
||||
1,Alice,alice@example.com,555-1234
|
||||
2,Bob,bob@example.com,555-5678
|
||||
|
@@ -0,0 +1,4 @@
|
||||
id,price,european_number,date,phone,quantity
|
||||
1, 100 ,1 234,2024-01-15,(555) 123-4567,42
|
||||
2," $1,500.00 ",12 345,15/01/2024,555.123.4567,7
|
||||
3, N/A ,nan,Jan 15 2024,+1 555 123 4567,0
|
||||
|
@@ -0,0 +1 @@
|
||||
id ,Name ,Email
|
||||
|
@@ -0,0 +1,5 @@
|
||||
id , Name ,“Email”,Notes
|
||||
1, Alice Smith ,Alice@Example.COM,“VIP” customer — contact ASAP…
|
||||
2, Bob Jones ,bob@example.com,it’s 5′6″ tall
|
||||
3, Carol Brown ,CAROL@EXAMPLE.COM,3 × 4 = 12 (preserve ×)
|
||||
4, ,empty@example.com,whitespace-only name (becomes empty)
|
||||
|
BIN
test-cases/text-cleaner-corpus/test_data/21_excel_pollution.xlsx
Normal file
BIN
test-cases/text-cleaner-corpus/test_data/21_excel_pollution.xlsx
Normal file
Binary file not shown.
319
tests/test_analyze.py
Normal file
319
tests/test_analyze.py
Normal file
@@ -0,0 +1,319 @@
|
||||
"""Tests for src.core.analyze — upload-time data quality detectors."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.analyze import (
|
||||
Finding,
|
||||
TOOL_DEDUPLICATOR,
|
||||
TOOL_MISSING_HANDLER,
|
||||
TOOL_TEXT_CLEANER,
|
||||
analyze,
|
||||
findings_by_tool,
|
||||
to_dict,
|
||||
)
|
||||
from src.core.io import RepairAction, RepairResult, repair_bytes
|
||||
|
||||
|
||||
def _ids(findings: list[Finding]) -> set[str]:
|
||||
return {f.id for f in findings}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Smart punctuation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSmartPunctuation:
|
||||
def test_finds_curly_quotes(self):
|
||||
df = pd.DataFrame({"note": ["plain", "“fancy”", "it’s"]})
|
||||
findings = analyze(df)
|
||||
assert "smart_punctuation_in_data" in _ids(findings)
|
||||
f = next(f for f in findings if f.id == "smart_punctuation_in_data")
|
||||
assert f.severity == "warn"
|
||||
assert f.tool == TOOL_TEXT_CLEANER
|
||||
assert f.count == 2
|
||||
|
||||
def test_finds_dashes_and_ellipsis(self):
|
||||
df = pd.DataFrame({"note": ["a—b", "wait…"]})
|
||||
findings = analyze(df)
|
||||
assert "smart_punctuation_in_data" in _ids(findings)
|
||||
|
||||
def test_clean_data_no_finding(self):
|
||||
df = pd.DataFrame({"note": ["plain", "ASCII only", "no smart chars"]})
|
||||
findings = analyze(df)
|
||||
assert "smart_punctuation_in_data" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Invisible / NBSP / dirty headers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestInvisibleChars:
|
||||
def test_finds_nbsp(self):
|
||||
df = pd.DataFrame({"name": ["Alice ", "Bob"]})
|
||||
findings = analyze(df)
|
||||
assert "nbsp_or_unicode_whitespace" in _ids(findings)
|
||||
f = next(f for f in findings if f.id == "nbsp_or_unicode_whitespace")
|
||||
assert f.count == 1
|
||||
|
||||
def test_finds_zero_width(self):
|
||||
df = pd.DataFrame({"name": ["Alice", "Bob"]})
|
||||
findings = analyze(df)
|
||||
assert "zero_width_or_invisible" in _ids(findings)
|
||||
|
||||
def test_flags_dirty_headers(self):
|
||||
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
|
||||
findings = analyze(df)
|
||||
assert "dirty_column_headers" in _ids(findings)
|
||||
f = next(f for f in findings if f.id == "dirty_column_headers")
|
||||
assert f.count == 2
|
||||
|
||||
def test_clean_headers_no_finding(self):
|
||||
df = pd.DataFrame({"id": [1], "email": ["a@b.com"]})
|
||||
findings = analyze(df)
|
||||
assert "dirty_column_headers" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Whitespace padding
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestWhitespacePadding:
|
||||
def test_finds_leading_trailing_space(self):
|
||||
df = pd.DataFrame({"x": [" padded ", "clean"]})
|
||||
findings = analyze(df)
|
||||
assert "whitespace_padding" in _ids(findings)
|
||||
|
||||
def test_finds_internal_double_space(self):
|
||||
df = pd.DataFrame({"x": ["double space", "single space"]})
|
||||
findings = analyze(df)
|
||||
assert "whitespace_padding" in _ids(findings)
|
||||
|
||||
def test_no_finding_when_clean(self):
|
||||
df = pd.DataFrame({"x": ["clean", "also clean"]})
|
||||
findings = analyze(df)
|
||||
assert "whitespace_padding" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Null-like sentinels
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNullLikeSentinels:
|
||||
def test_finds_n_a_and_nan(self):
|
||||
df = pd.DataFrame({"x": ["valid", "N/A", "nan", "None", "-"]})
|
||||
findings = analyze(df)
|
||||
f = next(f for f in findings if f.id == "null_like_sentinels")
|
||||
assert f.count == 4
|
||||
assert f.tool == TOOL_MISSING_HANDLER
|
||||
assert f.severity == "info"
|
||||
|
||||
def test_clean_data_no_finding(self):
|
||||
df = pd.DataFrame({"x": ["a", "b", "c"]})
|
||||
findings = analyze(df)
|
||||
assert "null_like_sentinels" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mojibake
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMojibake:
|
||||
def test_finds_classic_pattern(self):
|
||||
df = pd.DataFrame({"name": ["café", "café", "Müller"]})
|
||||
findings = analyze(df)
|
||||
assert "suspected_mojibake" in _ids(findings)
|
||||
|
||||
def test_clean_unicode_no_finding(self):
|
||||
df = pd.DataFrame({"name": ["café", "naïve", "München"]})
|
||||
findings = analyze(df)
|
||||
assert "suspected_mojibake" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mixed-case email column
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMixedCaseEmail:
|
||||
def test_finds_mixed_case(self):
|
||||
df = pd.DataFrame({"email": ["Alice@Example.COM", "bob@example.com"]})
|
||||
findings = analyze(df)
|
||||
assert "mixed_case_email_column" in _ids(findings)
|
||||
|
||||
def test_all_lower_no_finding(self):
|
||||
df = pd.DataFrame({"email": ["a@b.com", "c@d.com"]})
|
||||
findings = analyze(df)
|
||||
assert "mixed_case_email_column" not in _ids(findings)
|
||||
|
||||
def test_non_email_column_ignored(self):
|
||||
df = pd.DataFrame({"name": ["Alice", "bob"]})
|
||||
findings = analyze(df)
|
||||
assert "mixed_case_email_column" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Leading-zero IDs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLeadingZeroIds:
|
||||
def test_finds_zero_padded_ids(self):
|
||||
df = pd.DataFrame({
|
||||
"sku": ["0001234", "0005678", "0009999", "0001111", "0002222", "0003333"],
|
||||
})
|
||||
findings = analyze(df)
|
||||
assert "leading_zero_ids" in _ids(findings)
|
||||
|
||||
def test_no_finding_when_no_leading_zero(self):
|
||||
df = pd.DataFrame({"id": [str(i) for i in range(1, 100)]})
|
||||
findings = analyze(df)
|
||||
assert "leading_zero_ids" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Near-duplicate rows
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNearDuplicates:
|
||||
def test_finds_case_insensitive_dupes(self):
|
||||
df = pd.DataFrame({
|
||||
"name": ["Alice", "alice ", "Bob"],
|
||||
"email": ["a@b.com", "A@B.COM", "bob@b.com"],
|
||||
})
|
||||
findings = analyze(df)
|
||||
assert "near_duplicate_rows" in _ids(findings)
|
||||
|
||||
def test_unique_rows_no_finding(self):
|
||||
df = pd.DataFrame({
|
||||
"name": ["Alice", "Bob", "Carol"],
|
||||
"email": ["a@x.com", "b@x.com", "c@x.com"],
|
||||
})
|
||||
findings = analyze(df)
|
||||
assert "near_duplicate_rows" not in _ids(findings)
|
||||
|
||||
def test_single_row_no_finding(self):
|
||||
df = pd.DataFrame({"x": ["only"]})
|
||||
findings = analyze(df)
|
||||
assert "near_duplicate_rows" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mixed line endings
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMixedLineEndings:
|
||||
def test_crlf_plus_lf_flagged(self, tmp_path):
|
||||
f = tmp_path / "mixed.csv"
|
||||
f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n")
|
||||
findings = analyze(f)
|
||||
assert "mixed_line_endings" in _ids(findings)
|
||||
|
||||
def test_uniform_lf_not_flagged(self, tmp_path):
|
||||
f = tmp_path / "uniform.csv"
|
||||
f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
|
||||
findings = analyze(f)
|
||||
assert "mixed_line_endings" not in _ids(findings)
|
||||
|
||||
def test_dataframe_mode_skips_detector(self):
|
||||
# No raw bytes -> mixed_line_endings cannot be detected.
|
||||
df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
|
||||
findings = analyze(df)
|
||||
assert "mixed_line_endings" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Findings synthesized from RepairResult
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFindingsFromRepair:
|
||||
def test_bom_strip_surfaces(self):
|
||||
repair = repair_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
|
||||
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Alice"]}),
|
||||
repair_result=repair)
|
||||
assert "csv_bom_stripped" in _ids(findings)
|
||||
|
||||
def test_nul_strip_surfaces(self):
|
||||
repair = repair_bytes(b"id,name\n1,Hel\x00lo\n")
|
||||
findings = analyze(pd.DataFrame({"id": ["1"], "name": ["Hello"]}),
|
||||
repair_result=repair)
|
||||
assert "csv_nul_stripped" in _ids(findings)
|
||||
|
||||
def test_unrepairable_surfaces_as_error(self):
|
||||
# Synthesize a result with an unrepairable line.
|
||||
repair = RepairResult(
|
||||
repaired_bytes=b"id,a,b\n1,foo,bar\n",
|
||||
actions=[],
|
||||
unrepairable_lines=[3],
|
||||
)
|
||||
findings = analyze(pd.DataFrame({"id": ["1"], "a": ["foo"], "b": ["bar"]}),
|
||||
repair_result=repair)
|
||||
f = next(f for f in findings if f.id == "csv_unrepairable_rows")
|
||||
assert f.severity == "error"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# End-to-end on the corpus kitchen-sink fixture
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEndToEnd:
|
||||
def test_kitchen_sink_fixture_finds_pollution(self):
|
||||
path = Path("test-cases/text-cleaner-corpus/test_data/20_kitchen_sink.csv")
|
||||
if not path.exists():
|
||||
pytest.skip("corpus fixture not present")
|
||||
findings = analyze(path)
|
||||
ids = _ids(findings)
|
||||
# Kitchen-sink has BOM, smart quotes, NBSP, ZWSP, and dirty headers.
|
||||
# Pre-parse repair handles the file-level smart-quote/BOM, so they
|
||||
# show up as csv_* findings; the cell-level NBSP/ZW remain as
|
||||
# data findings.
|
||||
assert "csv_bom_stripped" in ids or "csv_smart_quotes_folded" in ids
|
||||
# NBSP-padded headers should still surface — pre-parse repair only
|
||||
# touches double-quote characters.
|
||||
assert any(i.startswith("dirty_") or i.startswith("nbsp") or i.startswith("zero_width")
|
||||
for i in ids)
|
||||
|
||||
def test_clean_dataframe_returns_empty_findings(self):
|
||||
df = pd.DataFrame({
|
||||
"id": ["1", "2", "3"],
|
||||
"name": ["Alice", "Bob", "Carol"],
|
||||
"email": ["a@x.com", "b@x.com", "c@x.com"],
|
||||
})
|
||||
findings = analyze(df)
|
||||
assert findings == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestHelpers:
|
||||
def test_findings_by_tool_groups_correctly(self):
|
||||
df = pd.DataFrame({
|
||||
"name": [" padded ", "“smart”"],
|
||||
"x": ["N/A", "valid"],
|
||||
})
|
||||
findings = analyze(df)
|
||||
grouped = findings_by_tool(findings)
|
||||
assert TOOL_TEXT_CLEANER in grouped
|
||||
assert TOOL_MISSING_HANDLER in grouped
|
||||
|
||||
def test_findings_by_tool_skips_toolless(self):
|
||||
repair = RepairResult(
|
||||
repaired_bytes=b"", actions=[], unrepairable_lines=[5, 7],
|
||||
)
|
||||
findings = analyze(pd.DataFrame({"x": ["a"]}), repair_result=repair)
|
||||
grouped = findings_by_tool(findings)
|
||||
# csv_unrepairable_rows has tool="" and should not appear.
|
||||
assert all(t for t in grouped)
|
||||
|
||||
def test_to_dict_is_json_serializable(self):
|
||||
df = pd.DataFrame({"x": [" padded "]})
|
||||
findings = analyze(df)
|
||||
d = to_dict(findings[0])
|
||||
import json
|
||||
json.dumps(d) # would raise on non-serializable values
|
||||
assert d["id"] == "whitespace_padding"
|
||||
assert "samples" in d
|
||||
97
tests/test_cli_analyze.py
Normal file
97
tests/test_cli_analyze.py
Normal file
@@ -0,0 +1,97 @@
|
||||
"""Tests for src.cli_analyze — Typer CLI."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from src.cli_analyze import app
|
||||
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
def _make_dirty(tmp_path: Path) -> Path:
|
||||
"""Write a small CSV with a mix of detectable issues."""
|
||||
f = tmp_path / "dirty.csv"
|
||||
f.write_bytes(
|
||||
b"\xef\xbb\xbf" # BOM
|
||||
b" id ,Name,Email\n" # padded header
|
||||
b"1,Alice,Alice@Example.COM\n"
|
||||
b"2, Bob ,bob@example.com\n"
|
||||
b"3,N/A,carol@example.com\n"
|
||||
)
|
||||
return f
|
||||
|
||||
|
||||
class TestAnalyzeCli:
|
||||
def test_clean_file_says_so(self, tmp_path):
|
||||
f = tmp_path / "clean.csv"
|
||||
f.write_text("id,name\n1,Alice\n2,Bob\n")
|
||||
result = runner.invoke(app, [str(f)])
|
||||
assert result.exit_code == 0
|
||||
assert "No issues detected" in result.stdout
|
||||
|
||||
def test_dirty_file_lists_findings(self, tmp_path):
|
||||
f = _make_dirty(tmp_path)
|
||||
result = runner.invoke(app, [str(f)])
|
||||
assert result.exit_code == 0
|
||||
# The Rich table breaks lines; assert on stable substrings instead of
|
||||
# full finding ids.
|
||||
assert "Text Cleaner" in result.stdout
|
||||
assert "Missing Value" in result.stdout
|
||||
# Severity column is rendered.
|
||||
assert "warn" in result.stdout
|
||||
|
||||
def test_json_output_round_trips(self, tmp_path):
|
||||
f = _make_dirty(tmp_path)
|
||||
result = runner.invoke(app, [str(f), "--json"])
|
||||
assert result.exit_code == 0
|
||||
data = json.loads(result.stdout)
|
||||
assert isinstance(data, list)
|
||||
assert len(data) > 0
|
||||
ids = {item["id"] for item in data}
|
||||
assert "dirty_column_headers" in ids or "whitespace_padding" in ids
|
||||
# Each finding has the documented shape.
|
||||
for f in data:
|
||||
assert {"id", "severity", "tool", "count", "description", "samples"} <= set(f)
|
||||
|
||||
def test_missing_file_exits_2(self, tmp_path):
|
||||
result = runner.invoke(app, [str(tmp_path / "nope.csv")])
|
||||
assert result.exit_code == 2
|
||||
assert "not found" in result.stdout.lower() or "not found" in (result.stderr or "")
|
||||
|
||||
def test_strict_exits_1_on_warnings(self, tmp_path):
|
||||
f = _make_dirty(tmp_path)
|
||||
result = runner.invoke(app, [str(f), "--strict", "--json"])
|
||||
# JSON output is still printed, but exit code is 1 because warns exist.
|
||||
assert result.exit_code == 1
|
||||
data = json.loads(result.stdout)
|
||||
assert any(item["severity"] in ("warn", "error") for item in data)
|
||||
|
||||
def test_strict_exits_0_on_clean(self, tmp_path):
|
||||
f = tmp_path / "clean.csv"
|
||||
f.write_text("id,name\n1,Alice\n2,Bob\n")
|
||||
result = runner.invoke(app, [str(f), "--strict"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_sample_rows_caps_scan(self, tmp_path):
|
||||
# Build a file where ONLY rows past 100 have NBSP padding; with
|
||||
# --sample-rows 50 we should miss it.
|
||||
rows = ["id,name"]
|
||||
for i in range(1, 101):
|
||||
rows.append(f"{i},Alice")
|
||||
for i in range(101, 200):
|
||||
rows.append(f"{i},Alice ") # NBSP padding
|
||||
f = tmp_path / "big.csv"
|
||||
f.write_text("\n".join(rows) + "\n", encoding="utf-8")
|
||||
|
||||
capped = runner.invoke(app, [str(f), "--sample-rows", "50", "--json"])
|
||||
full = runner.invoke(app, [str(f), "--sample-rows", "200", "--json"])
|
||||
capped_ids = {x["id"] for x in json.loads(capped.stdout)}
|
||||
full_ids = {x["id"] for x in json.loads(full.stdout)}
|
||||
assert "nbsp_or_unicode_whitespace" not in capped_ids
|
||||
assert "nbsp_or_unicode_whitespace" in full_ids
|
||||
209
tests/test_corpus.py
Normal file
209
tests/test_corpus.py
Normal file
@@ -0,0 +1,209 @@
|
||||
"""Run every corpus fixture through the current text cleaner and report diffs.
|
||||
|
||||
This is an *acceptance* test against an external corpus shipped in
|
||||
``test-cases/text-cleaner-corpus/``. Cases that fail are documented gaps
|
||||
between the current implementation and the spec target in TEST-CASES.md.
|
||||
The test fails on diff — that's the point. Each failure is informative.
|
||||
|
||||
Cases 12 and 14 produce multiple expected outputs depending on flags;
|
||||
case 21 is XLSX-only and verified separately (manual / smoke).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.text_clean import CleanOptions, clean_dataframe
|
||||
|
||||
CORPUS = Path(__file__).parent.parent / "test-cases" / "text-cleaner-corpus"
|
||||
TEST_DATA = CORPUS / "test_data"
|
||||
EXPECTED = CORPUS / "expected"
|
||||
|
||||
|
||||
# Cases where a single default run should produce the expected file
|
||||
DEFAULT_CASES = [
|
||||
"01_whitespace_basic",
|
||||
"02_whitespace_unicode",
|
||||
"03_smart_punctuation",
|
||||
"04_unicode_forms",
|
||||
"05_zero_width_invisible",
|
||||
"06_control_characters",
|
||||
"07_bom_utf8",
|
||||
"08_line_endings_crlf",
|
||||
"09_line_endings_cr",
|
||||
"10_line_endings_mixed",
|
||||
"11_embedded_newlines",
|
||||
"13_non_latin_scripts",
|
||||
"15_whitespace_only_cells",
|
||||
"16_dirty_headers",
|
||||
"17_preserve_intended",
|
||||
"19_headers_only",
|
||||
"20_kitchen_sink",
|
||||
]
|
||||
|
||||
|
||||
def _read_csv_strict(path: Path) -> pd.DataFrame:
|
||||
"""Read a corpus CSV file, treating all cells as strings.
|
||||
|
||||
NUL bytes are stripped from the raw file before parsing because the
|
||||
pandas C engine truncates fields at NUL while the python engine is
|
||||
too strict about embedded literal double quotes. Stripping NUL is
|
||||
the file-level pre-clean step the spec describes for case 06.
|
||||
"""
|
||||
raw = path.read_bytes().replace(b"\x00", b"")
|
||||
return pd.read_csv(
|
||||
io.BytesIO(raw), dtype=str, keep_default_na=False, encoding="utf-8-sig",
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DataFrame-level diff (covers cell content; ignores file-level encoding/EOL)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize("name", DEFAULT_CASES)
|
||||
def test_corpus_dataframe_diff(name):
|
||||
"""Run clean_dataframe on the input and diff against the expected DF."""
|
||||
inp_path = TEST_DATA / f"{name}.csv"
|
||||
exp_path = EXPECTED / f"{name}.csv"
|
||||
|
||||
if inp_path.stat().st_size == 0:
|
||||
pytest.skip(f"{name}: input is empty (file-level test)")
|
||||
|
||||
df_in = _read_csv_strict(inp_path)
|
||||
df_expected = _read_csv_strict(exp_path)
|
||||
|
||||
result = clean_dataframe(df_in)
|
||||
|
||||
# Normalize column names in expected/actual the same way (str cast)
|
||||
actual = result.cleaned_df.reset_index(drop=True)
|
||||
expected = df_expected.reset_index(drop=True)
|
||||
|
||||
# Frame-level diff: equal columns, equal cell content
|
||||
assert list(actual.columns) == list(expected.columns), (
|
||||
f"{name}: header mismatch.\n"
|
||||
f" actual: {list(actual.columns)!r}\n"
|
||||
f" expected: {list(expected.columns)!r}"
|
||||
)
|
||||
|
||||
diffs = []
|
||||
for col in expected.columns:
|
||||
for i, (a, e) in enumerate(zip(actual[col].tolist(), expected[col].tolist())):
|
||||
if a != e:
|
||||
diffs.append((i, col, repr(a), repr(e)))
|
||||
assert not diffs, (
|
||||
f"{name}: {len(diffs)} cell mismatch(es). First 5:\n"
|
||||
+ "\n".join(f" row {i} col {c}: actual={a} expected={e}"
|
||||
for i, c, a, e in diffs[:5])
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Idempotency property (every case)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize("name", DEFAULT_CASES + ["12_case_variations", "14_mojibake"])
|
||||
def test_corpus_idempotent(name):
|
||||
"""clean(clean(x)) == clean(x) for every fixture."""
|
||||
inp_path = TEST_DATA / f"{name}.csv"
|
||||
if inp_path.stat().st_size == 0:
|
||||
pytest.skip(f"{name}: input is empty")
|
||||
|
||||
df = _read_csv_strict(inp_path)
|
||||
once = clean_dataframe(df).cleaned_df.reset_index(drop=True)
|
||||
twice = clean_dataframe(once).cleaned_df.reset_index(drop=True)
|
||||
assert once.equals(twice), f"{name}: not idempotent"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Special cases: 12 (case ops, opt-in), 14 (mojibake), 18 (empty), 21 (xlsx)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCaseVariations:
|
||||
"""Case 12: --case email=lower and --case name=title variants."""
|
||||
|
||||
def test_default_is_identity_for_case(self):
|
||||
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
|
||||
expected = _read_csv_strict(EXPECTED / "12_case_variations__default.csv")
|
||||
actual = clean_dataframe(df).cleaned_df.reset_index(drop=True)
|
||||
# Default should not change case
|
||||
assert actual.equals(expected), (
|
||||
"12 default: cells differ (case mutated under default config)"
|
||||
)
|
||||
|
||||
def test_email_lower(self):
|
||||
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
|
||||
expected = _read_csv_strict(EXPECTED / "12_case_variations__email_lower.csv")
|
||||
opts = CleanOptions(case_columns={"email": "lower"})
|
||||
actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True)
|
||||
assert actual.equals(expected), "12 email_lower variant differs"
|
||||
|
||||
def test_name_title(self):
|
||||
df = _read_csv_strict(TEST_DATA / "12_case_variations.csv")
|
||||
expected = _read_csv_strict(EXPECTED / "12_case_variations__name_title.csv")
|
||||
opts = CleanOptions(case_columns={"name": "title"})
|
||||
actual = clean_dataframe(df, opts).cleaned_df.reset_index(drop=True)
|
||||
assert actual.equals(expected), "12 name_title variant differs"
|
||||
|
||||
|
||||
class TestMojibake:
|
||||
def test_default_no_repair(self):
|
||||
df = _read_csv_strict(TEST_DATA / "14_mojibake.csv")
|
||||
expected = _read_csv_strict(EXPECTED / "14_mojibake__default.csv")
|
||||
actual = clean_dataframe(df).cleaned_df.reset_index(drop=True)
|
||||
assert actual.equals(expected), "14 mojibake default (no repair) differs"
|
||||
|
||||
def test_fixed_variant(self):
|
||||
# --fix-mojibake is Tier 2; the cleaner does not implement it. Mark xfail.
|
||||
pytest.xfail("Mojibake auto-repair is Tier 2; not yet implemented (uses ftfy).")
|
||||
|
||||
|
||||
class TestEmptyFile:
|
||||
def test_empty_no_crash(self, tmp_path):
|
||||
"""Case 18: zero-byte file should not crash."""
|
||||
inp = TEST_DATA / "18_empty_file.csv"
|
||||
assert inp.stat().st_size == 0
|
||||
# Reading an empty CSV with pandas raises EmptyDataError; corpus says
|
||||
# the cleaner must handle it gracefully. Not yet wired in core.
|
||||
with pytest.raises(pd.errors.EmptyDataError):
|
||||
pd.read_csv(inp)
|
||||
|
||||
|
||||
class TestXlsxPollution:
|
||||
"""Case 21: XLSX with multi-sheet pollution; smoke-test each sheet."""
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
def workbook(self):
|
||||
path = TEST_DATA / "21_excel_pollution.xlsx"
|
||||
return pd.ExcelFile(path, engine="openpyxl")
|
||||
|
||||
def test_sheets_present(self, workbook):
|
||||
names = set(workbook.sheet_names)
|
||||
assert {"Customers", "Notes", "International", "ForceText"}.issubset(names)
|
||||
|
||||
def test_each_sheet_runs_without_error(self, workbook):
|
||||
for sheet in workbook.sheet_names:
|
||||
df = pd.read_excel(
|
||||
workbook, sheet_name=sheet, dtype=str, keep_default_na=False,
|
||||
)
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.shape[0] == df.shape[0], (
|
||||
f"sheet {sheet}: row count changed"
|
||||
)
|
||||
|
||||
def test_force_text_leading_zeros_preserved(self, workbook):
|
||||
df = pd.read_excel(
|
||||
workbook, sheet_name="ForceText", dtype=str, keep_default_na=False,
|
||||
)
|
||||
result = clean_dataframe(df)
|
||||
# First column likely an id with leading zeros — make sure it isn't
|
||||
# numerically coerced or stripped.
|
||||
first_col = result.cleaned_df.iloc[:, 0].tolist()
|
||||
for val in first_col:
|
||||
if val and val.lstrip("'").isdigit():
|
||||
assert not val.startswith(" ") and not val.endswith(" ")
|
||||
143
tests/test_e2e.py
Normal file
143
tests/test_e2e.py
Normal file
@@ -0,0 +1,143 @@
|
||||
"""End-to-end smoke tests.
|
||||
|
||||
Round-trips through the CLI binaries with real fixture inputs to catch
|
||||
glue-code breakage that pure unit tests miss: argv parsing, file I/O, log
|
||||
configuration, exit codes, and the integration between the analyzer, the
|
||||
pre-parse repair, and pandas.
|
||||
|
||||
These are intentionally lightweight — one happy path per CLI plus a
|
||||
couple of failure modes. Bigger scenarios live in ``test_corpus.py`` and
|
||||
``test_fixtures_sweep.py``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.e2e
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
CORPUS_KITCHEN_SINK = (
|
||||
PROJECT_ROOT / "test-cases" / "text-cleaner-corpus" / "test_data" / "20_kitchen_sink.csv"
|
||||
)
|
||||
|
||||
|
||||
def _run(*args: str, cwd: Path | None = None, **kwargs):
|
||||
return subprocess.run(
|
||||
[sys.executable, *args],
|
||||
capture_output=True, text=True, timeout=60,
|
||||
cwd=cwd or PROJECT_ROOT,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# cli_analyze — full round-trip
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestAnalyzeCliE2E:
|
||||
def test_table_output_on_kitchen_sink(self):
|
||||
if not CORPUS_KITCHEN_SINK.exists():
|
||||
pytest.skip("kitchen-sink fixture missing")
|
||||
proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK))
|
||||
assert proc.returncode == 0, proc.stderr
|
||||
# Rich tables wrap; assert on stable substrings.
|
||||
assert "Text Cleaner" in proc.stdout
|
||||
assert "csv_bom_stripped" in proc.stdout or "smart_quotes" in proc.stdout
|
||||
|
||||
def test_json_output_parses(self):
|
||||
if not CORPUS_KITCHEN_SINK.exists():
|
||||
pytest.skip("kitchen-sink fixture missing")
|
||||
proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK), "--json")
|
||||
assert proc.returncode == 0, proc.stderr
|
||||
data = json.loads(proc.stdout)
|
||||
assert isinstance(data, list) and len(data) > 0
|
||||
for item in data:
|
||||
assert {"id", "severity", "tool", "count", "description"} <= set(item)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# cli_text_clean — full round-trip
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestTextCleanCliE2E:
|
||||
def test_apply_writes_cleaned_file(self, tmp_path):
|
||||
# Build a small dirty CSV: NBSP padding + smart quotes.
|
||||
src = tmp_path / "dirty.csv"
|
||||
src.write_text(
|
||||
"id,name,note\n"
|
||||
"1, Alice ,“hello”\n"
|
||||
"2, Bob ,it’s fine\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
out = tmp_path / "out.csv"
|
||||
proc = _run(
|
||||
"-m", "src.cli_text_clean", str(src),
|
||||
"--apply", "--output", str(out),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stderr
|
||||
assert out.exists(), "cleaned file was not written"
|
||||
cleaned = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig")
|
||||
# NBSP padding stripped
|
||||
assert cleaned.iloc[0]["name"] == "Alice"
|
||||
assert cleaned.iloc[1]["name"] == "Bob"
|
||||
# Smart quotes folded
|
||||
assert cleaned.iloc[0]["note"] == '"hello"'
|
||||
assert cleaned.iloc[1]["note"] == "it's fine"
|
||||
|
||||
def test_preview_does_not_write(self, tmp_path):
|
||||
src = tmp_path / "input.csv"
|
||||
src.write_text("id,name\n1,Alice\n", encoding="utf-8")
|
||||
# Without --apply, no output file should appear.
|
||||
proc = _run("-m", "src.cli_text_clean", str(src))
|
||||
assert proc.returncode == 0
|
||||
# Default output path next to input — must not exist.
|
||||
default_out = src.with_name(src.stem + "_cleaned.csv")
|
||||
assert not default_out.exists()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# cli (dedup) — full round-trip
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDedupCliE2E:
|
||||
def test_apply_removes_duplicates(self, tmp_path):
|
||||
src = tmp_path / "dups.csv"
|
||||
src.write_text(
|
||||
"name,email\n"
|
||||
"Alice,alice@x.com\n"
|
||||
"Alice,alice@x.com\n"
|
||||
"Bob,bob@x.com\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
out = tmp_path / "deduped.csv"
|
||||
proc = _run(
|
||||
"-m", "src.cli", str(src),
|
||||
"--apply", "--output", str(out),
|
||||
)
|
||||
assert proc.returncode == 0, proc.stderr
|
||||
assert out.exists()
|
||||
result = pd.read_csv(out, dtype=str, keep_default_na=False, encoding="utf-8-sig")
|
||||
assert len(result) == 2 # Alice deduped, Bob unique
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# run_tests.py self-test — sanity check the runner itself works
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRunTestsE2E:
|
||||
def test_tool_filter_runs_subset(self):
|
||||
proc = _run("run_tests.py", "--tool", "config", "-v")
|
||||
assert proc.returncode == 0, proc.stderr
|
||||
# Check we limited the run via -k.
|
||||
assert "config" in proc.stdout.lower()
|
||||
|
||||
def test_unknown_tool_exits_2(self):
|
||||
proc = _run("run_tests.py", "--tool", "no_such_tool")
|
||||
assert proc.returncode == 2
|
||||
156
tests/test_fixtures_sweep.py
Normal file
156
tests/test_fixtures_sweep.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""Automated sweep over every fixture in ``test-cases/``.
|
||||
|
||||
Drop a new CSV/TSV/XLSX into ``test-cases/`` and the sweep picks it up the
|
||||
next time pytest runs — no test code changes required. Each fixture goes
|
||||
through three smoke tests:
|
||||
|
||||
1. **Pre-parse repair runs cleanly.** Byte-level repair (BOM, NUL, smart
|
||||
quotes, rogue delimiters) must not crash, and produced bytes must be
|
||||
valid for ``pd.read_csv``.
|
||||
2. **Analyzer runs cleanly.** ``analyze()`` must produce a list of
|
||||
:class:`Finding` objects without raising.
|
||||
3. **Text cleaner runs cleanly and preserves schema.** Default-config
|
||||
``clean_dataframe`` must not change row count and must return the same
|
||||
number of columns it started with.
|
||||
|
||||
The sweep skips files inside ``text-cleaner-corpus/`` because that subdir
|
||||
has its own dedicated test (``test_corpus.py``) with byte-exact expected
|
||||
outputs.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.analyze import Finding, analyze
|
||||
from src.core.io import detect_delimiter, detect_encoding, repair_bytes
|
||||
from src.core.text_clean import clean_dataframe
|
||||
|
||||
|
||||
TEST_CASES_DIR = Path(__file__).resolve().parent.parent / "test-cases"
|
||||
|
||||
# Subdirectories in test-cases/ that are exercised by their own dedicated
|
||||
# tests. The sweep ignores these so we don't double-test or fight expected
|
||||
# byte-exact outputs.
|
||||
_EXCLUDED_SUBDIRS = {"text-cleaner-corpus"}
|
||||
|
||||
# File suffixes we know how to load.
|
||||
_SUPPORTED_SUFFIXES = {".csv", ".tsv", ".xlsx", ".xls"}
|
||||
|
||||
|
||||
def _discover_fixtures() -> list[Path]:
|
||||
"""Return every fixture file under test-cases/ that the sweep should run.
|
||||
|
||||
Walks one level deep — CSV/XLSX directly inside test-cases/ are picked
|
||||
up; files in excluded subdirectories are not.
|
||||
"""
|
||||
if not TEST_CASES_DIR.is_dir():
|
||||
return []
|
||||
out: list[Path] = []
|
||||
for entry in sorted(TEST_CASES_DIR.iterdir()):
|
||||
if entry.is_dir():
|
||||
if entry.name in _EXCLUDED_SUBDIRS:
|
||||
continue
|
||||
for sub in sorted(entry.rglob("*")):
|
||||
if sub.is_file() and sub.suffix.lower() in _SUPPORTED_SUFFIXES:
|
||||
out.append(sub)
|
||||
continue
|
||||
if entry.is_file() and entry.suffix.lower() in _SUPPORTED_SUFFIXES:
|
||||
out.append(entry)
|
||||
return out
|
||||
|
||||
|
||||
_FIXTURES = _discover_fixtures()
|
||||
|
||||
|
||||
def _fixture_id(path: Path) -> str:
|
||||
"""Pretty pytest id derived from the filename, keeping subdirs visible."""
|
||||
rel = path.relative_to(TEST_CASES_DIR)
|
||||
return str(rel)
|
||||
|
||||
|
||||
# Skip the entire module gracefully when no fixtures are present, instead of
|
||||
# emitting a "no tests collected" failure.
|
||||
pytestmark = [
|
||||
pytest.mark.fixture_sweep,
|
||||
pytest.mark.skipif(
|
||||
not _FIXTURES,
|
||||
reason="no fixtures found under test-cases/ — drop a CSV/XLSX in to enable the sweep",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _read_with_repair(path: Path) -> tuple[pd.DataFrame, object | None]:
|
||||
"""Read *path* with the same robust pipeline analyze() uses.
|
||||
|
||||
Returns ``(df, repair_result)`` where repair_result is None for Excel.
|
||||
"""
|
||||
suffix = path.suffix.lower()
|
||||
if suffix in (".xlsx", ".xls"):
|
||||
df = pd.read_excel(path, dtype=str, keep_default_na=False, engine="openpyxl")
|
||||
return df, None
|
||||
enc = detect_encoding(path)
|
||||
delim = detect_delimiter(path, enc)
|
||||
raw = path.read_bytes()
|
||||
repair = repair_bytes(raw, encoding=enc, delimiter=delim)
|
||||
df = pd.read_csv(
|
||||
io.BytesIO(repair.repaired_bytes),
|
||||
encoding="utf-8", delimiter=delim,
|
||||
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||
)
|
||||
return df, repair
|
||||
|
||||
|
||||
@pytest.mark.parametrize("fixture", _FIXTURES, ids=[_fixture_id(p) for p in _FIXTURES])
|
||||
class TestFixtureSweep:
|
||||
"""Smoke tests that every fixture in ``test-cases/`` must pass."""
|
||||
|
||||
def test_repair_and_load(self, fixture: Path) -> None:
|
||||
df, _ = _read_with_repair(fixture)
|
||||
assert isinstance(df, pd.DataFrame), f"{fixture.name}: did not return a DataFrame"
|
||||
assert len(df.columns) >= 1, f"{fixture.name}: zero columns after parse"
|
||||
|
||||
def test_analyze_runs(self, fixture: Path) -> None:
|
||||
df, repair = _read_with_repair(fixture)
|
||||
findings = analyze(df, repair_result=repair)
|
||||
assert isinstance(findings, list)
|
||||
for f in findings:
|
||||
assert isinstance(f, Finding), (
|
||||
f"{fixture.name}: analyze() returned a non-Finding ({type(f)})"
|
||||
)
|
||||
|
||||
def test_text_cleaner_preserves_schema(self, fixture: Path) -> None:
|
||||
df, _ = _read_with_repair(fixture)
|
||||
before_rows = len(df)
|
||||
before_cols = len(df.columns)
|
||||
result = clean_dataframe(df)
|
||||
assert len(result.cleaned_df) == before_rows, (
|
||||
f"{fixture.name}: row count changed "
|
||||
f"({before_rows} -> {len(result.cleaned_df)})"
|
||||
)
|
||||
assert len(result.cleaned_df.columns) == before_cols, (
|
||||
f"{fixture.name}: column count changed "
|
||||
f"({before_cols} -> {len(result.cleaned_df.columns)})"
|
||||
)
|
||||
|
||||
def test_text_cleaner_idempotent(self, fixture: Path) -> None:
|
||||
df, _ = _read_with_repair(fixture)
|
||||
once = clean_dataframe(df).cleaned_df.reset_index(drop=True)
|
||||
twice = clean_dataframe(once).cleaned_df.reset_index(drop=True)
|
||||
assert once.equals(twice), (
|
||||
f"{fixture.name}: clean(clean(x)) != clean(x); cleaner is not idempotent"
|
||||
)
|
||||
|
||||
|
||||
def test_at_least_one_fixture_present() -> None:
|
||||
"""Smoke check: every project should ship at least one fixture so the
|
||||
sweep is not silently skipped on a clean checkout. Adjust the threshold
|
||||
only if intentionally moving fixtures elsewhere."""
|
||||
assert len(_FIXTURES) > 0, (
|
||||
"No fixtures found under test-cases/. "
|
||||
"Drop a CSV or XLSX file into the directory and re-run."
|
||||
)
|
||||
167
tests/test_gap_coverage.py
Normal file
167
tests/test_gap_coverage.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""Tests added to close gaps surfaced by the test audit.
|
||||
|
||||
These cover edges that existing suites missed:
|
||||
|
||||
- ``CleanOptions.clean_headers=False`` toggle (added but not directly tested).
|
||||
- ``repair_bytes`` with non-comma delimiters and combined-fix scenarios.
|
||||
- ``analyze()`` over a path-based Excel file.
|
||||
- ``analyze()`` with ``sample_rows >= len(df)`` (uses copy(), not head()).
|
||||
- ``findings_by_tool`` on an empty list.
|
||||
- BOM that appears mid-cell rather than at file start.
|
||||
|
||||
The collapse-whitespace heuristic for numeric/date/phone-shaped cells (spec
|
||||
§4.17) is *not yet implemented* and is captured here as a known-gap xfail
|
||||
so it's surfaced rather than silently missing.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.analyze import analyze, findings_by_tool
|
||||
from src.core.io import RepairAction, repair_bytes
|
||||
from src.core.text_clean import CleanOptions, clean_dataframe
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# clean_headers toggle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCleanHeadersToggle:
|
||||
def test_default_cleans_headers(self):
|
||||
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
|
||||
result = clean_dataframe(df)
|
||||
assert list(result.cleaned_df.columns) == ["id", "Email"]
|
||||
|
||||
def test_disable_preserves_dirty_headers(self):
|
||||
df = pd.DataFrame({" id ": [1], "Email": ["a@b.com"]})
|
||||
result = clean_dataframe(df, CleanOptions(clean_headers=False))
|
||||
assert list(result.cleaned_df.columns) == [" id ", "Email"]
|
||||
|
||||
def test_disable_still_cleans_data_cells(self):
|
||||
df = pd.DataFrame({"name": [" Alice ", "Bob "]})
|
||||
result = clean_dataframe(df, CleanOptions(clean_headers=False))
|
||||
assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# repair_bytes — non-comma delimiters and combined fixes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRepairBytesDelimiters:
|
||||
def test_tab_delimited_smart_quote_fold(self):
|
||||
raw = "id\tnote\n1\t“hi”\n".encode("utf-8")
|
||||
result = repair_bytes(raw, delimiter="\t")
|
||||
text = result.repaired_bytes.decode("utf-8")
|
||||
assert "“" not in text and "”" not in text
|
||||
assert "\t" in text # delimiter preserved
|
||||
|
||||
def test_semicolon_delimited_unrepairable_extras(self):
|
||||
raw = b"id;a;b\n1;foo;bar\n2;1;2;3;4\n"
|
||||
result = repair_bytes(raw, delimiter=";")
|
||||
# Extra-field row with no clear merge candidate is logged unrepairable.
|
||||
assert 3 in result.unrepairable_lines
|
||||
|
||||
|
||||
class TestRepairBytesCombinedFixes:
|
||||
def test_bom_plus_nul_plus_smart_quotes(self):
|
||||
raw = (
|
||||
b"\xef\xbb\xbf"
|
||||
b"id,note\n"
|
||||
b"1,Hel\x00lo \xe2\x80\x9cworld\xe2\x80\x9d\n"
|
||||
)
|
||||
result = repair_bytes(raw)
|
||||
kinds = {a.kind for a in result.actions}
|
||||
assert {"strip_bom", "strip_nul", "fold_smart_quote"} <= kinds
|
||||
# Resulting bytes parse cleanly.
|
||||
df = pd.read_csv(io.BytesIO(result.repaired_bytes))
|
||||
assert df.iloc[0]["note"] == 'Hello "world"'
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# analyze() — path-based Excel and large-sample edges
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestAnalyzeXlsxPath:
|
||||
def test_excel_path_runs_without_repair(self, tmp_path):
|
||||
path = tmp_path / "small.xlsx"
|
||||
df = pd.DataFrame({
|
||||
"id": ["1", "2"],
|
||||
"name": [" Alice ", "Bob"], # padding in xlsx
|
||||
})
|
||||
df.to_excel(path, index=False, engine="openpyxl")
|
||||
findings = analyze(path)
|
||||
ids = {f.id for f in findings}
|
||||
assert "whitespace_padding" in ids
|
||||
# Excel skips csv_* findings — no pre-parse repair on xlsx.
|
||||
assert not any(i.startswith("csv_") for i in ids)
|
||||
|
||||
|
||||
class TestAnalyzeSampleRowsEdge:
|
||||
def test_sample_rows_larger_than_df(self):
|
||||
df = pd.DataFrame({"x": [" pad ", "clean"]})
|
||||
# sample_rows=1000 but df has only 2 rows; must not crash.
|
||||
findings = analyze(df, sample_rows=1000)
|
||||
assert any(f.id == "whitespace_padding" for f in findings)
|
||||
|
||||
|
||||
class TestAnalyzeMidCellBom:
|
||||
def test_bom_inside_cell_treated_as_zero_width(self):
|
||||
df = pd.DataFrame({"name": ["Hello"]})
|
||||
findings = analyze(df)
|
||||
assert any(f.id == "zero_width_or_invisible" for f in findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# findings_by_tool — edge cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFindingsByToolEdges:
|
||||
def test_empty_list_returns_empty_dict(self):
|
||||
assert findings_by_tool([]) == {}
|
||||
|
||||
def test_only_toolless_findings_returns_empty_dict(self):
|
||||
from src.core.analyze import Finding
|
||||
# Construct a Finding with no tool — like csv_unrepairable_rows.
|
||||
f = Finding(
|
||||
id="x", severity="info", tool="", count=1,
|
||||
description="d",
|
||||
)
|
||||
assert findings_by_tool([f]) == {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Known gap: collapse_whitespace on numeric/date/phone-shaped cells
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestStructuredCellWhitespacePreservation:
|
||||
"""Spec §4.17: ``collapse_whitespace`` skips numeric/date/phone-shaped cells."""
|
||||
|
||||
def test_phone_internal_double_space_preserved(self):
|
||||
df = pd.DataFrame({"phone": ["(555) 123-4567"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["phone"] == "(555) 123-4567"
|
||||
|
||||
def test_european_thousands_sep_preserved(self):
|
||||
df = pd.DataFrame({"price": ["1 234"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["price"] == "1 234"
|
||||
|
||||
def test_iso_date_passes_through(self):
|
||||
df = pd.DataFrame({"date": ["2024-01-15"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["date"] == "2024-01-15"
|
||||
|
||||
def test_textual_date_preserves_spaces(self):
|
||||
df = pd.DataFrame({"date": ["Jan 15 2024"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["date"] == "Jan 15 2024"
|
||||
|
||||
def test_free_text_double_space_still_collapsed(self):
|
||||
# Crucially, the heuristic must NOT trigger on prose with letters.
|
||||
df = pd.DataFrame({"note": ["hello world"]})
|
||||
result = clean_dataframe(df)
|
||||
assert result.cleaned_df.iloc[0]["note"] == "hello world"
|
||||
173
tests/test_install.py
Normal file
173
tests/test_install.py
Normal file
@@ -0,0 +1,173 @@
|
||||
"""Install / dependency / entry-point sanity tests.
|
||||
|
||||
These tests answer the question: "after running ``pip install -r
|
||||
requirements.txt`` on a fresh machine, can the user actually use this
|
||||
project?" They run on every supported platform — the asserts touch only
|
||||
public APIs and CLI ``--help`` exits, never any platform-specific paths.
|
||||
|
||||
If a future dependency upgrade or refactor breaks an import that's used by
|
||||
the CLI or the GUI, these tests catch it before the rest of the suite even
|
||||
gets a chance to run.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.install
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Required dependencies
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Top-level packages that must import cleanly. If any of these fails, the
|
||||
# user's install is broken — fail loudly with the offender's name.
|
||||
_REQUIRED_DEPS = [
|
||||
"pandas",
|
||||
"numpy",
|
||||
"openpyxl",
|
||||
"rapidfuzz",
|
||||
"charset_normalizer",
|
||||
"loguru",
|
||||
"tqdm",
|
||||
"typer",
|
||||
"phonenumbers",
|
||||
"streamlit",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("module", _REQUIRED_DEPS)
|
||||
def test_required_dependency_imports(module: str) -> None:
|
||||
importlib.import_module(module)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Project package imports
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_PROJECT_MODULES = [
|
||||
"src",
|
||||
"src.core",
|
||||
"src.core.io",
|
||||
"src.core.text_clean",
|
||||
"src.core.dedup",
|
||||
"src.core.normalizers",
|
||||
"src.core.analyze",
|
||||
"src.core.config",
|
||||
"src.cli",
|
||||
"src.cli_text_clean",
|
||||
"src.cli_analyze",
|
||||
"src.gui.components",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("module", _PROJECT_MODULES)
|
||||
def test_project_module_imports(module: str) -> None:
|
||||
importlib.import_module(module)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API surface
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_core_public_api_present() -> None:
|
||||
"""Spot-check the symbols re-exported via ``src.core``.
|
||||
|
||||
Catches an accidental rename or drop in ``src/core/__init__.py``.
|
||||
"""
|
||||
import src.core as core
|
||||
|
||||
expected = [
|
||||
# I/O
|
||||
"read_file", "write_file", "list_sheets",
|
||||
"detect_encoding", "detect_delimiter", "detect_header_row",
|
||||
"read_csv_repaired", "repair_bytes",
|
||||
"RepairAction", "RepairResult",
|
||||
# Analyzer
|
||||
"Finding", "analyze", "findings_by_tool", "to_dict",
|
||||
# Text cleaner
|
||||
"CleanOptions", "CleanResult", "clean_dataframe", "clean_value",
|
||||
"smart_title_case", "sentence_case", "apply_case",
|
||||
# Dedup
|
||||
"deduplicate", "build_default_strategies",
|
||||
"Algorithm", "SurvivorRule", "MatchStrategy", "MatchResult",
|
||||
"DeduplicationResult",
|
||||
# Normalizers
|
||||
"normalize_email", "normalize_phone", "normalize_name",
|
||||
"normalize_address", "normalize_string", "get_normalizer",
|
||||
"NormalizerType",
|
||||
]
|
||||
missing = [name for name in expected if not hasattr(core, name)]
|
||||
assert not missing, f"src.core is missing public symbols: {missing}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI entry points
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _cli_help(module: str) -> subprocess.CompletedProcess:
|
||||
"""Run ``python -m <module> --help`` and return the CompletedProcess.
|
||||
|
||||
Captures both stdout and stderr so tests can inspect either; uses a
|
||||
short timeout so a hung CLI fails fast on CI.
|
||||
"""
|
||||
return subprocess.run(
|
||||
[sys.executable, "-m", module, "--help"],
|
||||
capture_output=True, text=True, timeout=30,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("cli_module", [
|
||||
"src.cli",
|
||||
"src.cli_text_clean",
|
||||
"src.cli_analyze",
|
||||
])
|
||||
def test_cli_help_exits_zero(cli_module: str) -> None:
|
||||
proc = _cli_help(cli_module)
|
||||
assert proc.returncode == 0, (
|
||||
f"{cli_module} --help exited {proc.returncode}.\n"
|
||||
f"stdout:\n{proc.stdout}\nstderr:\n{proc.stderr}"
|
||||
)
|
||||
# Help output must mention the command name or at least include "Usage:".
|
||||
combined = (proc.stdout + proc.stderr).lower()
|
||||
assert "usage" in combined, (
|
||||
f"{cli_module} --help did not produce a Usage line"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Streamlit GUI entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_streamlit_app_module_compiles() -> None:
|
||||
"""Ensure ``src/gui/app.py`` is at least syntactically valid Python.
|
||||
|
||||
A full Streamlit launch is too heavy for the install layer; that's
|
||||
covered by the e2e suite.
|
||||
"""
|
||||
import ast
|
||||
from pathlib import Path
|
||||
|
||||
app_path = Path(__file__).resolve().parent.parent / "src" / "gui" / "app.py"
|
||||
assert app_path.exists(), f"missing {app_path}"
|
||||
ast.parse(app_path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test runner sanity
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_run_tests_help_works() -> None:
|
||||
"""``python run_tests.py --help`` should describe the available flags."""
|
||||
proc = subprocess.run(
|
||||
[sys.executable, "run_tests.py", "--help"],
|
||||
capture_output=True, text=True, timeout=30,
|
||||
)
|
||||
assert proc.returncode == 0
|
||||
assert "--tool" in proc.stdout
|
||||
assert "--fixtures" in proc.stdout
|
||||
133
tests/test_io.py
133
tests/test_io.py
@@ -1,5 +1,7 @@
|
||||
"""Tests for src.core.io — file reading, encoding/delimiter detection."""
|
||||
|
||||
import io
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
@@ -11,6 +13,8 @@ from src.core.io import (
|
||||
read_file,
|
||||
write_file,
|
||||
list_sheets,
|
||||
repair_bytes,
|
||||
read_csv_repaired,
|
||||
)
|
||||
|
||||
|
||||
@@ -128,3 +132,132 @@ class TestListSheets:
|
||||
simple_df.to_excel(writer, sheet_name="Sheet2", index=False)
|
||||
sheets = list_sheets(path)
|
||||
assert sheets == ["Sheet1", "Sheet2"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pre-parse repair
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRepairBytes:
|
||||
def test_strips_bom(self):
|
||||
raw = b"\xef\xbb\xbfid,name\n1,Alice\n"
|
||||
result = repair_bytes(raw)
|
||||
assert result.repaired_bytes == b"id,name\n1,Alice\n"
|
||||
assert any(a.kind == "strip_bom" for a in result.actions)
|
||||
|
||||
def test_strips_nul_bytes(self):
|
||||
raw = b"id,name\n1,Hel\x00lo\n2,Wo\x00\x00rld\n"
|
||||
result = repair_bytes(raw)
|
||||
assert b"\x00" not in result.repaired_bytes
|
||||
nul_action = next(a for a in result.actions if a.kind == "strip_nul")
|
||||
assert "3" in nul_action.detail # 3 NUL bytes
|
||||
|
||||
def test_folds_smart_double_quotes(self):
|
||||
raw = "id,note\n1,“hello”\n2,«bonjour»\n".encode("utf-8")
|
||||
result = repair_bytes(raw)
|
||||
text = result.repaired_bytes.decode("utf-8")
|
||||
assert "“" not in text and "”" not in text
|
||||
assert "«" not in text and "»" not in text
|
||||
assert any(a.kind == "fold_smart_quote" for a in result.actions)
|
||||
|
||||
def test_does_not_fold_curly_singles(self):
|
||||
# Single curly quotes should pass through; cell-level cleaner handles them.
|
||||
raw = "id,note\n1,it’s fine\n".encode("utf-8")
|
||||
result = repair_bytes(raw)
|
||||
text = result.repaired_bytes.decode("utf-8")
|
||||
assert "’" in text
|
||||
assert not any(a.kind == "fold_smart_quote" for a in result.actions)
|
||||
|
||||
def test_no_changes_when_clean(self):
|
||||
raw = b"id,name\n1,Alice\n2,Bob\n"
|
||||
result = repair_bytes(raw)
|
||||
assert result.repaired_bytes == raw
|
||||
assert result.actions == []
|
||||
assert result.changed is False
|
||||
|
||||
def test_repairs_unquoted_currency_comma(self):
|
||||
raw = (
|
||||
b"id,price,qty\n"
|
||||
b"1,100,5\n"
|
||||
b"2, $1,500.00 ,7\n" # 4 fields instead of 3
|
||||
b"3,200,9\n"
|
||||
)
|
||||
result = repair_bytes(raw)
|
||||
# After repair, every row should have 3 fields when re-parsed.
|
||||
df = pd.read_csv(io.BytesIO(result.repaired_bytes))
|
||||
assert list(df.columns) == ["id", "price", "qty"]
|
||||
assert len(df) == 3
|
||||
assert any(a.kind == "quote_unquoted_delim" and a.line == 3 for a in result.actions)
|
||||
|
||||
def test_logs_unrepairable_when_ambiguous(self):
|
||||
# Two adjacent merge candidates -> bail out, log unrepairable.
|
||||
raw = (
|
||||
b"id,a,b,c\n"
|
||||
b"1,foo,bar,baz\n"
|
||||
b"2,1,2,3,4,5\n" # way too many extras, no clear merge
|
||||
)
|
||||
result = repair_bytes(raw)
|
||||
assert 3 in result.unrepairable_lines
|
||||
|
||||
def test_summary_groups_by_kind(self):
|
||||
raw = b"\xef\xbb\xbfid,name\n1,Hel\x00lo\n"
|
||||
result = repair_bytes(raw)
|
||||
summary = result.summary()
|
||||
assert summary.get("strip_bom") == 1
|
||||
assert summary.get("strip_nul") == 1
|
||||
|
||||
|
||||
class TestReadFileWithRepair:
|
||||
"""``read_file(repair=True)`` (default) routes CSV through repair_bytes."""
|
||||
|
||||
def test_default_strips_bom_via_repair(self, tmp_path):
|
||||
f = tmp_path / "bom.csv"
|
||||
f.write_bytes(b"\xef\xbb\xbfid,name\n1,Alice\n")
|
||||
df = read_file(f)
|
||||
# First column header must be 'id', not 'id'.
|
||||
assert list(df.columns)[0] == "id"
|
||||
|
||||
def test_default_folds_smart_double_quotes(self, tmp_path):
|
||||
# Curly quotes are *unquoted* here — outer ASCII quotes would create
|
||||
# a CSV-quoting collision once the fold runs.
|
||||
f = tmp_path / "quoted.csv"
|
||||
f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8"))
|
||||
df = read_file(f)
|
||||
assert df.iloc[0]["note"] == 'curly "hello" world'
|
||||
|
||||
def test_repair_false_preserves_smart_quotes(self, tmp_path):
|
||||
f = tmp_path / "quoted.csv"
|
||||
f.write_bytes("id,note\n1,curly “hello” world\n".encode("utf-8"))
|
||||
df = read_file(f, repair=False)
|
||||
assert "“" in df.iloc[0]["note"] or "”" in df.iloc[0]["note"]
|
||||
|
||||
def test_chunked_read_skips_repair(self, tmp_path):
|
||||
# Chunked reads bypass repair (memory budget). Verify they still work.
|
||||
rows = "id,name\n" + "\n".join(f"{i},Alice" for i in range(1, 21))
|
||||
f = tmp_path / "chunked.csv"
|
||||
f.write_text(rows)
|
||||
chunks = list(read_file(f, chunk_size=5))
|
||||
total = sum(len(c) for c in chunks)
|
||||
assert total == 20
|
||||
|
||||
|
||||
class TestReadCsvRepaired:
|
||||
def test_recovers_malformed_currency_row(self, tmp_path):
|
||||
f = tmp_path / "bad.csv"
|
||||
f.write_bytes(
|
||||
b"id,price,qty\n"
|
||||
b"1,100,5\n"
|
||||
b"2, $1,500.00 ,7\n"
|
||||
b"3,200,9\n"
|
||||
)
|
||||
df, repair = read_csv_repaired(f)
|
||||
assert len(df) == 3
|
||||
assert "1,500.00" in df.iloc[1]["price"]
|
||||
assert repair.changed
|
||||
|
||||
def test_passthrough_when_clean(self, tmp_path):
|
||||
f = tmp_path / "ok.csv"
|
||||
f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
|
||||
df, repair = read_csv_repaired(f)
|
||||
assert len(df) == 2
|
||||
assert repair.changed is False
|
||||
|
||||
67
tox.ini
Normal file
67
tox.ini
Normal file
@@ -0,0 +1,67 @@
|
||||
; Cross-platform test automation for DataTools.
|
||||
;
|
||||
; Drives the pytest suite under multiple Python versions on Linux, macOS,
|
||||
; and Windows. Use:
|
||||
;
|
||||
; tox # all envs
|
||||
; tox -e py312 # one Python version
|
||||
; tox -e e2e # CLI smoke tests
|
||||
; tox -e install # import / dependency sanity
|
||||
; tox -e lint # static checks (mypy / ruff if installed)
|
||||
; tox -e coverage # full suite with coverage report
|
||||
;
|
||||
; Adding a new fixture: drop the CSV/XLSX into test-cases/ and re-run.
|
||||
; tests/test_fixtures_sweep.py picks new files up automatically.
|
||||
|
||||
[tox]
|
||||
envlist = py310, py311, py312, py313, install, e2e
|
||||
skip_missing_interpreters = true
|
||||
isolated_build = false
|
||||
|
||||
[testenv]
|
||||
description = Run the full pytest suite under {envname}.
|
||||
deps =
|
||||
-r requirements.txt
|
||||
-r requirements-dev.txt
|
||||
commands =
|
||||
python run_tests.py {posargs}
|
||||
passenv =
|
||||
HOME
|
||||
USER
|
||||
LANG
|
||||
LC_ALL
|
||||
PATH
|
||||
setenv =
|
||||
PYTHONIOENCODING = utf-8
|
||||
PYTHONUTF8 = 1
|
||||
|
||||
[testenv:install]
|
||||
description = Verify imports and CLI entry points work after a fresh install.
|
||||
commands =
|
||||
python run_tests.py --install -v
|
||||
|
||||
[testenv:e2e]
|
||||
description = End-to-end CLI smoke tests against real fixtures.
|
||||
commands =
|
||||
python run_tests.py --e2e -v
|
||||
|
||||
[testenv:fixtures]
|
||||
description = Sweep test-cases/ for any newly-dropped fixtures.
|
||||
commands =
|
||||
python run_tests.py --fixtures -v
|
||||
|
||||
[testenv:coverage]
|
||||
description = Full suite with coverage report.
|
||||
commands =
|
||||
python run_tests.py --coverage
|
||||
|
||||
[testenv:lint]
|
||||
description = Static checks (run only if the optional tools are installed).
|
||||
deps =
|
||||
-r requirements.txt
|
||||
ruff>=0.5; python_version >= "3.10"
|
||||
mypy>=1.10; python_version >= "3.10"
|
||||
allowlist_externals = sh
|
||||
commands =
|
||||
sh -c "command -v ruff && ruff check src/ tests/ || echo 'ruff not installed; skipping'"
|
||||
sh -c "command -v mypy && mypy src/ || echo 'mypy not installed; skipping'"
|
||||
Reference in New Issue
Block a user