feat: implement text cleaner (script 02) with CLI, GUI, and tests

Builds 02_text_cleaner.py from stub to working: character-level hygiene for CSV/Excel inputs covering trim, whitespace collapse, smart-character folding, Unicode NFC/NFKC, BOM strip, zero-width strip, control-char strip, line-ending normalization, and per-column case conversion. Three presets (minimal/excel-hygiene/paranoid) keep the buyer surface small. - src/core/text_clean.py: pure helpers + CleanOptions/CleanResult + clean_dataframe with dtype-safe column selection - src/cli_text_clean.py: Typer CLI mirroring the dedup CLI shape (dry-run by default, --apply writes cleaned + changes audit, JSON config save/load) - src/gui/pages/2_Text_Cleaner.py: real Streamlit page with preset picker, advanced toggles, preview, before/after metrics, and three download buttons - tests/test_text_clean.py + test_cli_text_clean.py: 92 new tests covering edge cases E1-E50 from the spec - samples/messy_text.csv: demo dataset surfacing UC1, UC3, UC6, UC10 in 10 rows - test-cases/uc16-uc26 + ec05-ec09: per-use-case and per-edge-case fixtures Docs: TECHNICAL.md §10.2 (full Tier 1/2/3 spec), DECISIONS.md v1.7 entry locking the spec, CLI-REFERENCE.md gains the text cleaner section, README.md gains a top-level Text Cleaner block, USER-GUIDE.md status row 02 promoted Skeleton -> Working. 200/200 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:14:15 +00:00
parent b2ca04e6f4
commit 54f92ae47e
28 changed files with 2093 additions and 58 deletions
--- a/tests/test_cli_text_clean.py
+++ b/tests/test_cli_text_clean.py
@@ -0,0 +1,158 @@
+"""Integration tests for the text-cleaner CLI."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pandas as pd
+import pytest
+from typer.testing import CliRunner
+
+from src.cli_text_clean import app
+
+runner = CliRunner()
+
+
+@pytest.fixture
+def messy_csv(tmp_path):
+    df = pd.DataFrame({
+        "name": ["  Alice  ", "“Bob”", "Charlie"],
+        "city": ["NYC", " LA ", "SF"],
+        "qty": [1, 2, 3],
+    })
+    path = tmp_path / "messy.csv"
+    df.to_csv(path, index=False)
+    return path
+
+
+class TestPreview:
+    def test_default_is_preview(self, messy_csv):
+        result = runner.invoke(app, [str(messy_csv)])
+        assert result.exit_code == 0
+        assert "preview" in result.output.lower()
+        assert "Cells changed" in result.output
+
+    def test_no_files_written_in_preview(self, messy_csv):
+        result = runner.invoke(app, [str(messy_csv)])
+        assert result.exit_code == 0
+        assert not (messy_csv.parent / f"{messy_csv.stem}_cleaned.csv").exists()
+
+    def test_file_not_found(self):
+        result = runner.invoke(app, ["/tmp/does_not_exist_xyz.csv"])
+        assert result.exit_code != 0
+        assert "not found" in result.output.lower()
+
+
+class TestApply:
+    def test_apply_writes_cleaned_file(self, messy_csv):  # E47
+        result = runner.invoke(app, [str(messy_csv), "--apply"])
+        assert result.exit_code == 0
+        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
+        assert cleaned.exists()
+        df = pd.read_csv(cleaned)
+        assert df["name"].iloc[0] == "Alice"
+
+    def test_apply_writes_changes_audit(self, messy_csv):
+        result = runner.invoke(app, [str(messy_csv), "--apply"])
+        assert result.exit_code == 0
+        changes = messy_csv.parent / f"{messy_csv.stem}_changes.csv"
+        assert changes.exists()
+
+    def test_no_audit_when_no_changes(self, tmp_path):
+        clean = tmp_path / "clean.csv"
+        pd.DataFrame({"a": ["x", "y"]}).to_csv(clean, index=False)
+        result = runner.invoke(app, [str(clean), "--apply"])
+        assert result.exit_code == 0
+        assert not (tmp_path / "clean_changes.csv").exists()
+
+    def test_custom_output_path(self, messy_csv, tmp_path):
+        out = tmp_path / "renamed.csv"
+        result = runner.invoke(app, [str(messy_csv), "--apply", "-o", str(out)])
+        assert result.exit_code == 0
+        assert out.exists()
+
+
+class TestPresets:
+    def test_minimal_does_not_fold_smart_chars(self, messy_csv):
+        result = runner.invoke(app, [str(messy_csv), "--apply", "--preset", "minimal"])
+        assert result.exit_code == 0
+        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
+        df = pd.read_csv(cleaned)
+        # Smart quotes preserved under minimal preset
+        assert "“" in df["name"].iloc[1] or "”" in df["name"].iloc[1]
+
+    def test_excel_hygiene_default_folds_smart_chars(self, messy_csv):
+        result = runner.invoke(app, [str(messy_csv), "--apply"])
+        assert result.exit_code == 0
+        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
+        df = pd.read_csv(cleaned)
+        assert df["name"].iloc[1] == '"Bob"'
+
+    def test_unknown_preset_errors(self, messy_csv):
+        result = runner.invoke(app, [str(messy_csv), "--preset", "weird"])
+        assert result.exit_code != 0
+        assert "Unknown preset" in result.output
+
+
+class TestColumnSelection:
+    def test_columns_flag(self, messy_csv):
+        result = runner.invoke(
+            app, [str(messy_csv), "--apply", "--columns", "name"],
+        )
+        assert result.exit_code == 0
+        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
+        df = pd.read_csv(cleaned)
+        assert df["name"].iloc[0] == "Alice"
+        # city should be untouched (still has spaces)
+        assert df["city"].iloc[1] == " LA "
+
+    def test_skip_flag(self, messy_csv):
+        result = runner.invoke(
+            app, [str(messy_csv), "--apply", "--skip", "name"],
+        )
+        assert result.exit_code == 0
+        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
+        df = pd.read_csv(cleaned)
+        # name should still have spaces
+        assert df["name"].iloc[0].startswith(" ")
+
+
+class TestCaseFlag:
+    def test_bare_case_applies_to_all(self, tmp_path):
+        path = tmp_path / "names.csv"
+        pd.DataFrame({"a": ["alice"], "b": ["bob"]}).to_csv(path, index=False)
+        result = runner.invoke(app, [str(path), "--apply", "--case", "upper"])
+        assert result.exit_code == 0
+        df = pd.read_csv(tmp_path / "names_cleaned.csv")
+        assert df["a"].iloc[0] == "ALICE"
+        assert df["b"].iloc[0] == "BOB"
+
+    def test_per_column_case(self, tmp_path):
+        path = tmp_path / "names.csv"
+        pd.DataFrame({"name": ["alice"], "code": ["abc"]}).to_csv(path, index=False)
+        result = runner.invoke(
+            app, [str(path), "--apply", "--case", "title:name,upper:code"],
+        )
+        assert result.exit_code == 0
+        df = pd.read_csv(tmp_path / "names_cleaned.csv")
+        assert df["name"].iloc[0] == "Alice"
+        assert df["code"].iloc[0] == "ABC"
+
+
+class TestConfigRoundTrip:
+    def test_save_and_load(self, messy_csv, tmp_path):
+        cfg = tmp_path / "opts.json"
+        result1 = runner.invoke(
+            app,
+            [str(messy_csv), "--save-config", str(cfg), "--preset", "minimal", "--no-trim"],
+        )
+        assert result1.exit_code == 0
+        assert cfg.exists()
+
+        # Reload and apply
+        result2 = runner.invoke(app, [str(messy_csv), "--apply", "--config", str(cfg)])
+        assert result2.exit_code == 0
+        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
+        df = pd.read_csv(cleaned)
+        # With --no-trim, leading spaces survive
+        assert df["name"].iloc[0].startswith(" ")