feat: implement text cleaner (script 02) with CLI, GUI, and tests
Builds 02_text_cleaner.py from stub to working: character-level hygiene for CSV/Excel inputs covering trim, whitespace collapse, smart-character folding, Unicode NFC/NFKC, BOM strip, zero-width strip, control-char strip, line-ending normalization, and per-column case conversion. Three presets (minimal/excel-hygiene/paranoid) keep the buyer surface small. - src/core/text_clean.py: pure helpers + CleanOptions/CleanResult + clean_dataframe with dtype-safe column selection - src/cli_text_clean.py: Typer CLI mirroring the dedup CLI shape (dry-run by default, --apply writes cleaned + changes audit, JSON config save/load) - src/gui/pages/2_Text_Cleaner.py: real Streamlit page with preset picker, advanced toggles, preview, before/after metrics, and three download buttons - tests/test_text_clean.py + test_cli_text_clean.py: 92 new tests covering edge cases E1-E50 from the spec - samples/messy_text.csv: demo dataset surfacing UC1, UC3, UC6, UC10 in 10 rows - test-cases/uc16-uc26 + ec05-ec09: per-use-case and per-edge-case fixtures Docs: TECHNICAL.md §10.2 (full Tier 1/2/3 spec), DECISIONS.md v1.7 entry locking the spec, CLI-REFERENCE.md gains the text cleaner section, README.md gains a top-level Text Cleaner block, USER-GUIDE.md status row 02 promoted Skeleton -> Working. 200/200 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
158
tests/test_cli_text_clean.py
Normal file
158
tests/test_cli_text_clean.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""Integration tests for the text-cleaner CLI."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from src.cli_text_clean import app
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def messy_csv(tmp_path):
|
||||
df = pd.DataFrame({
|
||||
"name": [" Alice ", "“Bob”", "Charlie"],
|
||||
"city": ["NYC", " LA ", "SF"],
|
||||
"qty": [1, 2, 3],
|
||||
})
|
||||
path = tmp_path / "messy.csv"
|
||||
df.to_csv(path, index=False)
|
||||
return path
|
||||
|
||||
|
||||
class TestPreview:
|
||||
def test_default_is_preview(self, messy_csv):
|
||||
result = runner.invoke(app, [str(messy_csv)])
|
||||
assert result.exit_code == 0
|
||||
assert "preview" in result.output.lower()
|
||||
assert "Cells changed" in result.output
|
||||
|
||||
def test_no_files_written_in_preview(self, messy_csv):
|
||||
result = runner.invoke(app, [str(messy_csv)])
|
||||
assert result.exit_code == 0
|
||||
assert not (messy_csv.parent / f"{messy_csv.stem}_cleaned.csv").exists()
|
||||
|
||||
def test_file_not_found(self):
|
||||
result = runner.invoke(app, ["/tmp/does_not_exist_xyz.csv"])
|
||||
assert result.exit_code != 0
|
||||
assert "not found" in result.output.lower()
|
||||
|
||||
|
||||
class TestApply:
|
||||
def test_apply_writes_cleaned_file(self, messy_csv): # E47
|
||||
result = runner.invoke(app, [str(messy_csv), "--apply"])
|
||||
assert result.exit_code == 0
|
||||
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
||||
assert cleaned.exists()
|
||||
df = pd.read_csv(cleaned)
|
||||
assert df["name"].iloc[0] == "Alice"
|
||||
|
||||
def test_apply_writes_changes_audit(self, messy_csv):
|
||||
result = runner.invoke(app, [str(messy_csv), "--apply"])
|
||||
assert result.exit_code == 0
|
||||
changes = messy_csv.parent / f"{messy_csv.stem}_changes.csv"
|
||||
assert changes.exists()
|
||||
|
||||
def test_no_audit_when_no_changes(self, tmp_path):
|
||||
clean = tmp_path / "clean.csv"
|
||||
pd.DataFrame({"a": ["x", "y"]}).to_csv(clean, index=False)
|
||||
result = runner.invoke(app, [str(clean), "--apply"])
|
||||
assert result.exit_code == 0
|
||||
assert not (tmp_path / "clean_changes.csv").exists()
|
||||
|
||||
def test_custom_output_path(self, messy_csv, tmp_path):
|
||||
out = tmp_path / "renamed.csv"
|
||||
result = runner.invoke(app, [str(messy_csv), "--apply", "-o", str(out)])
|
||||
assert result.exit_code == 0
|
||||
assert out.exists()
|
||||
|
||||
|
||||
class TestPresets:
|
||||
def test_minimal_does_not_fold_smart_chars(self, messy_csv):
|
||||
result = runner.invoke(app, [str(messy_csv), "--apply", "--preset", "minimal"])
|
||||
assert result.exit_code == 0
|
||||
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
||||
df = pd.read_csv(cleaned)
|
||||
# Smart quotes preserved under minimal preset
|
||||
assert "“" in df["name"].iloc[1] or "”" in df["name"].iloc[1]
|
||||
|
||||
def test_excel_hygiene_default_folds_smart_chars(self, messy_csv):
|
||||
result = runner.invoke(app, [str(messy_csv), "--apply"])
|
||||
assert result.exit_code == 0
|
||||
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
||||
df = pd.read_csv(cleaned)
|
||||
assert df["name"].iloc[1] == '"Bob"'
|
||||
|
||||
def test_unknown_preset_errors(self, messy_csv):
|
||||
result = runner.invoke(app, [str(messy_csv), "--preset", "weird"])
|
||||
assert result.exit_code != 0
|
||||
assert "Unknown preset" in result.output
|
||||
|
||||
|
||||
class TestColumnSelection:
|
||||
def test_columns_flag(self, messy_csv):
|
||||
result = runner.invoke(
|
||||
app, [str(messy_csv), "--apply", "--columns", "name"],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
||||
df = pd.read_csv(cleaned)
|
||||
assert df["name"].iloc[0] == "Alice"
|
||||
# city should be untouched (still has spaces)
|
||||
assert df["city"].iloc[1] == " LA "
|
||||
|
||||
def test_skip_flag(self, messy_csv):
|
||||
result = runner.invoke(
|
||||
app, [str(messy_csv), "--apply", "--skip", "name"],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
||||
df = pd.read_csv(cleaned)
|
||||
# name should still have spaces
|
||||
assert df["name"].iloc[0].startswith(" ")
|
||||
|
||||
|
||||
class TestCaseFlag:
|
||||
def test_bare_case_applies_to_all(self, tmp_path):
|
||||
path = tmp_path / "names.csv"
|
||||
pd.DataFrame({"a": ["alice"], "b": ["bob"]}).to_csv(path, index=False)
|
||||
result = runner.invoke(app, [str(path), "--apply", "--case", "upper"])
|
||||
assert result.exit_code == 0
|
||||
df = pd.read_csv(tmp_path / "names_cleaned.csv")
|
||||
assert df["a"].iloc[0] == "ALICE"
|
||||
assert df["b"].iloc[0] == "BOB"
|
||||
|
||||
def test_per_column_case(self, tmp_path):
|
||||
path = tmp_path / "names.csv"
|
||||
pd.DataFrame({"name": ["alice"], "code": ["abc"]}).to_csv(path, index=False)
|
||||
result = runner.invoke(
|
||||
app, [str(path), "--apply", "--case", "title:name,upper:code"],
|
||||
)
|
||||
assert result.exit_code == 0
|
||||
df = pd.read_csv(tmp_path / "names_cleaned.csv")
|
||||
assert df["name"].iloc[0] == "Alice"
|
||||
assert df["code"].iloc[0] == "ABC"
|
||||
|
||||
|
||||
class TestConfigRoundTrip:
|
||||
def test_save_and_load(self, messy_csv, tmp_path):
|
||||
cfg = tmp_path / "opts.json"
|
||||
result1 = runner.invoke(
|
||||
app,
|
||||
[str(messy_csv), "--save-config", str(cfg), "--preset", "minimal", "--no-trim"],
|
||||
)
|
||||
assert result1.exit_code == 0
|
||||
assert cfg.exists()
|
||||
|
||||
# Reload and apply
|
||||
result2 = runner.invoke(app, [str(messy_csv), "--apply", "--config", str(cfg)])
|
||||
assert result2.exit_code == 0
|
||||
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
||||
df = pd.read_csv(cleaned)
|
||||
# With --no-trim, leading spaces survive
|
||||
assert df["name"].iloc[0].startswith(" ")
|
||||
Reference in New Issue
Block a user