Files
datatools-dev/tests/test_cli_text_clean.py
Michael 54f92ae47e feat: implement text cleaner (script 02) with CLI, GUI, and tests
Builds 02_text_cleaner.py from stub to working: character-level hygiene
for CSV/Excel inputs covering trim, whitespace collapse, smart-character
folding, Unicode NFC/NFKC, BOM strip, zero-width strip, control-char
strip, line-ending normalization, and per-column case conversion. Three
presets (minimal/excel-hygiene/paranoid) keep the buyer surface small.

- src/core/text_clean.py: pure helpers + CleanOptions/CleanResult +
  clean_dataframe with dtype-safe column selection
- src/cli_text_clean.py: Typer CLI mirroring the dedup CLI shape
  (dry-run by default, --apply writes cleaned + changes audit, JSON
  config save/load)
- src/gui/pages/2_Text_Cleaner.py: real Streamlit page with preset
  picker, advanced toggles, preview, before/after metrics, and three
  download buttons
- tests/test_text_clean.py + test_cli_text_clean.py: 92 new tests
  covering edge cases E1-E50 from the spec
- samples/messy_text.csv: demo dataset surfacing UC1, UC3, UC6, UC10
  in 10 rows
- test-cases/uc16-uc26 + ec05-ec09: per-use-case and per-edge-case
  fixtures

Docs: TECHNICAL.md §10.2 (full Tier 1/2/3 spec), DECISIONS.md v1.7
entry locking the spec, CLI-REFERENCE.md gains the text cleaner
section, README.md gains a top-level Text Cleaner block, USER-GUIDE.md
status row 02 promoted Skeleton -> Working.

200/200 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-29 15:14:15 +00:00

159 lines
5.7 KiB
Python

"""Integration tests for the text-cleaner CLI."""
from __future__ import annotations
from pathlib import Path
import pandas as pd
import pytest
from typer.testing import CliRunner
from src.cli_text_clean import app
runner = CliRunner()
@pytest.fixture
def messy_csv(tmp_path):
df = pd.DataFrame({
"name": [" Alice ", "“Bob”", "Charlie"],
"city": ["NYC", " LA ", "SF"],
"qty": [1, 2, 3],
})
path = tmp_path / "messy.csv"
df.to_csv(path, index=False)
return path
class TestPreview:
def test_default_is_preview(self, messy_csv):
result = runner.invoke(app, [str(messy_csv)])
assert result.exit_code == 0
assert "preview" in result.output.lower()
assert "Cells changed" in result.output
def test_no_files_written_in_preview(self, messy_csv):
result = runner.invoke(app, [str(messy_csv)])
assert result.exit_code == 0
assert not (messy_csv.parent / f"{messy_csv.stem}_cleaned.csv").exists()
def test_file_not_found(self):
result = runner.invoke(app, ["/tmp/does_not_exist_xyz.csv"])
assert result.exit_code != 0
assert "not found" in result.output.lower()
class TestApply:
def test_apply_writes_cleaned_file(self, messy_csv): # E47
result = runner.invoke(app, [str(messy_csv), "--apply"])
assert result.exit_code == 0
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
assert cleaned.exists()
df = pd.read_csv(cleaned)
assert df["name"].iloc[0] == "Alice"
def test_apply_writes_changes_audit(self, messy_csv):
result = runner.invoke(app, [str(messy_csv), "--apply"])
assert result.exit_code == 0
changes = messy_csv.parent / f"{messy_csv.stem}_changes.csv"
assert changes.exists()
def test_no_audit_when_no_changes(self, tmp_path):
clean = tmp_path / "clean.csv"
pd.DataFrame({"a": ["x", "y"]}).to_csv(clean, index=False)
result = runner.invoke(app, [str(clean), "--apply"])
assert result.exit_code == 0
assert not (tmp_path / "clean_changes.csv").exists()
def test_custom_output_path(self, messy_csv, tmp_path):
out = tmp_path / "renamed.csv"
result = runner.invoke(app, [str(messy_csv), "--apply", "-o", str(out)])
assert result.exit_code == 0
assert out.exists()
class TestPresets:
def test_minimal_does_not_fold_smart_chars(self, messy_csv):
result = runner.invoke(app, [str(messy_csv), "--apply", "--preset", "minimal"])
assert result.exit_code == 0
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
df = pd.read_csv(cleaned)
# Smart quotes preserved under minimal preset
assert "" in df["name"].iloc[1] or "" in df["name"].iloc[1]
def test_excel_hygiene_default_folds_smart_chars(self, messy_csv):
result = runner.invoke(app, [str(messy_csv), "--apply"])
assert result.exit_code == 0
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
df = pd.read_csv(cleaned)
assert df["name"].iloc[1] == '"Bob"'
def test_unknown_preset_errors(self, messy_csv):
result = runner.invoke(app, [str(messy_csv), "--preset", "weird"])
assert result.exit_code != 0
assert "Unknown preset" in result.output
class TestColumnSelection:
def test_columns_flag(self, messy_csv):
result = runner.invoke(
app, [str(messy_csv), "--apply", "--columns", "name"],
)
assert result.exit_code == 0
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
df = pd.read_csv(cleaned)
assert df["name"].iloc[0] == "Alice"
# city should be untouched (still has spaces)
assert df["city"].iloc[1] == " LA "
def test_skip_flag(self, messy_csv):
result = runner.invoke(
app, [str(messy_csv), "--apply", "--skip", "name"],
)
assert result.exit_code == 0
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
df = pd.read_csv(cleaned)
# name should still have spaces
assert df["name"].iloc[0].startswith(" ")
class TestCaseFlag:
def test_bare_case_applies_to_all(self, tmp_path):
path = tmp_path / "names.csv"
pd.DataFrame({"a": ["alice"], "b": ["bob"]}).to_csv(path, index=False)
result = runner.invoke(app, [str(path), "--apply", "--case", "upper"])
assert result.exit_code == 0
df = pd.read_csv(tmp_path / "names_cleaned.csv")
assert df["a"].iloc[0] == "ALICE"
assert df["b"].iloc[0] == "BOB"
def test_per_column_case(self, tmp_path):
path = tmp_path / "names.csv"
pd.DataFrame({"name": ["alice"], "code": ["abc"]}).to_csv(path, index=False)
result = runner.invoke(
app, [str(path), "--apply", "--case", "title:name,upper:code"],
)
assert result.exit_code == 0
df = pd.read_csv(tmp_path / "names_cleaned.csv")
assert df["name"].iloc[0] == "Alice"
assert df["code"].iloc[0] == "ABC"
class TestConfigRoundTrip:
def test_save_and_load(self, messy_csv, tmp_path):
cfg = tmp_path / "opts.json"
result1 = runner.invoke(
app,
[str(messy_csv), "--save-config", str(cfg), "--preset", "minimal", "--no-trim"],
)
assert result1.exit_code == 0
assert cfg.exists()
# Reload and apply
result2 = runner.invoke(app, [str(messy_csv), "--apply", "--config", str(cfg)])
assert result2.exit_code == 0
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
df = pd.read_csv(cleaned)
# With --no-trim, leading spaces survive
assert df["name"].iloc[0].startswith(" ")