Builds 02_text_cleaner.py from stub to working: character-level hygiene for CSV/Excel inputs covering trim, whitespace collapse, smart-character folding, Unicode NFC/NFKC, BOM strip, zero-width strip, control-char strip, line-ending normalization, and per-column case conversion. Three presets (minimal/excel-hygiene/paranoid) keep the buyer surface small. - src/core/text_clean.py: pure helpers + CleanOptions/CleanResult + clean_dataframe with dtype-safe column selection - src/cli_text_clean.py: Typer CLI mirroring the dedup CLI shape (dry-run by default, --apply writes cleaned + changes audit, JSON config save/load) - src/gui/pages/2_Text_Cleaner.py: real Streamlit page with preset picker, advanced toggles, preview, before/after metrics, and three download buttons - tests/test_text_clean.py + test_cli_text_clean.py: 92 new tests covering edge cases E1-E50 from the spec - samples/messy_text.csv: demo dataset surfacing UC1, UC3, UC6, UC10 in 10 rows - test-cases/uc16-uc26 + ec05-ec09: per-use-case and per-edge-case fixtures Docs: TECHNICAL.md §10.2 (full Tier 1/2/3 spec), DECISIONS.md v1.7 entry locking the spec, CLI-REFERENCE.md gains the text cleaner section, README.md gains a top-level Text Cleaner block, USER-GUIDE.md status row 02 promoted Skeleton -> Working. 200/200 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
159 lines
5.7 KiB
Python
159 lines
5.7 KiB
Python
"""Integration tests for the text-cleaner CLI."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
from typer.testing import CliRunner
|
|
|
|
from src.cli_text_clean import app
|
|
|
|
runner = CliRunner()
|
|
|
|
|
|
@pytest.fixture
|
|
def messy_csv(tmp_path):
|
|
df = pd.DataFrame({
|
|
"name": [" Alice ", "“Bob”", "Charlie"],
|
|
"city": ["NYC", " LA ", "SF"],
|
|
"qty": [1, 2, 3],
|
|
})
|
|
path = tmp_path / "messy.csv"
|
|
df.to_csv(path, index=False)
|
|
return path
|
|
|
|
|
|
class TestPreview:
|
|
def test_default_is_preview(self, messy_csv):
|
|
result = runner.invoke(app, [str(messy_csv)])
|
|
assert result.exit_code == 0
|
|
assert "preview" in result.output.lower()
|
|
assert "Cells changed" in result.output
|
|
|
|
def test_no_files_written_in_preview(self, messy_csv):
|
|
result = runner.invoke(app, [str(messy_csv)])
|
|
assert result.exit_code == 0
|
|
assert not (messy_csv.parent / f"{messy_csv.stem}_cleaned.csv").exists()
|
|
|
|
def test_file_not_found(self):
|
|
result = runner.invoke(app, ["/tmp/does_not_exist_xyz.csv"])
|
|
assert result.exit_code != 0
|
|
assert "not found" in result.output.lower()
|
|
|
|
|
|
class TestApply:
|
|
def test_apply_writes_cleaned_file(self, messy_csv): # E47
|
|
result = runner.invoke(app, [str(messy_csv), "--apply"])
|
|
assert result.exit_code == 0
|
|
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
|
assert cleaned.exists()
|
|
df = pd.read_csv(cleaned)
|
|
assert df["name"].iloc[0] == "Alice"
|
|
|
|
def test_apply_writes_changes_audit(self, messy_csv):
|
|
result = runner.invoke(app, [str(messy_csv), "--apply"])
|
|
assert result.exit_code == 0
|
|
changes = messy_csv.parent / f"{messy_csv.stem}_changes.csv"
|
|
assert changes.exists()
|
|
|
|
def test_no_audit_when_no_changes(self, tmp_path):
|
|
clean = tmp_path / "clean.csv"
|
|
pd.DataFrame({"a": ["x", "y"]}).to_csv(clean, index=False)
|
|
result = runner.invoke(app, [str(clean), "--apply"])
|
|
assert result.exit_code == 0
|
|
assert not (tmp_path / "clean_changes.csv").exists()
|
|
|
|
def test_custom_output_path(self, messy_csv, tmp_path):
|
|
out = tmp_path / "renamed.csv"
|
|
result = runner.invoke(app, [str(messy_csv), "--apply", "-o", str(out)])
|
|
assert result.exit_code == 0
|
|
assert out.exists()
|
|
|
|
|
|
class TestPresets:
|
|
def test_minimal_does_not_fold_smart_chars(self, messy_csv):
|
|
result = runner.invoke(app, [str(messy_csv), "--apply", "--preset", "minimal"])
|
|
assert result.exit_code == 0
|
|
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
|
df = pd.read_csv(cleaned)
|
|
# Smart quotes preserved under minimal preset
|
|
assert "“" in df["name"].iloc[1] or "”" in df["name"].iloc[1]
|
|
|
|
def test_excel_hygiene_default_folds_smart_chars(self, messy_csv):
|
|
result = runner.invoke(app, [str(messy_csv), "--apply"])
|
|
assert result.exit_code == 0
|
|
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
|
df = pd.read_csv(cleaned)
|
|
assert df["name"].iloc[1] == '"Bob"'
|
|
|
|
def test_unknown_preset_errors(self, messy_csv):
|
|
result = runner.invoke(app, [str(messy_csv), "--preset", "weird"])
|
|
assert result.exit_code != 0
|
|
assert "Unknown preset" in result.output
|
|
|
|
|
|
class TestColumnSelection:
|
|
def test_columns_flag(self, messy_csv):
|
|
result = runner.invoke(
|
|
app, [str(messy_csv), "--apply", "--columns", "name"],
|
|
)
|
|
assert result.exit_code == 0
|
|
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
|
df = pd.read_csv(cleaned)
|
|
assert df["name"].iloc[0] == "Alice"
|
|
# city should be untouched (still has spaces)
|
|
assert df["city"].iloc[1] == " LA "
|
|
|
|
def test_skip_flag(self, messy_csv):
|
|
result = runner.invoke(
|
|
app, [str(messy_csv), "--apply", "--skip", "name"],
|
|
)
|
|
assert result.exit_code == 0
|
|
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
|
df = pd.read_csv(cleaned)
|
|
# name should still have spaces
|
|
assert df["name"].iloc[0].startswith(" ")
|
|
|
|
|
|
class TestCaseFlag:
|
|
def test_bare_case_applies_to_all(self, tmp_path):
|
|
path = tmp_path / "names.csv"
|
|
pd.DataFrame({"a": ["alice"], "b": ["bob"]}).to_csv(path, index=False)
|
|
result = runner.invoke(app, [str(path), "--apply", "--case", "upper"])
|
|
assert result.exit_code == 0
|
|
df = pd.read_csv(tmp_path / "names_cleaned.csv")
|
|
assert df["a"].iloc[0] == "ALICE"
|
|
assert df["b"].iloc[0] == "BOB"
|
|
|
|
def test_per_column_case(self, tmp_path):
|
|
path = tmp_path / "names.csv"
|
|
pd.DataFrame({"name": ["alice"], "code": ["abc"]}).to_csv(path, index=False)
|
|
result = runner.invoke(
|
|
app, [str(path), "--apply", "--case", "title:name,upper:code"],
|
|
)
|
|
assert result.exit_code == 0
|
|
df = pd.read_csv(tmp_path / "names_cleaned.csv")
|
|
assert df["name"].iloc[0] == "Alice"
|
|
assert df["code"].iloc[0] == "ABC"
|
|
|
|
|
|
class TestConfigRoundTrip:
|
|
def test_save_and_load(self, messy_csv, tmp_path):
|
|
cfg = tmp_path / "opts.json"
|
|
result1 = runner.invoke(
|
|
app,
|
|
[str(messy_csv), "--save-config", str(cfg), "--preset", "minimal", "--no-trim"],
|
|
)
|
|
assert result1.exit_code == 0
|
|
assert cfg.exists()
|
|
|
|
# Reload and apply
|
|
result2 = runner.invoke(app, [str(messy_csv), "--apply", "--config", str(cfg)])
|
|
assert result2.exit_code == 0
|
|
cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
|
|
df = pd.read_csv(cleaned)
|
|
# With --no-trim, leading spaces survive
|
|
assert df["name"].iloc[0].startswith(" ")
|