"""Integration tests for the text-cleaner CLI.""" from __future__ import annotations from pathlib import Path import pandas as pd import pytest from typer.testing import CliRunner from src.cli_text_clean import app runner = CliRunner() @pytest.fixture def messy_csv(tmp_path): df = pd.DataFrame({ "name": [" Alice ", "“Bob”", "Charlie"], "city": ["NYC", " LA ", "SF"], "qty": [1, 2, 3], }) path = tmp_path / "messy.csv" df.to_csv(path, index=False) return path class TestPreview: def test_default_is_preview(self, messy_csv): result = runner.invoke(app, [str(messy_csv)]) assert result.exit_code == 0 assert "preview" in result.output.lower() assert "Cells changed" in result.output def test_no_files_written_in_preview(self, messy_csv): result = runner.invoke(app, [str(messy_csv)]) assert result.exit_code == 0 assert not (messy_csv.parent / f"{messy_csv.stem}_cleaned.csv").exists() def test_file_not_found(self): result = runner.invoke(app, ["/tmp/does_not_exist_xyz.csv"]) assert result.exit_code != 0 assert "not found" in result.output.lower() class TestApply: def test_apply_writes_cleaned_file(self, messy_csv): # E47 result = runner.invoke(app, [str(messy_csv), "--apply"]) assert result.exit_code == 0 cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv" assert cleaned.exists() df = pd.read_csv(cleaned) assert df["name"].iloc[0] == "Alice" def test_apply_writes_changes_audit(self, messy_csv): result = runner.invoke(app, [str(messy_csv), "--apply"]) assert result.exit_code == 0 changes = messy_csv.parent / f"{messy_csv.stem}_changes.csv" assert changes.exists() def test_no_audit_when_no_changes(self, tmp_path): clean = tmp_path / "clean.csv" pd.DataFrame({"a": ["x", "y"]}).to_csv(clean, index=False) result = runner.invoke(app, [str(clean), "--apply"]) assert result.exit_code == 0 assert not (tmp_path / "clean_changes.csv").exists() def test_custom_output_path(self, messy_csv, tmp_path): out = tmp_path / "renamed.csv" result = runner.invoke(app, [str(messy_csv), "--apply", "-o", str(out)]) assert result.exit_code == 0 assert out.exists() class TestPresets: def test_minimal_does_not_fold_smart_chars(self, messy_csv): result = runner.invoke(app, [str(messy_csv), "--apply", "--preset", "minimal"]) assert result.exit_code == 0 cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv" df = pd.read_csv(cleaned) # Smart quotes preserved under minimal preset assert "“" in df["name"].iloc[1] or "”" in df["name"].iloc[1] def test_excel_hygiene_default_folds_smart_chars(self, messy_csv): result = runner.invoke(app, [str(messy_csv), "--apply"]) assert result.exit_code == 0 cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv" df = pd.read_csv(cleaned) assert df["name"].iloc[1] == '"Bob"' def test_unknown_preset_errors(self, messy_csv): result = runner.invoke(app, [str(messy_csv), "--preset", "weird"]) assert result.exit_code != 0 assert "Unknown preset" in result.output class TestColumnSelection: def test_columns_flag(self, messy_csv): result = runner.invoke( app, [str(messy_csv), "--apply", "--columns", "name"], ) assert result.exit_code == 0 cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv" df = pd.read_csv(cleaned) assert df["name"].iloc[0] == "Alice" # city should be untouched (still has spaces) assert df["city"].iloc[1] == " LA " def test_skip_flag(self, messy_csv): result = runner.invoke( app, [str(messy_csv), "--apply", "--skip", "name"], ) assert result.exit_code == 0 cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv" df = pd.read_csv(cleaned) # name should still have spaces assert df["name"].iloc[0].startswith(" ") class TestCaseFlag: def test_bare_case_applies_to_all(self, tmp_path): path = tmp_path / "names.csv" pd.DataFrame({"a": ["alice"], "b": ["bob"]}).to_csv(path, index=False) result = runner.invoke(app, [str(path), "--apply", "--case", "upper"]) assert result.exit_code == 0 df = pd.read_csv(tmp_path / "names_cleaned.csv") assert df["a"].iloc[0] == "ALICE" assert df["b"].iloc[0] == "BOB" def test_per_column_case(self, tmp_path): path = tmp_path / "names.csv" pd.DataFrame({"name": ["alice"], "code": ["abc"]}).to_csv(path, index=False) result = runner.invoke( app, [str(path), "--apply", "--case", "title:name,upper:code"], ) assert result.exit_code == 0 df = pd.read_csv(tmp_path / "names_cleaned.csv") assert df["name"].iloc[0] == "Alice" assert df["code"].iloc[0] == "ABC" class TestConfigRoundTrip: def test_save_and_load(self, messy_csv, tmp_path): cfg = tmp_path / "opts.json" result1 = runner.invoke( app, [str(messy_csv), "--save-config", str(cfg), "--preset", "minimal", "--no-trim"], ) assert result1.exit_code == 0 assert cfg.exists() # Reload and apply result2 = runner.invoke(app, [str(messy_csv), "--apply", "--config", str(cfg)]) assert result2.exit_code == 0 cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv" df = pd.read_csv(cleaned) # With --no-trim, leading spaces survive assert df["name"].iloc[0].startswith(" ")