datatools-dev/tests/test_cli_text_clean.py

"""Integration tests for the text-cleaner CLI."""

from __future__ import annotations

from pathlib import Path

import pandas as pd
import pytest
from typer.testing import CliRunner

from src.cli_text_clean import app

runner = CliRunner()


@pytest.fixture
def messy_csv(tmp_path):
    df = pd.DataFrame({
        "name": ["  Alice  ", "“Bob”", "Charlie"],
        "city": ["NYC", " LA ", "SF"],
        "qty": [1, 2, 3],
    })
    path = tmp_path / "messy.csv"
    df.to_csv(path, index=False)
    return path


class TestPreview:
    def test_default_is_preview(self, messy_csv):
        result = runner.invoke(app, [str(messy_csv)])
        assert result.exit_code == 0
        assert "preview" in result.output.lower()
        assert "Cells changed" in result.output

    def test_no_files_written_in_preview(self, messy_csv):
        result = runner.invoke(app, [str(messy_csv)])
        assert result.exit_code == 0
        assert not (messy_csv.parent / f"{messy_csv.stem}_cleaned.csv").exists()

    def test_file_not_found(self):
        result = runner.invoke(app, ["/tmp/does_not_exist_xyz.csv"])
        assert result.exit_code != 0
        assert "not found" in result.output.lower()


class TestApply:
    def test_apply_writes_cleaned_file(self, messy_csv):  # E47
        result = runner.invoke(app, [str(messy_csv), "--apply"])
        assert result.exit_code == 0
        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
        assert cleaned.exists()
        df = pd.read_csv(cleaned)
        assert df["name"].iloc[0] == "Alice"

    def test_apply_writes_changes_audit(self, messy_csv):
        result = runner.invoke(app, [str(messy_csv), "--apply"])
        assert result.exit_code == 0
        changes = messy_csv.parent / f"{messy_csv.stem}_changes.csv"
        assert changes.exists()

    def test_no_audit_when_no_changes(self, tmp_path):
        clean = tmp_path / "clean.csv"
        pd.DataFrame({"a": ["x", "y"]}).to_csv(clean, index=False)
        result = runner.invoke(app, [str(clean), "--apply"])
        assert result.exit_code == 0
        assert not (tmp_path / "clean_changes.csv").exists()

    def test_custom_output_path(self, messy_csv, tmp_path):
        out = tmp_path / "renamed.csv"
        result = runner.invoke(app, [str(messy_csv), "--apply", "-o", str(out)])
        assert result.exit_code == 0
        assert out.exists()


class TestPresets:
    def test_minimal_does_not_fold_smart_chars(self, messy_csv):
        result = runner.invoke(app, [str(messy_csv), "--apply", "--preset", "minimal"])
        assert result.exit_code == 0
        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
        df = pd.read_csv(cleaned)
        # Smart quotes preserved under minimal preset
        assert "“" in df["name"].iloc[1] or "”" in df["name"].iloc[1]

    def test_excel_hygiene_default_folds_smart_chars(self, messy_csv):
        result = runner.invoke(app, [str(messy_csv), "--apply"])
        assert result.exit_code == 0
        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
        df = pd.read_csv(cleaned)
        assert df["name"].iloc[1] == '"Bob"'

    def test_unknown_preset_errors(self, messy_csv):
        result = runner.invoke(app, [str(messy_csv), "--preset", "weird"])
        assert result.exit_code != 0
        assert "Unknown preset" in result.output


class TestColumnSelection:
    def test_columns_flag(self, messy_csv):
        result = runner.invoke(
            app, [str(messy_csv), "--apply", "--columns", "name"],
        )
        assert result.exit_code == 0
        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
        df = pd.read_csv(cleaned)
        assert df["name"].iloc[0] == "Alice"
        # city should be untouched (still has spaces)
        assert df["city"].iloc[1] == " LA "

    def test_skip_flag(self, messy_csv):
        result = runner.invoke(
            app, [str(messy_csv), "--apply", "--skip", "name"],
        )
        assert result.exit_code == 0
        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
        df = pd.read_csv(cleaned)
        # name should still have spaces
        assert df["name"].iloc[0].startswith(" ")


class TestCaseFlag:
    def test_bare_case_applies_to_all(self, tmp_path):
        path = tmp_path / "names.csv"
        pd.DataFrame({"a": ["alice"], "b": ["bob"]}).to_csv(path, index=False)
        result = runner.invoke(app, [str(path), "--apply", "--case", "upper"])
        assert result.exit_code == 0
        df = pd.read_csv(tmp_path / "names_cleaned.csv")
        assert df["a"].iloc[0] == "ALICE"
        assert df["b"].iloc[0] == "BOB"

    def test_per_column_case(self, tmp_path):
        path = tmp_path / "names.csv"
        pd.DataFrame({"name": ["alice"], "code": ["abc"]}).to_csv(path, index=False)
        result = runner.invoke(
            app, [str(path), "--apply", "--case", "title:name,upper:code"],
        )
        assert result.exit_code == 0
        df = pd.read_csv(tmp_path / "names_cleaned.csv")
        assert df["name"].iloc[0] == "Alice"
        assert df["code"].iloc[0] == "ABC"


class TestConfigRoundTrip:
    def test_save_and_load(self, messy_csv, tmp_path):
        cfg = tmp_path / "opts.json"
        result1 = runner.invoke(
            app,
            [str(messy_csv), "--save-config", str(cfg), "--preset", "minimal", "--no-trim"],
        )
        assert result1.exit_code == 0
        assert cfg.exists()

        # Reload and apply
        result2 = runner.invoke(app, [str(messy_csv), "--apply", "--config", str(cfg)])
        assert result2.exit_code == 0
        cleaned = messy_csv.parent / f"{messy_csv.stem}_cleaned.csv"
        df = pd.read_csv(cleaned)
        # With --no-trim, leading spaces survive
        assert df["name"].iloc[0].startswith(" ")