datatools-dev/tests/test_text_clean.py

"""Tests for src/core/text_clean.py.

Covers edge cases E1-E50 from TECHNICAL.md Section 10.2 plan.
"""

from __future__ import annotations

import json

import numpy as np
import pandas as pd
import pytest

from src.core.text_clean import (
    CleanOptions,
    PRESETS,
    apply_case,
    clean_dataframe,
    clean_value,
    collapse_whitespace,
    fold_smart_chars,
    normalize_line_endings,
    sentence_case,
    smart_title_case,
    strip_bom,
    strip_control,
    strip_zero_width,
    to_nfc,
    to_nfkc,
    trim,
)


# ---------------------------------------------------------------------------
# Per-string helpers
# ---------------------------------------------------------------------------

class TestTrim:
    def test_strips_leading_and_trailing(self):
        assert trim("  hello  ") == "hello"

    def test_preserves_internal_spaces(self):
        assert trim("  a  b  ") == "a  b"

    def test_empty_string(self):
        assert trim("") == ""

    def test_idempotent(self):
        assert trim(trim("  x  ")) == trim("  x  ")


class TestCollapseWhitespace:
    def test_multiple_spaces(self):
        assert collapse_whitespace("a   b") == "a b"

    def test_tab_inside_cell(self):  # E2
        assert collapse_whitespace("a\tb") == "a b"

    def test_mixed_tabs_and_spaces(self):  # E3
        assert collapse_whitespace("a \t \t b") == "a b"

    def test_idempotent(self):
        assert collapse_whitespace(collapse_whitespace("a  b")) == collapse_whitespace("a  b")


class TestNFC:
    def test_combining_acute(self):  # E6
        decomposed = "é"  # e + combining acute
        composed = "é"     # é
        assert to_nfc(decomposed) == composed

    def test_idempotent(self):
        s = "café"
        assert to_nfc(to_nfc(s)) == to_nfc(s)


class TestNFKC:
    def test_circled_digit(self):  # E7
        assert to_nfkc("①") == "1"

    def test_ligature(self):  # E7
        assert to_nfkc("ﬁ") == "fi"

    def test_idempotent(self):
        assert to_nfkc(to_nfkc("①ﬁ")) == to_nfkc("①ﬁ")


class TestSmartChars:
    def test_curly_quotes(self):  # E11
        assert fold_smart_chars("‘hi’") == "'hi'"
        assert fold_smart_chars("“hi”") == '"hi"'

    def test_dashes(self):  # E12
        assert fold_smart_chars("a—b") == "a-b"
        assert fold_smart_chars("a–b") == "a-b"

    def test_ellipsis(self):  # E13
        assert fold_smart_chars("wait…") == "wait..."

    def test_nbsp(self):  # E14
        assert fold_smart_chars("a b") == "a b"

    def test_idempotent(self):
        s = "“hi” — a b"
        assert fold_smart_chars(fold_smart_chars(s)) == fold_smart_chars(s)


class TestZeroWidth:
    def test_zwsp_midword(self):  # E16
        assert strip_zero_width("foobar") == "foobar"

    def test_bidi_marks_stripped(self):  # E17
        assert strip_zero_width("a‎b‏c") == "abc"

    def test_word_joiner(self):  # E18
        assert strip_zero_width("a⁠b") == "ab"

    def test_mid_string_feff(self):  # E22
        assert strip_zero_width("foobar") == "foobar"


class TestStripBOM:
    def test_leading_bom(self):
        assert strip_bom("hello") == "hello"

    def test_no_bom(self):
        assert strip_bom("hello") == "hello"

    def test_idempotent(self):
        assert strip_bom(strip_bom("x")) == strip_bom("x")


class TestStripControl:
    def test_null_byte(self):  # E20
        assert strip_control("a\x00b") == "ab"

    def test_preserves_tab_newline_cr(self):  # E19
        assert strip_control("a\tb\nc\rd") == "a\tb\nc\rd"

    def test_strips_other_control(self):
        # 0x01..0x1F minus tab/newline/CR/VT/FF? we keep \t \n \r only.
        assert strip_control("a\x01b\x07c\x1fd") == "abcd"

    def test_strips_del(self):
        assert strip_control("a\x7fb") == "ab"


class TestLineEndings:
    def test_crlf(self):  # E23
        assert normalize_line_endings("a\r\nb") == "a\nb"

    def test_bare_cr(self):  # E24
        assert normalize_line_endings("a\rb") == "a\nb"

    def test_idempotent(self):
        assert (
            normalize_line_endings(normalize_line_endings("a\r\nb\rc"))
            == normalize_line_endings("a\r\nb\rc")
        )


class TestSmartTitleCase:
    def test_preserves_acronym(self):  # E26
        assert smart_title_case("USA report") == "USA Report"
        assert smart_title_case("nasa launch") == "Nasa Launch"  # already lower
        assert smart_title_case("NASA launch") == "NASA Launch"

    def test_lowercases_particles_midstring(self):  # E27
        assert smart_title_case("the lord of the rings") == "The Lord of the Rings"
        assert smart_title_case("a tale of two cities") == "A Tale of Two Cities"

    def test_keeps_first_and_last_capitalized(self):
        # "of" at the end stays capitalized
        result = smart_title_case("kingdom of")
        assert result == "Kingdom Of"

    def test_apostrophe(self):
        assert smart_title_case("o'neil") == "O'neil"


class TestSentenceCase:
    def test_basic(self):  # E28
        assert sentence_case("hello. how are you? fine!") == "Hello. How are you? Fine!"

    def test_preserves_punctuation(self):
        assert sentence_case("WHAT? OK.") == "What? Ok."


class TestApplyCase:
    def test_modes(self):
        assert apply_case("Hello World", "upper") == "HELLO WORLD"
        assert apply_case("Hello World", "lower") == "hello world"
        assert apply_case("hello world", "title") == "Hello World"
        assert apply_case("hello. world.", "sentence") == "Hello. World."

    def test_unknown_mode_raises(self):
        with pytest.raises(ValueError):
            apply_case("x", "weird")  # type: ignore[arg-type]


# ---------------------------------------------------------------------------
# clean_value composition
# ---------------------------------------------------------------------------

class TestCleanValue:
    def test_default_excel_hygiene(self):
        opts = CleanOptions()
        out, ops = clean_value("“Hello world”  ", opts)
        assert out == '"Hello world"'
        assert "fold_smart_chars" in ops
        assert "trim" in ops

    def test_pure_whitespace_to_empty(self):  # E1
        opts = CleanOptions()
        out, ops = clean_value("   ", opts)
        assert out == ""

    def test_nbsp_only_cell(self):  # E5
        opts = CleanOptions()
        out, _ = clean_value(" ", opts)
        assert out == ""

    def test_non_string_passthrough(self):  # E32
        opts = CleanOptions()
        for val in (None, 42, 3.14, True, np.nan):
            out, ops = clean_value(val, opts)
            # NaN compares unequal to itself; check pd.isna for that case
            if isinstance(val, float) and pd.isna(val):
                assert pd.isna(out)
            else:
                assert out == val
            assert ops == []

    def test_empty_string(self):
        opts = CleanOptions()
        out, ops = clean_value("", opts)
        assert out == ""
        assert ops == []

    def test_only_unchanged_ops_not_logged(self):
        opts = CleanOptions(trim=True, collapse_whitespace=True, nfc=False, nfkc=False,
                            fold_smart_chars=False, strip_zero_width=False,
                            strip_bom=False, strip_control=False,
                            normalize_line_endings=False)
        out, ops = clean_value("hello", opts)
        assert out == "hello"
        assert ops == []


class TestIdempotency:
    """E40 — applying the pipeline twice yields the same result as once."""

    @pytest.mark.parametrize("preset", list(PRESETS.keys()))
    def test_preset_idempotent(self, preset):
        opts = CleanOptions.from_preset(preset)
        cases = [
            "“Hello world”  ",
            "  \t multi   space  \r\n  ",
            "café",
            "éclair",
            "leading-bom",
            "USA and the Rings",
            "a\x00b\x01c",
            "",
            " ",
        ]
        for s in cases:
            once, _ = clean_value(s, opts)
            twice, _ = clean_value(once, opts)
            assert once == twice, f"not idempotent on {s!r} (preset {preset})"


# ---------------------------------------------------------------------------
# clean_dataframe
# ---------------------------------------------------------------------------

class TestCleanDataframe:
    def test_only_string_columns_touched(self):  # E31, E33, E35
        df = pd.DataFrame({
            "name": ["  Alice  ", "Bob"],
            "age": [30, 25],
            "joined": pd.to_datetime(["2024-01-01", "2024-02-01"]),
            "active": [True, False],
        })
        result = clean_dataframe(df)
        assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"]
        assert result.cleaned_df["age"].tolist() == [30, 25]
        assert result.cleaned_df["active"].tolist() == [True, False]
        assert "name" in result.columns_processed
        assert "age" not in result.columns_processed

    def test_explicit_columns(self):  # E41
        df = pd.DataFrame({"a": ["  x  "], "b": ["  y  "]})
        result = clean_dataframe(df, CleanOptions(columns=["a"]))
        assert result.cleaned_df["a"].iloc[0] == "x"
        assert result.cleaned_df["b"].iloc[0] == "  y  "
        assert result.columns_processed == ["a"]

    def test_skip_columns(self):  # E42
        df = pd.DataFrame({"name": ["  A  "], "notes": ["  free text  "]})
        result = clean_dataframe(df, CleanOptions(skip_columns=["notes"]))
        assert result.cleaned_df["name"].iloc[0] == "A"
        assert result.cleaned_df["notes"].iloc[0] == "  free text  "

    def test_unknown_column_raises(self):
        df = pd.DataFrame({"a": ["x"]})
        with pytest.raises(ValueError):
            clean_dataframe(df, CleanOptions(columns=["missing"]))

    def test_empty_dataframe(self):  # E43
        df = pd.DataFrame()
        result = clean_dataframe(df)
        assert result.cells_changed == 0
        assert result.cells_total == 0
        assert result.cleaned_df.empty

    def test_single_column_file(self):  # E44
        df = pd.DataFrame({"only": ["  hello  "]})
        result = clean_dataframe(df)
        assert result.cleaned_df["only"].iloc[0] == "hello"

    def test_all_numeric_no_op(self):  # E45
        df = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0]})
        result = clean_dataframe(df)
        assert result.columns_processed == []
        assert result.cells_changed == 0

    def test_mixed_object_column_strings_only(self):  # E34
        df = pd.DataFrame({"mix": ["  hello  ", 42, None]})
        result = clean_dataframe(df)
        assert result.cleaned_df["mix"].iloc[0] == "hello"
        assert result.cleaned_df["mix"].iloc[1] == 42
        assert result.cleaned_df["mix"].iloc[2] is None

    def test_nan_preserved(self):  # E32
        df = pd.DataFrame({"a": ["  x  ", np.nan]})
        result = clean_dataframe(df)
        assert result.cleaned_df["a"].iloc[0] == "x"
        assert pd.isna(result.cleaned_df["a"].iloc[1])

    def test_changes_audit_count(self):  # E48
        df = pd.DataFrame({"a": ["  x  ", "y", "  z"]})
        result = clean_dataframe(df)
        assert result.cells_changed == 2
        assert len(result.changes) == 2
        assert set(result.changes["row"].tolist()) == {0, 2}

    def test_does_not_mutate_input(self):
        df = pd.DataFrame({"a": ["  x  "]})
        original = df.copy()
        clean_dataframe(df)
        assert df.equals(original)

    def test_per_column_case_via_case_columns(self):
        df = pd.DataFrame({"name": ["alice"], "code": ["abc"]})
        result = clean_dataframe(df, CleanOptions(case_columns={"code": "upper"}))
        assert result.cleaned_df["name"].iloc[0] == "alice"
        assert result.cleaned_df["code"].iloc[0] == "ABC"

    def test_global_case_applied_to_selected_only(self):
        df = pd.DataFrame({"name": ["alice"], "notes": ["bob"]})
        result = clean_dataframe(
            df, CleanOptions(columns=["name"], case="upper"),
        )
        assert result.cleaned_df["name"].iloc[0] == "ALICE"
        assert result.cleaned_df["notes"].iloc[0] == "bob"


# ---------------------------------------------------------------------------
# Presets and config round-trip
# ---------------------------------------------------------------------------

class TestPresets:
    def test_minimal_only_trim_collapse(self):
        opts = CleanOptions.from_preset("minimal")
        assert opts.trim is True
        assert opts.collapse_whitespace is True
        assert opts.nfc is False
        assert opts.fold_smart_chars is False

    def test_excel_hygiene_smart_chars_on_nfkc_off(self):
        opts = CleanOptions.from_preset("excel-hygiene")
        assert opts.fold_smart_chars is True
        assert opts.nfc is True
        assert opts.nfkc is False

    def test_paranoid_includes_nfkc(self):
        opts = CleanOptions.from_preset("paranoid")
        assert opts.nfkc is True

    def test_unknown_preset_raises(self):
        with pytest.raises(ValueError):
            CleanOptions.from_preset("does-not-exist")


class TestConfigRoundTrip:
    def test_dict_roundtrip(self):  # E49
        opts = CleanOptions(
            trim=False, nfc=True, columns=["a", "b"], skip_columns=["c"],
            case="upper",
        )
        recovered = CleanOptions.from_dict(opts.to_dict())
        assert recovered == opts

    def test_file_roundtrip(self, tmp_path):
        path = tmp_path / "opts.json"
        opts = CleanOptions(case_columns={"code": "upper"}, fold_smart_chars=False)
        opts.to_file(path)
        loaded = CleanOptions.from_file(path)
        assert loaded == opts

    def test_unknown_keys_ignored(self):  # E50
        data = {"trim": True, "totally_made_up_key": 42}
        opts = CleanOptions.from_dict(data)
        assert opts.trim is True


# ---------------------------------------------------------------------------
# Use-case smoke tests (whole-pipeline)
# ---------------------------------------------------------------------------

class TestUseCases:
    def test_excel_save_as_csv_utf8_bom(self):
        # UC3: BOM at start of first cell
        df = pd.DataFrame({"name": ["Alice", "Bob"], "city": ["NYC", "LA"]})
        result = clean_dataframe(df)
        assert result.cleaned_df["name"].iloc[0] == "Alice"

    def test_word_smart_quotes_in_product_titles(self):
        # UC2
        df = pd.DataFrame({"title": ["“Best Dog Collar”", "Cat Toy — Red"]})
        result = clean_dataframe(df)
        assert result.cleaned_df["title"].iloc[0] == '"Best Dog Collar"'
        assert result.cleaned_df["title"].iloc[1] == "Cat Toy - Red"

    def test_nbsp_in_email_field(self):
        # UC10: invisible Unicode hiding in emails
        df = pd.DataFrame({"email": ["alice@test.com", "bob @test.com"]})
        result = clean_dataframe(df)
        # ZWSP stripped; NBSP folded to space then collapsed but trim won't remove
        # internal space. So "bob @test.com" remains. That's correct: the cleaner
        # doesn't know that's an email — script 03 owns email format. Just confirm
        # the invisible char is gone.
        assert "" not in result.cleaned_df["email"].iloc[0]
        assert " " not in result.cleaned_df["email"].iloc[1]

    def test_quickbooks_trailing_spaces(self):
        # UC6: VLOOKUP fails because of trailing spaces
        df = pd.DataFrame({"vendor": ["ACME Corp ", "ACME Corp"]})
        result = clean_dataframe(df)
        assert result.cleaned_df["vendor"].iloc[0] == result.cleaned_df["vendor"].iloc[1]

    def test_bank_export_crlf_in_memo(self):
        # UC5: \r\n inside multi-line memo cells
        df = pd.DataFrame({"memo": ["line one\r\nline two\r\nline three"]})
        result = clean_dataframe(df)
        assert "\r" not in result.cleaned_df["memo"].iloc[0]
        assert result.cleaned_df["memo"].iloc[0].count("\n") == 2


# ---------------------------------------------------------------------------
# Reporting / dtype edge cases
# ---------------------------------------------------------------------------

class TestReporting:
    def test_changes_columns_present(self):
        df = pd.DataFrame({"a": ["  x  "]})
        result = clean_dataframe(df)
        assert list(result.changes.columns) == [
            "row", "column", "old", "new", "ops_applied",
        ]

    def test_changes_empty_when_no_changes(self):
        df = pd.DataFrame({"a": ["x", "y"]})
        result = clean_dataframe(df)
        assert result.cells_changed == 0
        assert result.changes.empty

    def test_cells_total_counts_only_processed_columns(self):
        df = pd.DataFrame({"a": ["x", "y", "z"], "n": [1, 2, 3]})
        result = clean_dataframe(df)
        assert result.cells_total == 3  # only "a" is processed