"""Tests for src/core/text_clean.py. Covers edge cases E1-E50 from TECHNICAL.md Section 10.2 plan. """ from __future__ import annotations import json import numpy as np import pandas as pd import pytest from src.core.text_clean import ( CleanOptions, PRESETS, apply_case, clean_dataframe, clean_value, collapse_whitespace, fold_smart_chars, normalize_line_endings, sentence_case, smart_title_case, strip_bom, strip_control, strip_zero_width, to_nfc, to_nfkc, trim, ) # --------------------------------------------------------------------------- # Per-string helpers # --------------------------------------------------------------------------- class TestTrim: def test_strips_leading_and_trailing(self): assert trim(" hello ") == "hello" def test_preserves_internal_spaces(self): assert trim(" a b ") == "a b" def test_empty_string(self): assert trim("") == "" def test_idempotent(self): assert trim(trim(" x ")) == trim(" x ") class TestCollapseWhitespace: def test_multiple_spaces(self): assert collapse_whitespace("a b") == "a b" def test_tab_inside_cell(self): # E2 assert collapse_whitespace("a\tb") == "a b" def test_mixed_tabs_and_spaces(self): # E3 assert collapse_whitespace("a \t \t b") == "a b" def test_idempotent(self): assert collapse_whitespace(collapse_whitespace("a b")) == collapse_whitespace("a b") class TestNFC: def test_combining_acute(self): # E6 decomposed = "é" # e + combining acute composed = "é" # é assert to_nfc(decomposed) == composed def test_idempotent(self): s = "café" assert to_nfc(to_nfc(s)) == to_nfc(s) class TestNFKC: def test_circled_digit(self): # E7 assert to_nfkc("①") == "1" def test_ligature(self): # E7 assert to_nfkc("fi") == "fi" def test_idempotent(self): assert to_nfkc(to_nfkc("①fi")) == to_nfkc("①fi") class TestSmartChars: def test_curly_quotes(self): # E11 assert fold_smart_chars("‘hi’") == "'hi'" assert fold_smart_chars("“hi”") == '"hi"' def test_dashes(self): # E12 assert fold_smart_chars("a—b") == "a-b" assert fold_smart_chars("a–b") == "a-b" def test_ellipsis(self): # E13 assert fold_smart_chars("wait…") == "wait..." def test_nbsp(self): # E14 assert fold_smart_chars("a b") == "a b" def test_idempotent(self): s = "“hi” — a b" assert fold_smart_chars(fold_smart_chars(s)) == fold_smart_chars(s) class TestZeroWidth: def test_zwsp_midword(self): # E16 assert strip_zero_width("foo​bar") == "foobar" def test_bidi_marks_stripped(self): # E17 assert strip_zero_width("a‎b‏c") == "abc" def test_word_joiner(self): # E18 assert strip_zero_width("a⁠b") == "ab" def test_mid_string_feff(self): # E22 assert strip_zero_width("foobar") == "foobar" class TestStripBOM: def test_leading_bom(self): assert strip_bom("hello") == "hello" def test_no_bom(self): assert strip_bom("hello") == "hello" def test_idempotent(self): assert strip_bom(strip_bom("x")) == strip_bom("x") class TestStripControl: def test_null_byte(self): # E20 assert strip_control("a\x00b") == "ab" def test_preserves_tab_newline_cr(self): # E19 assert strip_control("a\tb\nc\rd") == "a\tb\nc\rd" def test_strips_other_control(self): # 0x01..0x1F minus tab/newline/CR/VT/FF? we keep \t \n \r only. assert strip_control("a\x01b\x07c\x1fd") == "abcd" def test_strips_del(self): assert strip_control("a\x7fb") == "ab" class TestLineEndings: def test_crlf(self): # E23 assert normalize_line_endings("a\r\nb") == "a\nb" def test_bare_cr(self): # E24 assert normalize_line_endings("a\rb") == "a\nb" def test_idempotent(self): assert ( normalize_line_endings(normalize_line_endings("a\r\nb\rc")) == normalize_line_endings("a\r\nb\rc") ) class TestSmartTitleCase: def test_preserves_acronym(self): # E26 assert smart_title_case("USA report") == "USA Report" assert smart_title_case("nasa launch") == "Nasa Launch" # already lower assert smart_title_case("NASA launch") == "NASA Launch" def test_lowercases_particles_midstring(self): # E27 assert smart_title_case("the lord of the rings") == "The Lord of the Rings" assert smart_title_case("a tale of two cities") == "A Tale of Two Cities" def test_keeps_first_and_last_capitalized(self): # "of" at the end stays capitalized result = smart_title_case("kingdom of") assert result == "Kingdom Of" def test_apostrophe(self): assert smart_title_case("o'neil") == "O'neil" class TestSentenceCase: def test_basic(self): # E28 assert sentence_case("hello. how are you? fine!") == "Hello. How are you? Fine!" def test_preserves_punctuation(self): assert sentence_case("WHAT? OK.") == "What? Ok." class TestApplyCase: def test_modes(self): assert apply_case("Hello World", "upper") == "HELLO WORLD" assert apply_case("Hello World", "lower") == "hello world" assert apply_case("hello world", "title") == "Hello World" assert apply_case("hello. world.", "sentence") == "Hello. World." def test_unknown_mode_raises(self): with pytest.raises(ValueError): apply_case("x", "weird") # type: ignore[arg-type] # --------------------------------------------------------------------------- # clean_value composition # --------------------------------------------------------------------------- class TestCleanValue: def test_default_excel_hygiene(self): opts = CleanOptions() out, ops = clean_value("“Hello world” ", opts) assert out == '"Hello world"' assert "fold_smart_chars" in ops assert "trim" in ops def test_pure_whitespace_to_empty(self): # E1 opts = CleanOptions() out, ops = clean_value(" ", opts) assert out == "" def test_nbsp_only_cell(self): # E5 opts = CleanOptions() out, _ = clean_value(" ", opts) assert out == "" def test_non_string_passthrough(self): # E32 opts = CleanOptions() for val in (None, 42, 3.14, True, np.nan): out, ops = clean_value(val, opts) # NaN compares unequal to itself; check pd.isna for that case if isinstance(val, float) and pd.isna(val): assert pd.isna(out) else: assert out == val assert ops == [] def test_empty_string(self): opts = CleanOptions() out, ops = clean_value("", opts) assert out == "" assert ops == [] def test_only_unchanged_ops_not_logged(self): opts = CleanOptions(trim=True, collapse_whitespace=True, nfc=False, nfkc=False, fold_smart_chars=False, strip_zero_width=False, strip_bom=False, strip_control=False, normalize_line_endings=False) out, ops = clean_value("hello", opts) assert out == "hello" assert ops == [] class TestIdempotency: """E40 — applying the pipeline twice yields the same result as once.""" @pytest.mark.parametrize("preset", list(PRESETS.keys())) def test_preset_idempotent(self, preset): opts = CleanOptions.from_preset(preset) cases = [ "“Hello world” ", " \t​ multi space \r\n ", "café", "éclair", "leading-bom", "USA and the Rings", "a\x00b\x01c", "", " ", ] for s in cases: once, _ = clean_value(s, opts) twice, _ = clean_value(once, opts) assert once == twice, f"not idempotent on {s!r} (preset {preset})" # --------------------------------------------------------------------------- # clean_dataframe # --------------------------------------------------------------------------- class TestCleanDataframe: def test_only_string_columns_touched(self): # E31, E33, E35 df = pd.DataFrame({ "name": [" Alice ", "Bob"], "age": [30, 25], "joined": pd.to_datetime(["2024-01-01", "2024-02-01"]), "active": [True, False], }) result = clean_dataframe(df) assert result.cleaned_df["name"].tolist() == ["Alice", "Bob"] assert result.cleaned_df["age"].tolist() == [30, 25] assert result.cleaned_df["active"].tolist() == [True, False] assert "name" in result.columns_processed assert "age" not in result.columns_processed def test_explicit_columns(self): # E41 df = pd.DataFrame({"a": [" x "], "b": [" y "]}) result = clean_dataframe(df, CleanOptions(columns=["a"])) assert result.cleaned_df["a"].iloc[0] == "x" assert result.cleaned_df["b"].iloc[0] == " y " assert result.columns_processed == ["a"] def test_skip_columns(self): # E42 df = pd.DataFrame({"name": [" A "], "notes": [" free text "]}) result = clean_dataframe(df, CleanOptions(skip_columns=["notes"])) assert result.cleaned_df["name"].iloc[0] == "A" assert result.cleaned_df["notes"].iloc[0] == " free text " def test_unknown_column_raises(self): df = pd.DataFrame({"a": ["x"]}) with pytest.raises(ValueError): clean_dataframe(df, CleanOptions(columns=["missing"])) def test_empty_dataframe(self): # E43 df = pd.DataFrame() result = clean_dataframe(df) assert result.cells_changed == 0 assert result.cells_total == 0 assert result.cleaned_df.empty def test_single_column_file(self): # E44 df = pd.DataFrame({"only": [" hello "]}) result = clean_dataframe(df) assert result.cleaned_df["only"].iloc[0] == "hello" def test_all_numeric_no_op(self): # E45 df = pd.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}) result = clean_dataframe(df) assert result.columns_processed == [] assert result.cells_changed == 0 def test_mixed_object_column_strings_only(self): # E34 df = pd.DataFrame({"mix": [" hello ", 42, None]}) result = clean_dataframe(df) assert result.cleaned_df["mix"].iloc[0] == "hello" assert result.cleaned_df["mix"].iloc[1] == 42 assert result.cleaned_df["mix"].iloc[2] is None def test_nan_preserved(self): # E32 df = pd.DataFrame({"a": [" x ", np.nan]}) result = clean_dataframe(df) assert result.cleaned_df["a"].iloc[0] == "x" assert pd.isna(result.cleaned_df["a"].iloc[1]) def test_changes_audit_count(self): # E48 df = pd.DataFrame({"a": [" x ", "y", " z"]}) result = clean_dataframe(df) assert result.cells_changed == 2 assert len(result.changes) == 2 assert set(result.changes["row"].tolist()) == {0, 2} def test_does_not_mutate_input(self): df = pd.DataFrame({"a": [" x "]}) original = df.copy() clean_dataframe(df) assert df.equals(original) def test_per_column_case_via_case_columns(self): df = pd.DataFrame({"name": ["alice"], "code": ["abc"]}) result = clean_dataframe(df, CleanOptions(case_columns={"code": "upper"})) assert result.cleaned_df["name"].iloc[0] == "alice" assert result.cleaned_df["code"].iloc[0] == "ABC" def test_global_case_applied_to_selected_only(self): df = pd.DataFrame({"name": ["alice"], "notes": ["bob"]}) result = clean_dataframe( df, CleanOptions(columns=["name"], case="upper"), ) assert result.cleaned_df["name"].iloc[0] == "ALICE" assert result.cleaned_df["notes"].iloc[0] == "bob" # --------------------------------------------------------------------------- # Presets and config round-trip # --------------------------------------------------------------------------- class TestPresets: def test_minimal_only_trim_collapse(self): opts = CleanOptions.from_preset("minimal") assert opts.trim is True assert opts.collapse_whitespace is True assert opts.nfc is False assert opts.fold_smart_chars is False def test_excel_hygiene_smart_chars_on_nfkc_off(self): opts = CleanOptions.from_preset("excel-hygiene") assert opts.fold_smart_chars is True assert opts.nfc is True assert opts.nfkc is False def test_paranoid_includes_nfkc(self): opts = CleanOptions.from_preset("paranoid") assert opts.nfkc is True def test_unknown_preset_raises(self): with pytest.raises(ValueError): CleanOptions.from_preset("does-not-exist") class TestConfigRoundTrip: def test_dict_roundtrip(self): # E49 opts = CleanOptions( trim=False, nfc=True, columns=["a", "b"], skip_columns=["c"], case="upper", ) recovered = CleanOptions.from_dict(opts.to_dict()) assert recovered == opts def test_file_roundtrip(self, tmp_path): path = tmp_path / "opts.json" opts = CleanOptions(case_columns={"code": "upper"}, fold_smart_chars=False) opts.to_file(path) loaded = CleanOptions.from_file(path) assert loaded == opts def test_unknown_keys_ignored(self): # E50 data = {"trim": True, "totally_made_up_key": 42} opts = CleanOptions.from_dict(data) assert opts.trim is True # --------------------------------------------------------------------------- # Use-case smoke tests (whole-pipeline) # --------------------------------------------------------------------------- class TestUseCases: def test_excel_save_as_csv_utf8_bom(self): # UC3: BOM at start of first cell df = pd.DataFrame({"name": ["Alice", "Bob"], "city": ["NYC", "LA"]}) result = clean_dataframe(df) assert result.cleaned_df["name"].iloc[0] == "Alice" def test_word_smart_quotes_in_product_titles(self): # UC2 df = pd.DataFrame({"title": ["“Best Dog Collar”", "Cat Toy — Red"]}) result = clean_dataframe(df) assert result.cleaned_df["title"].iloc[0] == '"Best Dog Collar"' assert result.cleaned_df["title"].iloc[1] == "Cat Toy - Red" def test_nbsp_in_email_field(self): # UC10: invisible Unicode hiding in emails df = pd.DataFrame({"email": ["alice@test.com​", "bob @test.com"]}) result = clean_dataframe(df) # ZWSP stripped; NBSP folded to space then collapsed but trim won't remove # internal space. So "bob @test.com" remains. That's correct: the cleaner # doesn't know that's an email — script 03 owns email format. Just confirm # the invisible char is gone. assert "​" not in result.cleaned_df["email"].iloc[0] assert " " not in result.cleaned_df["email"].iloc[1] def test_quickbooks_trailing_spaces(self): # UC6: VLOOKUP fails because of trailing spaces df = pd.DataFrame({"vendor": ["ACME Corp ", "ACME Corp"]}) result = clean_dataframe(df) assert result.cleaned_df["vendor"].iloc[0] == result.cleaned_df["vendor"].iloc[1] def test_bank_export_crlf_in_memo(self): # UC5: \r\n inside multi-line memo cells df = pd.DataFrame({"memo": ["line one\r\nline two\r\nline three"]}) result = clean_dataframe(df) assert "\r" not in result.cleaned_df["memo"].iloc[0] assert result.cleaned_df["memo"].iloc[0].count("\n") == 2 # --------------------------------------------------------------------------- # Reporting / dtype edge cases # --------------------------------------------------------------------------- class TestReporting: def test_changes_columns_present(self): df = pd.DataFrame({"a": [" x "]}) result = clean_dataframe(df) assert list(result.changes.columns) == [ "row", "column", "old", "new", "ops_applied", ] def test_changes_empty_when_no_changes(self): df = pd.DataFrame({"a": ["x", "y"]}) result = clean_dataframe(df) assert result.cells_changed == 0 assert result.changes.empty def test_cells_total_counts_only_processed_columns(self): df = pd.DataFrame({"a": ["x", "y", "z"], "n": [1, 2, 3]}) result = clean_dataframe(df) assert result.cells_total == 3 # only "a" is processed class TestVisualizeHidden: """``visualize_hidden_*`` makes invisible characters visible to the user.""" def test_text_passes_ascii_through(self): from src.core.text_clean import visualize_hidden_text assert visualize_hidden_text("hello") == "hello" def test_text_labels_nbsp(self): from src.core.text_clean import visualize_hidden_text out = visualize_hidden_text("Hi\u00a0there") assert "[NBSP]" in out def test_text_labels_zwsp(self): from src.core.text_clean import visualize_hidden_text out = visualize_hidden_text("a\u200bb") assert "[ZWSP]" in out def test_text_labels_tab_and_newline(self): from src.core.text_clean import visualize_hidden_text out = visualize_hidden_text("a\tb\nc") assert "[TAB]" in out assert "[LF]" in out def test_text_labels_smart_quotes(self): from src.core.text_clean import visualize_hidden_text out = visualize_hidden_text("“hi”") assert "[L DQUOTE]" in out and "[R DQUOTE]" in out def test_text_labels_unmapped_control_with_codepoint(self): from src.core.text_clean import visualize_hidden_text out = visualize_hidden_text("a\x07b") # BEL assert "[U+0007]" in out def test_html_wraps_invisibles_in_span(self): from src.core.text_clean import visualize_hidden_html out = visualize_hidden_html("Hi\u00a0\u200bthere") assert '") assert "<" in out and "&" in out and ">" in out def test_html_passes_normal_text_through(self): from src.core.text_clean import visualize_hidden_html assert visualize_hidden_html("plain") == "plain" def test_css_returns_a_style_block(self): from src.core.text_clean import hidden_char_css css = hidden_char_css() assert "