"""Tests for src.core.format_standardize.""" import pandas as pd import pytest from src.core.format_standardize import ( PRESETS, FieldType, StandardizeOptions, detect_currency_code, standardize_address, standardize_boolean, standardize_currency, standardize_dataframe, standardize_date, standardize_name, standardize_phone, ) class TestStandardizeDate: def test_iso_passthrough(self): out, changed = standardize_date("2024-01-15") assert out == "2024-01-15" assert changed is False def test_us_slash(self): out, changed = standardize_date("01/15/2024") assert (out, changed) == ("2024-01-15", True) def test_us_dash(self): out, _ = standardize_date("1-15-2024") assert out == "2024-01-15" def test_two_digit_year(self): out, _ = standardize_date("01/15/24") assert out == "2024-01-15" def test_long_month_name(self): out, _ = standardize_date("January 15, 2024") assert out == "2024-01-15" def test_short_month_name(self): out, _ = standardize_date("Jan 15 2024") assert out == "2024-01-15" def test_dmy_order(self): out, _ = standardize_date("15/01/2024", date_order="DMY") assert out == "2024-01-15" def test_strip_time_tail(self): out, _ = standardize_date("2024-01-15 13:45:00") assert out == "2024-01-15" def test_iso_with_t_separator(self): out, _ = standardize_date("2024-01-15T08:30:00Z") assert out == "2024-01-15" def test_compact(self): out, _ = standardize_date("20240115") assert out == "2024-01-15" def test_custom_output(self): out, _ = standardize_date("01/15/2024", output_format="%d %b %Y") assert out == "15 Jan 2024" def test_unparseable_passthrough(self): out, changed = standardize_date("hello") assert (out, changed) == ("hello", False) def test_empty(self): assert standardize_date("") == ("", False) assert standardize_date(None) == ("", False) def test_idempotent(self): out, _ = standardize_date("01/15/2024") out2, changed2 = standardize_date(out) assert out2 == out assert changed2 is False class TestStandardizePhone: def test_e164_default(self): out, _ = standardize_phone("(555) 123-4567") assert out == "+15551234567" def test_national(self): out, _ = standardize_phone("5551234567", output_format="NATIONAL") assert out == "(555) 123-4567" def test_international(self): out, _ = standardize_phone("5551234567", output_format="INTERNATIONAL") assert out == "+1 555-123-4567" def test_digits_only(self): out, changed = standardize_phone("(555) 123-4567", output_format="DIGITS") assert out == "5551234567" assert changed is True def test_invalid_passthrough(self): out, changed = standardize_phone("call me maybe") assert (out, changed) == ("call me maybe", False) def test_empty(self): assert standardize_phone("") == ("", False) assert standardize_phone(None) == ("", False) def test_idempotent(self): out, _ = standardize_phone("(555) 123-4567") out2, changed2 = standardize_phone(out) assert out2 == out assert changed2 is False class TestStandardizeCurrency: def test_dollar_with_cents(self): out, _ = standardize_currency("$1,234.56") assert out == "1234.56" def test_no_decimals_arg(self): out, _ = standardize_currency("$1,234.56", decimals=None) assert out == "1234.56" def test_round_to_two(self): out, _ = standardize_currency("$1,234.567", decimals=2) assert out == "1234.57" def test_integer_input(self): out, _ = standardize_currency("$1,000", decimals=None) assert out == "1000" def test_negative_parens(self): out, _ = standardize_currency("($50.00)", decimals=2) assert out == "-50.00" def test_negative_sign(self): out, _ = standardize_currency("-$50.00", decimals=2) assert out == "-50.00" def test_iso_code_prefix(self): out, _ = standardize_currency("USD 1,234.56") assert out == "1234.56" def test_iso_code_suffix(self): out, _ = standardize_currency("1234.56 EUR") assert out == "1234.56" def test_european_decimal(self): out, _ = standardize_currency("1.234,56 €", decimal="comma") assert out == "1234.56" def test_unparseable_passthrough(self): out, changed = standardize_currency("free!") assert (out, changed) == ("free!", False) def test_ambiguous_short_comma_rejected(self): # "1,5" under dot-decimal mode would be a comma decimal — reject. out, changed = standardize_currency("1,5") assert changed is False assert out == "1,5" def test_thousands_grouped_no_decimal(self): out, _ = standardize_currency("1,234", decimals=None) assert out == "1234" def test_empty(self): assert standardize_currency("") == ("", False) assert standardize_currency(None) == ("", False) def test_idempotent(self): out, _ = standardize_currency("$1,234.56", decimals=2) out2, changed2 = standardize_currency(out, decimals=2) assert out2 == out assert changed2 is False class TestStandardizeName: def test_shouting_to_title(self): out, _ = standardize_name("JOHN DOE") assert out == "John Doe" def test_lowercase_to_title(self): out, _ = standardize_name("john doe") assert out == "John Doe" def test_already_title(self): out, changed = standardize_name("Jane Smith") assert out == "Jane Smith" assert changed is False def test_apostrophe_inner_cap(self): # Surnames with O'/D' apostrophe prefixes get the inner letter # capitalized regardless of input case (corpus § 7.3 Irish names). out, _ = standardize_name("o'Connor") assert out == "O'Connor" out2, _ = standardize_name("o'connor") assert out2 == "O'Connor" def test_acronym_preserved(self): out, _ = standardize_name("Mary USA Smith") assert out == "Mary USA Smith" def test_upper_mode(self): out, _ = standardize_name("john doe", case="upper") assert out == "JOHN DOE" def test_lower_mode(self): out, _ = standardize_name("JOHN DOE", case="lower") assert out == "john doe" def test_empty(self): assert standardize_name("") == ("", False) assert standardize_name(None) == ("", False) def test_idempotent(self): out, _ = standardize_name("JOHN DOE") out2, changed2 = standardize_name(out) assert out2 == out assert changed2 is False class TestStandardizeAddress: def test_street(self): out, _ = standardize_address("123 Main St") assert out == "123 Main Street" def test_avenue_with_period(self): out, _ = standardize_address("456 Oak Ave.") assert out == "456 Oak Avenue" def test_apartment(self): out, _ = standardize_address("123 Main St Apt 4") assert out == "123 Main Street Apartment 4" def test_direction(self): out, _ = standardize_address("100 N Main St") assert out == "100 North Main Street" def test_combined(self): out, _ = standardize_address("789 pine blvd ste 200") assert out == "789 Pine Boulevard Suite 200" def test_already_expanded(self): out, changed = standardize_address("123 Main Street") assert out == "123 Main Street" assert changed is False def test_empty(self): assert standardize_address("") == ("", False) assert standardize_address(None) == ("", False) def test_idempotent(self): out, _ = standardize_address("123 main st apt 4") out2, changed2 = standardize_address(out) assert out2 == out assert changed2 is False class TestStandardizeBoolean: @pytest.mark.parametrize("inp", ["yes", "Yes", "YES", "y", "Y", "true", "1", "on"]) def test_truthy(self, inp): out, changed = standardize_boolean(inp) assert out == "True" assert changed is True @pytest.mark.parametrize("inp", ["no", "No", "NO", "n", "N", "false", "0", "off"]) def test_falsy(self, inp): out, changed = standardize_boolean(inp) assert out == "False" assert changed is True def test_already_canonical(self): out, changed = standardize_boolean("True") assert out == "True" assert changed is False def test_python_bool(self): assert standardize_boolean(True) == ("True", True) assert standardize_boolean(False) == ("False", True) def test_int_zero_one(self): assert standardize_boolean(1) == ("True", True) assert standardize_boolean(0) == ("False", True) def test_yes_no_style(self): assert standardize_boolean("y", style="Yes/No") == ("Yes", True) assert standardize_boolean("0", style="Yes/No") == ("No", True) def test_unrecognized_passthrough(self): out, changed = standardize_boolean("maybe") assert (out, changed) == ("maybe", False) def test_empty(self): assert standardize_boolean("") == ("", False) assert standardize_boolean(None) == ("", False) def test_idempotent(self): out, _ = standardize_boolean("yes") out2, changed2 = standardize_boolean(out) assert out2 == out assert changed2 is False # --------------------------------------------------------------------------- # DataFrame entry point # --------------------------------------------------------------------------- class TestStandardizeDataframe: def test_mixed_columns(self): df = pd.DataFrame({ "name": ["JOHN SMITH", "alice jones"], "phone": ["(555) 123-4567", "555.987.6543"], "amount": ["$1,234.56", "$50"], "joined": ["01/15/2024", "March 5 2023"], "active": ["yes", "0"], "address": ["123 Main St", "456 Oak Ave"], "skip_me": ["leave", "alone"], }) opts = StandardizeOptions( column_types={ "name": FieldType.NAME, "phone": FieldType.PHONE, "amount": FieldType.CURRENCY, "joined": FieldType.DATE, "active": FieldType.BOOLEAN, "address": FieldType.ADDRESS, }, ) result = standardize_dataframe(df, opts) out = result.standardized_df assert out.loc[0, "name"] == "John Smith" assert out.loc[1, "name"] == "Alice Jones" assert out.loc[0, "phone"] == "+15551234567" assert out.loc[1, "phone"] == "+15559876543" assert out.loc[0, "amount"] == "1234.56" assert out.loc[1, "amount"] == "50.00" assert out.loc[0, "joined"] == "2024-01-15" assert out.loc[1, "joined"] == "2023-03-05" assert out.loc[0, "active"] == "True" assert out.loc[1, "active"] == "False" assert out.loc[0, "address"] == "123 Main Street" assert out.loc[1, "address"] == "456 Oak Avenue" # Untouched column passes through verbatim. assert list(out["skip_me"]) == ["leave", "alone"] def test_changes_audit(self): df = pd.DataFrame({"d": ["01/15/2024", "2023-03-05"]}) opts = StandardizeOptions(column_types={"d": FieldType.DATE}) result = standardize_dataframe(df, opts) # Only the first row changed; the second was already canonical. assert result.cells_changed == 1 assert len(result.changes) == 1 assert result.changes.iloc[0]["row"] == 0 assert result.changes.iloc[0]["column"] == "d" assert result.changes.iloc[0]["old"] == "01/15/2024" assert result.changes.iloc[0]["new"] == "2024-01-15" def test_unparseable_count(self): df = pd.DataFrame({"d": ["01/15/2024", "not a date", "2024-01-15"]}) opts = StandardizeOptions(column_types={"d": FieldType.DATE}) result = standardize_dataframe(df, opts) assert result.cells_unparseable == 1 assert result.cells_total == 3 def test_unknown_column_raises(self): df = pd.DataFrame({"a": ["1"]}) opts = StandardizeOptions(column_types={"missing": FieldType.DATE}) with pytest.raises(ValueError, match="not found"): standardize_dataframe(df, opts) def test_input_not_mutated(self): df = pd.DataFrame({"d": ["01/15/2024"]}) opts = StandardizeOptions(column_types={"d": FieldType.DATE}) standardize_dataframe(df, opts) assert df.loc[0, "d"] == "01/15/2024" def test_options_serialization_roundtrip(self, tmp_path): opts = StandardizeOptions( column_types={"a": FieldType.DATE, "b": FieldType.PHONE}, date_output_format="%d-%b-%Y", phone_format="NATIONAL", ) path = tmp_path / "opts.json" opts.to_file(path) loaded = StandardizeOptions.from_file(path) assert loaded.column_types == {"a": FieldType.DATE, "b": FieldType.PHONE} assert loaded.date_output_format == "%d-%b-%Y" assert loaded.phone_format == "NATIONAL" def test_nan_passthrough(self): df = pd.DataFrame({"d": ["01/15/2024", None]}) opts = StandardizeOptions(column_types={"d": FieldType.DATE}) result = standardize_dataframe(df, opts) assert result.standardized_df.loc[0, "d"] == "2024-01-15" assert result.standardized_df.loc[1, "d"] is None # --------------------------------------------------------------------------- # Preset bundles # --------------------------------------------------------------------------- class TestPresets: def test_us_default_iso_dates(self): opts = StandardizeOptions.from_preset("us-default") assert opts.date_output_format == "%Y-%m-%d" assert opts.date_order == "MDY" assert opts.phone_format == "E164" assert opts.boolean_style == "True/False" def test_european_dmy_comma(self): opts = StandardizeOptions.from_preset("european") assert opts.date_order == "DMY" assert opts.currency_decimal == "comma" assert opts.currency_preserve_code is True def test_uk_ddmmyyyy_yes_no(self): opts = StandardizeOptions.from_preset("uk") assert opts.date_output_format == "%d/%m/%Y" assert opts.phone_region == "GB" assert opts.boolean_style == "Yes/No" def test_iso_strict_lowercase_bools_no_rounding(self): opts = StandardizeOptions.from_preset("iso-strict") assert opts.boolean_style == "true/false" assert opts.currency_decimals is None assert opts.currency_preserve_code is True def test_legacy_us_national_phones(self): opts = StandardizeOptions.from_preset("legacy-us") assert opts.date_output_format == "%m/%d/%Y" assert opts.phone_format == "NATIONAL" assert opts.boolean_style == "Yes/No" def test_overrides_layer_on_top(self): opts = StandardizeOptions.from_preset( "uk", column_types={"name": FieldType.NAME}, currency_decimals=4, ) assert opts.column_types == {"name": FieldType.NAME} assert opts.currency_decimals == 4 # UK-specific defaults survive what we didn't override. assert opts.phone_region == "GB" def test_unknown_preset_raises(self): with pytest.raises(ValueError, match="Unknown preset"): StandardizeOptions.from_preset("not-a-real-preset") def test_all_presets_loadable(self): # Smoke test: every advertised preset constructs cleanly. for name in PRESETS: opts = StandardizeOptions.from_preset(name) assert isinstance(opts, StandardizeOptions) def test_preset_drives_dataframe_pipeline(self): df = pd.DataFrame({ "joined": ["15/01/2024"], "active": ["yes"], "amount": ["1.234,56 €"], }) opts = StandardizeOptions.from_preset( "european", column_types={ "joined": FieldType.DATE, "active": FieldType.BOOLEAN, "amount": FieldType.CURRENCY, }, ) result = standardize_dataframe(df, opts) out = result.standardized_df assert out.loc[0, "joined"] == "2024-01-15" # ISO output for european assert out.loc[0, "active"] == "True" assert out.loc[0, "amount"] == "EUR 1234.56" # preserve_code on # --------------------------------------------------------------------------- # Currency code detection / preservation # --------------------------------------------------------------------------- class TestCurrencyCodeDetection: @pytest.mark.parametrize("inp,code", [ ("$1,234.56", "USD"), ("€1.234,56", "EUR"), ("£99.00", "GBP"), ("¥5000", "JPY"), ("₹500", "INR"), ("USD 1234", "USD"), ("1234 EUR", "EUR"), ("eur 50", "EUR"), ]) def test_detects(self, inp, code): assert detect_currency_code(inp) == code def test_no_marker_returns_none(self): assert detect_currency_code("1234.56") is None def test_non_string_returns_none(self): assert detect_currency_code(None) is None # type: ignore[arg-type] assert detect_currency_code(1234) is None # type: ignore[arg-type] class TestCurrencyPreserveCode: def test_dollar_preserved(self): out, changed = standardize_currency("$1,234.56", decimals=2, preserve_code=True) assert out == "USD 1234.56" assert changed is True def test_euro_preserved_comma_decimal(self): out, _ = standardize_currency( "1.234,56 €", decimal="comma", decimals=2, preserve_code=True, ) assert out == "EUR 1234.56" def test_iso_code_input_preserved(self): out, _ = standardize_currency("USD 1234.56", decimals=2, preserve_code=True) assert out == "USD 1234.56" def test_no_marker_no_prefix(self): out, _ = standardize_currency("1234.56", decimals=2, preserve_code=True) assert out == "1234.56" def test_off_by_default(self): out, _ = standardize_currency("$1,234.56", decimals=2) assert out == "1234.56" def test_pipeline_preserve_code(self): df = pd.DataFrame({"price": ["$50.00", "€30,00", "100", "USD 12.34"]}) opts = StandardizeOptions( column_types={"price": FieldType.CURRENCY}, currency_decimals=2, currency_preserve_code=True, currency_decimal="dot", # mixed input — euro will need its own ) # Note: comma-decimal euro won't parse under dot mode; treat that # as a known limitation — this test exercises the dot-input path. result = standardize_dataframe(df, opts) out = result.standardized_df assert out.loc[0, "price"] == "USD 50.00" assert out.loc[2, "price"] == "100.00" assert out.loc[3, "price"] == "USD 12.34" def test_canonical_check_recognizes_code_prefix(self): # "USD 50.00" should pass through unchanged when preserve_code is on # — and NOT count as unparseable. df = pd.DataFrame({"price": ["USD 50.00", "garbage"]}) opts = StandardizeOptions( column_types={"price": FieldType.CURRENCY}, currency_decimals=2, currency_preserve_code=True, ) result = standardize_dataframe(df, opts) assert result.cells_changed == 0 # Only "garbage" counts as unparseable. assert result.cells_unparseable == 1 # --------------------------------------------------------------------------- # User-editable abbreviations # --------------------------------------------------------------------------- class TestExtraAbbreviations: def test_extra_expansion(self): out, _ = standardize_address( "Bahnhofstrasse 12", extra_abbreviations={"strasse": "Straße"}, ) # smart_title_case will Title-case the result; "Bahnhofstrasse" is # already a single token (no embedded space) so it doesn't hit the # abbreviation lookup. Use a separated form for the realistic case. assert "Bahnhofstrasse" in out # not split → not expanded def test_extra_expansion_separated_token(self): out, _ = standardize_address( "Haupt strasse 12", extra_abbreviations={"strasse": "Straße"}, ) assert "Straße" in out def test_override_existing_entry(self): # Override "ave" to emit Spanish-language "Avenida". out, _ = standardize_address( "456 Oak Ave", extra_abbreviations={"ave": "Avenida"}, ) assert "Avenida" in out assert "Avenue" not in out def test_period_form_works(self): # Lookup is casefold + period-stripped, so ``Ave.`` still matches. out, _ = standardize_address( "456 Oak Ave.", extra_abbreviations={"ave": "Avenida"}, ) assert "Avenida" in out def test_empty_value_skipped(self): # Empty values in the user table don't blow up; they're ignored. out, _ = standardize_address( "456 Oak Ave", extra_abbreviations={"ave": "", " ": "Drive"}, ) # Built-in expansion still applies. assert "Avenue" in out def test_no_extras_unchanged_behavior(self): out_a, _ = standardize_address("123 Main St") out_b, _ = standardize_address("123 Main St", extra_abbreviations={}) out_c, _ = standardize_address("123 Main St", extra_abbreviations=None) assert out_a == out_b == out_c == "123 Main Street" def test_pipeline_uses_extras(self): df = pd.DataFrame({"addr": ["456 Oak Ave"]}) opts = StandardizeOptions( column_types={"addr": FieldType.ADDRESS}, extra_abbreviations={"ave": "Avenida"}, ) result = standardize_dataframe(df, opts) assert "Avenida" in result.standardized_df.loc[0, "addr"] def test_serialization_roundtrip_with_extras(self, tmp_path): opts = StandardizeOptions( column_types={"addr": FieldType.ADDRESS}, extra_abbreviations={"strasse": "Straße", "platz": "Platz"}, currency_preserve_code=True, ) path = tmp_path / "opts.json" opts.to_file(path) loaded = StandardizeOptions.from_file(path) assert loaded.extra_abbreviations == {"strasse": "Straße", "platz": "Platz"} assert loaded.currency_preserve_code is True