Adds src/core/format_standardize.py — a per-cell standardizer for dates,
phones, emails, addresses, names, currencies, booleans — wired through
StandardizeOptions / standardize_dataframe with FieldType registry.
Includes:
- Date parser handles ISO/US/EU/longform/excel-serial/unix-timestamp/
partial-precision/quarter notation; opt-in French/German/Spanish month
dictionaries via month_locales.
- Phone via libphonenumber with extension preservation (;ext=N), 001
international prefix handling, error sentinels for placeholders /
multi-number cells.
- Email lowercase/trim/mailto/angle-bracket strip with optional
--gmail-canonical mode.
- Address USPS abbreviation expansion or compression (expand=False per
corpus § 6.3), state-name → 2-letter conversion, multi-line collapse,
PO Box normalization, state-code preservation regardless of input case.
- Name handler: Mc/Mac/O'/D' inner caps, hyphen segments, particle
lowercasing (von/van/de/da), comma-format reversal, period stripping
for titles/suffixes/initials, PhD/MD acronym preservation, conservative
mode for mixed-case input.
- Currency: auto-detect EU vs US separators, space-thousands, Swiss
apostrophe, accounting parens, optional ISO code preservation, error
sentinels for percentages/ranges/word-values/ambiguous separators.
- Per-domain error_policy ("passthrough" | "sentinel") for surfacing
malformed values as <error: reason> per corpus § 0.3.
Test corpus from Business/DataTools/test-cases-format-cleaner copied to
test-cases/format-cleaner-corpus/ — 7 fixtures plus FORMATS-CASES.md.
tests/test_format_standardize_corpus.py drives all 199 rows through the
per-cell standardizers; 0 xfailed.
Wires the GUI page (3_Format_Standardizer.py) to "Ready" status.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
631 lines
22 KiB
Python
631 lines
22 KiB
Python
"""Tests for src.core.format_standardize."""
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from src.core.format_standardize import (
|
|
PRESETS,
|
|
FieldType,
|
|
StandardizeOptions,
|
|
detect_currency_code,
|
|
standardize_address,
|
|
standardize_boolean,
|
|
standardize_currency,
|
|
standardize_dataframe,
|
|
standardize_date,
|
|
standardize_name,
|
|
standardize_phone,
|
|
)
|
|
|
|
|
|
class TestStandardizeDate:
|
|
def test_iso_passthrough(self):
|
|
out, changed = standardize_date("2024-01-15")
|
|
assert out == "2024-01-15"
|
|
assert changed is False
|
|
|
|
def test_us_slash(self):
|
|
out, changed = standardize_date("01/15/2024")
|
|
assert (out, changed) == ("2024-01-15", True)
|
|
|
|
def test_us_dash(self):
|
|
out, _ = standardize_date("1-15-2024")
|
|
assert out == "2024-01-15"
|
|
|
|
def test_two_digit_year(self):
|
|
out, _ = standardize_date("01/15/24")
|
|
assert out == "2024-01-15"
|
|
|
|
def test_long_month_name(self):
|
|
out, _ = standardize_date("January 15, 2024")
|
|
assert out == "2024-01-15"
|
|
|
|
def test_short_month_name(self):
|
|
out, _ = standardize_date("Jan 15 2024")
|
|
assert out == "2024-01-15"
|
|
|
|
def test_dmy_order(self):
|
|
out, _ = standardize_date("15/01/2024", date_order="DMY")
|
|
assert out == "2024-01-15"
|
|
|
|
def test_strip_time_tail(self):
|
|
out, _ = standardize_date("2024-01-15 13:45:00")
|
|
assert out == "2024-01-15"
|
|
|
|
def test_iso_with_t_separator(self):
|
|
out, _ = standardize_date("2024-01-15T08:30:00Z")
|
|
assert out == "2024-01-15"
|
|
|
|
def test_compact(self):
|
|
out, _ = standardize_date("20240115")
|
|
assert out == "2024-01-15"
|
|
|
|
def test_custom_output(self):
|
|
out, _ = standardize_date("01/15/2024", output_format="%d %b %Y")
|
|
assert out == "15 Jan 2024"
|
|
|
|
def test_unparseable_passthrough(self):
|
|
out, changed = standardize_date("hello")
|
|
assert (out, changed) == ("hello", False)
|
|
|
|
def test_empty(self):
|
|
assert standardize_date("") == ("", False)
|
|
assert standardize_date(None) == ("", False)
|
|
|
|
def test_idempotent(self):
|
|
out, _ = standardize_date("01/15/2024")
|
|
out2, changed2 = standardize_date(out)
|
|
assert out2 == out
|
|
assert changed2 is False
|
|
|
|
|
|
class TestStandardizePhone:
|
|
def test_e164_default(self):
|
|
out, _ = standardize_phone("(555) 123-4567")
|
|
assert out == "+15551234567"
|
|
|
|
def test_national(self):
|
|
out, _ = standardize_phone("5551234567", output_format="NATIONAL")
|
|
assert out == "(555) 123-4567"
|
|
|
|
def test_international(self):
|
|
out, _ = standardize_phone("5551234567", output_format="INTERNATIONAL")
|
|
assert out == "+1 555-123-4567"
|
|
|
|
def test_digits_only(self):
|
|
out, changed = standardize_phone("(555) 123-4567", output_format="DIGITS")
|
|
assert out == "5551234567"
|
|
assert changed is True
|
|
|
|
def test_invalid_passthrough(self):
|
|
out, changed = standardize_phone("call me maybe")
|
|
assert (out, changed) == ("call me maybe", False)
|
|
|
|
def test_empty(self):
|
|
assert standardize_phone("") == ("", False)
|
|
assert standardize_phone(None) == ("", False)
|
|
|
|
def test_idempotent(self):
|
|
out, _ = standardize_phone("(555) 123-4567")
|
|
out2, changed2 = standardize_phone(out)
|
|
assert out2 == out
|
|
assert changed2 is False
|
|
|
|
|
|
class TestStandardizeCurrency:
|
|
def test_dollar_with_cents(self):
|
|
out, _ = standardize_currency("$1,234.56")
|
|
assert out == "1234.56"
|
|
|
|
def test_no_decimals_arg(self):
|
|
out, _ = standardize_currency("$1,234.56", decimals=None)
|
|
assert out == "1234.56"
|
|
|
|
def test_round_to_two(self):
|
|
out, _ = standardize_currency("$1,234.567", decimals=2)
|
|
assert out == "1234.57"
|
|
|
|
def test_integer_input(self):
|
|
out, _ = standardize_currency("$1,000", decimals=None)
|
|
assert out == "1000"
|
|
|
|
def test_negative_parens(self):
|
|
out, _ = standardize_currency("($50.00)", decimals=2)
|
|
assert out == "-50.00"
|
|
|
|
def test_negative_sign(self):
|
|
out, _ = standardize_currency("-$50.00", decimals=2)
|
|
assert out == "-50.00"
|
|
|
|
def test_iso_code_prefix(self):
|
|
out, _ = standardize_currency("USD 1,234.56")
|
|
assert out == "1234.56"
|
|
|
|
def test_iso_code_suffix(self):
|
|
out, _ = standardize_currency("1234.56 EUR")
|
|
assert out == "1234.56"
|
|
|
|
def test_european_decimal(self):
|
|
out, _ = standardize_currency("1.234,56 €", decimal="comma")
|
|
assert out == "1234.56"
|
|
|
|
def test_unparseable_passthrough(self):
|
|
out, changed = standardize_currency("free!")
|
|
assert (out, changed) == ("free!", False)
|
|
|
|
def test_ambiguous_short_comma_rejected(self):
|
|
# "1,5" under dot-decimal mode would be a comma decimal — reject.
|
|
out, changed = standardize_currency("1,5")
|
|
assert changed is False
|
|
assert out == "1,5"
|
|
|
|
def test_thousands_grouped_no_decimal(self):
|
|
out, _ = standardize_currency("1,234", decimals=None)
|
|
assert out == "1234"
|
|
|
|
def test_empty(self):
|
|
assert standardize_currency("") == ("", False)
|
|
assert standardize_currency(None) == ("", False)
|
|
|
|
def test_idempotent(self):
|
|
out, _ = standardize_currency("$1,234.56", decimals=2)
|
|
out2, changed2 = standardize_currency(out, decimals=2)
|
|
assert out2 == out
|
|
assert changed2 is False
|
|
|
|
|
|
class TestStandardizeName:
|
|
def test_shouting_to_title(self):
|
|
out, _ = standardize_name("JOHN DOE")
|
|
assert out == "John Doe"
|
|
|
|
def test_lowercase_to_title(self):
|
|
out, _ = standardize_name("john doe")
|
|
assert out == "John Doe"
|
|
|
|
def test_already_title(self):
|
|
out, changed = standardize_name("Jane Smith")
|
|
assert out == "Jane Smith"
|
|
assert changed is False
|
|
|
|
def test_apostrophe_inner_cap(self):
|
|
# Surnames with O'/D' apostrophe prefixes get the inner letter
|
|
# capitalized regardless of input case (corpus § 7.3 Irish names).
|
|
out, _ = standardize_name("o'Connor")
|
|
assert out == "O'Connor"
|
|
out2, _ = standardize_name("o'connor")
|
|
assert out2 == "O'Connor"
|
|
|
|
def test_acronym_preserved(self):
|
|
out, _ = standardize_name("Mary USA Smith")
|
|
assert out == "Mary USA Smith"
|
|
|
|
def test_upper_mode(self):
|
|
out, _ = standardize_name("john doe", case="upper")
|
|
assert out == "JOHN DOE"
|
|
|
|
def test_lower_mode(self):
|
|
out, _ = standardize_name("JOHN DOE", case="lower")
|
|
assert out == "john doe"
|
|
|
|
def test_empty(self):
|
|
assert standardize_name("") == ("", False)
|
|
assert standardize_name(None) == ("", False)
|
|
|
|
def test_idempotent(self):
|
|
out, _ = standardize_name("JOHN DOE")
|
|
out2, changed2 = standardize_name(out)
|
|
assert out2 == out
|
|
assert changed2 is False
|
|
|
|
|
|
class TestStandardizeAddress:
|
|
def test_street(self):
|
|
out, _ = standardize_address("123 Main St")
|
|
assert out == "123 Main Street"
|
|
|
|
def test_avenue_with_period(self):
|
|
out, _ = standardize_address("456 Oak Ave.")
|
|
assert out == "456 Oak Avenue"
|
|
|
|
def test_apartment(self):
|
|
out, _ = standardize_address("123 Main St Apt 4")
|
|
assert out == "123 Main Street Apartment 4"
|
|
|
|
def test_direction(self):
|
|
out, _ = standardize_address("100 N Main St")
|
|
assert out == "100 North Main Street"
|
|
|
|
def test_combined(self):
|
|
out, _ = standardize_address("789 pine blvd ste 200")
|
|
assert out == "789 Pine Boulevard Suite 200"
|
|
|
|
def test_already_expanded(self):
|
|
out, changed = standardize_address("123 Main Street")
|
|
assert out == "123 Main Street"
|
|
assert changed is False
|
|
|
|
def test_empty(self):
|
|
assert standardize_address("") == ("", False)
|
|
assert standardize_address(None) == ("", False)
|
|
|
|
def test_idempotent(self):
|
|
out, _ = standardize_address("123 main st apt 4")
|
|
out2, changed2 = standardize_address(out)
|
|
assert out2 == out
|
|
assert changed2 is False
|
|
|
|
|
|
class TestStandardizeBoolean:
|
|
@pytest.mark.parametrize("inp", ["yes", "Yes", "YES", "y", "Y", "true", "1", "on"])
|
|
def test_truthy(self, inp):
|
|
out, changed = standardize_boolean(inp)
|
|
assert out == "True"
|
|
assert changed is True
|
|
|
|
@pytest.mark.parametrize("inp", ["no", "No", "NO", "n", "N", "false", "0", "off"])
|
|
def test_falsy(self, inp):
|
|
out, changed = standardize_boolean(inp)
|
|
assert out == "False"
|
|
assert changed is True
|
|
|
|
def test_already_canonical(self):
|
|
out, changed = standardize_boolean("True")
|
|
assert out == "True"
|
|
assert changed is False
|
|
|
|
def test_python_bool(self):
|
|
assert standardize_boolean(True) == ("True", True)
|
|
assert standardize_boolean(False) == ("False", True)
|
|
|
|
def test_int_zero_one(self):
|
|
assert standardize_boolean(1) == ("True", True)
|
|
assert standardize_boolean(0) == ("False", True)
|
|
|
|
def test_yes_no_style(self):
|
|
assert standardize_boolean("y", style="Yes/No") == ("Yes", True)
|
|
assert standardize_boolean("0", style="Yes/No") == ("No", True)
|
|
|
|
def test_unrecognized_passthrough(self):
|
|
out, changed = standardize_boolean("maybe")
|
|
assert (out, changed) == ("maybe", False)
|
|
|
|
def test_empty(self):
|
|
assert standardize_boolean("") == ("", False)
|
|
assert standardize_boolean(None) == ("", False)
|
|
|
|
def test_idempotent(self):
|
|
out, _ = standardize_boolean("yes")
|
|
out2, changed2 = standardize_boolean(out)
|
|
assert out2 == out
|
|
assert changed2 is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# DataFrame entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestStandardizeDataframe:
|
|
def test_mixed_columns(self):
|
|
df = pd.DataFrame({
|
|
"name": ["JOHN SMITH", "alice jones"],
|
|
"phone": ["(555) 123-4567", "555.987.6543"],
|
|
"amount": ["$1,234.56", "$50"],
|
|
"joined": ["01/15/2024", "March 5 2023"],
|
|
"active": ["yes", "0"],
|
|
"address": ["123 Main St", "456 Oak Ave"],
|
|
"skip_me": ["leave", "alone"],
|
|
})
|
|
opts = StandardizeOptions(
|
|
column_types={
|
|
"name": FieldType.NAME,
|
|
"phone": FieldType.PHONE,
|
|
"amount": FieldType.CURRENCY,
|
|
"joined": FieldType.DATE,
|
|
"active": FieldType.BOOLEAN,
|
|
"address": FieldType.ADDRESS,
|
|
},
|
|
)
|
|
result = standardize_dataframe(df, opts)
|
|
out = result.standardized_df
|
|
assert out.loc[0, "name"] == "John Smith"
|
|
assert out.loc[1, "name"] == "Alice Jones"
|
|
assert out.loc[0, "phone"] == "+15551234567"
|
|
assert out.loc[1, "phone"] == "+15559876543"
|
|
assert out.loc[0, "amount"] == "1234.56"
|
|
assert out.loc[1, "amount"] == "50.00"
|
|
assert out.loc[0, "joined"] == "2024-01-15"
|
|
assert out.loc[1, "joined"] == "2023-03-05"
|
|
assert out.loc[0, "active"] == "True"
|
|
assert out.loc[1, "active"] == "False"
|
|
assert out.loc[0, "address"] == "123 Main Street"
|
|
assert out.loc[1, "address"] == "456 Oak Avenue"
|
|
# Untouched column passes through verbatim.
|
|
assert list(out["skip_me"]) == ["leave", "alone"]
|
|
|
|
def test_changes_audit(self):
|
|
df = pd.DataFrame({"d": ["01/15/2024", "2023-03-05"]})
|
|
opts = StandardizeOptions(column_types={"d": FieldType.DATE})
|
|
result = standardize_dataframe(df, opts)
|
|
# Only the first row changed; the second was already canonical.
|
|
assert result.cells_changed == 1
|
|
assert len(result.changes) == 1
|
|
assert result.changes.iloc[0]["row"] == 0
|
|
assert result.changes.iloc[0]["column"] == "d"
|
|
assert result.changes.iloc[0]["old"] == "01/15/2024"
|
|
assert result.changes.iloc[0]["new"] == "2024-01-15"
|
|
|
|
def test_unparseable_count(self):
|
|
df = pd.DataFrame({"d": ["01/15/2024", "not a date", "2024-01-15"]})
|
|
opts = StandardizeOptions(column_types={"d": FieldType.DATE})
|
|
result = standardize_dataframe(df, opts)
|
|
assert result.cells_unparseable == 1
|
|
assert result.cells_total == 3
|
|
|
|
def test_unknown_column_raises(self):
|
|
df = pd.DataFrame({"a": ["1"]})
|
|
opts = StandardizeOptions(column_types={"missing": FieldType.DATE})
|
|
with pytest.raises(ValueError, match="not found"):
|
|
standardize_dataframe(df, opts)
|
|
|
|
def test_input_not_mutated(self):
|
|
df = pd.DataFrame({"d": ["01/15/2024"]})
|
|
opts = StandardizeOptions(column_types={"d": FieldType.DATE})
|
|
standardize_dataframe(df, opts)
|
|
assert df.loc[0, "d"] == "01/15/2024"
|
|
|
|
def test_options_serialization_roundtrip(self, tmp_path):
|
|
opts = StandardizeOptions(
|
|
column_types={"a": FieldType.DATE, "b": FieldType.PHONE},
|
|
date_output_format="%d-%b-%Y",
|
|
phone_format="NATIONAL",
|
|
)
|
|
path = tmp_path / "opts.json"
|
|
opts.to_file(path)
|
|
loaded = StandardizeOptions.from_file(path)
|
|
assert loaded.column_types == {"a": FieldType.DATE, "b": FieldType.PHONE}
|
|
assert loaded.date_output_format == "%d-%b-%Y"
|
|
assert loaded.phone_format == "NATIONAL"
|
|
|
|
def test_nan_passthrough(self):
|
|
df = pd.DataFrame({"d": ["01/15/2024", None]})
|
|
opts = StandardizeOptions(column_types={"d": FieldType.DATE})
|
|
result = standardize_dataframe(df, opts)
|
|
assert result.standardized_df.loc[0, "d"] == "2024-01-15"
|
|
assert result.standardized_df.loc[1, "d"] is None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Preset bundles
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestPresets:
|
|
def test_us_default_iso_dates(self):
|
|
opts = StandardizeOptions.from_preset("us-default")
|
|
assert opts.date_output_format == "%Y-%m-%d"
|
|
assert opts.date_order == "MDY"
|
|
assert opts.phone_format == "E164"
|
|
assert opts.boolean_style == "True/False"
|
|
|
|
def test_european_dmy_comma(self):
|
|
opts = StandardizeOptions.from_preset("european")
|
|
assert opts.date_order == "DMY"
|
|
assert opts.currency_decimal == "comma"
|
|
assert opts.currency_preserve_code is True
|
|
|
|
def test_uk_ddmmyyyy_yes_no(self):
|
|
opts = StandardizeOptions.from_preset("uk")
|
|
assert opts.date_output_format == "%d/%m/%Y"
|
|
assert opts.phone_region == "GB"
|
|
assert opts.boolean_style == "Yes/No"
|
|
|
|
def test_iso_strict_lowercase_bools_no_rounding(self):
|
|
opts = StandardizeOptions.from_preset("iso-strict")
|
|
assert opts.boolean_style == "true/false"
|
|
assert opts.currency_decimals is None
|
|
assert opts.currency_preserve_code is True
|
|
|
|
def test_legacy_us_national_phones(self):
|
|
opts = StandardizeOptions.from_preset("legacy-us")
|
|
assert opts.date_output_format == "%m/%d/%Y"
|
|
assert opts.phone_format == "NATIONAL"
|
|
assert opts.boolean_style == "Yes/No"
|
|
|
|
def test_overrides_layer_on_top(self):
|
|
opts = StandardizeOptions.from_preset(
|
|
"uk",
|
|
column_types={"name": FieldType.NAME},
|
|
currency_decimals=4,
|
|
)
|
|
assert opts.column_types == {"name": FieldType.NAME}
|
|
assert opts.currency_decimals == 4
|
|
# UK-specific defaults survive what we didn't override.
|
|
assert opts.phone_region == "GB"
|
|
|
|
def test_unknown_preset_raises(self):
|
|
with pytest.raises(ValueError, match="Unknown preset"):
|
|
StandardizeOptions.from_preset("not-a-real-preset")
|
|
|
|
def test_all_presets_loadable(self):
|
|
# Smoke test: every advertised preset constructs cleanly.
|
|
for name in PRESETS:
|
|
opts = StandardizeOptions.from_preset(name)
|
|
assert isinstance(opts, StandardizeOptions)
|
|
|
|
def test_preset_drives_dataframe_pipeline(self):
|
|
df = pd.DataFrame({
|
|
"joined": ["15/01/2024"],
|
|
"active": ["yes"],
|
|
"amount": ["1.234,56 €"],
|
|
})
|
|
opts = StandardizeOptions.from_preset(
|
|
"european",
|
|
column_types={
|
|
"joined": FieldType.DATE,
|
|
"active": FieldType.BOOLEAN,
|
|
"amount": FieldType.CURRENCY,
|
|
},
|
|
)
|
|
result = standardize_dataframe(df, opts)
|
|
out = result.standardized_df
|
|
assert out.loc[0, "joined"] == "2024-01-15" # ISO output for european
|
|
assert out.loc[0, "active"] == "True"
|
|
assert out.loc[0, "amount"] == "EUR 1234.56" # preserve_code on
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Currency code detection / preservation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCurrencyCodeDetection:
|
|
@pytest.mark.parametrize("inp,code", [
|
|
("$1,234.56", "USD"),
|
|
("€1.234,56", "EUR"),
|
|
("£99.00", "GBP"),
|
|
("¥5000", "JPY"),
|
|
("₹500", "INR"),
|
|
("USD 1234", "USD"),
|
|
("1234 EUR", "EUR"),
|
|
("eur 50", "EUR"),
|
|
])
|
|
def test_detects(self, inp, code):
|
|
assert detect_currency_code(inp) == code
|
|
|
|
def test_no_marker_returns_none(self):
|
|
assert detect_currency_code("1234.56") is None
|
|
|
|
def test_non_string_returns_none(self):
|
|
assert detect_currency_code(None) is None # type: ignore[arg-type]
|
|
assert detect_currency_code(1234) is None # type: ignore[arg-type]
|
|
|
|
|
|
class TestCurrencyPreserveCode:
|
|
def test_dollar_preserved(self):
|
|
out, changed = standardize_currency("$1,234.56", decimals=2, preserve_code=True)
|
|
assert out == "USD 1234.56"
|
|
assert changed is True
|
|
|
|
def test_euro_preserved_comma_decimal(self):
|
|
out, _ = standardize_currency(
|
|
"1.234,56 €", decimal="comma", decimals=2, preserve_code=True,
|
|
)
|
|
assert out == "EUR 1234.56"
|
|
|
|
def test_iso_code_input_preserved(self):
|
|
out, _ = standardize_currency("USD 1234.56", decimals=2, preserve_code=True)
|
|
assert out == "USD 1234.56"
|
|
|
|
def test_no_marker_no_prefix(self):
|
|
out, _ = standardize_currency("1234.56", decimals=2, preserve_code=True)
|
|
assert out == "1234.56"
|
|
|
|
def test_off_by_default(self):
|
|
out, _ = standardize_currency("$1,234.56", decimals=2)
|
|
assert out == "1234.56"
|
|
|
|
def test_pipeline_preserve_code(self):
|
|
df = pd.DataFrame({"price": ["$50.00", "€30,00", "100", "USD 12.34"]})
|
|
opts = StandardizeOptions(
|
|
column_types={"price": FieldType.CURRENCY},
|
|
currency_decimals=2,
|
|
currency_preserve_code=True,
|
|
currency_decimal="dot", # mixed input — euro will need its own
|
|
)
|
|
# Note: comma-decimal euro won't parse under dot mode; treat that
|
|
# as a known limitation — this test exercises the dot-input path.
|
|
result = standardize_dataframe(df, opts)
|
|
out = result.standardized_df
|
|
assert out.loc[0, "price"] == "USD 50.00"
|
|
assert out.loc[2, "price"] == "100.00"
|
|
assert out.loc[3, "price"] == "USD 12.34"
|
|
|
|
def test_canonical_check_recognizes_code_prefix(self):
|
|
# "USD 50.00" should pass through unchanged when preserve_code is on
|
|
# — and NOT count as unparseable.
|
|
df = pd.DataFrame({"price": ["USD 50.00", "garbage"]})
|
|
opts = StandardizeOptions(
|
|
column_types={"price": FieldType.CURRENCY},
|
|
currency_decimals=2,
|
|
currency_preserve_code=True,
|
|
)
|
|
result = standardize_dataframe(df, opts)
|
|
assert result.cells_changed == 0
|
|
# Only "garbage" counts as unparseable.
|
|
assert result.cells_unparseable == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# User-editable abbreviations
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestExtraAbbreviations:
|
|
def test_extra_expansion(self):
|
|
out, _ = standardize_address(
|
|
"Bahnhofstrasse 12",
|
|
extra_abbreviations={"strasse": "Straße"},
|
|
)
|
|
# smart_title_case will Title-case the result; "Bahnhofstrasse" is
|
|
# already a single token (no embedded space) so it doesn't hit the
|
|
# abbreviation lookup. Use a separated form for the realistic case.
|
|
assert "Bahnhofstrasse" in out # not split → not expanded
|
|
|
|
def test_extra_expansion_separated_token(self):
|
|
out, _ = standardize_address(
|
|
"Haupt strasse 12",
|
|
extra_abbreviations={"strasse": "Straße"},
|
|
)
|
|
assert "Straße" in out
|
|
|
|
def test_override_existing_entry(self):
|
|
# Override "ave" to emit Spanish-language "Avenida".
|
|
out, _ = standardize_address(
|
|
"456 Oak Ave",
|
|
extra_abbreviations={"ave": "Avenida"},
|
|
)
|
|
assert "Avenida" in out
|
|
assert "Avenue" not in out
|
|
|
|
def test_period_form_works(self):
|
|
# Lookup is casefold + period-stripped, so ``Ave.`` still matches.
|
|
out, _ = standardize_address(
|
|
"456 Oak Ave.",
|
|
extra_abbreviations={"ave": "Avenida"},
|
|
)
|
|
assert "Avenida" in out
|
|
|
|
def test_empty_value_skipped(self):
|
|
# Empty values in the user table don't blow up; they're ignored.
|
|
out, _ = standardize_address(
|
|
"456 Oak Ave",
|
|
extra_abbreviations={"ave": "", " ": "Drive"},
|
|
)
|
|
# Built-in expansion still applies.
|
|
assert "Avenue" in out
|
|
|
|
def test_no_extras_unchanged_behavior(self):
|
|
out_a, _ = standardize_address("123 Main St")
|
|
out_b, _ = standardize_address("123 Main St", extra_abbreviations={})
|
|
out_c, _ = standardize_address("123 Main St", extra_abbreviations=None)
|
|
assert out_a == out_b == out_c == "123 Main Street"
|
|
|
|
def test_pipeline_uses_extras(self):
|
|
df = pd.DataFrame({"addr": ["456 Oak Ave"]})
|
|
opts = StandardizeOptions(
|
|
column_types={"addr": FieldType.ADDRESS},
|
|
extra_abbreviations={"ave": "Avenida"},
|
|
)
|
|
result = standardize_dataframe(df, opts)
|
|
assert "Avenida" in result.standardized_df.loc[0, "addr"]
|
|
|
|
def test_serialization_roundtrip_with_extras(self, tmp_path):
|
|
opts = StandardizeOptions(
|
|
column_types={"addr": FieldType.ADDRESS},
|
|
extra_abbreviations={"strasse": "Straße", "platz": "Platz"},
|
|
currency_preserve_code=True,
|
|
)
|
|
path = tmp_path / "opts.json"
|
|
opts.to_file(path)
|
|
loaded = StandardizeOptions.from_file(path)
|
|
assert loaded.extra_abbreviations == {"strasse": "Straße", "platz": "Platz"}
|
|
assert loaded.currency_preserve_code is True
|