"""Acceptance corpus for international format standardization. Stresses the rework's three pillars on a single mixed-locale fixture: * Per-row country column drives phone parsing. * ``currency_decimal="auto"`` resolves comma-decimal locales. * Streaming entry point handles the same content unchanged. """ from __future__ import annotations from pathlib import Path import pandas as pd import pytest from src.core.format_standardize import ( FieldType, StandardizeOptions, standardize_dataframe, standardize_file, ) CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "format-cleaner-corpus" / "international" FIXTURE = CORPUS / "intl_phones_addresses.csv" @pytest.fixture(scope="module") def df(): return pd.read_csv(FIXTURE, dtype=str, keep_default_na=False) @pytest.fixture(scope="module") def options(): return StandardizeOptions( column_types={ "name": FieldType.NAME, "phone": FieldType.PHONE, "price": FieldType.CURRENCY, }, phone_country_column="country", currency_preserve_code=True, currency_decimal="auto", ) class TestPhonesByRegion: def test_every_row_lands_on_correct_e164_prefix(self, df, options): # Each row's country column drives the per-row region used by # phonenumbers.parse — the correct + prefix is the acceptance bar. res = standardize_dataframe(df, options) out = res.standardized_df # ISO-2 → expected E.164 country code prefix prefix_for_country = { "US": "+1", "GB": "+44", "RU": "+7", "ES": "+34", "FR": "+33", "JP": "+81", "DE": "+49", "IT": "+39", "CN": "+86", "IN": "+91", "EG": "+20", "AU": "+61", "BR": "+55", "MX": "+52", "KR": "+82", "TR": "+90", "IL": "+972", "PL": "+48", "DK": "+45", "SE": "+46", } bad: list[tuple[str, str, str]] = [] for _, row in out.iterrows(): want = prefix_for_country[row["country"]] got = row["phone"] if not got.startswith(want): bad.append((row["country"], want, got)) assert not bad, f"phone prefix mismatches: {bad}" class TestCurrencyByLocale: def test_eu_decimal_comma_resolves_under_auto(self, df, options): res = standardize_dataframe(df, options) # Spain, France, Germany, Italy, Brazil, Sweden all use decimal # comma. Verify a clean numeric result post-standardization. eu_idx = df.index[df["country"].isin( ["ES", "FR", "DE", "IT", "BR", "SE"] )] for i in eu_idx: val = res.standardized_df.loc[i, "price"] # Either ``CODE NNN.NN`` or bare ``NNN.NN`` — but the comma # in the source must have become a dot in the output. assert "," not in val, ( f"row {i} ({df.loc[i, 'country']}): comma persisted in {val!r}" ) def test_brl_real_prefix_recognised(self, df, options): res = standardize_dataframe(df, options) br_row = res.standardized_df[res.standardized_df["country"] == "BR"].iloc[0] assert "BRL" in br_row["price"] class TestStreamingMatchesInMemory: def test_same_output_via_streaming(self, tmp_path, df, options): # Streaming the same fixture through standardize_file should # produce a CSV byte-equivalent to the in-memory path. in_mem = standardize_dataframe(df, options).standardized_df out = tmp_path / "out.csv" # Use a chunk size that splits the 20-row fixture mid-way. res = standardize_file(FIXTURE, out, options, chunk_size=7) assert res.rows_processed == len(df) streamed = pd.read_csv(out, dtype=str, keep_default_na=False) # Compare typed columns only — others pass through. for col in options.column_types: assert streamed[col].tolist() == in_mem[col].astype(str).tolist(), ( f"column {col} differs between in-memory and streaming" )