"""Tests for src.core.normalizers.""" import pytest from src.core.normalizers import ( NormalizerType, get_normalizer, normalize_email, normalize_phone, normalize_name, normalize_address, normalize_string, ) class TestNormalizeEmail: def test_basic_lowercase(self): assert normalize_email("John@Example.COM") == "john@example.com" def test_strip_whitespace(self): assert normalize_email(" alice@test.com ") == "alice@test.com" def test_strip_gmail_dots(self): assert normalize_email("j.o.h.n@gmail.com") == "john@gmail.com" def test_strip_plus_tag(self): assert normalize_email("alice+promo@test.com") == "alice@test.com" def test_gmail_dots_and_plus(self): assert normalize_email("j.smith+tag@gmail.com") == "jsmith@gmail.com" def test_non_gmail_keeps_dots(self): assert normalize_email("j.smith@company.com") == "j.smith@company.com" def test_empty(self): assert normalize_email("") == "" assert normalize_email(None) == "" def test_no_at_sign(self): assert normalize_email("not-an-email") == "not-an-email" def test_idempotent(self): result = normalize_email("J.Smith+tag@Gmail.com") assert normalize_email(result) == result class TestNormalizePhone: def test_us_formatted(self): assert normalize_phone("(555) 123-4567") == "+15551234567" def test_dashes(self): assert normalize_phone("555-123-4567") == "+15551234567" def test_dots(self): assert normalize_phone("555.123.4567") == "+15551234567" def test_with_country_code(self): assert normalize_phone("+1 555-123-4567") == "+15551234567" def test_digits_only_input(self): assert normalize_phone("5551234567") == "+15551234567" def test_empty(self): assert normalize_phone("") == "" assert normalize_phone(None) == "" def test_invalid_fallback_digits(self): # Very short number that phonenumbers rejects result = normalize_phone("123") assert result == "123" def test_idempotent(self): result = normalize_phone("(555) 123-4567") assert normalize_phone(result) == result class TestNormalizeName: def test_strip_mr(self): assert normalize_name("Mr. John Smith") == "john smith" def test_strip_dr(self): assert normalize_name("Dr. Jane Doe") == "jane doe" def test_strip_suffix(self): assert normalize_name("Robert Brown Jr.") == "robert brown" def test_strip_numeral_suffix(self): assert normalize_name("James Wilson III") == "james wilson" def test_title_and_suffix(self): assert normalize_name("Dr. Michael Williams III") == "michael williams" def test_collapse_whitespace(self): assert normalize_name(" John Smith ") == "john smith" def test_case_fold(self): assert normalize_name("JOHN SMITH") == "john smith" def test_empty(self): assert normalize_name("") == "" assert normalize_name(None) == "" def test_idempotent(self): result = normalize_name("Mr. John Smith Jr.") assert normalize_name(result) == result class TestNormalizeAddress: def test_street_abbreviation(self): assert normalize_address("123 Main Street") == "123 main st" def test_avenue_abbreviation(self): assert normalize_address("456 Oak Avenue") == "456 oak ave" def test_boulevard_abbreviation(self): assert normalize_address("789 Pine Boulevard") == "789 pine blvd" def test_apartment(self): assert normalize_address("123 Main St Apartment 4") == "123 main st apt 4" def test_direction(self): assert normalize_address("111 First Street North") == "111 first st n" def test_collapse_whitespace(self): assert normalize_address(" 123 Main Street ") == "123 main st" def test_empty(self): assert normalize_address("") == "" assert normalize_address(None) == "" def test_idempotent(self): result = normalize_address("123 Main Street Apartment 4") assert normalize_address(result) == result class TestNormalizeString: def test_trim_and_casefold(self): assert normalize_string(" Hello World ") == "hello world" def test_collapse_whitespace(self): assert normalize_string("a b c") == "a b c" def test_empty(self): assert normalize_string("") == "" assert normalize_string(None) == "" class TestGetNormalizer: def test_get_by_enum(self): fn = get_normalizer(NormalizerType.EMAIL) assert fn("TEST@Gmail.com") == "test@gmail.com" def test_get_by_string(self): fn = get_normalizer("phone") assert fn("(555) 123-4567") == "+15551234567" def test_unknown_raises(self): with pytest.raises(ValueError): get_normalizer("unknown_type") # --------------------------------------------------------------------------- # Alignment with format_standardize: extension preservation, state codes, # particle handling. See audit GAPs 15/16/17. # --------------------------------------------------------------------------- class TestNormalizerAudit: def test_phone_extension_preserved(self): # Two records with different extensions must NOT normalize to # the same key — they're different people at the same business. a = normalize_phone("+15551234567 ext 100") b = normalize_phone("+15551234567 ext 200") assert a != b assert a == "+15551234567;ext=100" def test_phone_no_extension_unchanged(self): assert normalize_phone("+15551234567") == "+15551234567" def test_address_state_name_to_code(self): # "California" and "CA" produce the same matching key. a = normalize_address("123 Main St, Los Angeles, California 90001") b = normalize_address("123 Main St, Los Angeles, CA 90001") assert a == b def test_address_multiword_state_name(self): a = normalize_address("100 Beacon St, Boston, Massachusetts 02101") b = normalize_address("100 Beacon St, Boston, MA 02101") assert a == b def test_address_does_not_butcher_city_named_after_state(self): # "New York" appearing as a city should still fold to "ny" — # this is intentional for matching keys (we want ``New York, NY`` # and ``NY, NY`` to be the same record) even though the # standardizer (display) would preserve the city name. out = normalize_address("123 Main St, New York, NY 10001") assert "ny" in out def test_name_particle_dropped(self): # "Charles de Gaulle" and "Charles Gaulle" produce the same key. assert normalize_name("Charles de Gaulle") == normalize_name("Charles Gaulle") def test_name_van_dropped(self): assert normalize_name("Vincent van Gogh") == normalize_name("Vincent Gogh") def test_name_particle_idempotent(self): out = normalize_name("Vincent van Gogh") assert normalize_name(out) == out