datatools-dev/tests/test_normalizers.py

"""Tests for src.core.normalizers."""

import pytest
from src.core.normalizers import (
    NormalizerType,
    get_normalizer,
    normalize_email,
    normalize_phone,
    normalize_name,
    normalize_address,
    normalize_string,
)


class TestNormalizeEmail:
    def test_basic_lowercase(self):
        assert normalize_email("John@Example.COM") == "john@example.com"

    def test_strip_whitespace(self):
        assert normalize_email("  alice@test.com  ") == "alice@test.com"

    def test_strip_gmail_dots(self):
        assert normalize_email("j.o.h.n@gmail.com") == "john@gmail.com"

    def test_strip_plus_tag(self):
        assert normalize_email("alice+promo@test.com") == "alice@test.com"

    def test_gmail_dots_and_plus(self):
        assert normalize_email("j.smith+tag@gmail.com") == "jsmith@gmail.com"

    def test_non_gmail_keeps_dots(self):
        assert normalize_email("j.smith@company.com") == "j.smith@company.com"

    def test_empty(self):
        assert normalize_email("") == ""
        assert normalize_email(None) == ""

    def test_no_at_sign(self):
        assert normalize_email("not-an-email") == "not-an-email"

    def test_idempotent(self):
        result = normalize_email("J.Smith+tag@Gmail.com")
        assert normalize_email(result) == result


class TestNormalizePhone:
    def test_us_formatted(self):
        assert normalize_phone("(555) 123-4567") == "+15551234567"

    def test_dashes(self):
        assert normalize_phone("555-123-4567") == "+15551234567"

    def test_dots(self):
        assert normalize_phone("555.123.4567") == "+15551234567"

    def test_with_country_code(self):
        assert normalize_phone("+1 555-123-4567") == "+15551234567"

    def test_digits_only_input(self):
        assert normalize_phone("5551234567") == "+15551234567"

    def test_empty(self):
        assert normalize_phone("") == ""
        assert normalize_phone(None) == ""

    def test_invalid_fallback_digits(self):
        # Very short number that phonenumbers rejects
        result = normalize_phone("123")
        assert result == "123"

    def test_idempotent(self):
        result = normalize_phone("(555) 123-4567")
        assert normalize_phone(result) == result


class TestNormalizeName:
    def test_strip_mr(self):
        assert normalize_name("Mr. John Smith") == "john smith"

    def test_strip_dr(self):
        assert normalize_name("Dr. Jane Doe") == "jane doe"

    def test_strip_suffix(self):
        assert normalize_name("Robert Brown Jr.") == "robert brown"

    def test_strip_numeral_suffix(self):
        assert normalize_name("James Wilson III") == "james wilson"

    def test_title_and_suffix(self):
        assert normalize_name("Dr. Michael Williams III") == "michael williams"

    def test_collapse_whitespace(self):
        assert normalize_name("  John   Smith  ") == "john smith"

    def test_case_fold(self):
        assert normalize_name("JOHN SMITH") == "john smith"

    def test_empty(self):
        assert normalize_name("") == ""
        assert normalize_name(None) == ""

    def test_idempotent(self):
        result = normalize_name("Mr. John Smith Jr.")
        assert normalize_name(result) == result


class TestNormalizeAddress:
    def test_street_abbreviation(self):
        assert normalize_address("123 Main Street") == "123 main st"

    def test_avenue_abbreviation(self):
        assert normalize_address("456 Oak Avenue") == "456 oak ave"

    def test_boulevard_abbreviation(self):
        assert normalize_address("789 Pine Boulevard") == "789 pine blvd"

    def test_apartment(self):
        assert normalize_address("123 Main St Apartment 4") == "123 main st apt 4"

    def test_direction(self):
        assert normalize_address("111 First Street North") == "111 first st n"

    def test_collapse_whitespace(self):
        assert normalize_address("  123   Main   Street  ") == "123 main st"

    def test_empty(self):
        assert normalize_address("") == ""
        assert normalize_address(None) == ""

    def test_idempotent(self):
        result = normalize_address("123 Main Street Apartment 4")
        assert normalize_address(result) == result


class TestNormalizeString:
    def test_trim_and_casefold(self):
        assert normalize_string("  Hello World  ") == "hello world"

    def test_collapse_whitespace(self):
        assert normalize_string("a   b   c") == "a b c"

    def test_empty(self):
        assert normalize_string("") == ""
        assert normalize_string(None) == ""


class TestGetNormalizer:
    def test_get_by_enum(self):
        fn = get_normalizer(NormalizerType.EMAIL)
        assert fn("TEST@Gmail.com") == "test@gmail.com"

    def test_get_by_string(self):
        fn = get_normalizer("phone")
        assert fn("(555) 123-4567") == "+15551234567"

    def test_unknown_raises(self):
        with pytest.raises(ValueError):
            get_normalizer("unknown_type")


# ---------------------------------------------------------------------------
# Alignment with format_standardize: extension preservation, state codes,
# particle handling. See audit GAPs 15/16/17.
# ---------------------------------------------------------------------------

class TestNormalizerAudit:
    def test_phone_extension_preserved(self):
        # Two records with different extensions must NOT normalize to
        # the same key — they're different people at the same business.
        a = normalize_phone("+15551234567 ext 100")
        b = normalize_phone("+15551234567 ext 200")
        assert a != b
        assert a == "+15551234567;ext=100"

    def test_phone_no_extension_unchanged(self):
        assert normalize_phone("+15551234567") == "+15551234567"

    def test_address_state_name_to_code(self):
        # "California" and "CA" produce the same matching key.
        a = normalize_address("123 Main St, Los Angeles, California 90001")
        b = normalize_address("123 Main St, Los Angeles, CA 90001")
        assert a == b

    def test_address_multiword_state_name(self):
        a = normalize_address("100 Beacon St, Boston, Massachusetts 02101")
        b = normalize_address("100 Beacon St, Boston, MA 02101")
        assert a == b

    def test_address_does_not_butcher_city_named_after_state(self):
        # "New York" appearing as a city should still fold to "ny" —
        # this is intentional for matching keys (we want ``New York, NY``
        # and ``NY, NY`` to be the same record) even though the
        # standardizer (display) would preserve the city name.
        out = normalize_address("123 Main St, New York, NY 10001")
        assert "ny" in out

    def test_name_particle_dropped(self):
        # "Charles de Gaulle" and "Charles Gaulle" produce the same key.
        assert normalize_name("Charles de Gaulle") == normalize_name("Charles Gaulle")

    def test_name_van_dropped(self):
        assert normalize_name("Vincent van Gogh") == normalize_name("Vincent Gogh")

    def test_name_particle_idempotent(self):
        out = normalize_name("Vincent van Gogh")
        assert normalize_name(out) == out