Files
datatools-dev/tests/test_normalizers.py
Michael b871ab24fc feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00

159 lines
4.8 KiB
Python

"""Tests for src.core.normalizers."""
import pytest
from src.core.normalizers import (
NormalizerType,
get_normalizer,
normalize_email,
normalize_phone,
normalize_name,
normalize_address,
normalize_string,
)
class TestNormalizeEmail:
def test_basic_lowercase(self):
assert normalize_email("John@Example.COM") == "john@example.com"
def test_strip_whitespace(self):
assert normalize_email(" alice@test.com ") == "alice@test.com"
def test_strip_gmail_dots(self):
assert normalize_email("j.o.h.n@gmail.com") == "john@gmail.com"
def test_strip_plus_tag(self):
assert normalize_email("alice+promo@test.com") == "alice@test.com"
def test_gmail_dots_and_plus(self):
assert normalize_email("j.smith+tag@gmail.com") == "jsmith@gmail.com"
def test_non_gmail_keeps_dots(self):
assert normalize_email("j.smith@company.com") == "j.smith@company.com"
def test_empty(self):
assert normalize_email("") == ""
assert normalize_email(None) == ""
def test_no_at_sign(self):
assert normalize_email("not-an-email") == "not-an-email"
def test_idempotent(self):
result = normalize_email("J.Smith+tag@Gmail.com")
assert normalize_email(result) == result
class TestNormalizePhone:
def test_us_formatted(self):
assert normalize_phone("(555) 123-4567") == "+15551234567"
def test_dashes(self):
assert normalize_phone("555-123-4567") == "+15551234567"
def test_dots(self):
assert normalize_phone("555.123.4567") == "+15551234567"
def test_with_country_code(self):
assert normalize_phone("+1 555-123-4567") == "+15551234567"
def test_digits_only_input(self):
assert normalize_phone("5551234567") == "+15551234567"
def test_empty(self):
assert normalize_phone("") == ""
assert normalize_phone(None) == ""
def test_invalid_fallback_digits(self):
# Very short number that phonenumbers rejects
result = normalize_phone("123")
assert result == "123"
def test_idempotent(self):
result = normalize_phone("(555) 123-4567")
assert normalize_phone(result) == result
class TestNormalizeName:
def test_strip_mr(self):
assert normalize_name("Mr. John Smith") == "john smith"
def test_strip_dr(self):
assert normalize_name("Dr. Jane Doe") == "jane doe"
def test_strip_suffix(self):
assert normalize_name("Robert Brown Jr.") == "robert brown"
def test_strip_numeral_suffix(self):
assert normalize_name("James Wilson III") == "james wilson"
def test_title_and_suffix(self):
assert normalize_name("Dr. Michael Williams III") == "michael williams"
def test_collapse_whitespace(self):
assert normalize_name(" John Smith ") == "john smith"
def test_case_fold(self):
assert normalize_name("JOHN SMITH") == "john smith"
def test_empty(self):
assert normalize_name("") == ""
assert normalize_name(None) == ""
def test_idempotent(self):
result = normalize_name("Mr. John Smith Jr.")
assert normalize_name(result) == result
class TestNormalizeAddress:
def test_street_abbreviation(self):
assert normalize_address("123 Main Street") == "123 main st"
def test_avenue_abbreviation(self):
assert normalize_address("456 Oak Avenue") == "456 oak ave"
def test_boulevard_abbreviation(self):
assert normalize_address("789 Pine Boulevard") == "789 pine blvd"
def test_apartment(self):
assert normalize_address("123 Main St Apartment 4") == "123 main st apt 4"
def test_direction(self):
assert normalize_address("111 First Street North") == "111 first st n"
def test_collapse_whitespace(self):
assert normalize_address(" 123 Main Street ") == "123 main st"
def test_empty(self):
assert normalize_address("") == ""
assert normalize_address(None) == ""
def test_idempotent(self):
result = normalize_address("123 Main Street Apartment 4")
assert normalize_address(result) == result
class TestNormalizeString:
def test_trim_and_casefold(self):
assert normalize_string(" Hello World ") == "hello world"
def test_collapse_whitespace(self):
assert normalize_string("a b c") == "a b c"
def test_empty(self):
assert normalize_string("") == ""
assert normalize_string(None) == ""
class TestGetNormalizer:
def test_get_by_enum(self):
fn = get_normalizer(NormalizerType.EMAIL)
assert fn("TEST@Gmail.com") == "test@gmail.com"
def test_get_by_string(self):
fn = get_normalizer("phone")
assert fn("(555) 123-4567") == "+15551234567"
def test_unknown_raises(self):
with pytest.raises(ValueError):
get_normalizer("unknown_type")