feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
158
tests/test_normalizers.py
Normal file
158
tests/test_normalizers.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""Tests for src.core.normalizers."""
|
||||
|
||||
import pytest
|
||||
from src.core.normalizers import (
|
||||
NormalizerType,
|
||||
get_normalizer,
|
||||
normalize_email,
|
||||
normalize_phone,
|
||||
normalize_name,
|
||||
normalize_address,
|
||||
normalize_string,
|
||||
)
|
||||
|
||||
|
||||
class TestNormalizeEmail:
|
||||
def test_basic_lowercase(self):
|
||||
assert normalize_email("John@Example.COM") == "john@example.com"
|
||||
|
||||
def test_strip_whitespace(self):
|
||||
assert normalize_email(" alice@test.com ") == "alice@test.com"
|
||||
|
||||
def test_strip_gmail_dots(self):
|
||||
assert normalize_email("j.o.h.n@gmail.com") == "john@gmail.com"
|
||||
|
||||
def test_strip_plus_tag(self):
|
||||
assert normalize_email("alice+promo@test.com") == "alice@test.com"
|
||||
|
||||
def test_gmail_dots_and_plus(self):
|
||||
assert normalize_email("j.smith+tag@gmail.com") == "jsmith@gmail.com"
|
||||
|
||||
def test_non_gmail_keeps_dots(self):
|
||||
assert normalize_email("j.smith@company.com") == "j.smith@company.com"
|
||||
|
||||
def test_empty(self):
|
||||
assert normalize_email("") == ""
|
||||
assert normalize_email(None) == ""
|
||||
|
||||
def test_no_at_sign(self):
|
||||
assert normalize_email("not-an-email") == "not-an-email"
|
||||
|
||||
def test_idempotent(self):
|
||||
result = normalize_email("J.Smith+tag@Gmail.com")
|
||||
assert normalize_email(result) == result
|
||||
|
||||
|
||||
class TestNormalizePhone:
|
||||
def test_us_formatted(self):
|
||||
assert normalize_phone("(555) 123-4567") == "+15551234567"
|
||||
|
||||
def test_dashes(self):
|
||||
assert normalize_phone("555-123-4567") == "+15551234567"
|
||||
|
||||
def test_dots(self):
|
||||
assert normalize_phone("555.123.4567") == "+15551234567"
|
||||
|
||||
def test_with_country_code(self):
|
||||
assert normalize_phone("+1 555-123-4567") == "+15551234567"
|
||||
|
||||
def test_digits_only_input(self):
|
||||
assert normalize_phone("5551234567") == "+15551234567"
|
||||
|
||||
def test_empty(self):
|
||||
assert normalize_phone("") == ""
|
||||
assert normalize_phone(None) == ""
|
||||
|
||||
def test_invalid_fallback_digits(self):
|
||||
# Very short number that phonenumbers rejects
|
||||
result = normalize_phone("123")
|
||||
assert result == "123"
|
||||
|
||||
def test_idempotent(self):
|
||||
result = normalize_phone("(555) 123-4567")
|
||||
assert normalize_phone(result) == result
|
||||
|
||||
|
||||
class TestNormalizeName:
|
||||
def test_strip_mr(self):
|
||||
assert normalize_name("Mr. John Smith") == "john smith"
|
||||
|
||||
def test_strip_dr(self):
|
||||
assert normalize_name("Dr. Jane Doe") == "jane doe"
|
||||
|
||||
def test_strip_suffix(self):
|
||||
assert normalize_name("Robert Brown Jr.") == "robert brown"
|
||||
|
||||
def test_strip_numeral_suffix(self):
|
||||
assert normalize_name("James Wilson III") == "james wilson"
|
||||
|
||||
def test_title_and_suffix(self):
|
||||
assert normalize_name("Dr. Michael Williams III") == "michael williams"
|
||||
|
||||
def test_collapse_whitespace(self):
|
||||
assert normalize_name(" John Smith ") == "john smith"
|
||||
|
||||
def test_case_fold(self):
|
||||
assert normalize_name("JOHN SMITH") == "john smith"
|
||||
|
||||
def test_empty(self):
|
||||
assert normalize_name("") == ""
|
||||
assert normalize_name(None) == ""
|
||||
|
||||
def test_idempotent(self):
|
||||
result = normalize_name("Mr. John Smith Jr.")
|
||||
assert normalize_name(result) == result
|
||||
|
||||
|
||||
class TestNormalizeAddress:
|
||||
def test_street_abbreviation(self):
|
||||
assert normalize_address("123 Main Street") == "123 main st"
|
||||
|
||||
def test_avenue_abbreviation(self):
|
||||
assert normalize_address("456 Oak Avenue") == "456 oak ave"
|
||||
|
||||
def test_boulevard_abbreviation(self):
|
||||
assert normalize_address("789 Pine Boulevard") == "789 pine blvd"
|
||||
|
||||
def test_apartment(self):
|
||||
assert normalize_address("123 Main St Apartment 4") == "123 main st apt 4"
|
||||
|
||||
def test_direction(self):
|
||||
assert normalize_address("111 First Street North") == "111 first st n"
|
||||
|
||||
def test_collapse_whitespace(self):
|
||||
assert normalize_address(" 123 Main Street ") == "123 main st"
|
||||
|
||||
def test_empty(self):
|
||||
assert normalize_address("") == ""
|
||||
assert normalize_address(None) == ""
|
||||
|
||||
def test_idempotent(self):
|
||||
result = normalize_address("123 Main Street Apartment 4")
|
||||
assert normalize_address(result) == result
|
||||
|
||||
|
||||
class TestNormalizeString:
|
||||
def test_trim_and_casefold(self):
|
||||
assert normalize_string(" Hello World ") == "hello world"
|
||||
|
||||
def test_collapse_whitespace(self):
|
||||
assert normalize_string("a b c") == "a b c"
|
||||
|
||||
def test_empty(self):
|
||||
assert normalize_string("") == ""
|
||||
assert normalize_string(None) == ""
|
||||
|
||||
|
||||
class TestGetNormalizer:
|
||||
def test_get_by_enum(self):
|
||||
fn = get_normalizer(NormalizerType.EMAIL)
|
||||
assert fn("TEST@Gmail.com") == "test@gmail.com"
|
||||
|
||||
def test_get_by_string(self):
|
||||
fn = get_normalizer("phone")
|
||||
assert fn("(555) 123-4567") == "+15551234567"
|
||||
|
||||
def test_unknown_raises(self):
|
||||
with pytest.raises(ValueError):
|
||||
get_normalizer("unknown_type")
|
||||
Reference in New Issue
Block a user