feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions

0
tests/__init__.py Normal file
View File

47
tests/conftest.py Normal file
View File

@@ -0,0 +1,47 @@
"""Shared test fixtures."""
import pandas as pd
import pytest
from pathlib import Path
SAMPLES_DIR = Path(__file__).parent.parent / "samples"
@pytest.fixture
def sample_csv_path():
return SAMPLES_DIR / "messy_sales.csv"
@pytest.fixture
def sample_df(sample_csv_path):
return pd.read_csv(sample_csv_path, dtype=str, keep_default_na=False)
@pytest.fixture
def simple_df():
"""Small DataFrame with obvious duplicates for unit testing."""
return pd.DataFrame({
"name": ["Alice", "alice", "Bob", "Charlie", "ALICE"],
"email": ["alice@test.com", "alice@test.com", "bob@test.com",
"charlie@test.com", "alice@test.com"],
"phone": ["555-1234", "555-1234", "555-5678", "555-9012", "555-1234"],
})
@pytest.fixture
def merge_df():
"""DataFrame with partial records that benefit from merge."""
return pd.DataFrame({
"name": ["John Doe", "John Doe", "Jane Smith"],
"email": ["john@test.com", "john@test.com", "jane@test.com"],
"phone": ["555-1111", "", "555-3333"],
"address": ["", "123 Main St", "456 Oak Ave"],
})
@pytest.fixture
def tmp_csv(tmp_path, simple_df):
"""Write simple_df to a temp CSV and return the path."""
path = tmp_path / "test_input.csv"
simple_df.to_csv(path, index=False)
return path

147
tests/test_cli.py Normal file
View File

@@ -0,0 +1,147 @@
"""Integration tests for the CLI via Typer's CliRunner."""
import pytest
from pathlib import Path
from typer.testing import CliRunner
from src.cli import app
runner = CliRunner()
class TestCliPreview:
def test_preview_default(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv)])
assert result.exit_code == 0
assert "preview" in result.output.lower() or "Rows in" in result.output
def test_preview_shows_row_counts(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv)])
assert result.exit_code == 0
assert "Rows in" in result.output
assert "Rows out" in result.output
def test_file_not_found(self):
result = runner.invoke(app, ["/tmp/nonexistent_xyz_abc.csv"])
assert result.exit_code != 0
assert "not found" in result.output.lower()
class TestCliApply:
def test_apply_writes_output(self, tmp_csv, tmp_path):
out = tmp_path / "output.csv"
result = runner.invoke(app, [str(tmp_csv), "--apply", "-o", str(out)])
assert result.exit_code == 0
assert out.exists()
def test_apply_default_output_name(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--apply"])
assert result.exit_code == 0
expected = tmp_csv.parent / f"{tmp_csv.stem}_deduplicated.csv"
assert expected.exists()
def test_apply_creates_removed_file(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--apply"])
assert result.exit_code == 0
removed = tmp_csv.parent / f"{tmp_csv.stem}_removed.csv"
# May or may not exist depending on whether duplicates were found
# with default auto-detect on simple_df
class TestCliFuzzy:
def test_fuzzy_flag(self, tmp_csv):
result = runner.invoke(app, [
str(tmp_csv), "--fuzzy", "name", "--threshold", "80",
])
assert result.exit_code == 0
def test_subset_flag(self, tmp_csv):
result = runner.invoke(app, [
str(tmp_csv), "--subset", "email",
])
assert result.exit_code == 0
def test_bad_column_error(self, tmp_csv):
result = runner.invoke(app, [
str(tmp_csv), "--subset", "nonexistent_column",
])
assert result.exit_code != 0
assert "not found" in result.output.lower()
class TestCliConfig:
def test_save_and_load_config(self, tmp_csv, tmp_path):
cfg_path = tmp_path / "my_config.json"
# Save
result = runner.invoke(app, [
str(tmp_csv), "--subset", "email", "--save-config", str(cfg_path),
])
assert result.exit_code == 0
assert cfg_path.exists()
# Load and apply
result = runner.invoke(app, [
str(tmp_csv), "--config", str(cfg_path), "--apply",
])
assert result.exit_code == 0
class TestCliSurvivor:
def test_survivor_last(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--survivor", "last"])
assert result.exit_code == 0
def test_survivor_most_complete(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--survivor", "most-complete"])
assert result.exit_code == 0
def test_invalid_survivor(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--survivor", "bogus"])
assert result.exit_code != 0
class TestCliMerge:
def test_merge_flag(self, tmp_csv):
result = runner.invoke(app, [str(tmp_csv), "--merge", "--apply"])
assert result.exit_code == 0
class TestCliSampleData:
def test_sample_preview(self, sample_csv_path):
result = runner.invoke(app, [str(sample_csv_path)])
assert result.exit_code == 0
assert "Rows in: 50" in result.output
# Should find duplicates
assert "Removed:" in result.output
def test_sample_apply(self, sample_csv_path, tmp_path):
out = tmp_path / "deduped.csv"
result = runner.invoke(app, [
str(sample_csv_path), "--apply", "-o", str(out),
])
assert result.exit_code == 0
assert out.exists()
import pandas as pd
df = pd.read_csv(out, encoding="utf-8-sig")
# Should have fewer than 50 rows
assert len(df) < 50
def test_sample_fuzzy_with_merge(self, sample_csv_path, tmp_path):
out = tmp_path / "fuzzy_merged.csv"
result = runner.invoke(app, [
str(sample_csv_path),
"--fuzzy", "customer_name",
"--threshold", "80",
"--merge",
"--apply",
"-o", str(out),
])
assert result.exit_code == 0
assert out.exists()
class TestCliHelp:
def test_help(self):
result = runner.invoke(app, ["--help"])
assert result.exit_code == 0
assert "--apply" in result.output

102
tests/test_config.py Normal file
View File

@@ -0,0 +1,102 @@
"""Tests for src.core.config — save/load configuration profiles."""
import json
import pytest
from pathlib import Path
from src.core.config import (
DeduplicationConfig,
StrategyConfig,
ColumnStrategyConfig,
)
from src.core.dedup import Algorithm, SurvivorRule
from src.core.normalizers import NormalizerType
class TestDeduplicationConfig:
def test_default(self):
cfg = DeduplicationConfig.default()
assert cfg.survivor_rule == "first"
assert cfg.merge is False
assert cfg.strategies == []
def test_to_dict_roundtrip(self):
cfg = DeduplicationConfig(
strategies=[
StrategyConfig(columns=[
ColumnStrategyConfig(
column="email",
algorithm="exact",
threshold=100.0,
normalizer="email",
),
]),
],
survivor_rule="most_complete",
merge=True,
)
d = cfg.to_dict()
cfg2 = DeduplicationConfig.from_dict(d)
assert cfg2.survivor_rule == "most_complete"
assert cfg2.merge is True
assert len(cfg2.strategies) == 1
assert cfg2.strategies[0].columns[0].column == "email"
def test_to_file_from_file(self, tmp_path):
cfg = DeduplicationConfig(
strategies=[
StrategyConfig(columns=[
ColumnStrategyConfig(column="name", algorithm="jaro_winkler",
threshold=85.0, normalizer="name"),
]),
],
survivor_rule="last",
)
path = tmp_path / "test_config.json"
cfg.to_file(path)
loaded = DeduplicationConfig.from_file(path)
assert loaded.survivor_rule == "last"
assert len(loaded.strategies) == 1
assert loaded.strategies[0].columns[0].algorithm == "jaro_winkler"
def test_to_strategies(self):
cfg = DeduplicationConfig(
strategies=[
StrategyConfig(columns=[
ColumnStrategyConfig(column="email", algorithm="exact",
threshold=100.0, normalizer="email"),
ColumnStrategyConfig(column="phone", algorithm="exact",
threshold=100.0, normalizer="phone"),
]),
],
)
strats = cfg.to_strategies()
assert strats is not None
assert len(strats) == 1
assert len(strats[0].column_strategies) == 2
assert strats[0].column_strategies[0].algorithm == Algorithm.EXACT
assert strats[0].column_strategies[0].normalizer == NormalizerType.EMAIL
def test_to_strategies_empty(self):
cfg = DeduplicationConfig.default()
assert cfg.to_strategies() is None
def test_to_survivor_rule(self):
cfg = DeduplicationConfig(survivor_rule="most_complete")
assert cfg.to_survivor_rule() == SurvivorRule.KEEP_MOST_COMPLETE
def test_json_is_valid(self, tmp_path):
cfg = DeduplicationConfig(
strategies=[
StrategyConfig(columns=[
ColumnStrategyConfig(column="x", algorithm="exact"),
]),
],
normalize_map={"email": "email"},
)
path = tmp_path / "valid.json"
cfg.to_file(path)
data = json.loads(path.read_text())
assert "strategies" in data
assert "normalize_map" in data

258
tests/test_dedup.py Normal file
View File

@@ -0,0 +1,258 @@
"""Tests for src.core.dedup — matching engine."""
import pandas as pd
import pytest
from src.core.dedup import (
Algorithm,
ColumnMatchStrategy,
MatchStrategy,
SurvivorRule,
_compute_similarity,
_compare_pair,
_UnionFind,
build_default_strategies,
deduplicate,
)
from src.core.normalizers import NormalizerType
class TestComputeSimilarity:
def test_exact_match(self):
assert _compute_similarity("hello", "hello", Algorithm.EXACT) == 100.0
def test_exact_mismatch(self):
assert _compute_similarity("hello", "world", Algorithm.EXACT) == 0.0
def test_levenshtein_similar(self):
score = _compute_similarity("kitten", "sitting", Algorithm.LEVENSHTEIN)
assert 50 < score < 80
def test_jaro_winkler_similar(self):
score = _compute_similarity("john", "jon", Algorithm.JARO_WINKLER)
assert score > 80
def test_token_set_ratio(self):
score = _compute_similarity(
"123 main street apt 4",
"apt 4 123 main street",
Algorithm.TOKEN_SET_RATIO,
)
assert score == 100.0
class TestUnionFind:
def test_basic_union(self):
uf = _UnionFind(5)
uf.union(0, 1)
uf.union(1, 2)
assert uf.find(0) == uf.find(2) # transitive
def test_separate_groups(self):
uf = _UnionFind(5)
uf.union(0, 1)
uf.union(3, 4)
assert uf.find(0) != uf.find(3)
def test_groups(self):
uf = _UnionFind(5)
uf.union(0, 1)
uf.union(1, 2)
uf.union(3, 4)
groups = uf.groups()
assert len(groups) == 2
sizes = sorted(len(v) for v in groups.values())
assert sizes == [2, 3]
class TestComparePair:
def test_exact_match(self):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
row_a = pd.Series({"email": "test@example.com"})
row_b = pd.Series({"email": "test@example.com"})
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
assert is_match
assert conf == 100.0
assert cols == ["email"]
def test_exact_mismatch(self):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
row_a = pd.Series({"email": "a@test.com"})
row_b = pd.Series({"email": "b@test.com"})
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
assert not is_match
def test_fuzzy_match(self):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=80),
])
row_a = pd.Series({"name": "john smith"})
row_b = pd.Series({"name": "jon smith"})
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
assert is_match
assert conf > 80
def test_and_logic_both_must_match(self):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="name", algorithm=Algorithm.EXACT),
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
# name matches, email doesn't
row_a = pd.Series({"name": "alice", "email": "a@test.com"})
row_b = pd.Series({"name": "alice", "email": "b@test.com"})
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
assert not is_match
class TestBuildDefaultStrategies:
def test_detects_email(self):
df = pd.DataFrame({"email": ["a@b.com"], "name": ["Alice"]})
strats = build_default_strategies(df)
# email (strong, standalone) + name AND email (weak paired with strong) = 2
assert len(strats) == 2
found_email = any(
cs.column == "email" and cs.normalizer == NormalizerType.EMAIL
for s in strats for cs in s.column_strategies
)
assert found_email
# Name should only appear paired with email, not standalone
name_strats = [s for s in strats
if any(cs.column == "name" for cs in s.column_strategies)]
for s in name_strats:
assert len(s.column_strategies) >= 2, "Name should be paired with a strong key"
def test_fallback_all_columns(self):
df = pd.DataFrame({"x": [1], "y": [2], "z": [3]})
strats = build_default_strategies(df)
assert len(strats) == 1
assert len(strats[0].column_strategies) == 3
class TestDeduplicate:
def test_exact_duplicates(self, simple_df):
# Alice appears 3 times with same email
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
result = deduplicate(simple_df, strategies=[strategy])
# 3 Alices -> 1, Bob stays, Charlie stays = 3 rows
assert len(result.deduplicated_df) == 3
assert result.original_row_count == 5
assert len(result.match_groups) == 1
def test_fuzzy_name_match(self):
df = pd.DataFrame({
"name": ["John Smith", "Jon Smith", "Jane Doe"],
"email": ["a@test.com", "b@test.com", "c@test.com"],
})
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(
column="name",
algorithm=Algorithm.JARO_WINKLER,
threshold=85,
normalizer=NormalizerType.NAME,
),
])
result = deduplicate(df, strategies=[strategy])
assert len(result.deduplicated_df) == 2
assert len(result.match_groups) == 1
def test_survivor_keep_last(self, simple_df):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
result = deduplicate(simple_df, strategies=[strategy],
survivor_rule=SurvivorRule.KEEP_LAST)
# The last Alice (index 4) should survive
assert len(result.match_groups) == 1
assert result.match_groups[0].survivor_index == 4
def test_survivor_most_complete(self, merge_df):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
result = deduplicate(merge_df, strategies=[strategy],
survivor_rule=SurvivorRule.KEEP_MOST_COMPLETE)
# Row 0 has phone but no address (1 empty)
# Row 1 has address but no phone (1 empty)
# Both have 1 empty, so keep_first among ties
assert len(result.deduplicated_df) == 2
def test_merge_mode(self, merge_df):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
result = deduplicate(merge_df, strategies=[strategy], merge=True)
# Survivor should have both phone and address filled
john_row = result.deduplicated_df[
result.deduplicated_df["name"] == "John Doe"
].iloc[0]
assert john_row["phone"] == "555-1111"
assert john_row["address"] == "123 Main St"
def test_multi_strategy_or(self):
df = pd.DataFrame({
"name": ["Alice", "Bob", "Alice B."],
"email": ["a@test.com", "a@test.com", "c@test.com"],
})
# Strategy 1: match on email
# Strategy 2: match on name (fuzzy)
strat1 = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
strat2 = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=70),
])
result = deduplicate(df, strategies=[strat1, strat2])
# All three should end up in one group via transitive closure:
# Alice~Bob (email), Alice~Alice B. (name)
assert len(result.deduplicated_df) == 1
def test_confidence_score(self, simple_df):
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
result = deduplicate(simple_df, strategies=[strategy])
for group in result.match_groups:
assert 0 <= group.confidence <= 100
def test_preview_flag(self, simple_df):
result = deduplicate(simple_df, preview=True)
assert result.is_preview is True
result2 = deduplicate(simple_df, preview=False)
assert result2.is_preview is False
def test_auto_detect_strategies(self, sample_df):
result = deduplicate(sample_df)
# Should find duplicates in the sample data
assert len(result.match_groups) > 0
assert len(result.deduplicated_df) < len(sample_df)
def test_idempotent(self, sample_df):
"""Running dedup twice with same config produces same output."""
result1 = deduplicate(sample_df)
result2 = deduplicate(result1.deduplicated_df)
# Second pass should find no new duplicates
assert len(result2.match_groups) == 0
assert len(result2.deduplicated_df) == len(result1.deduplicated_df)
def test_review_callback(self):
df = pd.DataFrame({
"email": ["a@test.com", "a@test.com", "b@test.com"],
})
strategy = MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])
# Reject all matches
result = deduplicate(df, strategies=[strategy],
review_callback=lambda g, d: False)
assert len(result.deduplicated_df) == 3 # nothing removed
# Accept all matches
result = deduplicate(df, strategies=[strategy],
review_callback=lambda g, d: True)
assert len(result.deduplicated_df) == 2

130
tests/test_io.py Normal file
View File

@@ -0,0 +1,130 @@
"""Tests for src.core.io — file reading, encoding/delimiter detection."""
import pandas as pd
import pytest
from pathlib import Path
from src.core.io import (
detect_encoding,
detect_delimiter,
detect_header_row,
read_file,
write_file,
list_sheets,
)
class TestDetectEncoding:
def test_utf8_file(self, sample_csv_path):
enc = detect_encoding(sample_csv_path)
assert enc.lower().replace("-", "") in ("utf8", "ascii", "utf8sig")
def test_empty_file(self, tmp_path):
f = tmp_path / "empty.csv"
f.write_bytes(b"")
assert detect_encoding(f) == "utf-8"
def test_bom_file(self, tmp_path):
f = tmp_path / "bom.csv"
f.write_bytes(b"\xef\xbb\xbfname,email\nAlice,a@b.com\n")
assert detect_encoding(f) == "utf-8-sig"
def test_latin1_file(self, tmp_path):
f = tmp_path / "latin.csv"
content = "name,city\nJosé,São Paulo\n".encode("latin-1")
f.write_bytes(content)
enc = detect_encoding(f)
# Should detect something compatible with latin-1 family
assert enc in ("iso-8859-1", "latin-1", "windows-1252", "cp1252",
"iso-8859-9", "cp1250", "iso-8859-15", "utf-8")
class TestDetectDelimiter:
def test_comma(self, sample_csv_path):
assert detect_delimiter(sample_csv_path) == ","
def test_tab(self, tmp_path):
f = tmp_path / "tabs.tsv"
f.write_text("name\temail\nAlice\ta@b.com\n")
assert detect_delimiter(f) == "\t"
def test_semicolon(self, tmp_path):
f = tmp_path / "semi.csv"
f.write_text("name;email;phone\nAlice;a@b.com;555\n")
assert detect_delimiter(f) == ";"
def test_pipe(self, tmp_path):
f = tmp_path / "pipe.csv"
f.write_text("name|email|phone\nAlice|a@b.com|555\n")
assert detect_delimiter(f) == "|"
class TestDetectHeaderRow:
def test_standard_csv(self, sample_csv_path):
assert detect_header_row(sample_csv_path) == 0
def test_with_junk_rows(self, tmp_path):
f = tmp_path / "junk.csv"
f.write_text("Report generated 2024-01-01\n\nname,email,phone\nAlice,a@b.com,555\n")
# Row 0 has "Report generated..." which is a single non-numeric string
# Row 2 has "name,email,phone" which looks like headers
# The heuristic checks all cells, so row 0 may match if it's a single cell
hdr = detect_header_row(f)
assert hdr in (0, 2) # depends on delimiter detection
class TestReadFile:
def test_read_csv(self, sample_csv_path):
df = read_file(sample_csv_path)
assert isinstance(df, pd.DataFrame)
assert len(df) == 50
assert "customer_name" in df.columns
def test_read_nonexistent(self):
with pytest.raises(FileNotFoundError):
read_file("/tmp/nonexistent_file_xyz.csv")
def test_read_with_encoding_override(self, sample_csv_path):
df = read_file(sample_csv_path, encoding="utf-8")
assert len(df) == 50
def test_chunked_reading(self, sample_csv_path):
chunks = read_file(sample_csv_path, chunk_size=10)
# Should be a generator
all_chunks = list(chunks)
assert len(all_chunks) == 5
total_rows = sum(len(c) for c in all_chunks)
assert total_rows == 50
class TestWriteFile:
def test_write_csv(self, tmp_path, simple_df):
out = tmp_path / "output.csv"
write_file(simple_df, out)
assert out.exists()
# Read back
df = pd.read_csv(out, encoding="utf-8-sig")
assert len(df) == len(simple_df)
def test_write_xlsx(self, tmp_path, simple_df):
out = tmp_path / "output.xlsx"
write_file(simple_df, out)
assert out.exists()
df = pd.read_excel(out)
assert len(df) == len(simple_df)
def test_utf8_bom_default(self, tmp_path, simple_df):
out = tmp_path / "bom.csv"
write_file(simple_df, out)
raw = out.read_bytes()
assert raw[:3] == b"\xef\xbb\xbf"
class TestListSheets:
def test_list_sheets(self, tmp_path, simple_df):
path = tmp_path / "multi.xlsx"
with pd.ExcelWriter(path, engine="openpyxl") as writer:
simple_df.to_excel(writer, sheet_name="Sheet1", index=False)
simple_df.to_excel(writer, sheet_name="Sheet2", index=False)
sheets = list_sheets(path)
assert sheets == ["Sheet1", "Sheet2"]

158
tests/test_normalizers.py Normal file
View File

@@ -0,0 +1,158 @@
"""Tests for src.core.normalizers."""
import pytest
from src.core.normalizers import (
NormalizerType,
get_normalizer,
normalize_email,
normalize_phone,
normalize_name,
normalize_address,
normalize_string,
)
class TestNormalizeEmail:
def test_basic_lowercase(self):
assert normalize_email("John@Example.COM") == "john@example.com"
def test_strip_whitespace(self):
assert normalize_email(" alice@test.com ") == "alice@test.com"
def test_strip_gmail_dots(self):
assert normalize_email("j.o.h.n@gmail.com") == "john@gmail.com"
def test_strip_plus_tag(self):
assert normalize_email("alice+promo@test.com") == "alice@test.com"
def test_gmail_dots_and_plus(self):
assert normalize_email("j.smith+tag@gmail.com") == "jsmith@gmail.com"
def test_non_gmail_keeps_dots(self):
assert normalize_email("j.smith@company.com") == "j.smith@company.com"
def test_empty(self):
assert normalize_email("") == ""
assert normalize_email(None) == ""
def test_no_at_sign(self):
assert normalize_email("not-an-email") == "not-an-email"
def test_idempotent(self):
result = normalize_email("J.Smith+tag@Gmail.com")
assert normalize_email(result) == result
class TestNormalizePhone:
def test_us_formatted(self):
assert normalize_phone("(555) 123-4567") == "+15551234567"
def test_dashes(self):
assert normalize_phone("555-123-4567") == "+15551234567"
def test_dots(self):
assert normalize_phone("555.123.4567") == "+15551234567"
def test_with_country_code(self):
assert normalize_phone("+1 555-123-4567") == "+15551234567"
def test_digits_only_input(self):
assert normalize_phone("5551234567") == "+15551234567"
def test_empty(self):
assert normalize_phone("") == ""
assert normalize_phone(None) == ""
def test_invalid_fallback_digits(self):
# Very short number that phonenumbers rejects
result = normalize_phone("123")
assert result == "123"
def test_idempotent(self):
result = normalize_phone("(555) 123-4567")
assert normalize_phone(result) == result
class TestNormalizeName:
def test_strip_mr(self):
assert normalize_name("Mr. John Smith") == "john smith"
def test_strip_dr(self):
assert normalize_name("Dr. Jane Doe") == "jane doe"
def test_strip_suffix(self):
assert normalize_name("Robert Brown Jr.") == "robert brown"
def test_strip_numeral_suffix(self):
assert normalize_name("James Wilson III") == "james wilson"
def test_title_and_suffix(self):
assert normalize_name("Dr. Michael Williams III") == "michael williams"
def test_collapse_whitespace(self):
assert normalize_name(" John Smith ") == "john smith"
def test_case_fold(self):
assert normalize_name("JOHN SMITH") == "john smith"
def test_empty(self):
assert normalize_name("") == ""
assert normalize_name(None) == ""
def test_idempotent(self):
result = normalize_name("Mr. John Smith Jr.")
assert normalize_name(result) == result
class TestNormalizeAddress:
def test_street_abbreviation(self):
assert normalize_address("123 Main Street") == "123 main st"
def test_avenue_abbreviation(self):
assert normalize_address("456 Oak Avenue") == "456 oak ave"
def test_boulevard_abbreviation(self):
assert normalize_address("789 Pine Boulevard") == "789 pine blvd"
def test_apartment(self):
assert normalize_address("123 Main St Apartment 4") == "123 main st apt 4"
def test_direction(self):
assert normalize_address("111 First Street North") == "111 first st n"
def test_collapse_whitespace(self):
assert normalize_address(" 123 Main Street ") == "123 main st"
def test_empty(self):
assert normalize_address("") == ""
assert normalize_address(None) == ""
def test_idempotent(self):
result = normalize_address("123 Main Street Apartment 4")
assert normalize_address(result) == result
class TestNormalizeString:
def test_trim_and_casefold(self):
assert normalize_string(" Hello World ") == "hello world"
def test_collapse_whitespace(self):
assert normalize_string("a b c") == "a b c"
def test_empty(self):
assert normalize_string("") == ""
assert normalize_string(None) == ""
class TestGetNormalizer:
def test_get_by_enum(self):
fn = get_normalizer(NormalizerType.EMAIL)
assert fn("TEST@Gmail.com") == "test@gmail.com"
def test_get_by_string(self):
fn = get_normalizer("phone")
assert fn("(555) 123-4567") == "+15551234567"
def test_unknown_raises(self):
with pytest.raises(ValueError):
get_normalizer("unknown_type")