feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
47
tests/conftest.py
Normal file
47
tests/conftest.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Shared test fixtures."""
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
SAMPLES_DIR = Path(__file__).parent.parent / "samples"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_csv_path():
|
||||
return SAMPLES_DIR / "messy_sales.csv"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_df(sample_csv_path):
|
||||
return pd.read_csv(sample_csv_path, dtype=str, keep_default_na=False)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def simple_df():
|
||||
"""Small DataFrame with obvious duplicates for unit testing."""
|
||||
return pd.DataFrame({
|
||||
"name": ["Alice", "alice", "Bob", "Charlie", "ALICE"],
|
||||
"email": ["alice@test.com", "alice@test.com", "bob@test.com",
|
||||
"charlie@test.com", "alice@test.com"],
|
||||
"phone": ["555-1234", "555-1234", "555-5678", "555-9012", "555-1234"],
|
||||
})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def merge_df():
|
||||
"""DataFrame with partial records that benefit from merge."""
|
||||
return pd.DataFrame({
|
||||
"name": ["John Doe", "John Doe", "Jane Smith"],
|
||||
"email": ["john@test.com", "john@test.com", "jane@test.com"],
|
||||
"phone": ["555-1111", "", "555-3333"],
|
||||
"address": ["", "123 Main St", "456 Oak Ave"],
|
||||
})
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tmp_csv(tmp_path, simple_df):
|
||||
"""Write simple_df to a temp CSV and return the path."""
|
||||
path = tmp_path / "test_input.csv"
|
||||
simple_df.to_csv(path, index=False)
|
||||
return path
|
||||
147
tests/test_cli.py
Normal file
147
tests/test_cli.py
Normal file
@@ -0,0 +1,147 @@
|
||||
"""Integration tests for the CLI via Typer's CliRunner."""
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from src.cli import app
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
class TestCliPreview:
|
||||
def test_preview_default(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv)])
|
||||
assert result.exit_code == 0
|
||||
assert "preview" in result.output.lower() or "Rows in" in result.output
|
||||
|
||||
def test_preview_shows_row_counts(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv)])
|
||||
assert result.exit_code == 0
|
||||
assert "Rows in" in result.output
|
||||
assert "Rows out" in result.output
|
||||
|
||||
def test_file_not_found(self):
|
||||
result = runner.invoke(app, ["/tmp/nonexistent_xyz_abc.csv"])
|
||||
assert result.exit_code != 0
|
||||
assert "not found" in result.output.lower()
|
||||
|
||||
|
||||
class TestCliApply:
|
||||
def test_apply_writes_output(self, tmp_csv, tmp_path):
|
||||
out = tmp_path / "output.csv"
|
||||
result = runner.invoke(app, [str(tmp_csv), "--apply", "-o", str(out)])
|
||||
assert result.exit_code == 0
|
||||
assert out.exists()
|
||||
|
||||
def test_apply_default_output_name(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--apply"])
|
||||
assert result.exit_code == 0
|
||||
expected = tmp_csv.parent / f"{tmp_csv.stem}_deduplicated.csv"
|
||||
assert expected.exists()
|
||||
|
||||
def test_apply_creates_removed_file(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--apply"])
|
||||
assert result.exit_code == 0
|
||||
removed = tmp_csv.parent / f"{tmp_csv.stem}_removed.csv"
|
||||
# May or may not exist depending on whether duplicates were found
|
||||
# with default auto-detect on simple_df
|
||||
|
||||
|
||||
class TestCliFuzzy:
|
||||
def test_fuzzy_flag(self, tmp_csv):
|
||||
result = runner.invoke(app, [
|
||||
str(tmp_csv), "--fuzzy", "name", "--threshold", "80",
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_subset_flag(self, tmp_csv):
|
||||
result = runner.invoke(app, [
|
||||
str(tmp_csv), "--subset", "email",
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_bad_column_error(self, tmp_csv):
|
||||
result = runner.invoke(app, [
|
||||
str(tmp_csv), "--subset", "nonexistent_column",
|
||||
])
|
||||
assert result.exit_code != 0
|
||||
assert "not found" in result.output.lower()
|
||||
|
||||
|
||||
class TestCliConfig:
|
||||
def test_save_and_load_config(self, tmp_csv, tmp_path):
|
||||
cfg_path = tmp_path / "my_config.json"
|
||||
# Save
|
||||
result = runner.invoke(app, [
|
||||
str(tmp_csv), "--subset", "email", "--save-config", str(cfg_path),
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
assert cfg_path.exists()
|
||||
|
||||
# Load and apply
|
||||
result = runner.invoke(app, [
|
||||
str(tmp_csv), "--config", str(cfg_path), "--apply",
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
class TestCliSurvivor:
|
||||
def test_survivor_last(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--survivor", "last"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_survivor_most_complete(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--survivor", "most-complete"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_invalid_survivor(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--survivor", "bogus"])
|
||||
assert result.exit_code != 0
|
||||
|
||||
|
||||
class TestCliMerge:
|
||||
def test_merge_flag(self, tmp_csv):
|
||||
result = runner.invoke(app, [str(tmp_csv), "--merge", "--apply"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
|
||||
class TestCliSampleData:
|
||||
def test_sample_preview(self, sample_csv_path):
|
||||
result = runner.invoke(app, [str(sample_csv_path)])
|
||||
assert result.exit_code == 0
|
||||
assert "Rows in: 50" in result.output
|
||||
# Should find duplicates
|
||||
assert "Removed:" in result.output
|
||||
|
||||
def test_sample_apply(self, sample_csv_path, tmp_path):
|
||||
out = tmp_path / "deduped.csv"
|
||||
result = runner.invoke(app, [
|
||||
str(sample_csv_path), "--apply", "-o", str(out),
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
assert out.exists()
|
||||
import pandas as pd
|
||||
df = pd.read_csv(out, encoding="utf-8-sig")
|
||||
# Should have fewer than 50 rows
|
||||
assert len(df) < 50
|
||||
|
||||
def test_sample_fuzzy_with_merge(self, sample_csv_path, tmp_path):
|
||||
out = tmp_path / "fuzzy_merged.csv"
|
||||
result = runner.invoke(app, [
|
||||
str(sample_csv_path),
|
||||
"--fuzzy", "customer_name",
|
||||
"--threshold", "80",
|
||||
"--merge",
|
||||
"--apply",
|
||||
"-o", str(out),
|
||||
])
|
||||
assert result.exit_code == 0
|
||||
assert out.exists()
|
||||
|
||||
|
||||
class TestCliHelp:
|
||||
def test_help(self):
|
||||
result = runner.invoke(app, ["--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "--apply" in result.output
|
||||
102
tests/test_config.py
Normal file
102
tests/test_config.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Tests for src.core.config — save/load configuration profiles."""
|
||||
|
||||
import json
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from src.core.config import (
|
||||
DeduplicationConfig,
|
||||
StrategyConfig,
|
||||
ColumnStrategyConfig,
|
||||
)
|
||||
from src.core.dedup import Algorithm, SurvivorRule
|
||||
from src.core.normalizers import NormalizerType
|
||||
|
||||
|
||||
class TestDeduplicationConfig:
|
||||
def test_default(self):
|
||||
cfg = DeduplicationConfig.default()
|
||||
assert cfg.survivor_rule == "first"
|
||||
assert cfg.merge is False
|
||||
assert cfg.strategies == []
|
||||
|
||||
def test_to_dict_roundtrip(self):
|
||||
cfg = DeduplicationConfig(
|
||||
strategies=[
|
||||
StrategyConfig(columns=[
|
||||
ColumnStrategyConfig(
|
||||
column="email",
|
||||
algorithm="exact",
|
||||
threshold=100.0,
|
||||
normalizer="email",
|
||||
),
|
||||
]),
|
||||
],
|
||||
survivor_rule="most_complete",
|
||||
merge=True,
|
||||
)
|
||||
d = cfg.to_dict()
|
||||
cfg2 = DeduplicationConfig.from_dict(d)
|
||||
assert cfg2.survivor_rule == "most_complete"
|
||||
assert cfg2.merge is True
|
||||
assert len(cfg2.strategies) == 1
|
||||
assert cfg2.strategies[0].columns[0].column == "email"
|
||||
|
||||
def test_to_file_from_file(self, tmp_path):
|
||||
cfg = DeduplicationConfig(
|
||||
strategies=[
|
||||
StrategyConfig(columns=[
|
||||
ColumnStrategyConfig(column="name", algorithm="jaro_winkler",
|
||||
threshold=85.0, normalizer="name"),
|
||||
]),
|
||||
],
|
||||
survivor_rule="last",
|
||||
)
|
||||
path = tmp_path / "test_config.json"
|
||||
cfg.to_file(path)
|
||||
|
||||
loaded = DeduplicationConfig.from_file(path)
|
||||
assert loaded.survivor_rule == "last"
|
||||
assert len(loaded.strategies) == 1
|
||||
assert loaded.strategies[0].columns[0].algorithm == "jaro_winkler"
|
||||
|
||||
def test_to_strategies(self):
|
||||
cfg = DeduplicationConfig(
|
||||
strategies=[
|
||||
StrategyConfig(columns=[
|
||||
ColumnStrategyConfig(column="email", algorithm="exact",
|
||||
threshold=100.0, normalizer="email"),
|
||||
ColumnStrategyConfig(column="phone", algorithm="exact",
|
||||
threshold=100.0, normalizer="phone"),
|
||||
]),
|
||||
],
|
||||
)
|
||||
strats = cfg.to_strategies()
|
||||
assert strats is not None
|
||||
assert len(strats) == 1
|
||||
assert len(strats[0].column_strategies) == 2
|
||||
assert strats[0].column_strategies[0].algorithm == Algorithm.EXACT
|
||||
assert strats[0].column_strategies[0].normalizer == NormalizerType.EMAIL
|
||||
|
||||
def test_to_strategies_empty(self):
|
||||
cfg = DeduplicationConfig.default()
|
||||
assert cfg.to_strategies() is None
|
||||
|
||||
def test_to_survivor_rule(self):
|
||||
cfg = DeduplicationConfig(survivor_rule="most_complete")
|
||||
assert cfg.to_survivor_rule() == SurvivorRule.KEEP_MOST_COMPLETE
|
||||
|
||||
def test_json_is_valid(self, tmp_path):
|
||||
cfg = DeduplicationConfig(
|
||||
strategies=[
|
||||
StrategyConfig(columns=[
|
||||
ColumnStrategyConfig(column="x", algorithm="exact"),
|
||||
]),
|
||||
],
|
||||
normalize_map={"email": "email"},
|
||||
)
|
||||
path = tmp_path / "valid.json"
|
||||
cfg.to_file(path)
|
||||
data = json.loads(path.read_text())
|
||||
assert "strategies" in data
|
||||
assert "normalize_map" in data
|
||||
258
tests/test_dedup.py
Normal file
258
tests/test_dedup.py
Normal file
@@ -0,0 +1,258 @@
|
||||
"""Tests for src.core.dedup — matching engine."""
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
|
||||
from src.core.dedup import (
|
||||
Algorithm,
|
||||
ColumnMatchStrategy,
|
||||
MatchStrategy,
|
||||
SurvivorRule,
|
||||
_compute_similarity,
|
||||
_compare_pair,
|
||||
_UnionFind,
|
||||
build_default_strategies,
|
||||
deduplicate,
|
||||
)
|
||||
from src.core.normalizers import NormalizerType
|
||||
|
||||
|
||||
class TestComputeSimilarity:
|
||||
def test_exact_match(self):
|
||||
assert _compute_similarity("hello", "hello", Algorithm.EXACT) == 100.0
|
||||
|
||||
def test_exact_mismatch(self):
|
||||
assert _compute_similarity("hello", "world", Algorithm.EXACT) == 0.0
|
||||
|
||||
def test_levenshtein_similar(self):
|
||||
score = _compute_similarity("kitten", "sitting", Algorithm.LEVENSHTEIN)
|
||||
assert 50 < score < 80
|
||||
|
||||
def test_jaro_winkler_similar(self):
|
||||
score = _compute_similarity("john", "jon", Algorithm.JARO_WINKLER)
|
||||
assert score > 80
|
||||
|
||||
def test_token_set_ratio(self):
|
||||
score = _compute_similarity(
|
||||
"123 main street apt 4",
|
||||
"apt 4 123 main street",
|
||||
Algorithm.TOKEN_SET_RATIO,
|
||||
)
|
||||
assert score == 100.0
|
||||
|
||||
|
||||
class TestUnionFind:
|
||||
def test_basic_union(self):
|
||||
uf = _UnionFind(5)
|
||||
uf.union(0, 1)
|
||||
uf.union(1, 2)
|
||||
assert uf.find(0) == uf.find(2) # transitive
|
||||
|
||||
def test_separate_groups(self):
|
||||
uf = _UnionFind(5)
|
||||
uf.union(0, 1)
|
||||
uf.union(3, 4)
|
||||
assert uf.find(0) != uf.find(3)
|
||||
|
||||
def test_groups(self):
|
||||
uf = _UnionFind(5)
|
||||
uf.union(0, 1)
|
||||
uf.union(1, 2)
|
||||
uf.union(3, 4)
|
||||
groups = uf.groups()
|
||||
assert len(groups) == 2
|
||||
sizes = sorted(len(v) for v in groups.values())
|
||||
assert sizes == [2, 3]
|
||||
|
||||
|
||||
class TestComparePair:
|
||||
def test_exact_match(self):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
row_a = pd.Series({"email": "test@example.com"})
|
||||
row_b = pd.Series({"email": "test@example.com"})
|
||||
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
||||
assert is_match
|
||||
assert conf == 100.0
|
||||
assert cols == ["email"]
|
||||
|
||||
def test_exact_mismatch(self):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
row_a = pd.Series({"email": "a@test.com"})
|
||||
row_b = pd.Series({"email": "b@test.com"})
|
||||
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
||||
assert not is_match
|
||||
|
||||
def test_fuzzy_match(self):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=80),
|
||||
])
|
||||
row_a = pd.Series({"name": "john smith"})
|
||||
row_b = pd.Series({"name": "jon smith"})
|
||||
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
||||
assert is_match
|
||||
assert conf > 80
|
||||
|
||||
def test_and_logic_both_must_match(self):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="name", algorithm=Algorithm.EXACT),
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
# name matches, email doesn't
|
||||
row_a = pd.Series({"name": "alice", "email": "a@test.com"})
|
||||
row_b = pd.Series({"name": "alice", "email": "b@test.com"})
|
||||
is_match, conf, cols = _compare_pair(row_a, row_b, strategy)
|
||||
assert not is_match
|
||||
|
||||
|
||||
class TestBuildDefaultStrategies:
|
||||
def test_detects_email(self):
|
||||
df = pd.DataFrame({"email": ["a@b.com"], "name": ["Alice"]})
|
||||
strats = build_default_strategies(df)
|
||||
# email (strong, standalone) + name AND email (weak paired with strong) = 2
|
||||
assert len(strats) == 2
|
||||
found_email = any(
|
||||
cs.column == "email" and cs.normalizer == NormalizerType.EMAIL
|
||||
for s in strats for cs in s.column_strategies
|
||||
)
|
||||
assert found_email
|
||||
# Name should only appear paired with email, not standalone
|
||||
name_strats = [s for s in strats
|
||||
if any(cs.column == "name" for cs in s.column_strategies)]
|
||||
for s in name_strats:
|
||||
assert len(s.column_strategies) >= 2, "Name should be paired with a strong key"
|
||||
|
||||
def test_fallback_all_columns(self):
|
||||
df = pd.DataFrame({"x": [1], "y": [2], "z": [3]})
|
||||
strats = build_default_strategies(df)
|
||||
assert len(strats) == 1
|
||||
assert len(strats[0].column_strategies) == 3
|
||||
|
||||
|
||||
class TestDeduplicate:
|
||||
def test_exact_duplicates(self, simple_df):
|
||||
# Alice appears 3 times with same email
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
result = deduplicate(simple_df, strategies=[strategy])
|
||||
# 3 Alices -> 1, Bob stays, Charlie stays = 3 rows
|
||||
assert len(result.deduplicated_df) == 3
|
||||
assert result.original_row_count == 5
|
||||
assert len(result.match_groups) == 1
|
||||
|
||||
def test_fuzzy_name_match(self):
|
||||
df = pd.DataFrame({
|
||||
"name": ["John Smith", "Jon Smith", "Jane Doe"],
|
||||
"email": ["a@test.com", "b@test.com", "c@test.com"],
|
||||
})
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(
|
||||
column="name",
|
||||
algorithm=Algorithm.JARO_WINKLER,
|
||||
threshold=85,
|
||||
normalizer=NormalizerType.NAME,
|
||||
),
|
||||
])
|
||||
result = deduplicate(df, strategies=[strategy])
|
||||
assert len(result.deduplicated_df) == 2
|
||||
assert len(result.match_groups) == 1
|
||||
|
||||
def test_survivor_keep_last(self, simple_df):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
result = deduplicate(simple_df, strategies=[strategy],
|
||||
survivor_rule=SurvivorRule.KEEP_LAST)
|
||||
# The last Alice (index 4) should survive
|
||||
assert len(result.match_groups) == 1
|
||||
assert result.match_groups[0].survivor_index == 4
|
||||
|
||||
def test_survivor_most_complete(self, merge_df):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
result = deduplicate(merge_df, strategies=[strategy],
|
||||
survivor_rule=SurvivorRule.KEEP_MOST_COMPLETE)
|
||||
# Row 0 has phone but no address (1 empty)
|
||||
# Row 1 has address but no phone (1 empty)
|
||||
# Both have 1 empty, so keep_first among ties
|
||||
assert len(result.deduplicated_df) == 2
|
||||
|
||||
def test_merge_mode(self, merge_df):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
result = deduplicate(merge_df, strategies=[strategy], merge=True)
|
||||
# Survivor should have both phone and address filled
|
||||
john_row = result.deduplicated_df[
|
||||
result.deduplicated_df["name"] == "John Doe"
|
||||
].iloc[0]
|
||||
assert john_row["phone"] == "555-1111"
|
||||
assert john_row["address"] == "123 Main St"
|
||||
|
||||
def test_multi_strategy_or(self):
|
||||
df = pd.DataFrame({
|
||||
"name": ["Alice", "Bob", "Alice B."],
|
||||
"email": ["a@test.com", "a@test.com", "c@test.com"],
|
||||
})
|
||||
# Strategy 1: match on email
|
||||
# Strategy 2: match on name (fuzzy)
|
||||
strat1 = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
strat2 = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="name", algorithm=Algorithm.JARO_WINKLER, threshold=70),
|
||||
])
|
||||
result = deduplicate(df, strategies=[strat1, strat2])
|
||||
# All three should end up in one group via transitive closure:
|
||||
# Alice~Bob (email), Alice~Alice B. (name)
|
||||
assert len(result.deduplicated_df) == 1
|
||||
|
||||
def test_confidence_score(self, simple_df):
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
result = deduplicate(simple_df, strategies=[strategy])
|
||||
for group in result.match_groups:
|
||||
assert 0 <= group.confidence <= 100
|
||||
|
||||
def test_preview_flag(self, simple_df):
|
||||
result = deduplicate(simple_df, preview=True)
|
||||
assert result.is_preview is True
|
||||
result2 = deduplicate(simple_df, preview=False)
|
||||
assert result2.is_preview is False
|
||||
|
||||
def test_auto_detect_strategies(self, sample_df):
|
||||
result = deduplicate(sample_df)
|
||||
# Should find duplicates in the sample data
|
||||
assert len(result.match_groups) > 0
|
||||
assert len(result.deduplicated_df) < len(sample_df)
|
||||
|
||||
def test_idempotent(self, sample_df):
|
||||
"""Running dedup twice with same config produces same output."""
|
||||
result1 = deduplicate(sample_df)
|
||||
result2 = deduplicate(result1.deduplicated_df)
|
||||
# Second pass should find no new duplicates
|
||||
assert len(result2.match_groups) == 0
|
||||
assert len(result2.deduplicated_df) == len(result1.deduplicated_df)
|
||||
|
||||
def test_review_callback(self):
|
||||
df = pd.DataFrame({
|
||||
"email": ["a@test.com", "a@test.com", "b@test.com"],
|
||||
})
|
||||
strategy = MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||
])
|
||||
# Reject all matches
|
||||
result = deduplicate(df, strategies=[strategy],
|
||||
review_callback=lambda g, d: False)
|
||||
assert len(result.deduplicated_df) == 3 # nothing removed
|
||||
|
||||
# Accept all matches
|
||||
result = deduplicate(df, strategies=[strategy],
|
||||
review_callback=lambda g, d: True)
|
||||
assert len(result.deduplicated_df) == 2
|
||||
130
tests/test_io.py
Normal file
130
tests/test_io.py
Normal file
@@ -0,0 +1,130 @@
|
||||
"""Tests for src.core.io — file reading, encoding/delimiter detection."""
|
||||
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from src.core.io import (
|
||||
detect_encoding,
|
||||
detect_delimiter,
|
||||
detect_header_row,
|
||||
read_file,
|
||||
write_file,
|
||||
list_sheets,
|
||||
)
|
||||
|
||||
|
||||
class TestDetectEncoding:
|
||||
def test_utf8_file(self, sample_csv_path):
|
||||
enc = detect_encoding(sample_csv_path)
|
||||
assert enc.lower().replace("-", "") in ("utf8", "ascii", "utf8sig")
|
||||
|
||||
def test_empty_file(self, tmp_path):
|
||||
f = tmp_path / "empty.csv"
|
||||
f.write_bytes(b"")
|
||||
assert detect_encoding(f) == "utf-8"
|
||||
|
||||
def test_bom_file(self, tmp_path):
|
||||
f = tmp_path / "bom.csv"
|
||||
f.write_bytes(b"\xef\xbb\xbfname,email\nAlice,a@b.com\n")
|
||||
assert detect_encoding(f) == "utf-8-sig"
|
||||
|
||||
def test_latin1_file(self, tmp_path):
|
||||
f = tmp_path / "latin.csv"
|
||||
content = "name,city\nJosé,São Paulo\n".encode("latin-1")
|
||||
f.write_bytes(content)
|
||||
enc = detect_encoding(f)
|
||||
# Should detect something compatible with latin-1 family
|
||||
assert enc in ("iso-8859-1", "latin-1", "windows-1252", "cp1252",
|
||||
"iso-8859-9", "cp1250", "iso-8859-15", "utf-8")
|
||||
|
||||
|
||||
class TestDetectDelimiter:
|
||||
def test_comma(self, sample_csv_path):
|
||||
assert detect_delimiter(sample_csv_path) == ","
|
||||
|
||||
def test_tab(self, tmp_path):
|
||||
f = tmp_path / "tabs.tsv"
|
||||
f.write_text("name\temail\nAlice\ta@b.com\n")
|
||||
assert detect_delimiter(f) == "\t"
|
||||
|
||||
def test_semicolon(self, tmp_path):
|
||||
f = tmp_path / "semi.csv"
|
||||
f.write_text("name;email;phone\nAlice;a@b.com;555\n")
|
||||
assert detect_delimiter(f) == ";"
|
||||
|
||||
def test_pipe(self, tmp_path):
|
||||
f = tmp_path / "pipe.csv"
|
||||
f.write_text("name|email|phone\nAlice|a@b.com|555\n")
|
||||
assert detect_delimiter(f) == "|"
|
||||
|
||||
|
||||
class TestDetectHeaderRow:
|
||||
def test_standard_csv(self, sample_csv_path):
|
||||
assert detect_header_row(sample_csv_path) == 0
|
||||
|
||||
def test_with_junk_rows(self, tmp_path):
|
||||
f = tmp_path / "junk.csv"
|
||||
f.write_text("Report generated 2024-01-01\n\nname,email,phone\nAlice,a@b.com,555\n")
|
||||
# Row 0 has "Report generated..." which is a single non-numeric string
|
||||
# Row 2 has "name,email,phone" which looks like headers
|
||||
# The heuristic checks all cells, so row 0 may match if it's a single cell
|
||||
hdr = detect_header_row(f)
|
||||
assert hdr in (0, 2) # depends on delimiter detection
|
||||
|
||||
|
||||
class TestReadFile:
|
||||
def test_read_csv(self, sample_csv_path):
|
||||
df = read_file(sample_csv_path)
|
||||
assert isinstance(df, pd.DataFrame)
|
||||
assert len(df) == 50
|
||||
assert "customer_name" in df.columns
|
||||
|
||||
def test_read_nonexistent(self):
|
||||
with pytest.raises(FileNotFoundError):
|
||||
read_file("/tmp/nonexistent_file_xyz.csv")
|
||||
|
||||
def test_read_with_encoding_override(self, sample_csv_path):
|
||||
df = read_file(sample_csv_path, encoding="utf-8")
|
||||
assert len(df) == 50
|
||||
|
||||
def test_chunked_reading(self, sample_csv_path):
|
||||
chunks = read_file(sample_csv_path, chunk_size=10)
|
||||
# Should be a generator
|
||||
all_chunks = list(chunks)
|
||||
assert len(all_chunks) == 5
|
||||
total_rows = sum(len(c) for c in all_chunks)
|
||||
assert total_rows == 50
|
||||
|
||||
|
||||
class TestWriteFile:
|
||||
def test_write_csv(self, tmp_path, simple_df):
|
||||
out = tmp_path / "output.csv"
|
||||
write_file(simple_df, out)
|
||||
assert out.exists()
|
||||
# Read back
|
||||
df = pd.read_csv(out, encoding="utf-8-sig")
|
||||
assert len(df) == len(simple_df)
|
||||
|
||||
def test_write_xlsx(self, tmp_path, simple_df):
|
||||
out = tmp_path / "output.xlsx"
|
||||
write_file(simple_df, out)
|
||||
assert out.exists()
|
||||
df = pd.read_excel(out)
|
||||
assert len(df) == len(simple_df)
|
||||
|
||||
def test_utf8_bom_default(self, tmp_path, simple_df):
|
||||
out = tmp_path / "bom.csv"
|
||||
write_file(simple_df, out)
|
||||
raw = out.read_bytes()
|
||||
assert raw[:3] == b"\xef\xbb\xbf"
|
||||
|
||||
|
||||
class TestListSheets:
|
||||
def test_list_sheets(self, tmp_path, simple_df):
|
||||
path = tmp_path / "multi.xlsx"
|
||||
with pd.ExcelWriter(path, engine="openpyxl") as writer:
|
||||
simple_df.to_excel(writer, sheet_name="Sheet1", index=False)
|
||||
simple_df.to_excel(writer, sheet_name="Sheet2", index=False)
|
||||
sheets = list_sheets(path)
|
||||
assert sheets == ["Sheet1", "Sheet2"]
|
||||
158
tests/test_normalizers.py
Normal file
158
tests/test_normalizers.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""Tests for src.core.normalizers."""
|
||||
|
||||
import pytest
|
||||
from src.core.normalizers import (
|
||||
NormalizerType,
|
||||
get_normalizer,
|
||||
normalize_email,
|
||||
normalize_phone,
|
||||
normalize_name,
|
||||
normalize_address,
|
||||
normalize_string,
|
||||
)
|
||||
|
||||
|
||||
class TestNormalizeEmail:
|
||||
def test_basic_lowercase(self):
|
||||
assert normalize_email("John@Example.COM") == "john@example.com"
|
||||
|
||||
def test_strip_whitespace(self):
|
||||
assert normalize_email(" alice@test.com ") == "alice@test.com"
|
||||
|
||||
def test_strip_gmail_dots(self):
|
||||
assert normalize_email("j.o.h.n@gmail.com") == "john@gmail.com"
|
||||
|
||||
def test_strip_plus_tag(self):
|
||||
assert normalize_email("alice+promo@test.com") == "alice@test.com"
|
||||
|
||||
def test_gmail_dots_and_plus(self):
|
||||
assert normalize_email("j.smith+tag@gmail.com") == "jsmith@gmail.com"
|
||||
|
||||
def test_non_gmail_keeps_dots(self):
|
||||
assert normalize_email("j.smith@company.com") == "j.smith@company.com"
|
||||
|
||||
def test_empty(self):
|
||||
assert normalize_email("") == ""
|
||||
assert normalize_email(None) == ""
|
||||
|
||||
def test_no_at_sign(self):
|
||||
assert normalize_email("not-an-email") == "not-an-email"
|
||||
|
||||
def test_idempotent(self):
|
||||
result = normalize_email("J.Smith+tag@Gmail.com")
|
||||
assert normalize_email(result) == result
|
||||
|
||||
|
||||
class TestNormalizePhone:
|
||||
def test_us_formatted(self):
|
||||
assert normalize_phone("(555) 123-4567") == "+15551234567"
|
||||
|
||||
def test_dashes(self):
|
||||
assert normalize_phone("555-123-4567") == "+15551234567"
|
||||
|
||||
def test_dots(self):
|
||||
assert normalize_phone("555.123.4567") == "+15551234567"
|
||||
|
||||
def test_with_country_code(self):
|
||||
assert normalize_phone("+1 555-123-4567") == "+15551234567"
|
||||
|
||||
def test_digits_only_input(self):
|
||||
assert normalize_phone("5551234567") == "+15551234567"
|
||||
|
||||
def test_empty(self):
|
||||
assert normalize_phone("") == ""
|
||||
assert normalize_phone(None) == ""
|
||||
|
||||
def test_invalid_fallback_digits(self):
|
||||
# Very short number that phonenumbers rejects
|
||||
result = normalize_phone("123")
|
||||
assert result == "123"
|
||||
|
||||
def test_idempotent(self):
|
||||
result = normalize_phone("(555) 123-4567")
|
||||
assert normalize_phone(result) == result
|
||||
|
||||
|
||||
class TestNormalizeName:
|
||||
def test_strip_mr(self):
|
||||
assert normalize_name("Mr. John Smith") == "john smith"
|
||||
|
||||
def test_strip_dr(self):
|
||||
assert normalize_name("Dr. Jane Doe") == "jane doe"
|
||||
|
||||
def test_strip_suffix(self):
|
||||
assert normalize_name("Robert Brown Jr.") == "robert brown"
|
||||
|
||||
def test_strip_numeral_suffix(self):
|
||||
assert normalize_name("James Wilson III") == "james wilson"
|
||||
|
||||
def test_title_and_suffix(self):
|
||||
assert normalize_name("Dr. Michael Williams III") == "michael williams"
|
||||
|
||||
def test_collapse_whitespace(self):
|
||||
assert normalize_name(" John Smith ") == "john smith"
|
||||
|
||||
def test_case_fold(self):
|
||||
assert normalize_name("JOHN SMITH") == "john smith"
|
||||
|
||||
def test_empty(self):
|
||||
assert normalize_name("") == ""
|
||||
assert normalize_name(None) == ""
|
||||
|
||||
def test_idempotent(self):
|
||||
result = normalize_name("Mr. John Smith Jr.")
|
||||
assert normalize_name(result) == result
|
||||
|
||||
|
||||
class TestNormalizeAddress:
|
||||
def test_street_abbreviation(self):
|
||||
assert normalize_address("123 Main Street") == "123 main st"
|
||||
|
||||
def test_avenue_abbreviation(self):
|
||||
assert normalize_address("456 Oak Avenue") == "456 oak ave"
|
||||
|
||||
def test_boulevard_abbreviation(self):
|
||||
assert normalize_address("789 Pine Boulevard") == "789 pine blvd"
|
||||
|
||||
def test_apartment(self):
|
||||
assert normalize_address("123 Main St Apartment 4") == "123 main st apt 4"
|
||||
|
||||
def test_direction(self):
|
||||
assert normalize_address("111 First Street North") == "111 first st n"
|
||||
|
||||
def test_collapse_whitespace(self):
|
||||
assert normalize_address(" 123 Main Street ") == "123 main st"
|
||||
|
||||
def test_empty(self):
|
||||
assert normalize_address("") == ""
|
||||
assert normalize_address(None) == ""
|
||||
|
||||
def test_idempotent(self):
|
||||
result = normalize_address("123 Main Street Apartment 4")
|
||||
assert normalize_address(result) == result
|
||||
|
||||
|
||||
class TestNormalizeString:
|
||||
def test_trim_and_casefold(self):
|
||||
assert normalize_string(" Hello World ") == "hello world"
|
||||
|
||||
def test_collapse_whitespace(self):
|
||||
assert normalize_string("a b c") == "a b c"
|
||||
|
||||
def test_empty(self):
|
||||
assert normalize_string("") == ""
|
||||
assert normalize_string(None) == ""
|
||||
|
||||
|
||||
class TestGetNormalizer:
|
||||
def test_get_by_enum(self):
|
||||
fn = get_normalizer(NormalizerType.EMAIL)
|
||||
assert fn("TEST@Gmail.com") == "test@gmail.com"
|
||||
|
||||
def test_get_by_string(self):
|
||||
fn = get_normalizer("phone")
|
||||
assert fn("(555) 123-4567") == "+15551234567"
|
||||
|
||||
def test_unknown_raises(self):
|
||||
with pytest.raises(ValueError):
|
||||
get_normalizer("unknown_type")
|
||||
Reference in New Issue
Block a user