- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
103 lines
3.5 KiB
Python
103 lines
3.5 KiB
Python
"""Tests for src.core.config — save/load configuration profiles."""
|
|
|
|
import json
|
|
import pytest
|
|
from pathlib import Path
|
|
|
|
from src.core.config import (
|
|
DeduplicationConfig,
|
|
StrategyConfig,
|
|
ColumnStrategyConfig,
|
|
)
|
|
from src.core.dedup import Algorithm, SurvivorRule
|
|
from src.core.normalizers import NormalizerType
|
|
|
|
|
|
class TestDeduplicationConfig:
|
|
def test_default(self):
|
|
cfg = DeduplicationConfig.default()
|
|
assert cfg.survivor_rule == "first"
|
|
assert cfg.merge is False
|
|
assert cfg.strategies == []
|
|
|
|
def test_to_dict_roundtrip(self):
|
|
cfg = DeduplicationConfig(
|
|
strategies=[
|
|
StrategyConfig(columns=[
|
|
ColumnStrategyConfig(
|
|
column="email",
|
|
algorithm="exact",
|
|
threshold=100.0,
|
|
normalizer="email",
|
|
),
|
|
]),
|
|
],
|
|
survivor_rule="most_complete",
|
|
merge=True,
|
|
)
|
|
d = cfg.to_dict()
|
|
cfg2 = DeduplicationConfig.from_dict(d)
|
|
assert cfg2.survivor_rule == "most_complete"
|
|
assert cfg2.merge is True
|
|
assert len(cfg2.strategies) == 1
|
|
assert cfg2.strategies[0].columns[0].column == "email"
|
|
|
|
def test_to_file_from_file(self, tmp_path):
|
|
cfg = DeduplicationConfig(
|
|
strategies=[
|
|
StrategyConfig(columns=[
|
|
ColumnStrategyConfig(column="name", algorithm="jaro_winkler",
|
|
threshold=85.0, normalizer="name"),
|
|
]),
|
|
],
|
|
survivor_rule="last",
|
|
)
|
|
path = tmp_path / "test_config.json"
|
|
cfg.to_file(path)
|
|
|
|
loaded = DeduplicationConfig.from_file(path)
|
|
assert loaded.survivor_rule == "last"
|
|
assert len(loaded.strategies) == 1
|
|
assert loaded.strategies[0].columns[0].algorithm == "jaro_winkler"
|
|
|
|
def test_to_strategies(self):
|
|
cfg = DeduplicationConfig(
|
|
strategies=[
|
|
StrategyConfig(columns=[
|
|
ColumnStrategyConfig(column="email", algorithm="exact",
|
|
threshold=100.0, normalizer="email"),
|
|
ColumnStrategyConfig(column="phone", algorithm="exact",
|
|
threshold=100.0, normalizer="phone"),
|
|
]),
|
|
],
|
|
)
|
|
strats = cfg.to_strategies()
|
|
assert strats is not None
|
|
assert len(strats) == 1
|
|
assert len(strats[0].column_strategies) == 2
|
|
assert strats[0].column_strategies[0].algorithm == Algorithm.EXACT
|
|
assert strats[0].column_strategies[0].normalizer == NormalizerType.EMAIL
|
|
|
|
def test_to_strategies_empty(self):
|
|
cfg = DeduplicationConfig.default()
|
|
assert cfg.to_strategies() is None
|
|
|
|
def test_to_survivor_rule(self):
|
|
cfg = DeduplicationConfig(survivor_rule="most_complete")
|
|
assert cfg.to_survivor_rule() == SurvivorRule.KEEP_MOST_COMPLETE
|
|
|
|
def test_json_is_valid(self, tmp_path):
|
|
cfg = DeduplicationConfig(
|
|
strategies=[
|
|
StrategyConfig(columns=[
|
|
ColumnStrategyConfig(column="x", algorithm="exact"),
|
|
]),
|
|
],
|
|
normalize_map={"email": "email"},
|
|
)
|
|
path = tmp_path / "valid.json"
|
|
cfg.to_file(path)
|
|
data = json.loads(path.read_text())
|
|
assert "strategies" in data
|
|
assert "normalize_map" in data
|