- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
94 lines
1.9 KiB
Python
94 lines
1.9 KiB
Python
"""DataTools deduplication engine.
|
|
|
|
Public API
|
|
----------
|
|
Core:
|
|
deduplicate(df, ...) -> DeduplicationResult
|
|
build_default_strategies(df) -> list[MatchStrategy]
|
|
|
|
Types:
|
|
Algorithm, SurvivorRule, ColumnMatchStrategy, MatchStrategy
|
|
MatchResult, DeduplicationResult
|
|
|
|
Normalizers:
|
|
get_normalizer(type) -> Callable
|
|
NormalizerType
|
|
normalize_email, normalize_phone, normalize_name,
|
|
normalize_address, normalize_string
|
|
|
|
I/O:
|
|
read_file(path, ...) -> DataFrame
|
|
write_file(df, path, ...)
|
|
list_sheets(path) -> list[str]
|
|
detect_encoding, detect_delimiter, detect_header_row
|
|
|
|
Configuration:
|
|
DeduplicationConfig.from_file(path) -> DeduplicationConfig
|
|
DeduplicationConfig.to_file(path)
|
|
"""
|
|
|
|
from .dedup import (
|
|
Algorithm,
|
|
ColumnMatchStrategy,
|
|
DeduplicationResult,
|
|
MatchResult,
|
|
MatchStrategy,
|
|
SurvivorRule,
|
|
build_default_strategies,
|
|
deduplicate,
|
|
)
|
|
from .normalizers import (
|
|
NormalizerType,
|
|
get_normalizer,
|
|
normalize_address,
|
|
normalize_email,
|
|
normalize_name,
|
|
normalize_phone,
|
|
normalize_string,
|
|
)
|
|
from .io import (
|
|
detect_delimiter,
|
|
detect_encoding,
|
|
detect_header_row,
|
|
list_sheets,
|
|
read_file,
|
|
write_file,
|
|
)
|
|
from .config import (
|
|
ColumnStrategyConfig,
|
|
DeduplicationConfig,
|
|
StrategyConfig,
|
|
)
|
|
|
|
__all__ = [
|
|
# Core
|
|
"deduplicate",
|
|
"build_default_strategies",
|
|
# Types
|
|
"Algorithm",
|
|
"SurvivorRule",
|
|
"ColumnMatchStrategy",
|
|
"MatchStrategy",
|
|
"MatchResult",
|
|
"DeduplicationResult",
|
|
# Normalizers
|
|
"NormalizerType",
|
|
"get_normalizer",
|
|
"normalize_email",
|
|
"normalize_phone",
|
|
"normalize_name",
|
|
"normalize_address",
|
|
"normalize_string",
|
|
# I/O
|
|
"read_file",
|
|
"write_file",
|
|
"list_sheets",
|
|
"detect_encoding",
|
|
"detect_delimiter",
|
|
"detect_header_row",
|
|
# Config
|
|
"DeduplicationConfig",
|
|
"StrategyConfig",
|
|
"ColumnStrategyConfig",
|
|
]
|