Files
datatools-dev/src/core/__init__.py
Michael b871ab24fc feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-28 23:06:39 +00:00

94 lines
1.9 KiB
Python

"""DataTools deduplication engine.
Public API
----------
Core:
deduplicate(df, ...) -> DeduplicationResult
build_default_strategies(df) -> list[MatchStrategy]
Types:
Algorithm, SurvivorRule, ColumnMatchStrategy, MatchStrategy
MatchResult, DeduplicationResult
Normalizers:
get_normalizer(type) -> Callable
NormalizerType
normalize_email, normalize_phone, normalize_name,
normalize_address, normalize_string
I/O:
read_file(path, ...) -> DataFrame
write_file(df, path, ...)
list_sheets(path) -> list[str]
detect_encoding, detect_delimiter, detect_header_row
Configuration:
DeduplicationConfig.from_file(path) -> DeduplicationConfig
DeduplicationConfig.to_file(path)
"""
from .dedup import (
Algorithm,
ColumnMatchStrategy,
DeduplicationResult,
MatchResult,
MatchStrategy,
SurvivorRule,
build_default_strategies,
deduplicate,
)
from .normalizers import (
NormalizerType,
get_normalizer,
normalize_address,
normalize_email,
normalize_name,
normalize_phone,
normalize_string,
)
from .io import (
detect_delimiter,
detect_encoding,
detect_header_row,
list_sheets,
read_file,
write_file,
)
from .config import (
ColumnStrategyConfig,
DeduplicationConfig,
StrategyConfig,
)
__all__ = [
# Core
"deduplicate",
"build_default_strategies",
# Types
"Algorithm",
"SurvivorRule",
"ColumnMatchStrategy",
"MatchStrategy",
"MatchResult",
"DeduplicationResult",
# Normalizers
"NormalizerType",
"get_normalizer",
"normalize_email",
"normalize_phone",
"normalize_name",
"normalize_address",
"normalize_string",
# I/O
"read_file",
"write_file",
"list_sheets",
"detect_encoding",
"detect_delimiter",
"detect_header_row",
# Config
"DeduplicationConfig",
"StrategyConfig",
"ColumnStrategyConfig",
]