feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
93
src/core/__init__.py
Normal file
93
src/core/__init__.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""DataTools deduplication engine.
|
||||
|
||||
Public API
|
||||
----------
|
||||
Core:
|
||||
deduplicate(df, ...) -> DeduplicationResult
|
||||
build_default_strategies(df) -> list[MatchStrategy]
|
||||
|
||||
Types:
|
||||
Algorithm, SurvivorRule, ColumnMatchStrategy, MatchStrategy
|
||||
MatchResult, DeduplicationResult
|
||||
|
||||
Normalizers:
|
||||
get_normalizer(type) -> Callable
|
||||
NormalizerType
|
||||
normalize_email, normalize_phone, normalize_name,
|
||||
normalize_address, normalize_string
|
||||
|
||||
I/O:
|
||||
read_file(path, ...) -> DataFrame
|
||||
write_file(df, path, ...)
|
||||
list_sheets(path) -> list[str]
|
||||
detect_encoding, detect_delimiter, detect_header_row
|
||||
|
||||
Configuration:
|
||||
DeduplicationConfig.from_file(path) -> DeduplicationConfig
|
||||
DeduplicationConfig.to_file(path)
|
||||
"""
|
||||
|
||||
from .dedup import (
|
||||
Algorithm,
|
||||
ColumnMatchStrategy,
|
||||
DeduplicationResult,
|
||||
MatchResult,
|
||||
MatchStrategy,
|
||||
SurvivorRule,
|
||||
build_default_strategies,
|
||||
deduplicate,
|
||||
)
|
||||
from .normalizers import (
|
||||
NormalizerType,
|
||||
get_normalizer,
|
||||
normalize_address,
|
||||
normalize_email,
|
||||
normalize_name,
|
||||
normalize_phone,
|
||||
normalize_string,
|
||||
)
|
||||
from .io import (
|
||||
detect_delimiter,
|
||||
detect_encoding,
|
||||
detect_header_row,
|
||||
list_sheets,
|
||||
read_file,
|
||||
write_file,
|
||||
)
|
||||
from .config import (
|
||||
ColumnStrategyConfig,
|
||||
DeduplicationConfig,
|
||||
StrategyConfig,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Core
|
||||
"deduplicate",
|
||||
"build_default_strategies",
|
||||
# Types
|
||||
"Algorithm",
|
||||
"SurvivorRule",
|
||||
"ColumnMatchStrategy",
|
||||
"MatchStrategy",
|
||||
"MatchResult",
|
||||
"DeduplicationResult",
|
||||
# Normalizers
|
||||
"NormalizerType",
|
||||
"get_normalizer",
|
||||
"normalize_email",
|
||||
"normalize_phone",
|
||||
"normalize_name",
|
||||
"normalize_address",
|
||||
"normalize_string",
|
||||
# I/O
|
||||
"read_file",
|
||||
"write_file",
|
||||
"list_sheets",
|
||||
"detect_encoding",
|
||||
"detect_delimiter",
|
||||
"detect_header_row",
|
||||
# Config
|
||||
"DeduplicationConfig",
|
||||
"StrategyConfig",
|
||||
"ColumnStrategyConfig",
|
||||
]
|
||||
Reference in New Issue
Block a user