feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions

413
src/gui/components.py Normal file
View File

@@ -0,0 +1,413 @@
"""Reusable Streamlit widgets for the deduplicator GUI."""
from __future__ import annotations
import io
from typing import Optional
import pandas as pd
import streamlit as st
from src.core.dedup import (
Algorithm,
ColumnMatchStrategy,
DeduplicationResult,
MatchResult,
MatchStrategy,
SurvivorRule,
)
from src.core.config import (
ColumnStrategyConfig,
DeduplicationConfig,
StrategyConfig,
)
from src.core.normalizers import NormalizerType
# ---------------------------------------------------------------------------
# Config panel (advanced options)
# ---------------------------------------------------------------------------
def config_panel(df: pd.DataFrame) -> dict:
"""Render the Advanced Options expander. Returns a settings dict.
Keys returned:
strategies: list[MatchStrategy] | None
survivor_rule: SurvivorRule
date_column: str | None
merge: bool
"""
columns = list(df.columns)
with st.expander("Advanced Options"):
col_left, col_right = st.columns(2)
with col_left:
subset_cols = st.multiselect(
"Match on columns",
columns,
default=[],
help="Leave empty to auto-detect based on column names.",
)
key_cols = st.multiselect(
"Strong keys",
columns,
default=[],
help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
)
fuzzy_cols = st.multiselect(
"Fuzzy columns",
columns,
default=[],
help="Columns to fuzzy-match. Others use exact matching.",
)
with col_right:
algorithm = st.selectbox(
"Fuzzy algorithm",
["jaro_winkler", "levenshtein", "token_set_ratio"],
index=0,
help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
)
threshold = st.slider(
"Similarity threshold",
min_value=50,
max_value=100,
value=85,
help="Lower = more matches but more false positives.",
)
survivor = st.selectbox(
"Survivor rule",
["first", "last", "most-complete", "most-recent"],
index=0,
help="Which row to keep when duplicates are found.",
)
# Second row of options
col_a, col_b = st.columns(2)
with col_a:
normalize_options = {c: "auto" for c in columns}
normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]
normalize_map: dict[str, str] = {}
if fuzzy_cols or subset_cols:
target_cols = fuzzy_cols or subset_cols
st.markdown("**Per-column normalizers**")
for col_name in target_cols:
norm = st.selectbox(
f"Normalizer for '{col_name}'",
normalizer_types,
index=0,
key=f"norm_{col_name}",
)
if norm not in ("auto", "none"):
normalize_map[col_name] = norm
with col_b:
merge = st.checkbox(
"Merge mode",
value=False,
help="Fill missing fields in the surviving row from removed duplicates.",
)
date_column: Optional[str] = None
if survivor == "most-recent":
date_column = st.selectbox(
"Date column",
columns,
help="Required for most-recent survivor rule.",
)
# Config save/load
st.divider()
cfg_left, cfg_right = st.columns(2)
with cfg_left:
config_file = st.file_uploader(
"Load config profile",
type=["json"],
help="Load previously saved settings.",
key="config_upload",
)
if config_file is not None:
import json
try:
data = json.loads(config_file.read())
loaded = DeduplicationConfig.from_dict(data)
st.session_state["loaded_config"] = loaded
st.success("Config loaded.")
except Exception as e:
st.error(f"Failed to load config: {e}")
with cfg_right:
if st.button("Save current settings"):
cfg = _build_config(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
survivor, date_column, merge,
)
cfg_json = cfg.to_dict()
import json
st.download_button(
"Download config JSON",
data=json.dumps(cfg_json, indent=2),
file_name="dedup_config.json",
mime="application/json",
)
# Build strategies from selections
strategies = _build_strategies(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
)
# Survivor rule mapping
survivor_map = {
"first": SurvivorRule.KEEP_FIRST,
"last": SurvivorRule.KEEP_LAST,
"most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
"most-recent": SurvivorRule.KEEP_MOST_RECENT,
}
return {
"strategies": strategies,
"survivor_rule": survivor_map[survivor],
"date_column": date_column,
"merge": merge,
}
def _build_strategies(
subset_cols: list[str],
key_cols: list[str],
fuzzy_cols: list[str],
algorithm: str,
threshold: int,
normalize_map: dict[str, str],
) -> Optional[list[MatchStrategy]]:
"""Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
strategies: list[MatchStrategy] = []
# If user selected columns explicitly, build from those
if subset_cols or fuzzy_cols:
target_cols = subset_cols if subset_cols else fuzzy_cols
fuzzy_set = set(fuzzy_cols)
col_strats: list[ColumnMatchStrategy] = []
for col in target_cols:
norm = None
if col in normalize_map:
norm = NormalizerType(normalize_map[col])
if col in fuzzy_set:
algo = Algorithm(algorithm)
thresh = float(threshold)
else:
algo = Algorithm.EXACT
thresh = 100.0
col_strats.append(ColumnMatchStrategy(
column=col, algorithm=algo, threshold=thresh, normalizer=norm,
))
strategies.append(MatchStrategy(column_strategies=col_strats))
# Add strong key strategies
if key_cols:
for col in key_cols:
strategies.append(MatchStrategy(column_strategies=[
ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
]))
return strategies if strategies else None
def _build_config(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
survivor, date_column, merge,
) -> DeduplicationConfig:
"""Build a DeduplicationConfig from GUI state."""
cfg = DeduplicationConfig(
survivor_rule=survivor.replace("-", "_"),
date_column=date_column,
merge=merge,
subset_columns=subset_cols or None,
fuzzy_columns=fuzzy_cols or None,
default_algorithm=algorithm,
default_threshold=float(threshold),
normalize_map=normalize_map or None,
)
strategies = _build_strategies(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
)
if strategies:
cfg.strategies = [
StrategyConfig(columns=[
ColumnStrategyConfig(
column=cs.column,
algorithm=cs.algorithm.value,
threshold=cs.threshold,
normalizer=cs.normalizer.value if cs.normalizer else None,
)
for cs in s.column_strategies
])
for s in strategies
]
return cfg
# ---------------------------------------------------------------------------
# Match group review card
# ---------------------------------------------------------------------------
def match_group_card(
group: MatchResult,
df: pd.DataFrame,
group_num: int,
) -> Optional[bool]:
"""Render an expandable match group card with side-by-side diff.
Returns:
True — user clicked Merge (accept match)
False — user clicked Keep Both (reject match)
None — no decision yet
"""
confidence = group.confidence
auto_expand = confidence < 95.0
matched_on = ", ".join(group.matched_on)
n_rows = len(group.row_indices)
label = (
f"Group {group_num}: {n_rows} rows "
f"(confidence: {confidence:.0f}%) "
f"[{matched_on}]"
)
with st.expander(label, expanded=auto_expand):
# Build comparison DataFrame
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
rows_data = []
for idx in group.row_indices:
row = {"_row": idx + 1}
for col in display_cols:
row[col] = df.iloc[idx].get(col, "")
rows_data.append(row)
compare_df = pd.DataFrame(rows_data)
compare_df = compare_df.set_index("_row")
# Highlight differences
def _highlight_diffs(s: pd.Series) -> list[str]:
"""Highlight cells that differ from the first row."""
styles = []
first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
for val in s:
val_str = str(val).strip()
if val_str != first_val and val_str and first_val:
styles.append("background-color: rgba(245, 166, 35, 0.2)")
elif not val_str and first_val:
styles.append("background-color: rgba(240, 82, 82, 0.1)")
else:
styles.append("")
return styles
styled = compare_df.style.apply(_highlight_diffs, axis=0)
st.dataframe(styled, use_container_width=True)
# Action buttons
btn_left, btn_mid, btn_right = st.columns(3)
merge_key = f"merge_{group.group_id}"
keep_key = f"keep_{group.group_id}"
with btn_left:
if st.button("Merge", key=merge_key, type="primary"):
return True
with btn_mid:
if st.button("Keep Both", key=keep_key):
return False
# Check session state for previous decisions
decisions = st.session_state.get("review_decisions", {})
if group.group_id in decisions:
decision = decisions[group.group_id]
if decision is True:
st.success("Decision: Merge")
elif decision is False:
st.info("Decision: Keep Both")
return None
# ---------------------------------------------------------------------------
# Results summary + downloads
# ---------------------------------------------------------------------------
def results_summary(
result: DeduplicationResult,
original_df: pd.DataFrame,
) -> None:
"""Render summary stats and download buttons."""
removed = result.original_row_count - len(result.deduplicated_df)
# Summary metrics
col1, col2, col3, col4 = st.columns(4)
col1.metric("Rows In", result.original_row_count)
col2.metric("Rows Out", len(result.deduplicated_df))
col3.metric("Removed", removed)
col4.metric("Groups", len(result.match_groups))
st.divider()
# Download buttons
dl_left, dl_mid, dl_right = st.columns(3)
with dl_left:
csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download Deduplicated CSV",
data=csv_bytes,
file_name="deduplicated.csv",
mime="text/csv",
)
with dl_mid:
if not result.removed_df.empty:
removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download Removed Rows",
data=removed_bytes,
file_name="removed_rows.csv",
mime="text/csv",
)
with dl_right:
if result.match_groups:
groups_data = _build_match_groups_csv(result, original_df)
st.download_button(
"Download Match Groups Report",
data=groups_data,
file_name="match_groups.csv",
mime="text/csv",
)
def _build_match_groups_csv(
result: DeduplicationResult,
original_df: pd.DataFrame,
) -> bytes:
"""Build the match groups audit CSV as bytes."""
rows = []
for g in result.match_groups:
for idx in g.row_indices:
row_data = {
"_group_id": g.group_id + 1,
"_is_survivor": idx == g.survivor_index,
"_confidence": g.confidence,
"_matched_on": ", ".join(g.matched_on),
"_original_row": idx + 1,
}
for col in original_df.columns:
if not str(col).startswith("_norm_"):
row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
rows.append(row_data)
groups_df = pd.DataFrame(rows)
return groups_df.to_csv(index=False).encode("utf-8-sig")