feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
413
src/gui/components.py
Normal file
413
src/gui/components.py
Normal file
@@ -0,0 +1,413 @@
|
||||
"""Reusable Streamlit widgets for the deduplicator GUI."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from src.core.dedup import (
|
||||
Algorithm,
|
||||
ColumnMatchStrategy,
|
||||
DeduplicationResult,
|
||||
MatchResult,
|
||||
MatchStrategy,
|
||||
SurvivorRule,
|
||||
)
|
||||
from src.core.config import (
|
||||
ColumnStrategyConfig,
|
||||
DeduplicationConfig,
|
||||
StrategyConfig,
|
||||
)
|
||||
from src.core.normalizers import NormalizerType
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config panel (advanced options)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def config_panel(df: pd.DataFrame) -> dict:
|
||||
"""Render the Advanced Options expander. Returns a settings dict.
|
||||
|
||||
Keys returned:
|
||||
strategies: list[MatchStrategy] | None
|
||||
survivor_rule: SurvivorRule
|
||||
date_column: str | None
|
||||
merge: bool
|
||||
"""
|
||||
columns = list(df.columns)
|
||||
|
||||
with st.expander("Advanced Options"):
|
||||
col_left, col_right = st.columns(2)
|
||||
|
||||
with col_left:
|
||||
subset_cols = st.multiselect(
|
||||
"Match on columns",
|
||||
columns,
|
||||
default=[],
|
||||
help="Leave empty to auto-detect based on column names.",
|
||||
)
|
||||
key_cols = st.multiselect(
|
||||
"Strong keys",
|
||||
columns,
|
||||
default=[],
|
||||
help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
|
||||
)
|
||||
fuzzy_cols = st.multiselect(
|
||||
"Fuzzy columns",
|
||||
columns,
|
||||
default=[],
|
||||
help="Columns to fuzzy-match. Others use exact matching.",
|
||||
)
|
||||
|
||||
with col_right:
|
||||
algorithm = st.selectbox(
|
||||
"Fuzzy algorithm",
|
||||
["jaro_winkler", "levenshtein", "token_set_ratio"],
|
||||
index=0,
|
||||
help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
|
||||
)
|
||||
threshold = st.slider(
|
||||
"Similarity threshold",
|
||||
min_value=50,
|
||||
max_value=100,
|
||||
value=85,
|
||||
help="Lower = more matches but more false positives.",
|
||||
)
|
||||
survivor = st.selectbox(
|
||||
"Survivor rule",
|
||||
["first", "last", "most-complete", "most-recent"],
|
||||
index=0,
|
||||
help="Which row to keep when duplicates are found.",
|
||||
)
|
||||
|
||||
# Second row of options
|
||||
col_a, col_b = st.columns(2)
|
||||
|
||||
with col_a:
|
||||
normalize_options = {c: "auto" for c in columns}
|
||||
normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]
|
||||
|
||||
normalize_map: dict[str, str] = {}
|
||||
if fuzzy_cols or subset_cols:
|
||||
target_cols = fuzzy_cols or subset_cols
|
||||
st.markdown("**Per-column normalizers**")
|
||||
for col_name in target_cols:
|
||||
norm = st.selectbox(
|
||||
f"Normalizer for '{col_name}'",
|
||||
normalizer_types,
|
||||
index=0,
|
||||
key=f"norm_{col_name}",
|
||||
)
|
||||
if norm not in ("auto", "none"):
|
||||
normalize_map[col_name] = norm
|
||||
|
||||
with col_b:
|
||||
merge = st.checkbox(
|
||||
"Merge mode",
|
||||
value=False,
|
||||
help="Fill missing fields in the surviving row from removed duplicates.",
|
||||
)
|
||||
date_column: Optional[str] = None
|
||||
if survivor == "most-recent":
|
||||
date_column = st.selectbox(
|
||||
"Date column",
|
||||
columns,
|
||||
help="Required for most-recent survivor rule.",
|
||||
)
|
||||
|
||||
# Config save/load
|
||||
st.divider()
|
||||
cfg_left, cfg_right = st.columns(2)
|
||||
|
||||
with cfg_left:
|
||||
config_file = st.file_uploader(
|
||||
"Load config profile",
|
||||
type=["json"],
|
||||
help="Load previously saved settings.",
|
||||
key="config_upload",
|
||||
)
|
||||
if config_file is not None:
|
||||
import json
|
||||
try:
|
||||
data = json.loads(config_file.read())
|
||||
loaded = DeduplicationConfig.from_dict(data)
|
||||
st.session_state["loaded_config"] = loaded
|
||||
st.success("Config loaded.")
|
||||
except Exception as e:
|
||||
st.error(f"Failed to load config: {e}")
|
||||
|
||||
with cfg_right:
|
||||
if st.button("Save current settings"):
|
||||
cfg = _build_config(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
survivor, date_column, merge,
|
||||
)
|
||||
cfg_json = cfg.to_dict()
|
||||
import json
|
||||
st.download_button(
|
||||
"Download config JSON",
|
||||
data=json.dumps(cfg_json, indent=2),
|
||||
file_name="dedup_config.json",
|
||||
mime="application/json",
|
||||
)
|
||||
|
||||
# Build strategies from selections
|
||||
strategies = _build_strategies(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
)
|
||||
|
||||
# Survivor rule mapping
|
||||
survivor_map = {
|
||||
"first": SurvivorRule.KEEP_FIRST,
|
||||
"last": SurvivorRule.KEEP_LAST,
|
||||
"most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
|
||||
"most-recent": SurvivorRule.KEEP_MOST_RECENT,
|
||||
}
|
||||
|
||||
return {
|
||||
"strategies": strategies,
|
||||
"survivor_rule": survivor_map[survivor],
|
||||
"date_column": date_column,
|
||||
"merge": merge,
|
||||
}
|
||||
|
||||
|
||||
def _build_strategies(
|
||||
subset_cols: list[str],
|
||||
key_cols: list[str],
|
||||
fuzzy_cols: list[str],
|
||||
algorithm: str,
|
||||
threshold: int,
|
||||
normalize_map: dict[str, str],
|
||||
) -> Optional[list[MatchStrategy]]:
|
||||
"""Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
|
||||
strategies: list[MatchStrategy] = []
|
||||
|
||||
# If user selected columns explicitly, build from those
|
||||
if subset_cols or fuzzy_cols:
|
||||
target_cols = subset_cols if subset_cols else fuzzy_cols
|
||||
fuzzy_set = set(fuzzy_cols)
|
||||
col_strats: list[ColumnMatchStrategy] = []
|
||||
for col in target_cols:
|
||||
norm = None
|
||||
if col in normalize_map:
|
||||
norm = NormalizerType(normalize_map[col])
|
||||
if col in fuzzy_set:
|
||||
algo = Algorithm(algorithm)
|
||||
thresh = float(threshold)
|
||||
else:
|
||||
algo = Algorithm.EXACT
|
||||
thresh = 100.0
|
||||
col_strats.append(ColumnMatchStrategy(
|
||||
column=col, algorithm=algo, threshold=thresh, normalizer=norm,
|
||||
))
|
||||
strategies.append(MatchStrategy(column_strategies=col_strats))
|
||||
|
||||
# Add strong key strategies
|
||||
if key_cols:
|
||||
for col in key_cols:
|
||||
strategies.append(MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
|
||||
]))
|
||||
|
||||
return strategies if strategies else None
|
||||
|
||||
|
||||
def _build_config(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
survivor, date_column, merge,
|
||||
) -> DeduplicationConfig:
|
||||
"""Build a DeduplicationConfig from GUI state."""
|
||||
cfg = DeduplicationConfig(
|
||||
survivor_rule=survivor.replace("-", "_"),
|
||||
date_column=date_column,
|
||||
merge=merge,
|
||||
subset_columns=subset_cols or None,
|
||||
fuzzy_columns=fuzzy_cols or None,
|
||||
default_algorithm=algorithm,
|
||||
default_threshold=float(threshold),
|
||||
normalize_map=normalize_map or None,
|
||||
)
|
||||
strategies = _build_strategies(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
)
|
||||
if strategies:
|
||||
cfg.strategies = [
|
||||
StrategyConfig(columns=[
|
||||
ColumnStrategyConfig(
|
||||
column=cs.column,
|
||||
algorithm=cs.algorithm.value,
|
||||
threshold=cs.threshold,
|
||||
normalizer=cs.normalizer.value if cs.normalizer else None,
|
||||
)
|
||||
for cs in s.column_strategies
|
||||
])
|
||||
for s in strategies
|
||||
]
|
||||
return cfg
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Match group review card
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def match_group_card(
|
||||
group: MatchResult,
|
||||
df: pd.DataFrame,
|
||||
group_num: int,
|
||||
) -> Optional[bool]:
|
||||
"""Render an expandable match group card with side-by-side diff.
|
||||
|
||||
Returns:
|
||||
True — user clicked Merge (accept match)
|
||||
False — user clicked Keep Both (reject match)
|
||||
None — no decision yet
|
||||
"""
|
||||
confidence = group.confidence
|
||||
auto_expand = confidence < 95.0
|
||||
matched_on = ", ".join(group.matched_on)
|
||||
n_rows = len(group.row_indices)
|
||||
|
||||
label = (
|
||||
f"Group {group_num}: {n_rows} rows "
|
||||
f"(confidence: {confidence:.0f}%) "
|
||||
f"[{matched_on}]"
|
||||
)
|
||||
|
||||
with st.expander(label, expanded=auto_expand):
|
||||
# Build comparison DataFrame
|
||||
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
|
||||
rows_data = []
|
||||
for idx in group.row_indices:
|
||||
row = {"_row": idx + 1}
|
||||
for col in display_cols:
|
||||
row[col] = df.iloc[idx].get(col, "")
|
||||
rows_data.append(row)
|
||||
|
||||
compare_df = pd.DataFrame(rows_data)
|
||||
compare_df = compare_df.set_index("_row")
|
||||
|
||||
# Highlight differences
|
||||
def _highlight_diffs(s: pd.Series) -> list[str]:
|
||||
"""Highlight cells that differ from the first row."""
|
||||
styles = []
|
||||
first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
|
||||
for val in s:
|
||||
val_str = str(val).strip()
|
||||
if val_str != first_val and val_str and first_val:
|
||||
styles.append("background-color: rgba(245, 166, 35, 0.2)")
|
||||
elif not val_str and first_val:
|
||||
styles.append("background-color: rgba(240, 82, 82, 0.1)")
|
||||
else:
|
||||
styles.append("")
|
||||
return styles
|
||||
|
||||
styled = compare_df.style.apply(_highlight_diffs, axis=0)
|
||||
st.dataframe(styled, use_container_width=True)
|
||||
|
||||
# Action buttons
|
||||
btn_left, btn_mid, btn_right = st.columns(3)
|
||||
merge_key = f"merge_{group.group_id}"
|
||||
keep_key = f"keep_{group.group_id}"
|
||||
|
||||
with btn_left:
|
||||
if st.button("Merge", key=merge_key, type="primary"):
|
||||
return True
|
||||
with btn_mid:
|
||||
if st.button("Keep Both", key=keep_key):
|
||||
return False
|
||||
|
||||
# Check session state for previous decisions
|
||||
decisions = st.session_state.get("review_decisions", {})
|
||||
if group.group_id in decisions:
|
||||
decision = decisions[group.group_id]
|
||||
if decision is True:
|
||||
st.success("Decision: Merge")
|
||||
elif decision is False:
|
||||
st.info("Decision: Keep Both")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Results summary + downloads
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def results_summary(
|
||||
result: DeduplicationResult,
|
||||
original_df: pd.DataFrame,
|
||||
) -> None:
|
||||
"""Render summary stats and download buttons."""
|
||||
removed = result.original_row_count - len(result.deduplicated_df)
|
||||
|
||||
# Summary metrics
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
col1.metric("Rows In", result.original_row_count)
|
||||
col2.metric("Rows Out", len(result.deduplicated_df))
|
||||
col3.metric("Removed", removed)
|
||||
col4.metric("Groups", len(result.match_groups))
|
||||
|
||||
st.divider()
|
||||
|
||||
# Download buttons
|
||||
dl_left, dl_mid, dl_right = st.columns(3)
|
||||
|
||||
with dl_left:
|
||||
csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download Deduplicated CSV",
|
||||
data=csv_bytes,
|
||||
file_name="deduplicated.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
|
||||
with dl_mid:
|
||||
if not result.removed_df.empty:
|
||||
removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download Removed Rows",
|
||||
data=removed_bytes,
|
||||
file_name="removed_rows.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
|
||||
with dl_right:
|
||||
if result.match_groups:
|
||||
groups_data = _build_match_groups_csv(result, original_df)
|
||||
st.download_button(
|
||||
"Download Match Groups Report",
|
||||
data=groups_data,
|
||||
file_name="match_groups.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
|
||||
|
||||
def _build_match_groups_csv(
|
||||
result: DeduplicationResult,
|
||||
original_df: pd.DataFrame,
|
||||
) -> bytes:
|
||||
"""Build the match groups audit CSV as bytes."""
|
||||
rows = []
|
||||
for g in result.match_groups:
|
||||
for idx in g.row_indices:
|
||||
row_data = {
|
||||
"_group_id": g.group_id + 1,
|
||||
"_is_survivor": idx == g.survivor_index,
|
||||
"_confidence": g.confidence,
|
||||
"_matched_on": ", ".join(g.matched_on),
|
||||
"_original_row": idx + 1,
|
||||
}
|
||||
for col in original_df.columns:
|
||||
if not str(col).startswith("_norm_"):
|
||||
row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
|
||||
rows.append(row_data)
|
||||
|
||||
groups_df = pd.DataFrame(rows)
|
||||
return groups_df.to_csv(index=False).encode("utf-8-sig")
|
||||
Reference in New Issue
Block a user