feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions

1
src/gui/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Streamlit GUI for the DataTools Deduplicator."""

8
src/gui/__main__.py Normal file
View File

@@ -0,0 +1,8 @@
"""Allow running as ``python -m src.gui``."""
import subprocess
import sys
from pathlib import Path
app_path = Path(__file__).parent / "app.py"
subprocess.run([sys.executable, "-m", "streamlit", "run", str(app_path)])

287
src/gui/app.py Normal file
View File

@@ -0,0 +1,287 @@
"""DataTools Deduplicator — Streamlit GUI.
Launch:
streamlit run src/gui/app.py
"""
from __future__ import annotations
import io
import sys
from pathlib import Path
import pandas as pd
import streamlit as st
# Ensure project root is on sys.path so `src.core` imports work
_project_root = Path(__file__).resolve().parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
from src.core.io import read_file, list_sheets
from src.core.config import DeduplicationConfig
from src.gui.components import config_panel, match_group_card, results_summary
# ---------------------------------------------------------------------------
# Page config
# ---------------------------------------------------------------------------
st.set_page_config(
page_title="DataTools Deduplicator",
page_icon="🔍",
layout="wide",
)
# ---------------------------------------------------------------------------
# Session state defaults
# ---------------------------------------------------------------------------
_DEFAULTS = {
"df": None,
"result": None,
"review_decisions": {},
"config": None,
"file_name": "",
"sheet_names": [],
}
for key, default in _DEFAULTS.items():
if key not in st.session_state:
st.session_state[key] = default
# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------
st.title("DataTools Deduplicator")
st.caption("Find and remove duplicate rows in CSV and Excel files.")
# ---------------------------------------------------------------------------
# File upload
# ---------------------------------------------------------------------------
uploaded = st.file_uploader(
"Upload CSV or Excel file",
type=["csv", "tsv", "xlsx", "xls"],
help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
)
if uploaded is not None:
# Detect if file changed
if uploaded.name != st.session_state["file_name"]:
st.session_state["file_name"] = uploaded.name
st.session_state["result"] = None
st.session_state["review_decisions"] = {}
# Read the file
try:
# Write to a temp file for read_file() which needs a path
import tempfile
suffix = Path(uploaded.name).suffix
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(uploaded.getvalue())
tmp_path = Path(tmp.name)
# Check for Excel sheets
if suffix.lower() in (".xlsx", ".xls"):
st.session_state["sheet_names"] = list_sheets(tmp_path)
else:
st.session_state["sheet_names"] = []
df = read_file(tmp_path)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
st.session_state["df"] = df
# Clean up temp file
tmp_path.unlink(missing_ok=True)
except Exception as e:
st.error(f"Failed to read file: {e}")
st.session_state["df"] = None
df = st.session_state["df"]
if df is not None:
# Sheet selector for Excel files
if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
sheet = st.selectbox(
"Select sheet",
st.session_state["sheet_names"],
)
if sheet != st.session_state.get("_current_sheet"):
st.session_state["_current_sheet"] = sheet
suffix = Path(uploaded.name).suffix
import tempfile
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(uploaded.getvalue())
tmp_path = Path(tmp.name)
df = read_file(tmp_path, sheet_name=sheet)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
st.session_state["df"] = df
st.session_state["result"] = None
st.session_state["review_decisions"] = {}
tmp_path.unlink(missing_ok=True)
# Preview
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
# Advanced options
settings = config_panel(df)
# Apply loaded config if present
loaded_cfg = st.session_state.get("loaded_config")
if loaded_cfg is not None:
settings["strategies"] = loaded_cfg.to_strategies()
settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
settings["date_column"] = loaded_cfg.date_column
settings["merge"] = loaded_cfg.merge
# Clear so it doesn't override on every rerun
del st.session_state["loaded_config"]
# ---------------------------------------------------------------------------
# Find Duplicates button
# ---------------------------------------------------------------------------
st.divider()
if st.button("Find Duplicates", type="primary", use_container_width=True):
progress_bar = st.progress(0, text="Comparing rows...")
def _gui_progress(current: int, total: int) -> None:
if total > 0:
pct = min(current / total, 1.0)
progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
with st.spinner("Running deduplication..."):
result = deduplicate(
df,
strategies=settings["strategies"],
survivor_rule=settings["survivor_rule"],
date_column=settings["date_column"],
merge=settings["merge"],
preview=False,
progress_callback=_gui_progress,
)
progress_bar.empty()
st.session_state["result"] = result
st.session_state["review_decisions"] = {}
# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------
result: DeduplicationResult | None = st.session_state["result"]
if result is not None:
st.divider()
st.subheader("Results")
# Summary + download buttons
results_summary(result, df)
# Match group review
if result.match_groups:
st.divider()
st.subheader("Match Groups")
# Batch actions
action_left, action_mid, action_right = st.columns(3)
with action_left:
if st.button("Accept All"):
for g in result.match_groups:
st.session_state["review_decisions"][g.group_id] = True
st.rerun()
with action_mid:
if st.button("Reject All"):
for g in result.match_groups:
st.session_state["review_decisions"][g.group_id] = False
st.rerun()
with action_right:
if st.button("Clear Decisions"):
st.session_state["review_decisions"] = {}
st.rerun()
# Individual group cards
decisions = st.session_state["review_decisions"]
for i, group in enumerate(result.match_groups):
decision = match_group_card(group, df, group_num=i + 1)
if decision is not None:
decisions[group.group_id] = decision
st.session_state["review_decisions"] = decisions
st.rerun()
# Show decision summary
if decisions:
st.divider()
accepted = sum(1 for v in decisions.values() if v is True)
rejected = sum(1 for v in decisions.values() if v is False)
pending = len(result.match_groups) - len(decisions)
st.caption(
f"Decisions: {accepted} merged, {rejected} kept both, "
f"{pending} pending"
)
# Re-run dedup with review decisions applied
if st.button(
"Apply Review Decisions & Download",
type="primary",
use_container_width=True,
):
def _review_callback(group, _df):
gid = group.group_id
if gid in decisions:
return decisions[gid]
return True # default: accept
reviewed_result = deduplicate(
df,
strategies=settings["strategies"],
survivor_rule=settings["survivor_rule"],
date_column=settings["date_column"],
merge=settings["merge"],
preview=False,
review_callback=_review_callback,
)
# Update result and show downloads
st.session_state["result"] = reviewed_result
csv_bytes = reviewed_result.deduplicated_df.to_csv(
index=False
).encode("utf-8-sig")
st.download_button(
"Download Reviewed & Deduplicated CSV",
data=csv_bytes,
file_name="deduplicated_reviewed.csv",
mime="text/csv",
key="reviewed_download",
)
# Log entries
if result.log_entries:
with st.expander("Processing Log"):
st.code("\n".join(result.log_entries))
else:
# No file uploaded — show placeholder
st.info("Upload a CSV or Excel file to get started.")
# ---------------------------------------------------------------------------
# Footer
# ---------------------------------------------------------------------------
st.divider()
st.caption(
"Runs locally. Your data never leaves this computer. "
"| DataTools Deduplicator v1.0"
)

413
src/gui/components.py Normal file
View File

@@ -0,0 +1,413 @@
"""Reusable Streamlit widgets for the deduplicator GUI."""
from __future__ import annotations
import io
from typing import Optional
import pandas as pd
import streamlit as st
from src.core.dedup import (
Algorithm,
ColumnMatchStrategy,
DeduplicationResult,
MatchResult,
MatchStrategy,
SurvivorRule,
)
from src.core.config import (
ColumnStrategyConfig,
DeduplicationConfig,
StrategyConfig,
)
from src.core.normalizers import NormalizerType
# ---------------------------------------------------------------------------
# Config panel (advanced options)
# ---------------------------------------------------------------------------
def config_panel(df: pd.DataFrame) -> dict:
"""Render the Advanced Options expander. Returns a settings dict.
Keys returned:
strategies: list[MatchStrategy] | None
survivor_rule: SurvivorRule
date_column: str | None
merge: bool
"""
columns = list(df.columns)
with st.expander("Advanced Options"):
col_left, col_right = st.columns(2)
with col_left:
subset_cols = st.multiselect(
"Match on columns",
columns,
default=[],
help="Leave empty to auto-detect based on column names.",
)
key_cols = st.multiselect(
"Strong keys",
columns,
default=[],
help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
)
fuzzy_cols = st.multiselect(
"Fuzzy columns",
columns,
default=[],
help="Columns to fuzzy-match. Others use exact matching.",
)
with col_right:
algorithm = st.selectbox(
"Fuzzy algorithm",
["jaro_winkler", "levenshtein", "token_set_ratio"],
index=0,
help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
)
threshold = st.slider(
"Similarity threshold",
min_value=50,
max_value=100,
value=85,
help="Lower = more matches but more false positives.",
)
survivor = st.selectbox(
"Survivor rule",
["first", "last", "most-complete", "most-recent"],
index=0,
help="Which row to keep when duplicates are found.",
)
# Second row of options
col_a, col_b = st.columns(2)
with col_a:
normalize_options = {c: "auto" for c in columns}
normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]
normalize_map: dict[str, str] = {}
if fuzzy_cols or subset_cols:
target_cols = fuzzy_cols or subset_cols
st.markdown("**Per-column normalizers**")
for col_name in target_cols:
norm = st.selectbox(
f"Normalizer for '{col_name}'",
normalizer_types,
index=0,
key=f"norm_{col_name}",
)
if norm not in ("auto", "none"):
normalize_map[col_name] = norm
with col_b:
merge = st.checkbox(
"Merge mode",
value=False,
help="Fill missing fields in the surviving row from removed duplicates.",
)
date_column: Optional[str] = None
if survivor == "most-recent":
date_column = st.selectbox(
"Date column",
columns,
help="Required for most-recent survivor rule.",
)
# Config save/load
st.divider()
cfg_left, cfg_right = st.columns(2)
with cfg_left:
config_file = st.file_uploader(
"Load config profile",
type=["json"],
help="Load previously saved settings.",
key="config_upload",
)
if config_file is not None:
import json
try:
data = json.loads(config_file.read())
loaded = DeduplicationConfig.from_dict(data)
st.session_state["loaded_config"] = loaded
st.success("Config loaded.")
except Exception as e:
st.error(f"Failed to load config: {e}")
with cfg_right:
if st.button("Save current settings"):
cfg = _build_config(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
survivor, date_column, merge,
)
cfg_json = cfg.to_dict()
import json
st.download_button(
"Download config JSON",
data=json.dumps(cfg_json, indent=2),
file_name="dedup_config.json",
mime="application/json",
)
# Build strategies from selections
strategies = _build_strategies(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
)
# Survivor rule mapping
survivor_map = {
"first": SurvivorRule.KEEP_FIRST,
"last": SurvivorRule.KEEP_LAST,
"most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
"most-recent": SurvivorRule.KEEP_MOST_RECENT,
}
return {
"strategies": strategies,
"survivor_rule": survivor_map[survivor],
"date_column": date_column,
"merge": merge,
}
def _build_strategies(
subset_cols: list[str],
key_cols: list[str],
fuzzy_cols: list[str],
algorithm: str,
threshold: int,
normalize_map: dict[str, str],
) -> Optional[list[MatchStrategy]]:
"""Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
strategies: list[MatchStrategy] = []
# If user selected columns explicitly, build from those
if subset_cols or fuzzy_cols:
target_cols = subset_cols if subset_cols else fuzzy_cols
fuzzy_set = set(fuzzy_cols)
col_strats: list[ColumnMatchStrategy] = []
for col in target_cols:
norm = None
if col in normalize_map:
norm = NormalizerType(normalize_map[col])
if col in fuzzy_set:
algo = Algorithm(algorithm)
thresh = float(threshold)
else:
algo = Algorithm.EXACT
thresh = 100.0
col_strats.append(ColumnMatchStrategy(
column=col, algorithm=algo, threshold=thresh, normalizer=norm,
))
strategies.append(MatchStrategy(column_strategies=col_strats))
# Add strong key strategies
if key_cols:
for col in key_cols:
strategies.append(MatchStrategy(column_strategies=[
ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
]))
return strategies if strategies else None
def _build_config(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
survivor, date_column, merge,
) -> DeduplicationConfig:
"""Build a DeduplicationConfig from GUI state."""
cfg = DeduplicationConfig(
survivor_rule=survivor.replace("-", "_"),
date_column=date_column,
merge=merge,
subset_columns=subset_cols or None,
fuzzy_columns=fuzzy_cols or None,
default_algorithm=algorithm,
default_threshold=float(threshold),
normalize_map=normalize_map or None,
)
strategies = _build_strategies(
subset_cols, key_cols, fuzzy_cols,
algorithm, threshold, normalize_map,
)
if strategies:
cfg.strategies = [
StrategyConfig(columns=[
ColumnStrategyConfig(
column=cs.column,
algorithm=cs.algorithm.value,
threshold=cs.threshold,
normalizer=cs.normalizer.value if cs.normalizer else None,
)
for cs in s.column_strategies
])
for s in strategies
]
return cfg
# ---------------------------------------------------------------------------
# Match group review card
# ---------------------------------------------------------------------------
def match_group_card(
group: MatchResult,
df: pd.DataFrame,
group_num: int,
) -> Optional[bool]:
"""Render an expandable match group card with side-by-side diff.
Returns:
True — user clicked Merge (accept match)
False — user clicked Keep Both (reject match)
None — no decision yet
"""
confidence = group.confidence
auto_expand = confidence < 95.0
matched_on = ", ".join(group.matched_on)
n_rows = len(group.row_indices)
label = (
f"Group {group_num}: {n_rows} rows "
f"(confidence: {confidence:.0f}%) "
f"[{matched_on}]"
)
with st.expander(label, expanded=auto_expand):
# Build comparison DataFrame
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
rows_data = []
for idx in group.row_indices:
row = {"_row": idx + 1}
for col in display_cols:
row[col] = df.iloc[idx].get(col, "")
rows_data.append(row)
compare_df = pd.DataFrame(rows_data)
compare_df = compare_df.set_index("_row")
# Highlight differences
def _highlight_diffs(s: pd.Series) -> list[str]:
"""Highlight cells that differ from the first row."""
styles = []
first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
for val in s:
val_str = str(val).strip()
if val_str != first_val and val_str and first_val:
styles.append("background-color: rgba(245, 166, 35, 0.2)")
elif not val_str and first_val:
styles.append("background-color: rgba(240, 82, 82, 0.1)")
else:
styles.append("")
return styles
styled = compare_df.style.apply(_highlight_diffs, axis=0)
st.dataframe(styled, use_container_width=True)
# Action buttons
btn_left, btn_mid, btn_right = st.columns(3)
merge_key = f"merge_{group.group_id}"
keep_key = f"keep_{group.group_id}"
with btn_left:
if st.button("Merge", key=merge_key, type="primary"):
return True
with btn_mid:
if st.button("Keep Both", key=keep_key):
return False
# Check session state for previous decisions
decisions = st.session_state.get("review_decisions", {})
if group.group_id in decisions:
decision = decisions[group.group_id]
if decision is True:
st.success("Decision: Merge")
elif decision is False:
st.info("Decision: Keep Both")
return None
# ---------------------------------------------------------------------------
# Results summary + downloads
# ---------------------------------------------------------------------------
def results_summary(
result: DeduplicationResult,
original_df: pd.DataFrame,
) -> None:
"""Render summary stats and download buttons."""
removed = result.original_row_count - len(result.deduplicated_df)
# Summary metrics
col1, col2, col3, col4 = st.columns(4)
col1.metric("Rows In", result.original_row_count)
col2.metric("Rows Out", len(result.deduplicated_df))
col3.metric("Removed", removed)
col4.metric("Groups", len(result.match_groups))
st.divider()
# Download buttons
dl_left, dl_mid, dl_right = st.columns(3)
with dl_left:
csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download Deduplicated CSV",
data=csv_bytes,
file_name="deduplicated.csv",
mime="text/csv",
)
with dl_mid:
if not result.removed_df.empty:
removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
st.download_button(
"Download Removed Rows",
data=removed_bytes,
file_name="removed_rows.csv",
mime="text/csv",
)
with dl_right:
if result.match_groups:
groups_data = _build_match_groups_csv(result, original_df)
st.download_button(
"Download Match Groups Report",
data=groups_data,
file_name="match_groups.csv",
mime="text/csv",
)
def _build_match_groups_csv(
result: DeduplicationResult,
original_df: pd.DataFrame,
) -> bytes:
"""Build the match groups audit CSV as bytes."""
rows = []
for g in result.match_groups:
for idx in g.row_indices:
row_data = {
"_group_id": g.group_id + 1,
"_is_survivor": idx == g.survivor_index,
"_confidence": g.confidence,
"_matched_on": ", ".join(g.matched_on),
"_original_row": idx + 1,
}
for col in original_df.columns:
if not str(col).startswith("_norm_"):
row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
rows.append(row_data)
groups_df = pd.DataFrame(rows)
return groups_df.to_csv(index=False).encode("utf-8-sig")