feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions

287
src/gui/app.py Normal file
View File

@@ -0,0 +1,287 @@
"""DataTools Deduplicator — Streamlit GUI.
Launch:
streamlit run src/gui/app.py
"""
from __future__ import annotations
import io
import sys
from pathlib import Path
import pandas as pd
import streamlit as st
# Ensure project root is on sys.path so `src.core` imports work
_project_root = Path(__file__).resolve().parent.parent.parent
if str(_project_root) not in sys.path:
sys.path.insert(0, str(_project_root))
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
from src.core.io import read_file, list_sheets
from src.core.config import DeduplicationConfig
from src.gui.components import config_panel, match_group_card, results_summary
# ---------------------------------------------------------------------------
# Page config
# ---------------------------------------------------------------------------
st.set_page_config(
page_title="DataTools Deduplicator",
page_icon="🔍",
layout="wide",
)
# ---------------------------------------------------------------------------
# Session state defaults
# ---------------------------------------------------------------------------
_DEFAULTS = {
"df": None,
"result": None,
"review_decisions": {},
"config": None,
"file_name": "",
"sheet_names": [],
}
for key, default in _DEFAULTS.items():
if key not in st.session_state:
st.session_state[key] = default
# ---------------------------------------------------------------------------
# Header
# ---------------------------------------------------------------------------
st.title("DataTools Deduplicator")
st.caption("Find and remove duplicate rows in CSV and Excel files.")
# ---------------------------------------------------------------------------
# File upload
# ---------------------------------------------------------------------------
uploaded = st.file_uploader(
"Upload CSV or Excel file",
type=["csv", "tsv", "xlsx", "xls"],
help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
)
if uploaded is not None:
# Detect if file changed
if uploaded.name != st.session_state["file_name"]:
st.session_state["file_name"] = uploaded.name
st.session_state["result"] = None
st.session_state["review_decisions"] = {}
# Read the file
try:
# Write to a temp file for read_file() which needs a path
import tempfile
suffix = Path(uploaded.name).suffix
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(uploaded.getvalue())
tmp_path = Path(tmp.name)
# Check for Excel sheets
if suffix.lower() in (".xlsx", ".xls"):
st.session_state["sheet_names"] = list_sheets(tmp_path)
else:
st.session_state["sheet_names"] = []
df = read_file(tmp_path)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
st.session_state["df"] = df
# Clean up temp file
tmp_path.unlink(missing_ok=True)
except Exception as e:
st.error(f"Failed to read file: {e}")
st.session_state["df"] = None
df = st.session_state["df"]
if df is not None:
# Sheet selector for Excel files
if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
sheet = st.selectbox(
"Select sheet",
st.session_state["sheet_names"],
)
if sheet != st.session_state.get("_current_sheet"):
st.session_state["_current_sheet"] = sheet
suffix = Path(uploaded.name).suffix
import tempfile
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(uploaded.getvalue())
tmp_path = Path(tmp.name)
df = read_file(tmp_path, sheet_name=sheet)
if not isinstance(df, pd.DataFrame):
df = pd.concat(list(df), ignore_index=True)
st.session_state["df"] = df
st.session_state["result"] = None
st.session_state["review_decisions"] = {}
tmp_path.unlink(missing_ok=True)
# Preview
st.subheader(f"Preview: {uploaded.name}")
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
st.dataframe(df.head(10), use_container_width=True)
# Advanced options
settings = config_panel(df)
# Apply loaded config if present
loaded_cfg = st.session_state.get("loaded_config")
if loaded_cfg is not None:
settings["strategies"] = loaded_cfg.to_strategies()
settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
settings["date_column"] = loaded_cfg.date_column
settings["merge"] = loaded_cfg.merge
# Clear so it doesn't override on every rerun
del st.session_state["loaded_config"]
# ---------------------------------------------------------------------------
# Find Duplicates button
# ---------------------------------------------------------------------------
st.divider()
if st.button("Find Duplicates", type="primary", use_container_width=True):
progress_bar = st.progress(0, text="Comparing rows...")
def _gui_progress(current: int, total: int) -> None:
if total > 0:
pct = min(current / total, 1.0)
progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
with st.spinner("Running deduplication..."):
result = deduplicate(
df,
strategies=settings["strategies"],
survivor_rule=settings["survivor_rule"],
date_column=settings["date_column"],
merge=settings["merge"],
preview=False,
progress_callback=_gui_progress,
)
progress_bar.empty()
st.session_state["result"] = result
st.session_state["review_decisions"] = {}
# ---------------------------------------------------------------------------
# Results
# ---------------------------------------------------------------------------
result: DeduplicationResult | None = st.session_state["result"]
if result is not None:
st.divider()
st.subheader("Results")
# Summary + download buttons
results_summary(result, df)
# Match group review
if result.match_groups:
st.divider()
st.subheader("Match Groups")
# Batch actions
action_left, action_mid, action_right = st.columns(3)
with action_left:
if st.button("Accept All"):
for g in result.match_groups:
st.session_state["review_decisions"][g.group_id] = True
st.rerun()
with action_mid:
if st.button("Reject All"):
for g in result.match_groups:
st.session_state["review_decisions"][g.group_id] = False
st.rerun()
with action_right:
if st.button("Clear Decisions"):
st.session_state["review_decisions"] = {}
st.rerun()
# Individual group cards
decisions = st.session_state["review_decisions"]
for i, group in enumerate(result.match_groups):
decision = match_group_card(group, df, group_num=i + 1)
if decision is not None:
decisions[group.group_id] = decision
st.session_state["review_decisions"] = decisions
st.rerun()
# Show decision summary
if decisions:
st.divider()
accepted = sum(1 for v in decisions.values() if v is True)
rejected = sum(1 for v in decisions.values() if v is False)
pending = len(result.match_groups) - len(decisions)
st.caption(
f"Decisions: {accepted} merged, {rejected} kept both, "
f"{pending} pending"
)
# Re-run dedup with review decisions applied
if st.button(
"Apply Review Decisions & Download",
type="primary",
use_container_width=True,
):
def _review_callback(group, _df):
gid = group.group_id
if gid in decisions:
return decisions[gid]
return True # default: accept
reviewed_result = deduplicate(
df,
strategies=settings["strategies"],
survivor_rule=settings["survivor_rule"],
date_column=settings["date_column"],
merge=settings["merge"],
preview=False,
review_callback=_review_callback,
)
# Update result and show downloads
st.session_state["result"] = reviewed_result
csv_bytes = reviewed_result.deduplicated_df.to_csv(
index=False
).encode("utf-8-sig")
st.download_button(
"Download Reviewed & Deduplicated CSV",
data=csv_bytes,
file_name="deduplicated_reviewed.csv",
mime="text/csv",
key="reviewed_download",
)
# Log entries
if result.log_entries:
with st.expander("Processing Log"):
st.code("\n".join(result.log_entries))
else:
# No file uploaded — show placeholder
st.info("Upload a CSV or Excel file to get started.")
# ---------------------------------------------------------------------------
# Footer
# ---------------------------------------------------------------------------
st.divider()
st.caption(
"Runs locally. Your data never leaves this computer. "
"| DataTools Deduplicator v1.0"
)