feat: add documentation, Streamlit GUI, and full source tree
- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
1
src/gui/__init__.py
Normal file
1
src/gui/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Streamlit GUI for the DataTools Deduplicator."""
|
||||
8
src/gui/__main__.py
Normal file
8
src/gui/__main__.py
Normal file
@@ -0,0 +1,8 @@
|
||||
"""Allow running as ``python -m src.gui``."""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
app_path = Path(__file__).parent / "app.py"
|
||||
subprocess.run([sys.executable, "-m", "streamlit", "run", str(app_path)])
|
||||
287
src/gui/app.py
Normal file
287
src/gui/app.py
Normal file
@@ -0,0 +1,287 @@
|
||||
"""DataTools Deduplicator — Streamlit GUI.
|
||||
|
||||
Launch:
|
||||
streamlit run src/gui/app.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
# Ensure project root is on sys.path so `src.core` imports work
|
||||
_project_root = Path(__file__).resolve().parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
|
||||
from src.core.io import read_file, list_sheets
|
||||
from src.core.config import DeduplicationConfig
|
||||
from src.gui.components import config_panel, match_group_card, results_summary
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Page config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.set_page_config(
|
||||
page_title="DataTools Deduplicator",
|
||||
page_icon="🔍",
|
||||
layout="wide",
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Session state defaults
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DEFAULTS = {
|
||||
"df": None,
|
||||
"result": None,
|
||||
"review_decisions": {},
|
||||
"config": None,
|
||||
"file_name": "",
|
||||
"sheet_names": [],
|
||||
}
|
||||
for key, default in _DEFAULTS.items():
|
||||
if key not in st.session_state:
|
||||
st.session_state[key] = default
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.title("DataTools Deduplicator")
|
||||
st.caption("Find and remove duplicate rows in CSV and Excel files.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File upload
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel file",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
|
||||
)
|
||||
|
||||
if uploaded is not None:
|
||||
# Detect if file changed
|
||||
if uploaded.name != st.session_state["file_name"]:
|
||||
st.session_state["file_name"] = uploaded.name
|
||||
st.session_state["result"] = None
|
||||
st.session_state["review_decisions"] = {}
|
||||
|
||||
# Read the file
|
||||
try:
|
||||
# Write to a temp file for read_file() which needs a path
|
||||
import tempfile
|
||||
suffix = Path(uploaded.name).suffix
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(uploaded.getvalue())
|
||||
tmp_path = Path(tmp.name)
|
||||
|
||||
# Check for Excel sheets
|
||||
if suffix.lower() in (".xlsx", ".xls"):
|
||||
st.session_state["sheet_names"] = list_sheets(tmp_path)
|
||||
else:
|
||||
st.session_state["sheet_names"] = []
|
||||
|
||||
df = read_file(tmp_path)
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
|
||||
st.session_state["df"] = df
|
||||
|
||||
# Clean up temp file
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Failed to read file: {e}")
|
||||
st.session_state["df"] = None
|
||||
|
||||
df = st.session_state["df"]
|
||||
|
||||
if df is not None:
|
||||
# Sheet selector for Excel files
|
||||
if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
|
||||
sheet = st.selectbox(
|
||||
"Select sheet",
|
||||
st.session_state["sheet_names"],
|
||||
)
|
||||
if sheet != st.session_state.get("_current_sheet"):
|
||||
st.session_state["_current_sheet"] = sheet
|
||||
suffix = Path(uploaded.name).suffix
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(uploaded.getvalue())
|
||||
tmp_path = Path(tmp.name)
|
||||
df = read_file(tmp_path, sheet_name=sheet)
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
st.session_state["df"] = df
|
||||
st.session_state["result"] = None
|
||||
st.session_state["review_decisions"] = {}
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
# Preview
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
|
||||
# Advanced options
|
||||
settings = config_panel(df)
|
||||
|
||||
# Apply loaded config if present
|
||||
loaded_cfg = st.session_state.get("loaded_config")
|
||||
if loaded_cfg is not None:
|
||||
settings["strategies"] = loaded_cfg.to_strategies()
|
||||
settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
|
||||
settings["date_column"] = loaded_cfg.date_column
|
||||
settings["merge"] = loaded_cfg.merge
|
||||
# Clear so it doesn't override on every rerun
|
||||
del st.session_state["loaded_config"]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Find Duplicates button
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
|
||||
if st.button("Find Duplicates", type="primary", use_container_width=True):
|
||||
progress_bar = st.progress(0, text="Comparing rows...")
|
||||
|
||||
def _gui_progress(current: int, total: int) -> None:
|
||||
if total > 0:
|
||||
pct = min(current / total, 1.0)
|
||||
progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
|
||||
|
||||
with st.spinner("Running deduplication..."):
|
||||
result = deduplicate(
|
||||
df,
|
||||
strategies=settings["strategies"],
|
||||
survivor_rule=settings["survivor_rule"],
|
||||
date_column=settings["date_column"],
|
||||
merge=settings["merge"],
|
||||
preview=False,
|
||||
progress_callback=_gui_progress,
|
||||
)
|
||||
|
||||
progress_bar.empty()
|
||||
st.session_state["result"] = result
|
||||
st.session_state["review_decisions"] = {}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
result: DeduplicationResult | None = st.session_state["result"]
|
||||
|
||||
if result is not None:
|
||||
st.divider()
|
||||
st.subheader("Results")
|
||||
|
||||
# Summary + download buttons
|
||||
results_summary(result, df)
|
||||
|
||||
# Match group review
|
||||
if result.match_groups:
|
||||
st.divider()
|
||||
st.subheader("Match Groups")
|
||||
|
||||
# Batch actions
|
||||
action_left, action_mid, action_right = st.columns(3)
|
||||
with action_left:
|
||||
if st.button("Accept All"):
|
||||
for g in result.match_groups:
|
||||
st.session_state["review_decisions"][g.group_id] = True
|
||||
st.rerun()
|
||||
with action_mid:
|
||||
if st.button("Reject All"):
|
||||
for g in result.match_groups:
|
||||
st.session_state["review_decisions"][g.group_id] = False
|
||||
st.rerun()
|
||||
with action_right:
|
||||
if st.button("Clear Decisions"):
|
||||
st.session_state["review_decisions"] = {}
|
||||
st.rerun()
|
||||
|
||||
# Individual group cards
|
||||
decisions = st.session_state["review_decisions"]
|
||||
for i, group in enumerate(result.match_groups):
|
||||
decision = match_group_card(group, df, group_num=i + 1)
|
||||
if decision is not None:
|
||||
decisions[group.group_id] = decision
|
||||
st.session_state["review_decisions"] = decisions
|
||||
st.rerun()
|
||||
|
||||
# Show decision summary
|
||||
if decisions:
|
||||
st.divider()
|
||||
accepted = sum(1 for v in decisions.values() if v is True)
|
||||
rejected = sum(1 for v in decisions.values() if v is False)
|
||||
pending = len(result.match_groups) - len(decisions)
|
||||
st.caption(
|
||||
f"Decisions: {accepted} merged, {rejected} kept both, "
|
||||
f"{pending} pending"
|
||||
)
|
||||
|
||||
# Re-run dedup with review decisions applied
|
||||
if st.button(
|
||||
"Apply Review Decisions & Download",
|
||||
type="primary",
|
||||
use_container_width=True,
|
||||
):
|
||||
def _review_callback(group, _df):
|
||||
gid = group.group_id
|
||||
if gid in decisions:
|
||||
return decisions[gid]
|
||||
return True # default: accept
|
||||
|
||||
reviewed_result = deduplicate(
|
||||
df,
|
||||
strategies=settings["strategies"],
|
||||
survivor_rule=settings["survivor_rule"],
|
||||
date_column=settings["date_column"],
|
||||
merge=settings["merge"],
|
||||
preview=False,
|
||||
review_callback=_review_callback,
|
||||
)
|
||||
|
||||
# Update result and show downloads
|
||||
st.session_state["result"] = reviewed_result
|
||||
|
||||
csv_bytes = reviewed_result.deduplicated_df.to_csv(
|
||||
index=False
|
||||
).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download Reviewed & Deduplicated CSV",
|
||||
data=csv_bytes,
|
||||
file_name="deduplicated_reviewed.csv",
|
||||
mime="text/csv",
|
||||
key="reviewed_download",
|
||||
)
|
||||
|
||||
# Log entries
|
||||
if result.log_entries:
|
||||
with st.expander("Processing Log"):
|
||||
st.code("\n".join(result.log_entries))
|
||||
|
||||
else:
|
||||
# No file uploaded — show placeholder
|
||||
st.info("Upload a CSV or Excel file to get started.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Footer
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
st.caption(
|
||||
"Runs locally. Your data never leaves this computer. "
|
||||
"| DataTools Deduplicator v1.0"
|
||||
)
|
||||
413
src/gui/components.py
Normal file
413
src/gui/components.py
Normal file
@@ -0,0 +1,413 @@
|
||||
"""Reusable Streamlit widgets for the deduplicator GUI."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
from src.core.dedup import (
|
||||
Algorithm,
|
||||
ColumnMatchStrategy,
|
||||
DeduplicationResult,
|
||||
MatchResult,
|
||||
MatchStrategy,
|
||||
SurvivorRule,
|
||||
)
|
||||
from src.core.config import (
|
||||
ColumnStrategyConfig,
|
||||
DeduplicationConfig,
|
||||
StrategyConfig,
|
||||
)
|
||||
from src.core.normalizers import NormalizerType
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Config panel (advanced options)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def config_panel(df: pd.DataFrame) -> dict:
|
||||
"""Render the Advanced Options expander. Returns a settings dict.
|
||||
|
||||
Keys returned:
|
||||
strategies: list[MatchStrategy] | None
|
||||
survivor_rule: SurvivorRule
|
||||
date_column: str | None
|
||||
merge: bool
|
||||
"""
|
||||
columns = list(df.columns)
|
||||
|
||||
with st.expander("Advanced Options"):
|
||||
col_left, col_right = st.columns(2)
|
||||
|
||||
with col_left:
|
||||
subset_cols = st.multiselect(
|
||||
"Match on columns",
|
||||
columns,
|
||||
default=[],
|
||||
help="Leave empty to auto-detect based on column names.",
|
||||
)
|
||||
key_cols = st.multiselect(
|
||||
"Strong keys",
|
||||
columns,
|
||||
default=[],
|
||||
help="Columns that uniquely identify records (e.g., EIN, SKU). Each is an independent exact-match strategy.",
|
||||
)
|
||||
fuzzy_cols = st.multiselect(
|
||||
"Fuzzy columns",
|
||||
columns,
|
||||
default=[],
|
||||
help="Columns to fuzzy-match. Others use exact matching.",
|
||||
)
|
||||
|
||||
with col_right:
|
||||
algorithm = st.selectbox(
|
||||
"Fuzzy algorithm",
|
||||
["jaro_winkler", "levenshtein", "token_set_ratio"],
|
||||
index=0,
|
||||
help="jaro_winkler: best for names. levenshtein: best for typos. token_set_ratio: best for addresses.",
|
||||
)
|
||||
threshold = st.slider(
|
||||
"Similarity threshold",
|
||||
min_value=50,
|
||||
max_value=100,
|
||||
value=85,
|
||||
help="Lower = more matches but more false positives.",
|
||||
)
|
||||
survivor = st.selectbox(
|
||||
"Survivor rule",
|
||||
["first", "last", "most-complete", "most-recent"],
|
||||
index=0,
|
||||
help="Which row to keep when duplicates are found.",
|
||||
)
|
||||
|
||||
# Second row of options
|
||||
col_a, col_b = st.columns(2)
|
||||
|
||||
with col_a:
|
||||
normalize_options = {c: "auto" for c in columns}
|
||||
normalizer_types = ["auto", "email", "phone", "name", "address", "string", "none"]
|
||||
|
||||
normalize_map: dict[str, str] = {}
|
||||
if fuzzy_cols or subset_cols:
|
||||
target_cols = fuzzy_cols or subset_cols
|
||||
st.markdown("**Per-column normalizers**")
|
||||
for col_name in target_cols:
|
||||
norm = st.selectbox(
|
||||
f"Normalizer for '{col_name}'",
|
||||
normalizer_types,
|
||||
index=0,
|
||||
key=f"norm_{col_name}",
|
||||
)
|
||||
if norm not in ("auto", "none"):
|
||||
normalize_map[col_name] = norm
|
||||
|
||||
with col_b:
|
||||
merge = st.checkbox(
|
||||
"Merge mode",
|
||||
value=False,
|
||||
help="Fill missing fields in the surviving row from removed duplicates.",
|
||||
)
|
||||
date_column: Optional[str] = None
|
||||
if survivor == "most-recent":
|
||||
date_column = st.selectbox(
|
||||
"Date column",
|
||||
columns,
|
||||
help="Required for most-recent survivor rule.",
|
||||
)
|
||||
|
||||
# Config save/load
|
||||
st.divider()
|
||||
cfg_left, cfg_right = st.columns(2)
|
||||
|
||||
with cfg_left:
|
||||
config_file = st.file_uploader(
|
||||
"Load config profile",
|
||||
type=["json"],
|
||||
help="Load previously saved settings.",
|
||||
key="config_upload",
|
||||
)
|
||||
if config_file is not None:
|
||||
import json
|
||||
try:
|
||||
data = json.loads(config_file.read())
|
||||
loaded = DeduplicationConfig.from_dict(data)
|
||||
st.session_state["loaded_config"] = loaded
|
||||
st.success("Config loaded.")
|
||||
except Exception as e:
|
||||
st.error(f"Failed to load config: {e}")
|
||||
|
||||
with cfg_right:
|
||||
if st.button("Save current settings"):
|
||||
cfg = _build_config(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
survivor, date_column, merge,
|
||||
)
|
||||
cfg_json = cfg.to_dict()
|
||||
import json
|
||||
st.download_button(
|
||||
"Download config JSON",
|
||||
data=json.dumps(cfg_json, indent=2),
|
||||
file_name="dedup_config.json",
|
||||
mime="application/json",
|
||||
)
|
||||
|
||||
# Build strategies from selections
|
||||
strategies = _build_strategies(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
)
|
||||
|
||||
# Survivor rule mapping
|
||||
survivor_map = {
|
||||
"first": SurvivorRule.KEEP_FIRST,
|
||||
"last": SurvivorRule.KEEP_LAST,
|
||||
"most-complete": SurvivorRule.KEEP_MOST_COMPLETE,
|
||||
"most-recent": SurvivorRule.KEEP_MOST_RECENT,
|
||||
}
|
||||
|
||||
return {
|
||||
"strategies": strategies,
|
||||
"survivor_rule": survivor_map[survivor],
|
||||
"date_column": date_column,
|
||||
"merge": merge,
|
||||
}
|
||||
|
||||
|
||||
def _build_strategies(
|
||||
subset_cols: list[str],
|
||||
key_cols: list[str],
|
||||
fuzzy_cols: list[str],
|
||||
algorithm: str,
|
||||
threshold: int,
|
||||
normalize_map: dict[str, str],
|
||||
) -> Optional[list[MatchStrategy]]:
|
||||
"""Build MatchStrategy list from GUI selections. Returns None for auto-detect."""
|
||||
strategies: list[MatchStrategy] = []
|
||||
|
||||
# If user selected columns explicitly, build from those
|
||||
if subset_cols or fuzzy_cols:
|
||||
target_cols = subset_cols if subset_cols else fuzzy_cols
|
||||
fuzzy_set = set(fuzzy_cols)
|
||||
col_strats: list[ColumnMatchStrategy] = []
|
||||
for col in target_cols:
|
||||
norm = None
|
||||
if col in normalize_map:
|
||||
norm = NormalizerType(normalize_map[col])
|
||||
if col in fuzzy_set:
|
||||
algo = Algorithm(algorithm)
|
||||
thresh = float(threshold)
|
||||
else:
|
||||
algo = Algorithm.EXACT
|
||||
thresh = 100.0
|
||||
col_strats.append(ColumnMatchStrategy(
|
||||
column=col, algorithm=algo, threshold=thresh, normalizer=norm,
|
||||
))
|
||||
strategies.append(MatchStrategy(column_strategies=col_strats))
|
||||
|
||||
# Add strong key strategies
|
||||
if key_cols:
|
||||
for col in key_cols:
|
||||
strategies.append(MatchStrategy(column_strategies=[
|
||||
ColumnMatchStrategy(column=col, algorithm=Algorithm.EXACT, threshold=100.0)
|
||||
]))
|
||||
|
||||
return strategies if strategies else None
|
||||
|
||||
|
||||
def _build_config(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
survivor, date_column, merge,
|
||||
) -> DeduplicationConfig:
|
||||
"""Build a DeduplicationConfig from GUI state."""
|
||||
cfg = DeduplicationConfig(
|
||||
survivor_rule=survivor.replace("-", "_"),
|
||||
date_column=date_column,
|
||||
merge=merge,
|
||||
subset_columns=subset_cols or None,
|
||||
fuzzy_columns=fuzzy_cols or None,
|
||||
default_algorithm=algorithm,
|
||||
default_threshold=float(threshold),
|
||||
normalize_map=normalize_map or None,
|
||||
)
|
||||
strategies = _build_strategies(
|
||||
subset_cols, key_cols, fuzzy_cols,
|
||||
algorithm, threshold, normalize_map,
|
||||
)
|
||||
if strategies:
|
||||
cfg.strategies = [
|
||||
StrategyConfig(columns=[
|
||||
ColumnStrategyConfig(
|
||||
column=cs.column,
|
||||
algorithm=cs.algorithm.value,
|
||||
threshold=cs.threshold,
|
||||
normalizer=cs.normalizer.value if cs.normalizer else None,
|
||||
)
|
||||
for cs in s.column_strategies
|
||||
])
|
||||
for s in strategies
|
||||
]
|
||||
return cfg
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Match group review card
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def match_group_card(
|
||||
group: MatchResult,
|
||||
df: pd.DataFrame,
|
||||
group_num: int,
|
||||
) -> Optional[bool]:
|
||||
"""Render an expandable match group card with side-by-side diff.
|
||||
|
||||
Returns:
|
||||
True — user clicked Merge (accept match)
|
||||
False — user clicked Keep Both (reject match)
|
||||
None — no decision yet
|
||||
"""
|
||||
confidence = group.confidence
|
||||
auto_expand = confidence < 95.0
|
||||
matched_on = ", ".join(group.matched_on)
|
||||
n_rows = len(group.row_indices)
|
||||
|
||||
label = (
|
||||
f"Group {group_num}: {n_rows} rows "
|
||||
f"(confidence: {confidence:.0f}%) "
|
||||
f"[{matched_on}]"
|
||||
)
|
||||
|
||||
with st.expander(label, expanded=auto_expand):
|
||||
# Build comparison DataFrame
|
||||
display_cols = [c for c in df.columns if not str(c).startswith("_norm_")]
|
||||
rows_data = []
|
||||
for idx in group.row_indices:
|
||||
row = {"_row": idx + 1}
|
||||
for col in display_cols:
|
||||
row[col] = df.iloc[idx].get(col, "")
|
||||
rows_data.append(row)
|
||||
|
||||
compare_df = pd.DataFrame(rows_data)
|
||||
compare_df = compare_df.set_index("_row")
|
||||
|
||||
# Highlight differences
|
||||
def _highlight_diffs(s: pd.Series) -> list[str]:
|
||||
"""Highlight cells that differ from the first row."""
|
||||
styles = []
|
||||
first_val = str(s.iloc[0]).strip() if len(s) > 0 else ""
|
||||
for val in s:
|
||||
val_str = str(val).strip()
|
||||
if val_str != first_val and val_str and first_val:
|
||||
styles.append("background-color: rgba(245, 166, 35, 0.2)")
|
||||
elif not val_str and first_val:
|
||||
styles.append("background-color: rgba(240, 82, 82, 0.1)")
|
||||
else:
|
||||
styles.append("")
|
||||
return styles
|
||||
|
||||
styled = compare_df.style.apply(_highlight_diffs, axis=0)
|
||||
st.dataframe(styled, use_container_width=True)
|
||||
|
||||
# Action buttons
|
||||
btn_left, btn_mid, btn_right = st.columns(3)
|
||||
merge_key = f"merge_{group.group_id}"
|
||||
keep_key = f"keep_{group.group_id}"
|
||||
|
||||
with btn_left:
|
||||
if st.button("Merge", key=merge_key, type="primary"):
|
||||
return True
|
||||
with btn_mid:
|
||||
if st.button("Keep Both", key=keep_key):
|
||||
return False
|
||||
|
||||
# Check session state for previous decisions
|
||||
decisions = st.session_state.get("review_decisions", {})
|
||||
if group.group_id in decisions:
|
||||
decision = decisions[group.group_id]
|
||||
if decision is True:
|
||||
st.success("Decision: Merge")
|
||||
elif decision is False:
|
||||
st.info("Decision: Keep Both")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Results summary + downloads
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def results_summary(
|
||||
result: DeduplicationResult,
|
||||
original_df: pd.DataFrame,
|
||||
) -> None:
|
||||
"""Render summary stats and download buttons."""
|
||||
removed = result.original_row_count - len(result.deduplicated_df)
|
||||
|
||||
# Summary metrics
|
||||
col1, col2, col3, col4 = st.columns(4)
|
||||
col1.metric("Rows In", result.original_row_count)
|
||||
col2.metric("Rows Out", len(result.deduplicated_df))
|
||||
col3.metric("Removed", removed)
|
||||
col4.metric("Groups", len(result.match_groups))
|
||||
|
||||
st.divider()
|
||||
|
||||
# Download buttons
|
||||
dl_left, dl_mid, dl_right = st.columns(3)
|
||||
|
||||
with dl_left:
|
||||
csv_bytes = result.deduplicated_df.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download Deduplicated CSV",
|
||||
data=csv_bytes,
|
||||
file_name="deduplicated.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
|
||||
with dl_mid:
|
||||
if not result.removed_df.empty:
|
||||
removed_bytes = result.removed_df.to_csv(index=False).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download Removed Rows",
|
||||
data=removed_bytes,
|
||||
file_name="removed_rows.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
|
||||
with dl_right:
|
||||
if result.match_groups:
|
||||
groups_data = _build_match_groups_csv(result, original_df)
|
||||
st.download_button(
|
||||
"Download Match Groups Report",
|
||||
data=groups_data,
|
||||
file_name="match_groups.csv",
|
||||
mime="text/csv",
|
||||
)
|
||||
|
||||
|
||||
def _build_match_groups_csv(
|
||||
result: DeduplicationResult,
|
||||
original_df: pd.DataFrame,
|
||||
) -> bytes:
|
||||
"""Build the match groups audit CSV as bytes."""
|
||||
rows = []
|
||||
for g in result.match_groups:
|
||||
for idx in g.row_indices:
|
||||
row_data = {
|
||||
"_group_id": g.group_id + 1,
|
||||
"_is_survivor": idx == g.survivor_index,
|
||||
"_confidence": g.confidence,
|
||||
"_matched_on": ", ".join(g.matched_on),
|
||||
"_original_row": idx + 1,
|
||||
}
|
||||
for col in original_df.columns:
|
||||
if not str(col).startswith("_norm_"):
|
||||
row_data[col] = original_df.iloc[idx].get(col, "") if idx < len(original_df) else ""
|
||||
rows.append(row_data)
|
||||
|
||||
groups_df = pd.DataFrame(rows)
|
||||
return groups_df.to_csv(index=False).encode("utf-8-sig")
|
||||
Reference in New Issue
Block a user