feat: refactor GUI to multi-page Streamlit app with 9 tool pages
Convert single-page deduplicator into a multi-page suite. Home page shows tool card grid. Deduplicator extracted to its own page (fully working). 8 stub pages added for Text Cleaner, Format Standardizer, Missing Values, Column Mapper, Outlier Detector, Multi-File Merger, Validator & Reporter, and Pipeline Runner — each with functional file upload and coming-soon UI. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
418
src/gui/app.py
418
src/gui/app.py
@@ -1,4 +1,4 @@
|
||||
"""DataTools Deduplicator — Streamlit GUI.
|
||||
"""DataTools — Data Cleaning Mastery Suite.
|
||||
|
||||
Launch:
|
||||
streamlit run src/gui/app.py
|
||||
@@ -6,11 +6,9 @@ Launch:
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import streamlit as st
|
||||
|
||||
# Ensure project root is on sys.path so `src.core` imports work
|
||||
@@ -18,24 +16,14 @@ _project_root = Path(__file__).resolve().parent.parent.parent
|
||||
if str(_project_root) not in sys.path:
|
||||
sys.path.insert(0, str(_project_root))
|
||||
|
||||
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
|
||||
from src.core.io import read_file, list_sheets, detect_encoding, detect_delimiter
|
||||
from src.core.config import DeduplicationConfig
|
||||
from src.gui.components import (
|
||||
apply_review_decisions,
|
||||
config_panel,
|
||||
match_group_card,
|
||||
results_summary,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Page config
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.set_page_config(
|
||||
page_title="DataTools Deduplicator",
|
||||
page_icon="🔍",
|
||||
page_title="DataTools — Data Cleaning Mastery",
|
||||
page_icon="🧹",
|
||||
layout="wide",
|
||||
)
|
||||
|
||||
@@ -45,331 +33,101 @@ st.markdown(
|
||||
unsafe_allow_html=True,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Session state defaults
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DEFAULTS = {
|
||||
"df": None,
|
||||
"result": None,
|
||||
"review_decisions": {},
|
||||
"config": None,
|
||||
"file_name": "",
|
||||
"sheet_names": [],
|
||||
"detected_delimiter": ",",
|
||||
}
|
||||
for key, default in _DEFAULTS.items():
|
||||
if key not in st.session_state:
|
||||
st.session_state[key] = default
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Header
|
||||
# Home page
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.title("DataTools Deduplicator")
|
||||
st.caption("Find and remove duplicate rows in CSV, delimited text, and Excel files.")
|
||||
st.title("🧹 DataTools — Data Cleaning Mastery")
|
||||
st.caption("A 9-tool suite for cleaning, standardizing, and validating tabular data. Runs 100% locally.")
|
||||
|
||||
st.divider()
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# File upload
|
||||
# Tool cards
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
uploaded = st.file_uploader(
|
||||
"Upload CSV or Excel file",
|
||||
type=["csv", "tsv", "xlsx", "xls"],
|
||||
help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
|
||||
)
|
||||
TOOLS = [
|
||||
{
|
||||
"icon": "🔍",
|
||||
"name": "Deduplicator",
|
||||
"description": "Fuzzy matching, normalization, survivor selection, and interactive review.",
|
||||
"status": "Ready",
|
||||
"page": "1_Deduplicator",
|
||||
},
|
||||
{
|
||||
"icon": "✂️",
|
||||
"name": "Text Cleaner",
|
||||
"description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.",
|
||||
"status": "Coming Soon",
|
||||
"page": "2_Text_Cleaner",
|
||||
},
|
||||
{
|
||||
"icon": "📐",
|
||||
"name": "Format Standardizer",
|
||||
"description": "Standardize dates, currencies, names, phone numbers, and addresses.",
|
||||
"status": "Coming Soon",
|
||||
"page": "3_Format_Standardizer",
|
||||
},
|
||||
{
|
||||
"icon": "🕳️",
|
||||
"name": "Missing Value Handler",
|
||||
"description": "Detect disguised nulls, missingness analysis, and imputation strategies.",
|
||||
"status": "Coming Soon",
|
||||
"page": "4_Missing_Values",
|
||||
},
|
||||
{
|
||||
"icon": "🗂️",
|
||||
"name": "Column Mapper",
|
||||
"description": "Rename columns, enforce a target schema, and coerce types.",
|
||||
"status": "Coming Soon",
|
||||
"page": "5_Column_Mapper",
|
||||
},
|
||||
{
|
||||
"icon": "📊",
|
||||
"name": "Outlier Detector",
|
||||
"description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.",
|
||||
"status": "Coming Soon",
|
||||
"page": "6_Outlier_Detector",
|
||||
},
|
||||
{
|
||||
"icon": "📎",
|
||||
"name": "Multi-File Merger",
|
||||
"description": "Combine multiple CSV/Excel files with schema alignment.",
|
||||
"status": "Coming Soon",
|
||||
"page": "7_Multi_File_Merger",
|
||||
},
|
||||
{
|
||||
"icon": "✅",
|
||||
"name": "Validator & Reporter",
|
||||
"description": "Validate against rules and generate PDF/Excel quality reports.",
|
||||
"status": "Coming Soon",
|
||||
"page": "8_Validator_Reporter",
|
||||
},
|
||||
{
|
||||
"icon": "⚙️",
|
||||
"name": "Pipeline Runner",
|
||||
"description": "Chain tools in recommended order and pass output between steps.",
|
||||
"status": "Coming Soon",
|
||||
"page": "9_Pipeline_Runner",
|
||||
},
|
||||
]
|
||||
|
||||
if uploaded is not None:
|
||||
# Detect if file changed
|
||||
if uploaded.name != st.session_state["file_name"]:
|
||||
st.session_state["file_name"] = uploaded.name
|
||||
st.session_state["result"] = None
|
||||
st.session_state["review_decisions"] = {}
|
||||
|
||||
# Read the file
|
||||
try:
|
||||
# Write to a temp file for read_file() which needs a path
|
||||
import tempfile
|
||||
suffix = Path(uploaded.name).suffix
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(uploaded.getvalue())
|
||||
tmp_path = Path(tmp.name)
|
||||
|
||||
# Check for Excel sheets / detect delimiter
|
||||
if suffix.lower() in (".xlsx", ".xls"):
|
||||
st.session_state["sheet_names"] = list_sheets(tmp_path)
|
||||
st.session_state["detected_delimiter"] = ","
|
||||
else:
|
||||
st.session_state["sheet_names"] = []
|
||||
enc = detect_encoding(tmp_path)
|
||||
st.session_state["detected_delimiter"] = detect_delimiter(tmp_path, enc)
|
||||
|
||||
df = read_file(tmp_path)
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
|
||||
st.session_state["df"] = df
|
||||
|
||||
# Clean up temp file
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
except Exception as e:
|
||||
st.error(f"Failed to read file: {e}")
|
||||
st.session_state["df"] = None
|
||||
|
||||
df = st.session_state["df"]
|
||||
|
||||
if df is not None:
|
||||
# Sheet selector for Excel files
|
||||
if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
|
||||
sheet = st.selectbox(
|
||||
"Select sheet",
|
||||
st.session_state["sheet_names"],
|
||||
# Render tool cards in a 3-column grid
|
||||
for row_start in range(0, len(TOOLS), 3):
|
||||
cols = st.columns(3)
|
||||
for i, col in enumerate(cols):
|
||||
idx = row_start + i
|
||||
if idx >= len(TOOLS):
|
||||
break
|
||||
tool = TOOLS[idx]
|
||||
with col:
|
||||
status_color = "green" if tool["status"] == "Ready" else "orange"
|
||||
st.markdown(
|
||||
f"### {tool['icon']} {tool['name']}\n\n"
|
||||
f"{tool['description']}\n\n"
|
||||
f":{status_color}[**{tool['status']}**]"
|
||||
)
|
||||
if sheet != st.session_state.get("_current_sheet"):
|
||||
st.session_state["_current_sheet"] = sheet
|
||||
suffix = Path(uploaded.name).suffix
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(uploaded.getvalue())
|
||||
tmp_path = Path(tmp.name)
|
||||
df = read_file(tmp_path, sheet_name=sheet)
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
st.session_state["df"] = df
|
||||
st.session_state["result"] = None
|
||||
st.session_state["review_decisions"] = {}
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
# Delimiter selector for CSV/TSV files
|
||||
is_csv = Path(uploaded.name).suffix.lower() not in (".xlsx", ".xls")
|
||||
if is_csv:
|
||||
_DELIMITERS = {
|
||||
"Comma (,)": ",",
|
||||
"Tab (\\t)": "\t",
|
||||
"Semicolon (;)": ";",
|
||||
"Pipe (|)": "|",
|
||||
"Other": None,
|
||||
}
|
||||
_DELIM_LABELS = list(_DELIMITERS.keys())
|
||||
_DELIM_VALUES = list(_DELIMITERS.values())
|
||||
detected = st.session_state.get("detected_delimiter", ",")
|
||||
default_idx = _DELIM_VALUES.index(detected) if detected in _DELIM_VALUES else 0
|
||||
chosen_label = st.selectbox(
|
||||
"Delimiter",
|
||||
_DELIM_LABELS,
|
||||
index=default_idx,
|
||||
help="Auto-detected on upload. Change if the preview looks wrong.",
|
||||
)
|
||||
if chosen_label == "Other":
|
||||
custom_delim = st.text_input(
|
||||
"Enter delimiter character",
|
||||
max_chars=5,
|
||||
help="Enter the character(s) used to separate fields.",
|
||||
)
|
||||
chosen_delim = custom_delim if custom_delim else ","
|
||||
else:
|
||||
chosen_delim = _DELIMITERS[chosen_label]
|
||||
if chosen_delim != st.session_state.get("_current_delimiter"):
|
||||
st.session_state["_current_delimiter"] = chosen_delim
|
||||
import tempfile
|
||||
suffix = Path(uploaded.name).suffix
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(uploaded.getvalue())
|
||||
tmp_path = Path(tmp.name)
|
||||
df = read_file(tmp_path, delimiter=chosen_delim)
|
||||
if not isinstance(df, pd.DataFrame):
|
||||
df = pd.concat(list(df), ignore_index=True)
|
||||
st.session_state["df"] = df
|
||||
st.session_state["result"] = None
|
||||
st.session_state["review_decisions"] = {}
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
|
||||
# Preview
|
||||
st.subheader(f"Preview: {uploaded.name}")
|
||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||
st.dataframe(df.head(10), use_container_width=True)
|
||||
|
||||
# Advanced options
|
||||
settings = config_panel(df)
|
||||
|
||||
# Apply loaded config if present
|
||||
loaded_cfg = st.session_state.get("loaded_config")
|
||||
if loaded_cfg is not None:
|
||||
settings["strategies"] = loaded_cfg.to_strategies()
|
||||
settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
|
||||
settings["date_column"] = loaded_cfg.date_column
|
||||
settings["merge"] = loaded_cfg.merge
|
||||
# Clear so it doesn't override on every rerun
|
||||
del st.session_state["loaded_config"]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Find Duplicates button
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
st.divider()
|
||||
|
||||
if st.button("Find Duplicates", type="primary", use_container_width=True):
|
||||
progress_bar = st.progress(0, text="Comparing rows...")
|
||||
|
||||
def _gui_progress(current: int, total: int) -> None:
|
||||
if total > 0:
|
||||
pct = min(current / total, 1.0)
|
||||
progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
|
||||
|
||||
with st.spinner("Running deduplication..."):
|
||||
result = deduplicate(
|
||||
df,
|
||||
strategies=settings["strategies"],
|
||||
survivor_rule=settings["survivor_rule"],
|
||||
date_column=settings["date_column"],
|
||||
merge=settings["merge"],
|
||||
preview=False,
|
||||
progress_callback=_gui_progress,
|
||||
)
|
||||
|
||||
progress_bar.empty()
|
||||
st.session_state["result"] = result
|
||||
st.session_state["review_decisions"] = {}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Results
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
result: DeduplicationResult | None = st.session_state["result"]
|
||||
|
||||
if result is not None:
|
||||
st.divider()
|
||||
st.subheader("Results")
|
||||
|
||||
# Summary + download buttons
|
||||
results_summary(result, df)
|
||||
|
||||
# Match group review
|
||||
if result.match_groups:
|
||||
st.divider()
|
||||
st.subheader("Match Groups")
|
||||
|
||||
# Batch actions
|
||||
def _accept_all():
|
||||
for g in result.match_groups:
|
||||
st.session_state["review_decisions"][g.group_id] = {
|
||||
"keep_indices": [g.survivor_index],
|
||||
"overrides": {},
|
||||
}
|
||||
|
||||
def _reject_all():
|
||||
for g in result.match_groups:
|
||||
st.session_state["review_decisions"][g.group_id] = {
|
||||
"keep_indices": list(g.row_indices),
|
||||
"overrides": {},
|
||||
}
|
||||
|
||||
def _clear_all():
|
||||
st.session_state["review_decisions"] = {}
|
||||
for k in list(st.session_state):
|
||||
if k.startswith("editor_"):
|
||||
del st.session_state[k]
|
||||
|
||||
action_left, action_mid, action_right = st.columns(3)
|
||||
with action_left:
|
||||
st.button("Accept All", on_click=_accept_all)
|
||||
with action_mid:
|
||||
st.button("Reject All", on_click=_reject_all)
|
||||
with action_right:
|
||||
st.button("Clear Decisions", on_click=_clear_all)
|
||||
|
||||
# Individual group cards
|
||||
decisions = st.session_state["review_decisions"]
|
||||
for i, group in enumerate(result.match_groups):
|
||||
match_group_card(group, df, group_num=i + 1)
|
||||
|
||||
# Show decision summary
|
||||
if decisions:
|
||||
st.divider()
|
||||
merged = 0
|
||||
customized = 0
|
||||
split = 0
|
||||
kept_all = 0
|
||||
for v in decisions.values():
|
||||
if not isinstance(v, dict):
|
||||
continue
|
||||
ki = v.get("keep_indices", [])
|
||||
# Find the matching group size
|
||||
gid_for_v = next(
|
||||
(gid for gid, d in decisions.items() if d is v),
|
||||
None,
|
||||
)
|
||||
group_size = next(
|
||||
(len(g.row_indices) for g in result.match_groups
|
||||
if g.group_id == gid_for_v),
|
||||
0,
|
||||
)
|
||||
if len(ki) == group_size:
|
||||
kept_all += 1
|
||||
elif len(ki) == 1:
|
||||
if v.get("overrides"):
|
||||
customized += 1
|
||||
else:
|
||||
merged += 1
|
||||
else:
|
||||
split += 1
|
||||
|
||||
pending = len(result.match_groups) - len(decisions)
|
||||
parts = []
|
||||
if merged:
|
||||
parts.append(f"{merged} merged")
|
||||
if customized:
|
||||
parts.append(f"{customized} customized")
|
||||
if split:
|
||||
parts.append(f"{split} split")
|
||||
if kept_all:
|
||||
parts.append(f"{kept_all} kept all")
|
||||
parts.append(f"{pending} pending")
|
||||
st.caption("Decisions: " + ", ".join(parts))
|
||||
|
||||
# Apply decisions and offer download
|
||||
if st.button(
|
||||
"Apply Review Decisions & Download",
|
||||
type="primary",
|
||||
use_container_width=True,
|
||||
):
|
||||
reviewed_df, reviewed_removed = apply_review_decisions(
|
||||
df, result.match_groups, decisions,
|
||||
)
|
||||
|
||||
csv_bytes = reviewed_df.to_csv(
|
||||
index=False
|
||||
).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download Reviewed & Deduplicated CSV",
|
||||
data=csv_bytes,
|
||||
file_name="deduplicated_reviewed.csv",
|
||||
mime="text/csv",
|
||||
key="reviewed_download",
|
||||
)
|
||||
if not reviewed_removed.empty:
|
||||
removed_bytes = reviewed_removed.to_csv(
|
||||
index=False
|
||||
).encode("utf-8-sig")
|
||||
st.download_button(
|
||||
"Download Reviewed Removed Rows",
|
||||
data=removed_bytes,
|
||||
file_name="removed_reviewed.csv",
|
||||
mime="text/csv",
|
||||
key="reviewed_removed_download",
|
||||
)
|
||||
|
||||
# Log entries
|
||||
if result.log_entries:
|
||||
with st.expander("Processing Log"):
|
||||
st.code("\n".join(result.log_entries))
|
||||
|
||||
else:
|
||||
# No file uploaded — show placeholder
|
||||
st.info("Upload a file to get started.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -379,5 +137,5 @@ else:
|
||||
st.divider()
|
||||
st.caption(
|
||||
"Runs locally. Your data never leaves this computer. "
|
||||
"| DataTools Deduplicator v3.0"
|
||||
"| DataTools v3.0"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user