feat: refactor GUI to multi-page Streamlit app with 9 tool pages
Convert single-page deduplicator into a multi-page suite. Home page shows tool card grid. Deduplicator extracted to its own page (fully working). 8 stub pages added for Text Cleaner, Format Standardizer, Missing Values, Column Mapper, Outlier Detector, Multi-File Merger, Validator & Reporter, and Pipeline Runner — each with functional file upload and coming-soon UI. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
418
src/gui/app.py
418
src/gui/app.py
@@ -1,4 +1,4 @@
|
|||||||
"""DataTools Deduplicator — Streamlit GUI.
|
"""DataTools — Data Cleaning Mastery Suite.
|
||||||
|
|
||||||
Launch:
|
Launch:
|
||||||
streamlit run src/gui/app.py
|
streamlit run src/gui/app.py
|
||||||
@@ -6,11 +6,9 @@ Launch:
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import io
|
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
# Ensure project root is on sys.path so `src.core` imports work
|
# Ensure project root is on sys.path so `src.core` imports work
|
||||||
@@ -18,24 +16,14 @@ _project_root = Path(__file__).resolve().parent.parent.parent
|
|||||||
if str(_project_root) not in sys.path:
|
if str(_project_root) not in sys.path:
|
||||||
sys.path.insert(0, str(_project_root))
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
|
|
||||||
from src.core.io import read_file, list_sheets, detect_encoding, detect_delimiter
|
|
||||||
from src.core.config import DeduplicationConfig
|
|
||||||
from src.gui.components import (
|
|
||||||
apply_review_decisions,
|
|
||||||
config_panel,
|
|
||||||
match_group_card,
|
|
||||||
results_summary,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Page config
|
# Page config
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
st.set_page_config(
|
st.set_page_config(
|
||||||
page_title="DataTools Deduplicator",
|
page_title="DataTools — Data Cleaning Mastery",
|
||||||
page_icon="🔍",
|
page_icon="🧹",
|
||||||
layout="wide",
|
layout="wide",
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -45,331 +33,101 @@ st.markdown(
|
|||||||
unsafe_allow_html=True,
|
unsafe_allow_html=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Session state defaults
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
_DEFAULTS = {
|
|
||||||
"df": None,
|
|
||||||
"result": None,
|
|
||||||
"review_decisions": {},
|
|
||||||
"config": None,
|
|
||||||
"file_name": "",
|
|
||||||
"sheet_names": [],
|
|
||||||
"detected_delimiter": ",",
|
|
||||||
}
|
|
||||||
for key, default in _DEFAULTS.items():
|
|
||||||
if key not in st.session_state:
|
|
||||||
st.session_state[key] = default
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Header
|
# Home page
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
st.title("DataTools Deduplicator")
|
st.title("🧹 DataTools — Data Cleaning Mastery")
|
||||||
st.caption("Find and remove duplicate rows in CSV, delimited text, and Excel files.")
|
st.caption("A 9-tool suite for cleaning, standardizing, and validating tabular data. Runs 100% locally.")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# File upload
|
# Tool cards
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
uploaded = st.file_uploader(
|
TOOLS = [
|
||||||
"Upload CSV or Excel file",
|
{
|
||||||
type=["csv", "tsv", "xlsx", "xls"],
|
"icon": "🔍",
|
||||||
help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
|
"name": "Deduplicator",
|
||||||
)
|
"description": "Fuzzy matching, normalization, survivor selection, and interactive review.",
|
||||||
|
"status": "Ready",
|
||||||
|
"page": "1_Deduplicator",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"icon": "✂️",
|
||||||
|
"name": "Text Cleaner",
|
||||||
|
"description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.",
|
||||||
|
"status": "Coming Soon",
|
||||||
|
"page": "2_Text_Cleaner",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"icon": "📐",
|
||||||
|
"name": "Format Standardizer",
|
||||||
|
"description": "Standardize dates, currencies, names, phone numbers, and addresses.",
|
||||||
|
"status": "Coming Soon",
|
||||||
|
"page": "3_Format_Standardizer",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"icon": "🕳️",
|
||||||
|
"name": "Missing Value Handler",
|
||||||
|
"description": "Detect disguised nulls, missingness analysis, and imputation strategies.",
|
||||||
|
"status": "Coming Soon",
|
||||||
|
"page": "4_Missing_Values",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"icon": "🗂️",
|
||||||
|
"name": "Column Mapper",
|
||||||
|
"description": "Rename columns, enforce a target schema, and coerce types.",
|
||||||
|
"status": "Coming Soon",
|
||||||
|
"page": "5_Column_Mapper",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"icon": "📊",
|
||||||
|
"name": "Outlier Detector",
|
||||||
|
"description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.",
|
||||||
|
"status": "Coming Soon",
|
||||||
|
"page": "6_Outlier_Detector",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"icon": "📎",
|
||||||
|
"name": "Multi-File Merger",
|
||||||
|
"description": "Combine multiple CSV/Excel files with schema alignment.",
|
||||||
|
"status": "Coming Soon",
|
||||||
|
"page": "7_Multi_File_Merger",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"icon": "✅",
|
||||||
|
"name": "Validator & Reporter",
|
||||||
|
"description": "Validate against rules and generate PDF/Excel quality reports.",
|
||||||
|
"status": "Coming Soon",
|
||||||
|
"page": "8_Validator_Reporter",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"icon": "⚙️",
|
||||||
|
"name": "Pipeline Runner",
|
||||||
|
"description": "Chain tools in recommended order and pass output between steps.",
|
||||||
|
"status": "Coming Soon",
|
||||||
|
"page": "9_Pipeline_Runner",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
if uploaded is not None:
|
# Render tool cards in a 3-column grid
|
||||||
# Detect if file changed
|
for row_start in range(0, len(TOOLS), 3):
|
||||||
if uploaded.name != st.session_state["file_name"]:
|
cols = st.columns(3)
|
||||||
st.session_state["file_name"] = uploaded.name
|
for i, col in enumerate(cols):
|
||||||
st.session_state["result"] = None
|
idx = row_start + i
|
||||||
st.session_state["review_decisions"] = {}
|
if idx >= len(TOOLS):
|
||||||
|
break
|
||||||
# Read the file
|
tool = TOOLS[idx]
|
||||||
try:
|
with col:
|
||||||
# Write to a temp file for read_file() which needs a path
|
status_color = "green" if tool["status"] == "Ready" else "orange"
|
||||||
import tempfile
|
st.markdown(
|
||||||
suffix = Path(uploaded.name).suffix
|
f"### {tool['icon']} {tool['name']}\n\n"
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
f"{tool['description']}\n\n"
|
||||||
tmp.write(uploaded.getvalue())
|
f":{status_color}[**{tool['status']}**]"
|
||||||
tmp_path = Path(tmp.name)
|
|
||||||
|
|
||||||
# Check for Excel sheets / detect delimiter
|
|
||||||
if suffix.lower() in (".xlsx", ".xls"):
|
|
||||||
st.session_state["sheet_names"] = list_sheets(tmp_path)
|
|
||||||
st.session_state["detected_delimiter"] = ","
|
|
||||||
else:
|
|
||||||
st.session_state["sheet_names"] = []
|
|
||||||
enc = detect_encoding(tmp_path)
|
|
||||||
st.session_state["detected_delimiter"] = detect_delimiter(tmp_path, enc)
|
|
||||||
|
|
||||||
df = read_file(tmp_path)
|
|
||||||
if not isinstance(df, pd.DataFrame):
|
|
||||||
df = pd.concat(list(df), ignore_index=True)
|
|
||||||
|
|
||||||
st.session_state["df"] = df
|
|
||||||
|
|
||||||
# Clean up temp file
|
|
||||||
tmp_path.unlink(missing_ok=True)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
st.error(f"Failed to read file: {e}")
|
|
||||||
st.session_state["df"] = None
|
|
||||||
|
|
||||||
df = st.session_state["df"]
|
|
||||||
|
|
||||||
if df is not None:
|
|
||||||
# Sheet selector for Excel files
|
|
||||||
if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
|
|
||||||
sheet = st.selectbox(
|
|
||||||
"Select sheet",
|
|
||||||
st.session_state["sheet_names"],
|
|
||||||
)
|
)
|
||||||
if sheet != st.session_state.get("_current_sheet"):
|
|
||||||
st.session_state["_current_sheet"] = sheet
|
|
||||||
suffix = Path(uploaded.name).suffix
|
|
||||||
import tempfile
|
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
|
||||||
tmp.write(uploaded.getvalue())
|
|
||||||
tmp_path = Path(tmp.name)
|
|
||||||
df = read_file(tmp_path, sheet_name=sheet)
|
|
||||||
if not isinstance(df, pd.DataFrame):
|
|
||||||
df = pd.concat(list(df), ignore_index=True)
|
|
||||||
st.session_state["df"] = df
|
|
||||||
st.session_state["result"] = None
|
|
||||||
st.session_state["review_decisions"] = {}
|
|
||||||
tmp_path.unlink(missing_ok=True)
|
|
||||||
|
|
||||||
# Delimiter selector for CSV/TSV files
|
|
||||||
is_csv = Path(uploaded.name).suffix.lower() not in (".xlsx", ".xls")
|
|
||||||
if is_csv:
|
|
||||||
_DELIMITERS = {
|
|
||||||
"Comma (,)": ",",
|
|
||||||
"Tab (\\t)": "\t",
|
|
||||||
"Semicolon (;)": ";",
|
|
||||||
"Pipe (|)": "|",
|
|
||||||
"Other": None,
|
|
||||||
}
|
|
||||||
_DELIM_LABELS = list(_DELIMITERS.keys())
|
|
||||||
_DELIM_VALUES = list(_DELIMITERS.values())
|
|
||||||
detected = st.session_state.get("detected_delimiter", ",")
|
|
||||||
default_idx = _DELIM_VALUES.index(detected) if detected in _DELIM_VALUES else 0
|
|
||||||
chosen_label = st.selectbox(
|
|
||||||
"Delimiter",
|
|
||||||
_DELIM_LABELS,
|
|
||||||
index=default_idx,
|
|
||||||
help="Auto-detected on upload. Change if the preview looks wrong.",
|
|
||||||
)
|
|
||||||
if chosen_label == "Other":
|
|
||||||
custom_delim = st.text_input(
|
|
||||||
"Enter delimiter character",
|
|
||||||
max_chars=5,
|
|
||||||
help="Enter the character(s) used to separate fields.",
|
|
||||||
)
|
|
||||||
chosen_delim = custom_delim if custom_delim else ","
|
|
||||||
else:
|
|
||||||
chosen_delim = _DELIMITERS[chosen_label]
|
|
||||||
if chosen_delim != st.session_state.get("_current_delimiter"):
|
|
||||||
st.session_state["_current_delimiter"] = chosen_delim
|
|
||||||
import tempfile
|
|
||||||
suffix = Path(uploaded.name).suffix
|
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
|
||||||
tmp.write(uploaded.getvalue())
|
|
||||||
tmp_path = Path(tmp.name)
|
|
||||||
df = read_file(tmp_path, delimiter=chosen_delim)
|
|
||||||
if not isinstance(df, pd.DataFrame):
|
|
||||||
df = pd.concat(list(df), ignore_index=True)
|
|
||||||
st.session_state["df"] = df
|
|
||||||
st.session_state["result"] = None
|
|
||||||
st.session_state["review_decisions"] = {}
|
|
||||||
tmp_path.unlink(missing_ok=True)
|
|
||||||
|
|
||||||
# Preview
|
|
||||||
st.subheader(f"Preview: {uploaded.name}")
|
|
||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
|
||||||
|
|
||||||
# Advanced options
|
|
||||||
settings = config_panel(df)
|
|
||||||
|
|
||||||
# Apply loaded config if present
|
|
||||||
loaded_cfg = st.session_state.get("loaded_config")
|
|
||||||
if loaded_cfg is not None:
|
|
||||||
settings["strategies"] = loaded_cfg.to_strategies()
|
|
||||||
settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
|
|
||||||
settings["date_column"] = loaded_cfg.date_column
|
|
||||||
settings["merge"] = loaded_cfg.merge
|
|
||||||
# Clear so it doesn't override on every rerun
|
|
||||||
del st.session_state["loaded_config"]
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Find Duplicates button
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
st.divider()
|
|
||||||
|
|
||||||
if st.button("Find Duplicates", type="primary", use_container_width=True):
|
|
||||||
progress_bar = st.progress(0, text="Comparing rows...")
|
|
||||||
|
|
||||||
def _gui_progress(current: int, total: int) -> None:
|
|
||||||
if total > 0:
|
|
||||||
pct = min(current / total, 1.0)
|
|
||||||
progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
|
|
||||||
|
|
||||||
with st.spinner("Running deduplication..."):
|
|
||||||
result = deduplicate(
|
|
||||||
df,
|
|
||||||
strategies=settings["strategies"],
|
|
||||||
survivor_rule=settings["survivor_rule"],
|
|
||||||
date_column=settings["date_column"],
|
|
||||||
merge=settings["merge"],
|
|
||||||
preview=False,
|
|
||||||
progress_callback=_gui_progress,
|
|
||||||
)
|
|
||||||
|
|
||||||
progress_bar.empty()
|
|
||||||
st.session_state["result"] = result
|
|
||||||
st.session_state["review_decisions"] = {}
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Results
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
result: DeduplicationResult | None = st.session_state["result"]
|
|
||||||
|
|
||||||
if result is not None:
|
|
||||||
st.divider()
|
|
||||||
st.subheader("Results")
|
|
||||||
|
|
||||||
# Summary + download buttons
|
|
||||||
results_summary(result, df)
|
|
||||||
|
|
||||||
# Match group review
|
|
||||||
if result.match_groups:
|
|
||||||
st.divider()
|
|
||||||
st.subheader("Match Groups")
|
|
||||||
|
|
||||||
# Batch actions
|
|
||||||
def _accept_all():
|
|
||||||
for g in result.match_groups:
|
|
||||||
st.session_state["review_decisions"][g.group_id] = {
|
|
||||||
"keep_indices": [g.survivor_index],
|
|
||||||
"overrides": {},
|
|
||||||
}
|
|
||||||
|
|
||||||
def _reject_all():
|
|
||||||
for g in result.match_groups:
|
|
||||||
st.session_state["review_decisions"][g.group_id] = {
|
|
||||||
"keep_indices": list(g.row_indices),
|
|
||||||
"overrides": {},
|
|
||||||
}
|
|
||||||
|
|
||||||
def _clear_all():
|
|
||||||
st.session_state["review_decisions"] = {}
|
|
||||||
for k in list(st.session_state):
|
|
||||||
if k.startswith("editor_"):
|
|
||||||
del st.session_state[k]
|
|
||||||
|
|
||||||
action_left, action_mid, action_right = st.columns(3)
|
|
||||||
with action_left:
|
|
||||||
st.button("Accept All", on_click=_accept_all)
|
|
||||||
with action_mid:
|
|
||||||
st.button("Reject All", on_click=_reject_all)
|
|
||||||
with action_right:
|
|
||||||
st.button("Clear Decisions", on_click=_clear_all)
|
|
||||||
|
|
||||||
# Individual group cards
|
|
||||||
decisions = st.session_state["review_decisions"]
|
|
||||||
for i, group in enumerate(result.match_groups):
|
|
||||||
match_group_card(group, df, group_num=i + 1)
|
|
||||||
|
|
||||||
# Show decision summary
|
|
||||||
if decisions:
|
|
||||||
st.divider()
|
|
||||||
merged = 0
|
|
||||||
customized = 0
|
|
||||||
split = 0
|
|
||||||
kept_all = 0
|
|
||||||
for v in decisions.values():
|
|
||||||
if not isinstance(v, dict):
|
|
||||||
continue
|
|
||||||
ki = v.get("keep_indices", [])
|
|
||||||
# Find the matching group size
|
|
||||||
gid_for_v = next(
|
|
||||||
(gid for gid, d in decisions.items() if d is v),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
group_size = next(
|
|
||||||
(len(g.row_indices) for g in result.match_groups
|
|
||||||
if g.group_id == gid_for_v),
|
|
||||||
0,
|
|
||||||
)
|
|
||||||
if len(ki) == group_size:
|
|
||||||
kept_all += 1
|
|
||||||
elif len(ki) == 1:
|
|
||||||
if v.get("overrides"):
|
|
||||||
customized += 1
|
|
||||||
else:
|
|
||||||
merged += 1
|
|
||||||
else:
|
|
||||||
split += 1
|
|
||||||
|
|
||||||
pending = len(result.match_groups) - len(decisions)
|
|
||||||
parts = []
|
|
||||||
if merged:
|
|
||||||
parts.append(f"{merged} merged")
|
|
||||||
if customized:
|
|
||||||
parts.append(f"{customized} customized")
|
|
||||||
if split:
|
|
||||||
parts.append(f"{split} split")
|
|
||||||
if kept_all:
|
|
||||||
parts.append(f"{kept_all} kept all")
|
|
||||||
parts.append(f"{pending} pending")
|
|
||||||
st.caption("Decisions: " + ", ".join(parts))
|
|
||||||
|
|
||||||
# Apply decisions and offer download
|
|
||||||
if st.button(
|
|
||||||
"Apply Review Decisions & Download",
|
|
||||||
type="primary",
|
|
||||||
use_container_width=True,
|
|
||||||
):
|
|
||||||
reviewed_df, reviewed_removed = apply_review_decisions(
|
|
||||||
df, result.match_groups, decisions,
|
|
||||||
)
|
|
||||||
|
|
||||||
csv_bytes = reviewed_df.to_csv(
|
|
||||||
index=False
|
|
||||||
).encode("utf-8-sig")
|
|
||||||
st.download_button(
|
|
||||||
"Download Reviewed & Deduplicated CSV",
|
|
||||||
data=csv_bytes,
|
|
||||||
file_name="deduplicated_reviewed.csv",
|
|
||||||
mime="text/csv",
|
|
||||||
key="reviewed_download",
|
|
||||||
)
|
|
||||||
if not reviewed_removed.empty:
|
|
||||||
removed_bytes = reviewed_removed.to_csv(
|
|
||||||
index=False
|
|
||||||
).encode("utf-8-sig")
|
|
||||||
st.download_button(
|
|
||||||
"Download Reviewed Removed Rows",
|
|
||||||
data=removed_bytes,
|
|
||||||
file_name="removed_reviewed.csv",
|
|
||||||
mime="text/csv",
|
|
||||||
key="reviewed_removed_download",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Log entries
|
|
||||||
if result.log_entries:
|
|
||||||
with st.expander("Processing Log"):
|
|
||||||
st.code("\n".join(result.log_entries))
|
|
||||||
|
|
||||||
else:
|
|
||||||
# No file uploaded — show placeholder
|
|
||||||
st.info("Upload a file to get started.")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -379,5 +137,5 @@ else:
|
|||||||
st.divider()
|
st.divider()
|
||||||
st.caption(
|
st.caption(
|
||||||
"Runs locally. Your data never leaves this computer. "
|
"Runs locally. Your data never leaves this computer. "
|
||||||
"| DataTools Deduplicator v3.0"
|
"| DataTools v3.0"
|
||||||
)
|
)
|
||||||
|
|||||||
355
src/gui/pages/1_Deduplicator.py
Normal file
355
src/gui/pages/1_Deduplicator.py
Normal file
@@ -0,0 +1,355 @@
|
|||||||
|
"""DataTools Deduplicator — full working tool page."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
# Ensure project root is on sys.path so `src.core` imports work
|
||||||
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_project_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
|
from src.core.dedup import deduplicate, DeduplicationResult
|
||||||
|
from src.core.io import read_file, list_sheets, detect_encoding, detect_delimiter
|
||||||
|
from src.gui.components import (
|
||||||
|
apply_review_decisions,
|
||||||
|
config_panel,
|
||||||
|
match_group_card,
|
||||||
|
results_summary,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Session state defaults
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_DEFAULTS = {
|
||||||
|
"df": None,
|
||||||
|
"result": None,
|
||||||
|
"review_decisions": {},
|
||||||
|
"config": None,
|
||||||
|
"file_name": "",
|
||||||
|
"sheet_names": [],
|
||||||
|
"detected_delimiter": ",",
|
||||||
|
}
|
||||||
|
for key, default in _DEFAULTS.items():
|
||||||
|
if key not in st.session_state:
|
||||||
|
st.session_state[key] = default
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.title("🔍 Deduplicator")
|
||||||
|
st.caption("Find and remove duplicate rows in CSV, delimited text, and Excel files.")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File upload
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
uploaded = st.file_uploader(
|
||||||
|
"Upload CSV or Excel file",
|
||||||
|
type=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.",
|
||||||
|
key="dedup_file_upload",
|
||||||
|
)
|
||||||
|
|
||||||
|
if uploaded is not None:
|
||||||
|
# Detect if file changed
|
||||||
|
if uploaded.name != st.session_state["file_name"]:
|
||||||
|
st.session_state["file_name"] = uploaded.name
|
||||||
|
st.session_state["result"] = None
|
||||||
|
st.session_state["review_decisions"] = {}
|
||||||
|
|
||||||
|
# Read the file
|
||||||
|
try:
|
||||||
|
suffix = Path(uploaded.name).suffix
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||||
|
tmp.write(uploaded.getvalue())
|
||||||
|
tmp_path = Path(tmp.name)
|
||||||
|
|
||||||
|
# Check for Excel sheets / detect delimiter
|
||||||
|
if suffix.lower() in (".xlsx", ".xls"):
|
||||||
|
st.session_state["sheet_names"] = list_sheets(tmp_path)
|
||||||
|
st.session_state["detected_delimiter"] = ","
|
||||||
|
else:
|
||||||
|
st.session_state["sheet_names"] = []
|
||||||
|
enc = detect_encoding(tmp_path)
|
||||||
|
st.session_state["detected_delimiter"] = detect_delimiter(tmp_path, enc)
|
||||||
|
|
||||||
|
df = read_file(tmp_path)
|
||||||
|
if not isinstance(df, pd.DataFrame):
|
||||||
|
df = pd.concat(list(df), ignore_index=True)
|
||||||
|
|
||||||
|
st.session_state["df"] = df
|
||||||
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Failed to read file: {e}")
|
||||||
|
st.session_state["df"] = None
|
||||||
|
|
||||||
|
df = st.session_state["df"]
|
||||||
|
|
||||||
|
if df is not None:
|
||||||
|
# Sheet selector for Excel files
|
||||||
|
if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1:
|
||||||
|
sheet = st.selectbox(
|
||||||
|
"Select sheet",
|
||||||
|
st.session_state["sheet_names"],
|
||||||
|
)
|
||||||
|
if sheet != st.session_state.get("_current_sheet"):
|
||||||
|
st.session_state["_current_sheet"] = sheet
|
||||||
|
suffix = Path(uploaded.name).suffix
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||||
|
tmp.write(uploaded.getvalue())
|
||||||
|
tmp_path = Path(tmp.name)
|
||||||
|
df = read_file(tmp_path, sheet_name=sheet)
|
||||||
|
if not isinstance(df, pd.DataFrame):
|
||||||
|
df = pd.concat(list(df), ignore_index=True)
|
||||||
|
st.session_state["df"] = df
|
||||||
|
st.session_state["result"] = None
|
||||||
|
st.session_state["review_decisions"] = {}
|
||||||
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
# Delimiter selector for CSV/TSV files
|
||||||
|
is_csv = Path(uploaded.name).suffix.lower() not in (".xlsx", ".xls")
|
||||||
|
if is_csv:
|
||||||
|
_DELIMITERS = {
|
||||||
|
"Comma (,)": ",",
|
||||||
|
"Tab (\\t)": "\t",
|
||||||
|
"Semicolon (;)": ";",
|
||||||
|
"Pipe (|)": "|",
|
||||||
|
"Other": None,
|
||||||
|
}
|
||||||
|
_DELIM_LABELS = list(_DELIMITERS.keys())
|
||||||
|
_DELIM_VALUES = list(_DELIMITERS.values())
|
||||||
|
detected = st.session_state.get("detected_delimiter", ",")
|
||||||
|
default_idx = _DELIM_VALUES.index(detected) if detected in _DELIM_VALUES else 0
|
||||||
|
chosen_label = st.selectbox(
|
||||||
|
"Delimiter",
|
||||||
|
_DELIM_LABELS,
|
||||||
|
index=default_idx,
|
||||||
|
help="Auto-detected on upload. Change if the preview looks wrong.",
|
||||||
|
)
|
||||||
|
if chosen_label == "Other":
|
||||||
|
custom_delim = st.text_input(
|
||||||
|
"Enter delimiter character",
|
||||||
|
max_chars=5,
|
||||||
|
help="Enter the character(s) used to separate fields.",
|
||||||
|
)
|
||||||
|
chosen_delim = custom_delim if custom_delim else ","
|
||||||
|
else:
|
||||||
|
chosen_delim = _DELIMITERS[chosen_label]
|
||||||
|
if chosen_delim != st.session_state.get("_current_delimiter"):
|
||||||
|
st.session_state["_current_delimiter"] = chosen_delim
|
||||||
|
suffix = Path(uploaded.name).suffix
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||||
|
tmp.write(uploaded.getvalue())
|
||||||
|
tmp_path = Path(tmp.name)
|
||||||
|
df = read_file(tmp_path, delimiter=chosen_delim)
|
||||||
|
if not isinstance(df, pd.DataFrame):
|
||||||
|
df = pd.concat(list(df), ignore_index=True)
|
||||||
|
st.session_state["df"] = df
|
||||||
|
st.session_state["result"] = None
|
||||||
|
st.session_state["review_decisions"] = {}
|
||||||
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
# Preview
|
||||||
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
|
||||||
|
# Advanced options
|
||||||
|
settings = config_panel(df)
|
||||||
|
|
||||||
|
# Apply loaded config if present
|
||||||
|
loaded_cfg = st.session_state.get("loaded_config")
|
||||||
|
if loaded_cfg is not None:
|
||||||
|
settings["strategies"] = loaded_cfg.to_strategies()
|
||||||
|
settings["survivor_rule"] = loaded_cfg.to_survivor_rule()
|
||||||
|
settings["date_column"] = loaded_cfg.date_column
|
||||||
|
settings["merge"] = loaded_cfg.merge
|
||||||
|
del st.session_state["loaded_config"]
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
# Find Duplicates button
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
if st.button("Find Duplicates", type="primary", use_container_width=True):
|
||||||
|
progress_bar = st.progress(0, text="Comparing rows...")
|
||||||
|
|
||||||
|
def _gui_progress(current: int, total: int) -> None:
|
||||||
|
if total > 0:
|
||||||
|
pct = min(current / total, 1.0)
|
||||||
|
progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}")
|
||||||
|
|
||||||
|
with st.spinner("Running deduplication..."):
|
||||||
|
result = deduplicate(
|
||||||
|
df,
|
||||||
|
strategies=settings["strategies"],
|
||||||
|
survivor_rule=settings["survivor_rule"],
|
||||||
|
date_column=settings["date_column"],
|
||||||
|
merge=settings["merge"],
|
||||||
|
preview=False,
|
||||||
|
progress_callback=_gui_progress,
|
||||||
|
)
|
||||||
|
|
||||||
|
progress_bar.empty()
|
||||||
|
st.session_state["result"] = result
|
||||||
|
st.session_state["review_decisions"] = {}
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
# Results
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
|
||||||
|
result: DeduplicationResult | None = st.session_state["result"]
|
||||||
|
|
||||||
|
if result is not None:
|
||||||
|
st.divider()
|
||||||
|
st.subheader("Results")
|
||||||
|
|
||||||
|
# Summary + download buttons
|
||||||
|
results_summary(result, df)
|
||||||
|
|
||||||
|
# Match group review
|
||||||
|
if result.match_groups:
|
||||||
|
st.divider()
|
||||||
|
st.subheader("Match Groups")
|
||||||
|
|
||||||
|
# Batch actions
|
||||||
|
def _accept_all():
|
||||||
|
for g in result.match_groups:
|
||||||
|
st.session_state["review_decisions"][g.group_id] = {
|
||||||
|
"keep_indices": [g.survivor_index],
|
||||||
|
"overrides": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _reject_all():
|
||||||
|
for g in result.match_groups:
|
||||||
|
st.session_state["review_decisions"][g.group_id] = {
|
||||||
|
"keep_indices": list(g.row_indices),
|
||||||
|
"overrides": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _clear_all():
|
||||||
|
st.session_state["review_decisions"] = {}
|
||||||
|
for k in list(st.session_state):
|
||||||
|
if k.startswith("editor_"):
|
||||||
|
del st.session_state[k]
|
||||||
|
|
||||||
|
action_left, action_mid, action_right = st.columns(3)
|
||||||
|
with action_left:
|
||||||
|
st.button("Accept All", on_click=_accept_all)
|
||||||
|
with action_mid:
|
||||||
|
st.button("Reject All", on_click=_reject_all)
|
||||||
|
with action_right:
|
||||||
|
st.button("Clear Decisions", on_click=_clear_all)
|
||||||
|
|
||||||
|
# Individual group cards
|
||||||
|
decisions = st.session_state["review_decisions"]
|
||||||
|
for i, group in enumerate(result.match_groups):
|
||||||
|
match_group_card(group, df, group_num=i + 1)
|
||||||
|
|
||||||
|
# Show decision summary
|
||||||
|
if decisions:
|
||||||
|
st.divider()
|
||||||
|
merged = 0
|
||||||
|
customized = 0
|
||||||
|
split = 0
|
||||||
|
kept_all = 0
|
||||||
|
for v in decisions.values():
|
||||||
|
if not isinstance(v, dict):
|
||||||
|
continue
|
||||||
|
ki = v.get("keep_indices", [])
|
||||||
|
gid_for_v = next(
|
||||||
|
(gid for gid, d in decisions.items() if d is v),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
group_size = next(
|
||||||
|
(len(g.row_indices) for g in result.match_groups
|
||||||
|
if g.group_id == gid_for_v),
|
||||||
|
0,
|
||||||
|
)
|
||||||
|
if len(ki) == group_size:
|
||||||
|
kept_all += 1
|
||||||
|
elif len(ki) == 1:
|
||||||
|
if v.get("overrides"):
|
||||||
|
customized += 1
|
||||||
|
else:
|
||||||
|
merged += 1
|
||||||
|
else:
|
||||||
|
split += 1
|
||||||
|
|
||||||
|
pending = len(result.match_groups) - len(decisions)
|
||||||
|
parts = []
|
||||||
|
if merged:
|
||||||
|
parts.append(f"{merged} merged")
|
||||||
|
if customized:
|
||||||
|
parts.append(f"{customized} customized")
|
||||||
|
if split:
|
||||||
|
parts.append(f"{split} split")
|
||||||
|
if kept_all:
|
||||||
|
parts.append(f"{kept_all} kept all")
|
||||||
|
parts.append(f"{pending} pending")
|
||||||
|
st.caption("Decisions: " + ", ".join(parts))
|
||||||
|
|
||||||
|
# Apply decisions and offer download
|
||||||
|
if st.button(
|
||||||
|
"Apply Review Decisions & Download",
|
||||||
|
type="primary",
|
||||||
|
use_container_width=True,
|
||||||
|
):
|
||||||
|
reviewed_df, reviewed_removed = apply_review_decisions(
|
||||||
|
df, result.match_groups, decisions,
|
||||||
|
)
|
||||||
|
|
||||||
|
csv_bytes = reviewed_df.to_csv(
|
||||||
|
index=False
|
||||||
|
).encode("utf-8-sig")
|
||||||
|
st.download_button(
|
||||||
|
"Download Reviewed & Deduplicated CSV",
|
||||||
|
data=csv_bytes,
|
||||||
|
file_name="deduplicated_reviewed.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
key="reviewed_download",
|
||||||
|
)
|
||||||
|
if not reviewed_removed.empty:
|
||||||
|
removed_bytes = reviewed_removed.to_csv(
|
||||||
|
index=False
|
||||||
|
).encode("utf-8-sig")
|
||||||
|
st.download_button(
|
||||||
|
"Download Reviewed Removed Rows",
|
||||||
|
data=removed_bytes,
|
||||||
|
file_name="removed_reviewed.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
key="reviewed_removed_download",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Log entries
|
||||||
|
if result.log_entries:
|
||||||
|
with st.expander("Processing Log"):
|
||||||
|
st.code("\n".join(result.log_entries))
|
||||||
|
|
||||||
|
else:
|
||||||
|
# No file uploaded — show placeholder
|
||||||
|
st.info("Upload a file to get started.")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Footer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption(
|
||||||
|
"Runs locally. Your data never leaves this computer. "
|
||||||
|
"| DataTools Deduplicator v3.0"
|
||||||
|
)
|
||||||
89
src/gui/pages/2_Text_Cleaner.py
Normal file
89
src/gui/pages/2_Text_Cleaner.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
"""DataTools Text Cleaner — stub page."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_project_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.title("✂️ Text Cleaner")
|
||||||
|
st.caption("Clean and normalize text content across your data.")
|
||||||
|
|
||||||
|
st.info("This tool is under development.")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# What this tool will do
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.markdown("""
|
||||||
|
**Features:**
|
||||||
|
- Trim leading/trailing whitespace
|
||||||
|
- Collapse multiple spaces into one
|
||||||
|
- Unicode normalization (NFC/NFKC)
|
||||||
|
- Strip non-printable / control characters
|
||||||
|
- Remove BOM (byte order mark)
|
||||||
|
- Normalize line endings (CRLF → LF)
|
||||||
|
- Case conversion (upper, lower, title, sentence)
|
||||||
|
""")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File upload (functional)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
uploaded = st.file_uploader(
|
||||||
|
"Upload CSV or Excel file",
|
||||||
|
type=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
help="Upload a file to preview. Processing is not yet available.",
|
||||||
|
key="textclean_file_upload",
|
||||||
|
)
|
||||||
|
|
||||||
|
if uploaded is not None:
|
||||||
|
import pandas as pd
|
||||||
|
try:
|
||||||
|
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||||
|
df = pd.read_excel(uploaded)
|
||||||
|
else:
|
||||||
|
df = pd.read_csv(uploaded)
|
||||||
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Failed to read file: {e}")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Placeholder options
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Operations")
|
||||||
|
|
||||||
|
st.checkbox("Trim whitespace", value=True, disabled=True)
|
||||||
|
st.checkbox("Collapse multiple spaces", value=True, disabled=True)
|
||||||
|
st.checkbox("Unicode normalization (NFC)", value=False, disabled=True)
|
||||||
|
st.checkbox("Strip non-printable characters", value=False, disabled=True)
|
||||||
|
st.checkbox("Remove BOM", value=False, disabled=True)
|
||||||
|
st.checkbox("Normalize line endings", value=False, disabled=True)
|
||||||
|
st.selectbox("Case conversion", ["None", "UPPER", "lower", "Title Case", "Sentence case"], disabled=True)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.button("Clean Text", type="primary", use_container_width=True, disabled=True)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Footer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption(
|
||||||
|
"Runs locally. Your data never leaves this computer. "
|
||||||
|
"| DataTools v3.0"
|
||||||
|
)
|
||||||
86
src/gui/pages/3_Format_Standardizer.py
Normal file
86
src/gui/pages/3_Format_Standardizer.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
"""DataTools Format Standardizer — stub page."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_project_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.title("📐 Format Standardizer")
|
||||||
|
st.caption("Standardize formats across columns for consistency.")
|
||||||
|
|
||||||
|
st.info("This tool is under development.")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# What this tool will do
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.markdown("""
|
||||||
|
**Features:**
|
||||||
|
- Date format standardization (e.g., MM/DD/YYYY → YYYY-MM-DD)
|
||||||
|
- Phone number formatting (E.164, national, international)
|
||||||
|
- Currency normalization ($1,000.00 → 1000.00)
|
||||||
|
- Name casing (JOHN DOE → John Doe)
|
||||||
|
- Address abbreviation expansion (St. → Street, Ave. → Avenue)
|
||||||
|
- Boolean standardization (Yes/No/Y/N/1/0 → True/False)
|
||||||
|
""")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File upload (functional)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
uploaded = st.file_uploader(
|
||||||
|
"Upload CSV or Excel file",
|
||||||
|
type=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
help="Upload a file to preview. Processing is not yet available.",
|
||||||
|
key="fmtstd_file_upload",
|
||||||
|
)
|
||||||
|
|
||||||
|
if uploaded is not None:
|
||||||
|
import pandas as pd
|
||||||
|
try:
|
||||||
|
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||||
|
df = pd.read_excel(uploaded)
|
||||||
|
else:
|
||||||
|
df = pd.read_csv(uploaded)
|
||||||
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Failed to read file: {e}")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Placeholder options
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Format Rules")
|
||||||
|
|
||||||
|
st.selectbox("Date format", ["YYYY-MM-DD", "MM/DD/YYYY", "DD/MM/YYYY", "DD-Mon-YYYY"], disabled=True)
|
||||||
|
st.selectbox("Phone format", ["E.164 (+15551234567)", "National ((555) 123-4567)", "Digits only"], disabled=True)
|
||||||
|
st.selectbox("Currency handling", ["Strip symbols, keep number", "Normalize to 2 decimals", "Keep as-is"], disabled=True)
|
||||||
|
st.selectbox("Name casing", ["Title Case", "UPPER", "lower", "As-is"], disabled=True)
|
||||||
|
st.checkbox("Expand address abbreviations", value=False, disabled=True)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.button("Standardize Formats", type="primary", use_container_width=True, disabled=True)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Footer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption(
|
||||||
|
"Runs locally. Your data never leaves this computer. "
|
||||||
|
"| DataTools v3.0"
|
||||||
|
)
|
||||||
102
src/gui/pages/4_Missing_Values.py
Normal file
102
src/gui/pages/4_Missing_Values.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
"""DataTools Missing Value Handler — stub page."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_project_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.title("🕳️ Missing Value Handler")
|
||||||
|
st.caption("Detect, analyze, and handle missing values in your data.")
|
||||||
|
|
||||||
|
st.info("This tool is under development.")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# What this tool will do
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.markdown("""
|
||||||
|
**Features:**
|
||||||
|
- Detect disguised nulls (empty strings, "N/A", "n/a", "-", "NULL", "None", etc.)
|
||||||
|
- Missingness analysis: per-column counts, percentages, and patterns
|
||||||
|
- Visualize missing data heatmap
|
||||||
|
- Imputation strategies: drop rows/columns, fill with mean/median/mode, forward-fill, backward-fill
|
||||||
|
- Custom sentinel value replacement
|
||||||
|
- Before/after comparison
|
||||||
|
""")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File upload (functional)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
uploaded = st.file_uploader(
|
||||||
|
"Upload CSV or Excel file",
|
||||||
|
type=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
help="Upload a file to preview. Processing is not yet available.",
|
||||||
|
key="missing_file_upload",
|
||||||
|
)
|
||||||
|
|
||||||
|
if uploaded is not None:
|
||||||
|
import pandas as pd
|
||||||
|
try:
|
||||||
|
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||||
|
df = pd.read_excel(uploaded)
|
||||||
|
else:
|
||||||
|
df = pd.read_csv(uploaded)
|
||||||
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Failed to read file: {e}")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Placeholder options
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Detection Settings")
|
||||||
|
|
||||||
|
st.text_input(
|
||||||
|
"Null patterns (comma-separated)",
|
||||||
|
value="N/A, n/a, NA, -, NULL, None, empty, .",
|
||||||
|
disabled=True,
|
||||||
|
help="Values to treat as missing.",
|
||||||
|
)
|
||||||
|
|
||||||
|
st.subheader("Handling Strategy")
|
||||||
|
|
||||||
|
st.selectbox("Strategy", [
|
||||||
|
"Drop rows with any missing",
|
||||||
|
"Drop rows above threshold",
|
||||||
|
"Fill with mean (numeric)",
|
||||||
|
"Fill with median (numeric)",
|
||||||
|
"Fill with mode (categorical)",
|
||||||
|
"Forward-fill",
|
||||||
|
"Backward-fill",
|
||||||
|
"Custom value",
|
||||||
|
], disabled=True)
|
||||||
|
|
||||||
|
st.slider("Drop threshold (%)", 0, 100, 50, disabled=True, help="Drop rows missing more than this % of columns.")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.button("Handle Missing Values", type="primary", use_container_width=True, disabled=True)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Footer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption(
|
||||||
|
"Runs locally. Your data never leaves this computer. "
|
||||||
|
"| DataTools v3.0"
|
||||||
|
)
|
||||||
93
src/gui/pages/5_Column_Mapper.py
Normal file
93
src/gui/pages/5_Column_Mapper.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
"""DataTools Column Mapper — stub page."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_project_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.title("🗂️ Column Mapper")
|
||||||
|
st.caption("Rename columns, enforce a target schema, and coerce types.")
|
||||||
|
|
||||||
|
st.info("This tool is under development.")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# What this tool will do
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.markdown("""
|
||||||
|
**Features:**
|
||||||
|
- Rename columns via interactive mapping table
|
||||||
|
- Load a target schema (JSON/CSV) to auto-map columns
|
||||||
|
- Fuzzy column name matching for automatic suggestions
|
||||||
|
- Type coercion (string → int, string → date, etc.)
|
||||||
|
- Drop unmapped columns or keep as-is
|
||||||
|
- Reorder columns to match target schema
|
||||||
|
""")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File upload (functional)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
uploaded = st.file_uploader(
|
||||||
|
"Upload CSV or Excel file",
|
||||||
|
type=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
help="Upload a file to preview. Processing is not yet available.",
|
||||||
|
key="colmap_file_upload",
|
||||||
|
)
|
||||||
|
|
||||||
|
if uploaded is not None:
|
||||||
|
import pandas as pd
|
||||||
|
try:
|
||||||
|
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||||
|
df = pd.read_excel(uploaded)
|
||||||
|
else:
|
||||||
|
df = pd.read_csv(uploaded)
|
||||||
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
|
||||||
|
st.subheader("Column Mapping")
|
||||||
|
st.caption("Map source columns to target names. (Interactive mapping coming soon.)")
|
||||||
|
mapping_data = pd.DataFrame({
|
||||||
|
"Source Column": df.columns.tolist(),
|
||||||
|
"Target Column": df.columns.tolist(),
|
||||||
|
"Type": ["auto"] * len(df.columns),
|
||||||
|
})
|
||||||
|
st.dataframe(mapping_data, use_container_width=True, hide_index=True)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Failed to read file: {e}")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Placeholder options
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Schema Options")
|
||||||
|
|
||||||
|
st.file_uploader("Load target schema (JSON)", type=["json"], disabled=True, key="colmap_schema")
|
||||||
|
st.checkbox("Drop unmapped columns", value=False, disabled=True)
|
||||||
|
st.checkbox("Reorder to match schema", value=True, disabled=True)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.button("Apply Column Mapping", type="primary", use_container_width=True, disabled=True)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Footer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption(
|
||||||
|
"Runs locally. Your data never leaves this computer. "
|
||||||
|
"| DataTools v3.0"
|
||||||
|
)
|
||||||
88
src/gui/pages/6_Outlier_Detector.py
Normal file
88
src/gui/pages/6_Outlier_Detector.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
"""DataTools Outlier Detector — stub page."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_project_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.title("📊 Outlier Detector")
|
||||||
|
st.caption("Detect and handle outliers in numeric columns.")
|
||||||
|
|
||||||
|
st.info("This tool is under development.")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# What this tool will do
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.markdown("""
|
||||||
|
**Features:**
|
||||||
|
- Z-score detection (configurable threshold)
|
||||||
|
- IQR (interquartile range) detection
|
||||||
|
- MAD (median absolute deviation) detection
|
||||||
|
- Domain-rule violations (e.g., age < 0, price > $1M)
|
||||||
|
- Visual outlier highlighting in data preview
|
||||||
|
- Handling: flag only, remove, cap/winsorize to bounds
|
||||||
|
""")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File upload (functional)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
uploaded = st.file_uploader(
|
||||||
|
"Upload CSV or Excel file",
|
||||||
|
type=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
help="Upload a file to preview. Processing is not yet available.",
|
||||||
|
key="outlier_file_upload",
|
||||||
|
)
|
||||||
|
|
||||||
|
if uploaded is not None:
|
||||||
|
import pandas as pd
|
||||||
|
try:
|
||||||
|
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||||
|
df = pd.read_excel(uploaded)
|
||||||
|
else:
|
||||||
|
df = pd.read_csv(uploaded)
|
||||||
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Failed to read file: {e}")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Placeholder options
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Detection Method")
|
||||||
|
|
||||||
|
st.selectbox("Method", ["Z-Score", "IQR (Interquartile Range)", "MAD (Median Absolute Deviation)"], disabled=True)
|
||||||
|
st.slider("Z-Score threshold", 1.0, 5.0, 3.0, 0.1, disabled=True)
|
||||||
|
st.slider("IQR multiplier", 1.0, 3.0, 1.5, 0.1, disabled=True)
|
||||||
|
|
||||||
|
st.subheader("Handling")
|
||||||
|
|
||||||
|
st.selectbox("Action", ["Flag only (add column)", "Remove outlier rows", "Cap / Winsorize to bounds"], disabled=True)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.button("Detect Outliers", type="primary", use_container_width=True, disabled=True)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Footer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption(
|
||||||
|
"Runs locally. Your data never leaves this computer. "
|
||||||
|
"| DataTools v3.0"
|
||||||
|
)
|
||||||
86
src/gui/pages/7_Multi_File_Merger.py
Normal file
86
src/gui/pages/7_Multi_File_Merger.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
"""DataTools Multi-File Merger — stub page."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_project_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.title("📎 Multi-File Merger")
|
||||||
|
st.caption("Combine multiple CSV and Excel files into one dataset.")
|
||||||
|
|
||||||
|
st.info("This tool is under development.")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# What this tool will do
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.markdown("""
|
||||||
|
**Features:**
|
||||||
|
- Upload multiple CSV/Excel files at once
|
||||||
|
- Automatic schema alignment (matching columns by name)
|
||||||
|
- Append mode: stack files vertically (union)
|
||||||
|
- Join mode: merge files on shared key columns
|
||||||
|
- Handle mismatched columns (fill missing with nulls or drop)
|
||||||
|
- Source file tracking column
|
||||||
|
""")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Multi-file upload (functional)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
uploaded_files = st.file_uploader(
|
||||||
|
"Upload CSV or Excel files",
|
||||||
|
type=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
accept_multiple_files=True,
|
||||||
|
help="Upload multiple files to preview. Processing is not yet available.",
|
||||||
|
key="merger_file_upload",
|
||||||
|
)
|
||||||
|
|
||||||
|
if uploaded_files:
|
||||||
|
import pandas as pd
|
||||||
|
for f in uploaded_files:
|
||||||
|
try:
|
||||||
|
if f.name.endswith((".xlsx", ".xls")):
|
||||||
|
df = pd.read_excel(f)
|
||||||
|
else:
|
||||||
|
df = pd.read_csv(f)
|
||||||
|
st.subheader(f"Preview: {f.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns — Columns: {', '.join(df.columns[:10])}{'...' if len(df.columns) > 10 else ''}")
|
||||||
|
st.dataframe(df.head(5), use_container_width=True)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Failed to read {f.name}: {e}")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Placeholder options
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Merge Strategy")
|
||||||
|
|
||||||
|
st.selectbox("Mode", ["Append (stack vertically)", "Join on key columns", "Schema alignment (smart merge)"], disabled=True)
|
||||||
|
st.selectbox("Mismatched columns", ["Fill with null", "Drop non-shared columns", "Error"], disabled=True)
|
||||||
|
st.checkbox("Add source filename column", value=True, disabled=True)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.button("Merge Files", type="primary", use_container_width=True, disabled=True)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Footer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption(
|
||||||
|
"Runs locally. Your data never leaves this computer. "
|
||||||
|
"| DataTools v3.0"
|
||||||
|
)
|
||||||
93
src/gui/pages/8_Validator_Reporter.py
Normal file
93
src/gui/pages/8_Validator_Reporter.py
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
"""DataTools Validator & Reporter — stub page."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_project_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.title("✅ Validator & Reporter")
|
||||||
|
st.caption("Validate data against rules and generate quality reports.")
|
||||||
|
|
||||||
|
st.info("This tool is under development.")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# What this tool will do
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.markdown("""
|
||||||
|
**Features:**
|
||||||
|
- Column-level validation rules (not null, unique, regex pattern, range, enum)
|
||||||
|
- Cross-column validation (e.g., start_date < end_date)
|
||||||
|
- Data quality score per column and overall
|
||||||
|
- Generate PDF quality report
|
||||||
|
- Generate Excel report with flagged rows highlighted
|
||||||
|
- Summary dashboard: pass/fail counts, severity breakdown
|
||||||
|
""")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File upload (functional)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
uploaded = st.file_uploader(
|
||||||
|
"Upload CSV or Excel file",
|
||||||
|
type=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
help="Upload a file to preview. Processing is not yet available.",
|
||||||
|
key="validator_file_upload",
|
||||||
|
)
|
||||||
|
|
||||||
|
if uploaded is not None:
|
||||||
|
import pandas as pd
|
||||||
|
try:
|
||||||
|
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||||
|
df = pd.read_excel(uploaded)
|
||||||
|
else:
|
||||||
|
df = pd.read_csv(uploaded)
|
||||||
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Failed to read file: {e}")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Placeholder options
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Validation Rules")
|
||||||
|
|
||||||
|
st.file_uploader("Load rules file (JSON)", type=["json"], disabled=True, key="validator_rules")
|
||||||
|
st.multiselect("Quick checks", [
|
||||||
|
"No null values",
|
||||||
|
"No duplicate rows",
|
||||||
|
"All emails valid",
|
||||||
|
"All dates parseable",
|
||||||
|
"Numeric columns in range",
|
||||||
|
], disabled=True)
|
||||||
|
|
||||||
|
st.subheader("Report Format")
|
||||||
|
|
||||||
|
st.selectbox("Output format", ["Excel (flagged rows)", "PDF summary", "Both"], disabled=True)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.button("Validate & Generate Report", type="primary", use_container_width=True, disabled=True)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Footer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption(
|
||||||
|
"Runs locally. Your data never leaves this computer. "
|
||||||
|
"| DataTools v3.0"
|
||||||
|
)
|
||||||
95
src/gui/pages/9_Pipeline_Runner.py
Normal file
95
src/gui/pages/9_Pipeline_Runner.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
"""DataTools Pipeline Runner — stub page."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
|
if str(_project_root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Header
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.title("⚙️ Pipeline Runner")
|
||||||
|
st.caption("Chain tools in sequence and pass output between steps automatically.")
|
||||||
|
|
||||||
|
st.info("This tool is under development.")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# What this tool will do
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.markdown("""
|
||||||
|
**Features:**
|
||||||
|
- Select tools to run in sequence
|
||||||
|
- Recommended order: Text Cleaner → Format Standardizer → Missing Values → Deduplicator → Validator
|
||||||
|
- Each step's output feeds into the next step's input
|
||||||
|
- Per-step configuration overrides
|
||||||
|
- Progress tracking across all steps
|
||||||
|
- Final combined report
|
||||||
|
""")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File upload (functional)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
uploaded = st.file_uploader(
|
||||||
|
"Upload CSV or Excel file",
|
||||||
|
type=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
help="Upload a file to preview. Processing is not yet available.",
|
||||||
|
key="pipeline_file_upload",
|
||||||
|
)
|
||||||
|
|
||||||
|
if uploaded is not None:
|
||||||
|
import pandas as pd
|
||||||
|
try:
|
||||||
|
if uploaded.name.endswith((".xlsx", ".xls")):
|
||||||
|
df = pd.read_excel(uploaded)
|
||||||
|
else:
|
||||||
|
df = pd.read_csv(uploaded)
|
||||||
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Failed to read file: {e}")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Pipeline steps (checklist)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Pipeline Steps")
|
||||||
|
st.caption("Select tools to include in the pipeline (recommended order):")
|
||||||
|
|
||||||
|
st.checkbox("1. Text Cleaner", value=True, disabled=True)
|
||||||
|
st.checkbox("2. Format Standardizer", value=True, disabled=True)
|
||||||
|
st.checkbox("3. Missing Value Handler", value=True, disabled=True)
|
||||||
|
st.checkbox("4. Column Mapper", value=False, disabled=True)
|
||||||
|
st.checkbox("5. Outlier Detector", value=False, disabled=True)
|
||||||
|
st.checkbox("6. Deduplicator", value=True, disabled=True)
|
||||||
|
st.checkbox("7. Multi-File Merger", value=False, disabled=True)
|
||||||
|
st.checkbox("8. Validator & Reporter", value=True, disabled=True)
|
||||||
|
|
||||||
|
st.subheader("Pipeline Configuration")
|
||||||
|
|
||||||
|
st.selectbox("On error", ["Stop pipeline", "Skip step and continue", "Prompt for decision"], disabled=True)
|
||||||
|
st.checkbox("Generate combined report at end", value=True, disabled=True)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.button("Run Pipeline", type="primary", use_container_width=True, disabled=True)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Footer
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption(
|
||||||
|
"Runs locally. Your data never leaves this computer. "
|
||||||
|
"| DataTools v3.0"
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user