From f2fdc10af74a5d546bf7a2668bd3e3199aa570fd Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 29 Apr 2026 01:16:12 +0000 Subject: [PATCH] feat: refactor GUI to multi-page Streamlit app with 9 tool pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert single-page deduplicator into a multi-page suite. Home page shows tool card grid. Deduplicator extracted to its own page (fully working). 8 stub pages added for Text Cleaner, Format Standardizer, Missing Values, Column Mapper, Outlier Detector, Multi-File Merger, Validator & Reporter, and Pipeline Runner โ€” each with functional file upload and coming-soon UI. Co-Authored-By: Claude Opus 4.6 --- src/gui/app.py | 418 ++++++------------------- src/gui/pages/1_Deduplicator.py | 355 +++++++++++++++++++++ src/gui/pages/2_Text_Cleaner.py | 89 ++++++ src/gui/pages/3_Format_Standardizer.py | 86 +++++ src/gui/pages/4_Missing_Values.py | 102 ++++++ src/gui/pages/5_Column_Mapper.py | 93 ++++++ src/gui/pages/6_Outlier_Detector.py | 88 ++++++ src/gui/pages/7_Multi_File_Merger.py | 86 +++++ src/gui/pages/8_Validator_Reporter.py | 93 ++++++ src/gui/pages/9_Pipeline_Runner.py | 95 ++++++ 10 files changed, 1175 insertions(+), 330 deletions(-) create mode 100644 src/gui/pages/1_Deduplicator.py create mode 100644 src/gui/pages/2_Text_Cleaner.py create mode 100644 src/gui/pages/3_Format_Standardizer.py create mode 100644 src/gui/pages/4_Missing_Values.py create mode 100644 src/gui/pages/5_Column_Mapper.py create mode 100644 src/gui/pages/6_Outlier_Detector.py create mode 100644 src/gui/pages/7_Multi_File_Merger.py create mode 100644 src/gui/pages/8_Validator_Reporter.py create mode 100644 src/gui/pages/9_Pipeline_Runner.py diff --git a/src/gui/app.py b/src/gui/app.py index cadd9d8..0b4f7a8 100644 --- a/src/gui/app.py +++ b/src/gui/app.py @@ -1,4 +1,4 @@ -"""DataTools Deduplicator โ€” Streamlit GUI. +"""DataTools โ€” Data Cleaning Mastery Suite. Launch: streamlit run src/gui/app.py @@ -6,11 +6,9 @@ Launch: from __future__ import annotations -import io import sys from pathlib import Path -import pandas as pd import streamlit as st # Ensure project root is on sys.path so `src.core` imports work @@ -18,24 +16,14 @@ _project_root = Path(__file__).resolve().parent.parent.parent if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) -from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult -from src.core.io import read_file, list_sheets, detect_encoding, detect_delimiter -from src.core.config import DeduplicationConfig -from src.gui.components import ( - apply_review_decisions, - config_panel, - match_group_card, - results_summary, -) - # --------------------------------------------------------------------------- # Page config # --------------------------------------------------------------------------- st.set_page_config( - page_title="DataTools Deduplicator", - page_icon="๐Ÿ”", + page_title="DataTools โ€” Data Cleaning Mastery", + page_icon="๐Ÿงน", layout="wide", ) @@ -45,331 +33,101 @@ st.markdown( unsafe_allow_html=True, ) -# --------------------------------------------------------------------------- -# Session state defaults -# --------------------------------------------------------------------------- - -_DEFAULTS = { - "df": None, - "result": None, - "review_decisions": {}, - "config": None, - "file_name": "", - "sheet_names": [], - "detected_delimiter": ",", -} -for key, default in _DEFAULTS.items(): - if key not in st.session_state: - st.session_state[key] = default - # --------------------------------------------------------------------------- -# Header +# Home page # --------------------------------------------------------------------------- -st.title("DataTools Deduplicator") -st.caption("Find and remove duplicate rows in CSV, delimited text, and Excel files.") +st.title("๐Ÿงน DataTools โ€” Data Cleaning Mastery") +st.caption("A 9-tool suite for cleaning, standardizing, and validating tabular data. Runs 100% locally.") +st.divider() # --------------------------------------------------------------------------- -# File upload +# Tool cards # --------------------------------------------------------------------------- -uploaded = st.file_uploader( - "Upload CSV or Excel file", - type=["csv", "tsv", "xlsx", "xls"], - help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.", -) +TOOLS = [ + { + "icon": "๐Ÿ”", + "name": "Deduplicator", + "description": "Fuzzy matching, normalization, survivor selection, and interactive review.", + "status": "Ready", + "page": "1_Deduplicator", + }, + { + "icon": "โœ‚๏ธ", + "name": "Text Cleaner", + "description": "Whitespace trim, multi-space collapse, Unicode normalization, BOM and line-ending handling.", + "status": "Coming Soon", + "page": "2_Text_Cleaner", + }, + { + "icon": "๐Ÿ“", + "name": "Format Standardizer", + "description": "Standardize dates, currencies, names, phone numbers, and addresses.", + "status": "Coming Soon", + "page": "3_Format_Standardizer", + }, + { + "icon": "๐Ÿ•ณ๏ธ", + "name": "Missing Value Handler", + "description": "Detect disguised nulls, missingness analysis, and imputation strategies.", + "status": "Coming Soon", + "page": "4_Missing_Values", + }, + { + "icon": "๐Ÿ—‚๏ธ", + "name": "Column Mapper", + "description": "Rename columns, enforce a target schema, and coerce types.", + "status": "Coming Soon", + "page": "5_Column_Mapper", + }, + { + "icon": "๐Ÿ“Š", + "name": "Outlier Detector", + "description": "Z-score, IQR, and MAD detection with domain-rule violations and winsorization.", + "status": "Coming Soon", + "page": "6_Outlier_Detector", + }, + { + "icon": "๐Ÿ“Ž", + "name": "Multi-File Merger", + "description": "Combine multiple CSV/Excel files with schema alignment.", + "status": "Coming Soon", + "page": "7_Multi_File_Merger", + }, + { + "icon": "โœ…", + "name": "Validator & Reporter", + "description": "Validate against rules and generate PDF/Excel quality reports.", + "status": "Coming Soon", + "page": "8_Validator_Reporter", + }, + { + "icon": "โš™๏ธ", + "name": "Pipeline Runner", + "description": "Chain tools in recommended order and pass output between steps.", + "status": "Coming Soon", + "page": "9_Pipeline_Runner", + }, +] -if uploaded is not None: - # Detect if file changed - if uploaded.name != st.session_state["file_name"]: - st.session_state["file_name"] = uploaded.name - st.session_state["result"] = None - st.session_state["review_decisions"] = {} - - # Read the file - try: - # Write to a temp file for read_file() which needs a path - import tempfile - suffix = Path(uploaded.name).suffix - with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: - tmp.write(uploaded.getvalue()) - tmp_path = Path(tmp.name) - - # Check for Excel sheets / detect delimiter - if suffix.lower() in (".xlsx", ".xls"): - st.session_state["sheet_names"] = list_sheets(tmp_path) - st.session_state["detected_delimiter"] = "," - else: - st.session_state["sheet_names"] = [] - enc = detect_encoding(tmp_path) - st.session_state["detected_delimiter"] = detect_delimiter(tmp_path, enc) - - df = read_file(tmp_path) - if not isinstance(df, pd.DataFrame): - df = pd.concat(list(df), ignore_index=True) - - st.session_state["df"] = df - - # Clean up temp file - tmp_path.unlink(missing_ok=True) - - except Exception as e: - st.error(f"Failed to read file: {e}") - st.session_state["df"] = None - - df = st.session_state["df"] - - if df is not None: - # Sheet selector for Excel files - if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1: - sheet = st.selectbox( - "Select sheet", - st.session_state["sheet_names"], +# Render tool cards in a 3-column grid +for row_start in range(0, len(TOOLS), 3): + cols = st.columns(3) + for i, col in enumerate(cols): + idx = row_start + i + if idx >= len(TOOLS): + break + tool = TOOLS[idx] + with col: + status_color = "green" if tool["status"] == "Ready" else "orange" + st.markdown( + f"### {tool['icon']} {tool['name']}\n\n" + f"{tool['description']}\n\n" + f":{status_color}[**{tool['status']}**]" ) - if sheet != st.session_state.get("_current_sheet"): - st.session_state["_current_sheet"] = sheet - suffix = Path(uploaded.name).suffix - import tempfile - with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: - tmp.write(uploaded.getvalue()) - tmp_path = Path(tmp.name) - df = read_file(tmp_path, sheet_name=sheet) - if not isinstance(df, pd.DataFrame): - df = pd.concat(list(df), ignore_index=True) - st.session_state["df"] = df - st.session_state["result"] = None - st.session_state["review_decisions"] = {} - tmp_path.unlink(missing_ok=True) - - # Delimiter selector for CSV/TSV files - is_csv = Path(uploaded.name).suffix.lower() not in (".xlsx", ".xls") - if is_csv: - _DELIMITERS = { - "Comma (,)": ",", - "Tab (\\t)": "\t", - "Semicolon (;)": ";", - "Pipe (|)": "|", - "Other": None, - } - _DELIM_LABELS = list(_DELIMITERS.keys()) - _DELIM_VALUES = list(_DELIMITERS.values()) - detected = st.session_state.get("detected_delimiter", ",") - default_idx = _DELIM_VALUES.index(detected) if detected in _DELIM_VALUES else 0 - chosen_label = st.selectbox( - "Delimiter", - _DELIM_LABELS, - index=default_idx, - help="Auto-detected on upload. Change if the preview looks wrong.", - ) - if chosen_label == "Other": - custom_delim = st.text_input( - "Enter delimiter character", - max_chars=5, - help="Enter the character(s) used to separate fields.", - ) - chosen_delim = custom_delim if custom_delim else "," - else: - chosen_delim = _DELIMITERS[chosen_label] - if chosen_delim != st.session_state.get("_current_delimiter"): - st.session_state["_current_delimiter"] = chosen_delim - import tempfile - suffix = Path(uploaded.name).suffix - with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: - tmp.write(uploaded.getvalue()) - tmp_path = Path(tmp.name) - df = read_file(tmp_path, delimiter=chosen_delim) - if not isinstance(df, pd.DataFrame): - df = pd.concat(list(df), ignore_index=True) - st.session_state["df"] = df - st.session_state["result"] = None - st.session_state["review_decisions"] = {} - tmp_path.unlink(missing_ok=True) - - # Preview - st.subheader(f"Preview: {uploaded.name}") - st.caption(f"{len(df)} rows, {len(df.columns)} columns") - st.dataframe(df.head(10), use_container_width=True) - - # Advanced options - settings = config_panel(df) - - # Apply loaded config if present - loaded_cfg = st.session_state.get("loaded_config") - if loaded_cfg is not None: - settings["strategies"] = loaded_cfg.to_strategies() - settings["survivor_rule"] = loaded_cfg.to_survivor_rule() - settings["date_column"] = loaded_cfg.date_column - settings["merge"] = loaded_cfg.merge - # Clear so it doesn't override on every rerun - del st.session_state["loaded_config"] - - # --------------------------------------------------------------------------- - # Find Duplicates button - # --------------------------------------------------------------------------- - - st.divider() - - if st.button("Find Duplicates", type="primary", use_container_width=True): - progress_bar = st.progress(0, text="Comparing rows...") - - def _gui_progress(current: int, total: int) -> None: - if total > 0: - pct = min(current / total, 1.0) - progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}") - - with st.spinner("Running deduplication..."): - result = deduplicate( - df, - strategies=settings["strategies"], - survivor_rule=settings["survivor_rule"], - date_column=settings["date_column"], - merge=settings["merge"], - preview=False, - progress_callback=_gui_progress, - ) - - progress_bar.empty() - st.session_state["result"] = result - st.session_state["review_decisions"] = {} - - # --------------------------------------------------------------------------- - # Results - # --------------------------------------------------------------------------- - - result: DeduplicationResult | None = st.session_state["result"] - - if result is not None: - st.divider() - st.subheader("Results") - - # Summary + download buttons - results_summary(result, df) - - # Match group review - if result.match_groups: - st.divider() - st.subheader("Match Groups") - - # Batch actions - def _accept_all(): - for g in result.match_groups: - st.session_state["review_decisions"][g.group_id] = { - "keep_indices": [g.survivor_index], - "overrides": {}, - } - - def _reject_all(): - for g in result.match_groups: - st.session_state["review_decisions"][g.group_id] = { - "keep_indices": list(g.row_indices), - "overrides": {}, - } - - def _clear_all(): - st.session_state["review_decisions"] = {} - for k in list(st.session_state): - if k.startswith("editor_"): - del st.session_state[k] - - action_left, action_mid, action_right = st.columns(3) - with action_left: - st.button("Accept All", on_click=_accept_all) - with action_mid: - st.button("Reject All", on_click=_reject_all) - with action_right: - st.button("Clear Decisions", on_click=_clear_all) - - # Individual group cards - decisions = st.session_state["review_decisions"] - for i, group in enumerate(result.match_groups): - match_group_card(group, df, group_num=i + 1) - - # Show decision summary - if decisions: - st.divider() - merged = 0 - customized = 0 - split = 0 - kept_all = 0 - for v in decisions.values(): - if not isinstance(v, dict): - continue - ki = v.get("keep_indices", []) - # Find the matching group size - gid_for_v = next( - (gid for gid, d in decisions.items() if d is v), - None, - ) - group_size = next( - (len(g.row_indices) for g in result.match_groups - if g.group_id == gid_for_v), - 0, - ) - if len(ki) == group_size: - kept_all += 1 - elif len(ki) == 1: - if v.get("overrides"): - customized += 1 - else: - merged += 1 - else: - split += 1 - - pending = len(result.match_groups) - len(decisions) - parts = [] - if merged: - parts.append(f"{merged} merged") - if customized: - parts.append(f"{customized} customized") - if split: - parts.append(f"{split} split") - if kept_all: - parts.append(f"{kept_all} kept all") - parts.append(f"{pending} pending") - st.caption("Decisions: " + ", ".join(parts)) - - # Apply decisions and offer download - if st.button( - "Apply Review Decisions & Download", - type="primary", - use_container_width=True, - ): - reviewed_df, reviewed_removed = apply_review_decisions( - df, result.match_groups, decisions, - ) - - csv_bytes = reviewed_df.to_csv( - index=False - ).encode("utf-8-sig") - st.download_button( - "Download Reviewed & Deduplicated CSV", - data=csv_bytes, - file_name="deduplicated_reviewed.csv", - mime="text/csv", - key="reviewed_download", - ) - if not reviewed_removed.empty: - removed_bytes = reviewed_removed.to_csv( - index=False - ).encode("utf-8-sig") - st.download_button( - "Download Reviewed Removed Rows", - data=removed_bytes, - file_name="removed_reviewed.csv", - mime="text/csv", - key="reviewed_removed_download", - ) - - # Log entries - if result.log_entries: - with st.expander("Processing Log"): - st.code("\n".join(result.log_entries)) - -else: - # No file uploaded โ€” show placeholder - st.info("Upload a file to get started.") # --------------------------------------------------------------------------- @@ -379,5 +137,5 @@ else: st.divider() st.caption( "Runs locally. Your data never leaves this computer. " - "| DataTools Deduplicator v3.0" + "| DataTools v3.0" ) diff --git a/src/gui/pages/1_Deduplicator.py b/src/gui/pages/1_Deduplicator.py new file mode 100644 index 0000000..125f78b --- /dev/null +++ b/src/gui/pages/1_Deduplicator.py @@ -0,0 +1,355 @@ +"""DataTools Deduplicator โ€” full working tool page.""" + +from __future__ import annotations + +import sys +import tempfile +from pathlib import Path + +import pandas as pd +import streamlit as st + +# Ensure project root is on sys.path so `src.core` imports work +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) + +from src.core.dedup import deduplicate, DeduplicationResult +from src.core.io import read_file, list_sheets, detect_encoding, detect_delimiter +from src.gui.components import ( + apply_review_decisions, + config_panel, + match_group_card, + results_summary, +) + + +# --------------------------------------------------------------------------- +# Session state defaults +# --------------------------------------------------------------------------- + +_DEFAULTS = { + "df": None, + "result": None, + "review_decisions": {}, + "config": None, + "file_name": "", + "sheet_names": [], + "detected_delimiter": ",", +} +for key, default in _DEFAULTS.items(): + if key not in st.session_state: + st.session_state[key] = default + + +# --------------------------------------------------------------------------- +# Header +# --------------------------------------------------------------------------- + +st.title("๐Ÿ” Deduplicator") +st.caption("Find and remove duplicate rows in CSV, delimited text, and Excel files.") + + +# --------------------------------------------------------------------------- +# File upload +# --------------------------------------------------------------------------- + +uploaded = st.file_uploader( + "Upload CSV or Excel file", + type=["csv", "tsv", "xlsx", "xls"], + help="Supports CSV, TSV, and Excel files. Encoding and delimiters are auto-detected.", + key="dedup_file_upload", +) + +if uploaded is not None: + # Detect if file changed + if uploaded.name != st.session_state["file_name"]: + st.session_state["file_name"] = uploaded.name + st.session_state["result"] = None + st.session_state["review_decisions"] = {} + + # Read the file + try: + suffix = Path(uploaded.name).suffix + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(uploaded.getvalue()) + tmp_path = Path(tmp.name) + + # Check for Excel sheets / detect delimiter + if suffix.lower() in (".xlsx", ".xls"): + st.session_state["sheet_names"] = list_sheets(tmp_path) + st.session_state["detected_delimiter"] = "," + else: + st.session_state["sheet_names"] = [] + enc = detect_encoding(tmp_path) + st.session_state["detected_delimiter"] = detect_delimiter(tmp_path, enc) + + df = read_file(tmp_path) + if not isinstance(df, pd.DataFrame): + df = pd.concat(list(df), ignore_index=True) + + st.session_state["df"] = df + tmp_path.unlink(missing_ok=True) + + except Exception as e: + st.error(f"Failed to read file: {e}") + st.session_state["df"] = None + + df = st.session_state["df"] + + if df is not None: + # Sheet selector for Excel files + if st.session_state["sheet_names"] and len(st.session_state["sheet_names"]) > 1: + sheet = st.selectbox( + "Select sheet", + st.session_state["sheet_names"], + ) + if sheet != st.session_state.get("_current_sheet"): + st.session_state["_current_sheet"] = sheet + suffix = Path(uploaded.name).suffix + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(uploaded.getvalue()) + tmp_path = Path(tmp.name) + df = read_file(tmp_path, sheet_name=sheet) + if not isinstance(df, pd.DataFrame): + df = pd.concat(list(df), ignore_index=True) + st.session_state["df"] = df + st.session_state["result"] = None + st.session_state["review_decisions"] = {} + tmp_path.unlink(missing_ok=True) + + # Delimiter selector for CSV/TSV files + is_csv = Path(uploaded.name).suffix.lower() not in (".xlsx", ".xls") + if is_csv: + _DELIMITERS = { + "Comma (,)": ",", + "Tab (\\t)": "\t", + "Semicolon (;)": ";", + "Pipe (|)": "|", + "Other": None, + } + _DELIM_LABELS = list(_DELIMITERS.keys()) + _DELIM_VALUES = list(_DELIMITERS.values()) + detected = st.session_state.get("detected_delimiter", ",") + default_idx = _DELIM_VALUES.index(detected) if detected in _DELIM_VALUES else 0 + chosen_label = st.selectbox( + "Delimiter", + _DELIM_LABELS, + index=default_idx, + help="Auto-detected on upload. Change if the preview looks wrong.", + ) + if chosen_label == "Other": + custom_delim = st.text_input( + "Enter delimiter character", + max_chars=5, + help="Enter the character(s) used to separate fields.", + ) + chosen_delim = custom_delim if custom_delim else "," + else: + chosen_delim = _DELIMITERS[chosen_label] + if chosen_delim != st.session_state.get("_current_delimiter"): + st.session_state["_current_delimiter"] = chosen_delim + suffix = Path(uploaded.name).suffix + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(uploaded.getvalue()) + tmp_path = Path(tmp.name) + df = read_file(tmp_path, delimiter=chosen_delim) + if not isinstance(df, pd.DataFrame): + df = pd.concat(list(df), ignore_index=True) + st.session_state["df"] = df + st.session_state["result"] = None + st.session_state["review_decisions"] = {} + tmp_path.unlink(missing_ok=True) + + # Preview + st.subheader(f"Preview: {uploaded.name}") + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) + + # Advanced options + settings = config_panel(df) + + # Apply loaded config if present + loaded_cfg = st.session_state.get("loaded_config") + if loaded_cfg is not None: + settings["strategies"] = loaded_cfg.to_strategies() + settings["survivor_rule"] = loaded_cfg.to_survivor_rule() + settings["date_column"] = loaded_cfg.date_column + settings["merge"] = loaded_cfg.merge + del st.session_state["loaded_config"] + + # ------------------------------------------------------------------- + # Find Duplicates button + # ------------------------------------------------------------------- + + st.divider() + + if st.button("Find Duplicates", type="primary", use_container_width=True): + progress_bar = st.progress(0, text="Comparing rows...") + + def _gui_progress(current: int, total: int) -> None: + if total > 0: + pct = min(current / total, 1.0) + progress_bar.progress(pct, text=f"Comparing rows... {current:,}/{total:,}") + + with st.spinner("Running deduplication..."): + result = deduplicate( + df, + strategies=settings["strategies"], + survivor_rule=settings["survivor_rule"], + date_column=settings["date_column"], + merge=settings["merge"], + preview=False, + progress_callback=_gui_progress, + ) + + progress_bar.empty() + st.session_state["result"] = result + st.session_state["review_decisions"] = {} + + # ------------------------------------------------------------------- + # Results + # ------------------------------------------------------------------- + + result: DeduplicationResult | None = st.session_state["result"] + + if result is not None: + st.divider() + st.subheader("Results") + + # Summary + download buttons + results_summary(result, df) + + # Match group review + if result.match_groups: + st.divider() + st.subheader("Match Groups") + + # Batch actions + def _accept_all(): + for g in result.match_groups: + st.session_state["review_decisions"][g.group_id] = { + "keep_indices": [g.survivor_index], + "overrides": {}, + } + + def _reject_all(): + for g in result.match_groups: + st.session_state["review_decisions"][g.group_id] = { + "keep_indices": list(g.row_indices), + "overrides": {}, + } + + def _clear_all(): + st.session_state["review_decisions"] = {} + for k in list(st.session_state): + if k.startswith("editor_"): + del st.session_state[k] + + action_left, action_mid, action_right = st.columns(3) + with action_left: + st.button("Accept All", on_click=_accept_all) + with action_mid: + st.button("Reject All", on_click=_reject_all) + with action_right: + st.button("Clear Decisions", on_click=_clear_all) + + # Individual group cards + decisions = st.session_state["review_decisions"] + for i, group in enumerate(result.match_groups): + match_group_card(group, df, group_num=i + 1) + + # Show decision summary + if decisions: + st.divider() + merged = 0 + customized = 0 + split = 0 + kept_all = 0 + for v in decisions.values(): + if not isinstance(v, dict): + continue + ki = v.get("keep_indices", []) + gid_for_v = next( + (gid for gid, d in decisions.items() if d is v), + None, + ) + group_size = next( + (len(g.row_indices) for g in result.match_groups + if g.group_id == gid_for_v), + 0, + ) + if len(ki) == group_size: + kept_all += 1 + elif len(ki) == 1: + if v.get("overrides"): + customized += 1 + else: + merged += 1 + else: + split += 1 + + pending = len(result.match_groups) - len(decisions) + parts = [] + if merged: + parts.append(f"{merged} merged") + if customized: + parts.append(f"{customized} customized") + if split: + parts.append(f"{split} split") + if kept_all: + parts.append(f"{kept_all} kept all") + parts.append(f"{pending} pending") + st.caption("Decisions: " + ", ".join(parts)) + + # Apply decisions and offer download + if st.button( + "Apply Review Decisions & Download", + type="primary", + use_container_width=True, + ): + reviewed_df, reviewed_removed = apply_review_decisions( + df, result.match_groups, decisions, + ) + + csv_bytes = reviewed_df.to_csv( + index=False + ).encode("utf-8-sig") + st.download_button( + "Download Reviewed & Deduplicated CSV", + data=csv_bytes, + file_name="deduplicated_reviewed.csv", + mime="text/csv", + key="reviewed_download", + ) + if not reviewed_removed.empty: + removed_bytes = reviewed_removed.to_csv( + index=False + ).encode("utf-8-sig") + st.download_button( + "Download Reviewed Removed Rows", + data=removed_bytes, + file_name="removed_reviewed.csv", + mime="text/csv", + key="reviewed_removed_download", + ) + + # Log entries + if result.log_entries: + with st.expander("Processing Log"): + st.code("\n".join(result.log_entries)) + +else: + # No file uploaded โ€” show placeholder + st.info("Upload a file to get started.") + + +# --------------------------------------------------------------------------- +# Footer +# --------------------------------------------------------------------------- + +st.divider() +st.caption( + "Runs locally. Your data never leaves this computer. " + "| DataTools Deduplicator v3.0" +) diff --git a/src/gui/pages/2_Text_Cleaner.py b/src/gui/pages/2_Text_Cleaner.py new file mode 100644 index 0000000..c114e09 --- /dev/null +++ b/src/gui/pages/2_Text_Cleaner.py @@ -0,0 +1,89 @@ +"""DataTools Text Cleaner โ€” stub page.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import streamlit as st + +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) + +# --------------------------------------------------------------------------- +# Header +# --------------------------------------------------------------------------- + +st.title("โœ‚๏ธ Text Cleaner") +st.caption("Clean and normalize text content across your data.") + +st.info("This tool is under development.") + +# --------------------------------------------------------------------------- +# What this tool will do +# --------------------------------------------------------------------------- + +st.markdown(""" +**Features:** +- Trim leading/trailing whitespace +- Collapse multiple spaces into one +- Unicode normalization (NFC/NFKC) +- Strip non-printable / control characters +- Remove BOM (byte order mark) +- Normalize line endings (CRLF โ†’ LF) +- Case conversion (upper, lower, title, sentence) +""") + +st.divider() + +# --------------------------------------------------------------------------- +# File upload (functional) +# --------------------------------------------------------------------------- + +uploaded = st.file_uploader( + "Upload CSV or Excel file", + type=["csv", "tsv", "xlsx", "xls"], + help="Upload a file to preview. Processing is not yet available.", + key="textclean_file_upload", +) + +if uploaded is not None: + import pandas as pd + try: + if uploaded.name.endswith((".xlsx", ".xls")): + df = pd.read_excel(uploaded) + else: + df = pd.read_csv(uploaded) + st.subheader(f"Preview: {uploaded.name}") + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) + except Exception as e: + st.error(f"Failed to read file: {e}") + +# --------------------------------------------------------------------------- +# Placeholder options +# --------------------------------------------------------------------------- + +st.subheader("Operations") + +st.checkbox("Trim whitespace", value=True, disabled=True) +st.checkbox("Collapse multiple spaces", value=True, disabled=True) +st.checkbox("Unicode normalization (NFC)", value=False, disabled=True) +st.checkbox("Strip non-printable characters", value=False, disabled=True) +st.checkbox("Remove BOM", value=False, disabled=True) +st.checkbox("Normalize line endings", value=False, disabled=True) +st.selectbox("Case conversion", ["None", "UPPER", "lower", "Title Case", "Sentence case"], disabled=True) + +st.divider() +st.button("Clean Text", type="primary", use_container_width=True, disabled=True) + +# --------------------------------------------------------------------------- +# Footer +# --------------------------------------------------------------------------- + +st.divider() +st.caption( + "Runs locally. Your data never leaves this computer. " + "| DataTools v3.0" +) diff --git a/src/gui/pages/3_Format_Standardizer.py b/src/gui/pages/3_Format_Standardizer.py new file mode 100644 index 0000000..cc3a3b0 --- /dev/null +++ b/src/gui/pages/3_Format_Standardizer.py @@ -0,0 +1,86 @@ +"""DataTools Format Standardizer โ€” stub page.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import streamlit as st + +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) + +# --------------------------------------------------------------------------- +# Header +# --------------------------------------------------------------------------- + +st.title("๐Ÿ“ Format Standardizer") +st.caption("Standardize formats across columns for consistency.") + +st.info("This tool is under development.") + +# --------------------------------------------------------------------------- +# What this tool will do +# --------------------------------------------------------------------------- + +st.markdown(""" +**Features:** +- Date format standardization (e.g., MM/DD/YYYY โ†’ YYYY-MM-DD) +- Phone number formatting (E.164, national, international) +- Currency normalization ($1,000.00 โ†’ 1000.00) +- Name casing (JOHN DOE โ†’ John Doe) +- Address abbreviation expansion (St. โ†’ Street, Ave. โ†’ Avenue) +- Boolean standardization (Yes/No/Y/N/1/0 โ†’ True/False) +""") + +st.divider() + +# --------------------------------------------------------------------------- +# File upload (functional) +# --------------------------------------------------------------------------- + +uploaded = st.file_uploader( + "Upload CSV or Excel file", + type=["csv", "tsv", "xlsx", "xls"], + help="Upload a file to preview. Processing is not yet available.", + key="fmtstd_file_upload", +) + +if uploaded is not None: + import pandas as pd + try: + if uploaded.name.endswith((".xlsx", ".xls")): + df = pd.read_excel(uploaded) + else: + df = pd.read_csv(uploaded) + st.subheader(f"Preview: {uploaded.name}") + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) + except Exception as e: + st.error(f"Failed to read file: {e}") + +# --------------------------------------------------------------------------- +# Placeholder options +# --------------------------------------------------------------------------- + +st.subheader("Format Rules") + +st.selectbox("Date format", ["YYYY-MM-DD", "MM/DD/YYYY", "DD/MM/YYYY", "DD-Mon-YYYY"], disabled=True) +st.selectbox("Phone format", ["E.164 (+15551234567)", "National ((555) 123-4567)", "Digits only"], disabled=True) +st.selectbox("Currency handling", ["Strip symbols, keep number", "Normalize to 2 decimals", "Keep as-is"], disabled=True) +st.selectbox("Name casing", ["Title Case", "UPPER", "lower", "As-is"], disabled=True) +st.checkbox("Expand address abbreviations", value=False, disabled=True) + +st.divider() +st.button("Standardize Formats", type="primary", use_container_width=True, disabled=True) + +# --------------------------------------------------------------------------- +# Footer +# --------------------------------------------------------------------------- + +st.divider() +st.caption( + "Runs locally. Your data never leaves this computer. " + "| DataTools v3.0" +) diff --git a/src/gui/pages/4_Missing_Values.py b/src/gui/pages/4_Missing_Values.py new file mode 100644 index 0000000..8db07ab --- /dev/null +++ b/src/gui/pages/4_Missing_Values.py @@ -0,0 +1,102 @@ +"""DataTools Missing Value Handler โ€” stub page.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import streamlit as st + +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) + +# --------------------------------------------------------------------------- +# Header +# --------------------------------------------------------------------------- + +st.title("๐Ÿ•ณ๏ธ Missing Value Handler") +st.caption("Detect, analyze, and handle missing values in your data.") + +st.info("This tool is under development.") + +# --------------------------------------------------------------------------- +# What this tool will do +# --------------------------------------------------------------------------- + +st.markdown(""" +**Features:** +- Detect disguised nulls (empty strings, "N/A", "n/a", "-", "NULL", "None", etc.) +- Missingness analysis: per-column counts, percentages, and patterns +- Visualize missing data heatmap +- Imputation strategies: drop rows/columns, fill with mean/median/mode, forward-fill, backward-fill +- Custom sentinel value replacement +- Before/after comparison +""") + +st.divider() + +# --------------------------------------------------------------------------- +# File upload (functional) +# --------------------------------------------------------------------------- + +uploaded = st.file_uploader( + "Upload CSV or Excel file", + type=["csv", "tsv", "xlsx", "xls"], + help="Upload a file to preview. Processing is not yet available.", + key="missing_file_upload", +) + +if uploaded is not None: + import pandas as pd + try: + if uploaded.name.endswith((".xlsx", ".xls")): + df = pd.read_excel(uploaded) + else: + df = pd.read_csv(uploaded) + st.subheader(f"Preview: {uploaded.name}") + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) + except Exception as e: + st.error(f"Failed to read file: {e}") + +# --------------------------------------------------------------------------- +# Placeholder options +# --------------------------------------------------------------------------- + +st.subheader("Detection Settings") + +st.text_input( + "Null patterns (comma-separated)", + value="N/A, n/a, NA, -, NULL, None, empty, .", + disabled=True, + help="Values to treat as missing.", +) + +st.subheader("Handling Strategy") + +st.selectbox("Strategy", [ + "Drop rows with any missing", + "Drop rows above threshold", + "Fill with mean (numeric)", + "Fill with median (numeric)", + "Fill with mode (categorical)", + "Forward-fill", + "Backward-fill", + "Custom value", +], disabled=True) + +st.slider("Drop threshold (%)", 0, 100, 50, disabled=True, help="Drop rows missing more than this % of columns.") + +st.divider() +st.button("Handle Missing Values", type="primary", use_container_width=True, disabled=True) + +# --------------------------------------------------------------------------- +# Footer +# --------------------------------------------------------------------------- + +st.divider() +st.caption( + "Runs locally. Your data never leaves this computer. " + "| DataTools v3.0" +) diff --git a/src/gui/pages/5_Column_Mapper.py b/src/gui/pages/5_Column_Mapper.py new file mode 100644 index 0000000..b406e48 --- /dev/null +++ b/src/gui/pages/5_Column_Mapper.py @@ -0,0 +1,93 @@ +"""DataTools Column Mapper โ€” stub page.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import streamlit as st + +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) + +# --------------------------------------------------------------------------- +# Header +# --------------------------------------------------------------------------- + +st.title("๐Ÿ—‚๏ธ Column Mapper") +st.caption("Rename columns, enforce a target schema, and coerce types.") + +st.info("This tool is under development.") + +# --------------------------------------------------------------------------- +# What this tool will do +# --------------------------------------------------------------------------- + +st.markdown(""" +**Features:** +- Rename columns via interactive mapping table +- Load a target schema (JSON/CSV) to auto-map columns +- Fuzzy column name matching for automatic suggestions +- Type coercion (string โ†’ int, string โ†’ date, etc.) +- Drop unmapped columns or keep as-is +- Reorder columns to match target schema +""") + +st.divider() + +# --------------------------------------------------------------------------- +# File upload (functional) +# --------------------------------------------------------------------------- + +uploaded = st.file_uploader( + "Upload CSV or Excel file", + type=["csv", "tsv", "xlsx", "xls"], + help="Upload a file to preview. Processing is not yet available.", + key="colmap_file_upload", +) + +if uploaded is not None: + import pandas as pd + try: + if uploaded.name.endswith((".xlsx", ".xls")): + df = pd.read_excel(uploaded) + else: + df = pd.read_csv(uploaded) + st.subheader(f"Preview: {uploaded.name}") + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) + + st.subheader("Column Mapping") + st.caption("Map source columns to target names. (Interactive mapping coming soon.)") + mapping_data = pd.DataFrame({ + "Source Column": df.columns.tolist(), + "Target Column": df.columns.tolist(), + "Type": ["auto"] * len(df.columns), + }) + st.dataframe(mapping_data, use_container_width=True, hide_index=True) + except Exception as e: + st.error(f"Failed to read file: {e}") + +# --------------------------------------------------------------------------- +# Placeholder options +# --------------------------------------------------------------------------- + +st.subheader("Schema Options") + +st.file_uploader("Load target schema (JSON)", type=["json"], disabled=True, key="colmap_schema") +st.checkbox("Drop unmapped columns", value=False, disabled=True) +st.checkbox("Reorder to match schema", value=True, disabled=True) + +st.divider() +st.button("Apply Column Mapping", type="primary", use_container_width=True, disabled=True) + +# --------------------------------------------------------------------------- +# Footer +# --------------------------------------------------------------------------- + +st.divider() +st.caption( + "Runs locally. Your data never leaves this computer. " + "| DataTools v3.0" +) diff --git a/src/gui/pages/6_Outlier_Detector.py b/src/gui/pages/6_Outlier_Detector.py new file mode 100644 index 0000000..0860c49 --- /dev/null +++ b/src/gui/pages/6_Outlier_Detector.py @@ -0,0 +1,88 @@ +"""DataTools Outlier Detector โ€” stub page.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import streamlit as st + +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) + +# --------------------------------------------------------------------------- +# Header +# --------------------------------------------------------------------------- + +st.title("๐Ÿ“Š Outlier Detector") +st.caption("Detect and handle outliers in numeric columns.") + +st.info("This tool is under development.") + +# --------------------------------------------------------------------------- +# What this tool will do +# --------------------------------------------------------------------------- + +st.markdown(""" +**Features:** +- Z-score detection (configurable threshold) +- IQR (interquartile range) detection +- MAD (median absolute deviation) detection +- Domain-rule violations (e.g., age < 0, price > $1M) +- Visual outlier highlighting in data preview +- Handling: flag only, remove, cap/winsorize to bounds +""") + +st.divider() + +# --------------------------------------------------------------------------- +# File upload (functional) +# --------------------------------------------------------------------------- + +uploaded = st.file_uploader( + "Upload CSV or Excel file", + type=["csv", "tsv", "xlsx", "xls"], + help="Upload a file to preview. Processing is not yet available.", + key="outlier_file_upload", +) + +if uploaded is not None: + import pandas as pd + try: + if uploaded.name.endswith((".xlsx", ".xls")): + df = pd.read_excel(uploaded) + else: + df = pd.read_csv(uploaded) + st.subheader(f"Preview: {uploaded.name}") + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) + except Exception as e: + st.error(f"Failed to read file: {e}") + +# --------------------------------------------------------------------------- +# Placeholder options +# --------------------------------------------------------------------------- + +st.subheader("Detection Method") + +st.selectbox("Method", ["Z-Score", "IQR (Interquartile Range)", "MAD (Median Absolute Deviation)"], disabled=True) +st.slider("Z-Score threshold", 1.0, 5.0, 3.0, 0.1, disabled=True) +st.slider("IQR multiplier", 1.0, 3.0, 1.5, 0.1, disabled=True) + +st.subheader("Handling") + +st.selectbox("Action", ["Flag only (add column)", "Remove outlier rows", "Cap / Winsorize to bounds"], disabled=True) + +st.divider() +st.button("Detect Outliers", type="primary", use_container_width=True, disabled=True) + +# --------------------------------------------------------------------------- +# Footer +# --------------------------------------------------------------------------- + +st.divider() +st.caption( + "Runs locally. Your data never leaves this computer. " + "| DataTools v3.0" +) diff --git a/src/gui/pages/7_Multi_File_Merger.py b/src/gui/pages/7_Multi_File_Merger.py new file mode 100644 index 0000000..b5a1dec --- /dev/null +++ b/src/gui/pages/7_Multi_File_Merger.py @@ -0,0 +1,86 @@ +"""DataTools Multi-File Merger โ€” stub page.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import streamlit as st + +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) + +# --------------------------------------------------------------------------- +# Header +# --------------------------------------------------------------------------- + +st.title("๐Ÿ“Ž Multi-File Merger") +st.caption("Combine multiple CSV and Excel files into one dataset.") + +st.info("This tool is under development.") + +# --------------------------------------------------------------------------- +# What this tool will do +# --------------------------------------------------------------------------- + +st.markdown(""" +**Features:** +- Upload multiple CSV/Excel files at once +- Automatic schema alignment (matching columns by name) +- Append mode: stack files vertically (union) +- Join mode: merge files on shared key columns +- Handle mismatched columns (fill missing with nulls or drop) +- Source file tracking column +""") + +st.divider() + +# --------------------------------------------------------------------------- +# Multi-file upload (functional) +# --------------------------------------------------------------------------- + +uploaded_files = st.file_uploader( + "Upload CSV or Excel files", + type=["csv", "tsv", "xlsx", "xls"], + accept_multiple_files=True, + help="Upload multiple files to preview. Processing is not yet available.", + key="merger_file_upload", +) + +if uploaded_files: + import pandas as pd + for f in uploaded_files: + try: + if f.name.endswith((".xlsx", ".xls")): + df = pd.read_excel(f) + else: + df = pd.read_csv(f) + st.subheader(f"Preview: {f.name}") + st.caption(f"{len(df)} rows, {len(df.columns)} columns โ€” Columns: {', '.join(df.columns[:10])}{'...' if len(df.columns) > 10 else ''}") + st.dataframe(df.head(5), use_container_width=True) + except Exception as e: + st.error(f"Failed to read {f.name}: {e}") + +# --------------------------------------------------------------------------- +# Placeholder options +# --------------------------------------------------------------------------- + +st.subheader("Merge Strategy") + +st.selectbox("Mode", ["Append (stack vertically)", "Join on key columns", "Schema alignment (smart merge)"], disabled=True) +st.selectbox("Mismatched columns", ["Fill with null", "Drop non-shared columns", "Error"], disabled=True) +st.checkbox("Add source filename column", value=True, disabled=True) + +st.divider() +st.button("Merge Files", type="primary", use_container_width=True, disabled=True) + +# --------------------------------------------------------------------------- +# Footer +# --------------------------------------------------------------------------- + +st.divider() +st.caption( + "Runs locally. Your data never leaves this computer. " + "| DataTools v3.0" +) diff --git a/src/gui/pages/8_Validator_Reporter.py b/src/gui/pages/8_Validator_Reporter.py new file mode 100644 index 0000000..f5f16a2 --- /dev/null +++ b/src/gui/pages/8_Validator_Reporter.py @@ -0,0 +1,93 @@ +"""DataTools Validator & Reporter โ€” stub page.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import streamlit as st + +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) + +# --------------------------------------------------------------------------- +# Header +# --------------------------------------------------------------------------- + +st.title("โœ… Validator & Reporter") +st.caption("Validate data against rules and generate quality reports.") + +st.info("This tool is under development.") + +# --------------------------------------------------------------------------- +# What this tool will do +# --------------------------------------------------------------------------- + +st.markdown(""" +**Features:** +- Column-level validation rules (not null, unique, regex pattern, range, enum) +- Cross-column validation (e.g., start_date < end_date) +- Data quality score per column and overall +- Generate PDF quality report +- Generate Excel report with flagged rows highlighted +- Summary dashboard: pass/fail counts, severity breakdown +""") + +st.divider() + +# --------------------------------------------------------------------------- +# File upload (functional) +# --------------------------------------------------------------------------- + +uploaded = st.file_uploader( + "Upload CSV or Excel file", + type=["csv", "tsv", "xlsx", "xls"], + help="Upload a file to preview. Processing is not yet available.", + key="validator_file_upload", +) + +if uploaded is not None: + import pandas as pd + try: + if uploaded.name.endswith((".xlsx", ".xls")): + df = pd.read_excel(uploaded) + else: + df = pd.read_csv(uploaded) + st.subheader(f"Preview: {uploaded.name}") + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) + except Exception as e: + st.error(f"Failed to read file: {e}") + +# --------------------------------------------------------------------------- +# Placeholder options +# --------------------------------------------------------------------------- + +st.subheader("Validation Rules") + +st.file_uploader("Load rules file (JSON)", type=["json"], disabled=True, key="validator_rules") +st.multiselect("Quick checks", [ + "No null values", + "No duplicate rows", + "All emails valid", + "All dates parseable", + "Numeric columns in range", +], disabled=True) + +st.subheader("Report Format") + +st.selectbox("Output format", ["Excel (flagged rows)", "PDF summary", "Both"], disabled=True) + +st.divider() +st.button("Validate & Generate Report", type="primary", use_container_width=True, disabled=True) + +# --------------------------------------------------------------------------- +# Footer +# --------------------------------------------------------------------------- + +st.divider() +st.caption( + "Runs locally. Your data never leaves this computer. " + "| DataTools v3.0" +) diff --git a/src/gui/pages/9_Pipeline_Runner.py b/src/gui/pages/9_Pipeline_Runner.py new file mode 100644 index 0000000..0660481 --- /dev/null +++ b/src/gui/pages/9_Pipeline_Runner.py @@ -0,0 +1,95 @@ +"""DataTools Pipeline Runner โ€” stub page.""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import streamlit as st + +_project_root = Path(__file__).resolve().parent.parent.parent.parent +if str(_project_root) not in sys.path: + sys.path.insert(0, str(_project_root)) + +# --------------------------------------------------------------------------- +# Header +# --------------------------------------------------------------------------- + +st.title("โš™๏ธ Pipeline Runner") +st.caption("Chain tools in sequence and pass output between steps automatically.") + +st.info("This tool is under development.") + +# --------------------------------------------------------------------------- +# What this tool will do +# --------------------------------------------------------------------------- + +st.markdown(""" +**Features:** +- Select tools to run in sequence +- Recommended order: Text Cleaner โ†’ Format Standardizer โ†’ Missing Values โ†’ Deduplicator โ†’ Validator +- Each step's output feeds into the next step's input +- Per-step configuration overrides +- Progress tracking across all steps +- Final combined report +""") + +st.divider() + +# --------------------------------------------------------------------------- +# File upload (functional) +# --------------------------------------------------------------------------- + +uploaded = st.file_uploader( + "Upload CSV or Excel file", + type=["csv", "tsv", "xlsx", "xls"], + help="Upload a file to preview. Processing is not yet available.", + key="pipeline_file_upload", +) + +if uploaded is not None: + import pandas as pd + try: + if uploaded.name.endswith((".xlsx", ".xls")): + df = pd.read_excel(uploaded) + else: + df = pd.read_csv(uploaded) + st.subheader(f"Preview: {uploaded.name}") + st.caption(f"{len(df)} rows, {len(df.columns)} columns") + st.dataframe(df.head(10), use_container_width=True) + except Exception as e: + st.error(f"Failed to read file: {e}") + +# --------------------------------------------------------------------------- +# Pipeline steps (checklist) +# --------------------------------------------------------------------------- + +st.subheader("Pipeline Steps") +st.caption("Select tools to include in the pipeline (recommended order):") + +st.checkbox("1. Text Cleaner", value=True, disabled=True) +st.checkbox("2. Format Standardizer", value=True, disabled=True) +st.checkbox("3. Missing Value Handler", value=True, disabled=True) +st.checkbox("4. Column Mapper", value=False, disabled=True) +st.checkbox("5. Outlier Detector", value=False, disabled=True) +st.checkbox("6. Deduplicator", value=True, disabled=True) +st.checkbox("7. Multi-File Merger", value=False, disabled=True) +st.checkbox("8. Validator & Reporter", value=True, disabled=True) + +st.subheader("Pipeline Configuration") + +st.selectbox("On error", ["Stop pipeline", "Skip step and continue", "Prompt for decision"], disabled=True) +st.checkbox("Generate combined report at end", value=True, disabled=True) + +st.divider() +st.button("Run Pipeline", type="primary", use_container_width=True, disabled=True) + +# --------------------------------------------------------------------------- +# Footer +# --------------------------------------------------------------------------- + +st.divider() +st.caption( + "Runs locally. Your data never leaves this computer. " + "| DataTools v3.0" +)