diff --git a/src/gui/app.py b/src/gui/app.py index b81e635..b353f58 100644 --- a/src/gui/app.py +++ b/src/gui/app.py @@ -19,7 +19,7 @@ if str(_project_root) not in sys.path: sys.path.insert(0, str(_project_root)) from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult -from src.core.io import read_file, list_sheets +from src.core.io import read_file, list_sheets, detect_encoding, detect_delimiter from src.core.config import DeduplicationConfig from src.gui.components import ( apply_review_decisions, @@ -56,6 +56,7 @@ _DEFAULTS = { "config": None, "file_name": "", "sheet_names": [], + "detected_delimiter": ",", } for key, default in _DEFAULTS.items(): if key not in st.session_state: @@ -96,11 +97,14 @@ if uploaded is not None: tmp.write(uploaded.getvalue()) tmp_path = Path(tmp.name) - # Check for Excel sheets + # Check for Excel sheets / detect delimiter if suffix.lower() in (".xlsx", ".xls"): st.session_state["sheet_names"] = list_sheets(tmp_path) + st.session_state["detected_delimiter"] = "," else: st.session_state["sheet_names"] = [] + enc = detect_encoding(tmp_path) + st.session_state["detected_delimiter"] = detect_delimiter(tmp_path, enc) df = read_file(tmp_path) if not isinstance(df, pd.DataFrame): @@ -139,6 +143,41 @@ if uploaded is not None: st.session_state["review_decisions"] = {} tmp_path.unlink(missing_ok=True) + # Delimiter selector for CSV/TSV files + is_csv = Path(uploaded.name).suffix.lower() not in (".xlsx", ".xls") + if is_csv: + _DELIMITERS = { + "Comma (,)": ",", + "Tab (\\t)": "\t", + "Semicolon (;)": ";", + "Pipe (|)": "|", + } + _DELIM_LABELS = list(_DELIMITERS.keys()) + _DELIM_VALUES = list(_DELIMITERS.values()) + detected = st.session_state.get("detected_delimiter", ",") + default_idx = _DELIM_VALUES.index(detected) if detected in _DELIM_VALUES else 0 + chosen_label = st.selectbox( + "Delimiter", + _DELIM_LABELS, + index=default_idx, + help="Auto-detected on upload. Change if the preview looks wrong.", + ) + chosen_delim = _DELIMITERS[chosen_label] + if chosen_delim != st.session_state.get("_current_delimiter"): + st.session_state["_current_delimiter"] = chosen_delim + import tempfile + suffix = Path(uploaded.name).suffix + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(uploaded.getvalue()) + tmp_path = Path(tmp.name) + df = read_file(tmp_path, delimiter=chosen_delim) + if not isinstance(df, pd.DataFrame): + df = pd.concat(list(df), ignore_index=True) + st.session_state["df"] = df + st.session_state["result"] = None + st.session_state["review_decisions"] = {} + tmp_path.unlink(missing_ok=True) + # Preview st.subheader(f"Preview: {uploaded.name}") st.caption(f"{len(df)} rows, {len(df.columns)} columns")