feat: add delimiter selector for CSV/TSV files in GUI
Auto-detects delimiter on upload and shows a selectbox with comma, tab, semicolon, and pipe options. Changing re-reads the file immediately. Line terminators (Windows/Unix/Mac) already handled by universal newlines. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -19,7 +19,7 @@ if str(_project_root) not in sys.path:
|
|||||||
sys.path.insert(0, str(_project_root))
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
|
from src.core.dedup import deduplicate, build_default_strategies, DeduplicationResult
|
||||||
from src.core.io import read_file, list_sheets
|
from src.core.io import read_file, list_sheets, detect_encoding, detect_delimiter
|
||||||
from src.core.config import DeduplicationConfig
|
from src.core.config import DeduplicationConfig
|
||||||
from src.gui.components import (
|
from src.gui.components import (
|
||||||
apply_review_decisions,
|
apply_review_decisions,
|
||||||
@@ -56,6 +56,7 @@ _DEFAULTS = {
|
|||||||
"config": None,
|
"config": None,
|
||||||
"file_name": "",
|
"file_name": "",
|
||||||
"sheet_names": [],
|
"sheet_names": [],
|
||||||
|
"detected_delimiter": ",",
|
||||||
}
|
}
|
||||||
for key, default in _DEFAULTS.items():
|
for key, default in _DEFAULTS.items():
|
||||||
if key not in st.session_state:
|
if key not in st.session_state:
|
||||||
@@ -96,11 +97,14 @@ if uploaded is not None:
|
|||||||
tmp.write(uploaded.getvalue())
|
tmp.write(uploaded.getvalue())
|
||||||
tmp_path = Path(tmp.name)
|
tmp_path = Path(tmp.name)
|
||||||
|
|
||||||
# Check for Excel sheets
|
# Check for Excel sheets / detect delimiter
|
||||||
if suffix.lower() in (".xlsx", ".xls"):
|
if suffix.lower() in (".xlsx", ".xls"):
|
||||||
st.session_state["sheet_names"] = list_sheets(tmp_path)
|
st.session_state["sheet_names"] = list_sheets(tmp_path)
|
||||||
|
st.session_state["detected_delimiter"] = ","
|
||||||
else:
|
else:
|
||||||
st.session_state["sheet_names"] = []
|
st.session_state["sheet_names"] = []
|
||||||
|
enc = detect_encoding(tmp_path)
|
||||||
|
st.session_state["detected_delimiter"] = detect_delimiter(tmp_path, enc)
|
||||||
|
|
||||||
df = read_file(tmp_path)
|
df = read_file(tmp_path)
|
||||||
if not isinstance(df, pd.DataFrame):
|
if not isinstance(df, pd.DataFrame):
|
||||||
@@ -139,6 +143,41 @@ if uploaded is not None:
|
|||||||
st.session_state["review_decisions"] = {}
|
st.session_state["review_decisions"] = {}
|
||||||
tmp_path.unlink(missing_ok=True)
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
# Delimiter selector for CSV/TSV files
|
||||||
|
is_csv = Path(uploaded.name).suffix.lower() not in (".xlsx", ".xls")
|
||||||
|
if is_csv:
|
||||||
|
_DELIMITERS = {
|
||||||
|
"Comma (,)": ",",
|
||||||
|
"Tab (\\t)": "\t",
|
||||||
|
"Semicolon (;)": ";",
|
||||||
|
"Pipe (|)": "|",
|
||||||
|
}
|
||||||
|
_DELIM_LABELS = list(_DELIMITERS.keys())
|
||||||
|
_DELIM_VALUES = list(_DELIMITERS.values())
|
||||||
|
detected = st.session_state.get("detected_delimiter", ",")
|
||||||
|
default_idx = _DELIM_VALUES.index(detected) if detected in _DELIM_VALUES else 0
|
||||||
|
chosen_label = st.selectbox(
|
||||||
|
"Delimiter",
|
||||||
|
_DELIM_LABELS,
|
||||||
|
index=default_idx,
|
||||||
|
help="Auto-detected on upload. Change if the preview looks wrong.",
|
||||||
|
)
|
||||||
|
chosen_delim = _DELIMITERS[chosen_label]
|
||||||
|
if chosen_delim != st.session_state.get("_current_delimiter"):
|
||||||
|
st.session_state["_current_delimiter"] = chosen_delim
|
||||||
|
import tempfile
|
||||||
|
suffix = Path(uploaded.name).suffix
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||||
|
tmp.write(uploaded.getvalue())
|
||||||
|
tmp_path = Path(tmp.name)
|
||||||
|
df = read_file(tmp_path, delimiter=chosen_delim)
|
||||||
|
if not isinstance(df, pd.DataFrame):
|
||||||
|
df = pd.concat(list(df), ignore_index=True)
|
||||||
|
st.session_state["df"] = df
|
||||||
|
st.session_state["result"] = None
|
||||||
|
st.session_state["review_decisions"] = {}
|
||||||
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
# Preview
|
# Preview
|
||||||
st.subheader(f"Preview: {uploaded.name}")
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
|||||||
Reference in New Issue
Block a user