From 26b9771625a5d7df2f3887435f325d64dab44c3b Mon Sep 17 00:00:00 2001 From: Michael Date: Fri, 1 May 2026 02:35:42 +0000 Subject: [PATCH] feat(errors): structured error hierarchy + helpful messages everywhere MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces src/core/errors.py with a small structured error hierarchy that every public entry point now uses. Each error carries the context a user needs to fix it and the context a maintainer needs to trace it. The hierarchy: DataToolsError (base — formats path, column, operation, suggestion) InputValidationError (extends ValueError — bad arg / wrong type) ConfigError (extends ValueError — bad config / options) FileFormatError (extends ValueError — file is not what we expected) FileAccessError (extends OSError — file I/O failure) Subclassing the stdlib bases means existing `except OSError` / `except ValueError` handlers still catch them — no breaking change. Helpers: - ensure_dataframe(value, function=...) — uniform DataFrame guard - ensure_choice(value, name=, choices=) — uniform enum/literal guard - wrap_file_read(path, op, exc) — tag OSError with hint + path - wrap_file_write(path, op, exc) — same, with Windows-aware tip - format_for_user(exc, context=) — user-facing string for st.error / stderr Library hardening: - io.read_file: missing files surface FileAccessError listing whether the parent directory exists, and the suggestion to check the path. - io.read_file: chunk_size <= 0 now raises InputValidationError with a positive-integer suggestion. - io._read_excel: openpyxl BadZipFile / InvalidFileException / pandas ValueError ("sheet not found") wrapped as FileFormatError listing the path and a "list sheets with list_sheets()" hint. - io._detect_excel_header_row: bare except narrowed to specific openpyxl exceptions; falls back gracefully and logs at debug so the real error surfaces from pd.read_excel. - io.write_file: OSError / PermissionError on to_csv/to_excel wrapped with file path and Windows-aware "file may be open in another program" hint. - dedup._parse_date: bare `except Exception` narrowed to (TypeError, ValueError, OutOfBoundsDatetime); failed values logged at debug for survivor-selection forensics. - dedup._select_survivor: KEEP_MOST_RECENT now raises InputValidationError instead of silently falling back to keep_first. - dedup.deduplicate: input validation errors are InputValidationError with operation/column/suggestion fields. - format_standardize.from_dict: invalid FieldType for a column raises ConfigError naming the column AND the bad value AND listing valid values; same for date_order / phone_format / etc. - format_standardize.from_file: OSError / JSON decode wrapped with path AND line/column where parsing failed. - format_standardize.to_file: TypeError on json.dumps wrapped as ConfigError with the suspected source (extra_abbreviations). - format_standardize._apply_field_type: dispatcher's "unknown field type" branch now raises AssertionError (it's an internal invariant, not user error — a new enum value was added without a branch). - format_standardize._resolve_column_types: missing-column error now InputValidationError with a "check for typos / unparsed header" suggestion. - format_standardize.standardize_dataframe: ensure_dataframe at entry. - text_clean.clean_dataframe: ensure_dataframe at entry. - config.to_strategies: invalid Algorithm/NormalizerType wrapped as ConfigError naming the strategy index AND the column. - config.to_survivor_rule: invalid SurvivorRule wrapped as ConfigError listing valid values. - config.from_file: OSError / JSON decode wrapped (mirror of StandardizeOptions.from_file). - fixes.repair_mojibake: ImportError on ftfy now logged at info level with the underlying ImportError so a corrupt-package vs not-installed distinction is visible in the logs. - normalizers.normalize_phone: phonenumbers.NumberParseException now logged at debug when the digits-only fallback drops extension / country-code information — gives a trail when matching results look wrong. GUI / CLI surfaces: - All 9 page handlers (`except Exception as e: st.error(...)`) now use format_for_user(), which renders DataToolsError fields nicely and falls back to "ClassName: message" for unrecognized errors. - 2_Text_Cleaner and 3_Format_Standardizer additionally distinguish UnicodeDecodeError with an "re-save as UTF-8" suggestion before the generic handler. - cli.py's "Error reading file" handler now uses format_for_user() and includes the input path in the prefix. Tests: - tests/test_errors.py — 22 new tests covering: base class formatting, stdlib inheritance, ensure_dataframe / ensure_choice helpers, wrap_file_read / wrap_file_write, format_for_user behavior, and end-to-end integration (missing file, missing dir, bad JSON, bad algorithm, bad enum, missing column). - tests/test_audit_fixes.py + tests/test_io.py — updated 4 tests for the new exception types (InputValidationError replaces TypeError, FileAccessError extends OSError). Full project suite: 1230 passed, 4 skipped, 17 xfailed. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/cli.py | 6 +- src/core/config.py | 56 +++++- src/core/dedup.py | 65 ++++--- src/core/errors.py | 185 ++++++++++++++++++++ src/core/fixes.py | 7 +- src/core/format_standardize.py | 90 +++++++--- src/core/io.py | 113 ++++++++---- src/core/normalizers.py | 11 +- src/core/text_clean.py | 7 +- src/gui/pages/1_Deduplicator.py | 6 +- src/gui/pages/2_Text_Cleaner.py | 15 +- src/gui/pages/3_Format_Standardizer.py | 14 +- src/gui/pages/4_Missing_Values.py | 6 +- src/gui/pages/5_Column_Mapper.py | 6 +- src/gui/pages/6_Outlier_Detector.py | 6 +- src/gui/pages/7_Multi_File_Merger.py | 6 +- src/gui/pages/8_Validator_Reporter.py | 6 +- src/gui/pages/9_Pipeline_Runner.py | 6 +- tests/test_audit_fixes.py | 9 +- tests/test_errors.py | 230 +++++++++++++++++++++++++ tests/test_io.py | 5 +- 21 files changed, 751 insertions(+), 104 deletions(-) create mode 100644 src/core/errors.py create mode 100644 tests/test_errors.py diff --git a/src/cli.py b/src/cli.py index a5e2eff..204fd73 100644 --- a/src/cli.py +++ b/src/cli.py @@ -251,7 +251,11 @@ def dedup( import pandas as pd df = pd.concat(list(df), ignore_index=True) except Exception as e: - typer.echo(f"Error reading file: {e}", err=True) + from src.core.errors import format_for_user + typer.echo( + f"Error reading {input_path}:\n{format_for_user(e)}", + err=True, + ) raise typer.Exit(1) typer.echo(f" {len(df)} rows, {len(df.columns)} columns") diff --git a/src/core/config.py b/src/core/config.py index 3857a06..fe0bd10 100644 --- a/src/core/config.py +++ b/src/core/config.py @@ -86,18 +86,24 @@ class DeduplicationConfig: @classmethod def from_file(cls, path: str | Path) -> DeduplicationConfig: """Load configuration from a JSON file.""" + from .errors import ConfigError, wrap_file_read path = Path(path) try: text = path.read_text() except OSError as e: - raise OSError( - f"Could not read DeduplicationConfig from {path}: {e}" - ) from e + raise wrap_file_read(path, "DeduplicationConfig.from_file", e) from e try: data = json.loads(text) except json.JSONDecodeError as e: - raise ValueError( - f"Invalid JSON in DeduplicationConfig file {path}: {e}" + raise ConfigError( + "Invalid JSON in DeduplicationConfig file", + path=path, + operation="DeduplicationConfig.from_file", + cause=e, + suggestion=( + f"JSON parser failed at line {e.lineno}, column {e.colno}. " + "Validate with `python -m json.tool < file.json`." + ), ) from e return cls.from_dict(data) @@ -119,18 +125,50 @@ class DeduplicationConfig: if not self.strategies: return None + from .errors import ConfigError result: list[MatchStrategy] = [] - for sc in self.strategies: + for s_idx, sc in enumerate(self.strategies): col_strats = [] for cc in sc.columns: + try: + algorithm = Algorithm(cc.algorithm) + except ValueError as e: + raise ConfigError( + f"Invalid algorithm {cc.algorithm!r}", + column=cc.column, + operation=f"strategy[{s_idx}]", + cause=e, + suggestion=f"Valid: {sorted(a.value for a in Algorithm)}", + ) from e + try: + normalizer = ( + NormalizerType(cc.normalizer) if cc.normalizer else None + ) + except ValueError as e: + raise ConfigError( + f"Invalid normalizer {cc.normalizer!r}", + column=cc.column, + operation=f"strategy[{s_idx}]", + cause=e, + suggestion=f"Valid: {sorted(n.value for n in NormalizerType)}", + ) from e col_strats.append(ColumnMatchStrategy( column=cc.column, - algorithm=Algorithm(cc.algorithm), + algorithm=algorithm, threshold=cc.threshold, - normalizer=NormalizerType(cc.normalizer) if cc.normalizer else None, + normalizer=normalizer, )) result.append(MatchStrategy(column_strategies=col_strats)) return result def to_survivor_rule(self) -> SurvivorRule: - return SurvivorRule(self.survivor_rule) + from .errors import ConfigError + try: + return SurvivorRule(self.survivor_rule) + except ValueError as e: + raise ConfigError( + f"Invalid survivor_rule {self.survivor_rule!r}", + operation="DeduplicationConfig.to_survivor_rule", + cause=e, + suggestion=f"Valid: {sorted(r.value for r in SurvivorRule)}", + ) from e diff --git a/src/core/dedup.py b/src/core/dedup.py index e3de301..72e37bc 100644 --- a/src/core/dedup.py +++ b/src/core/dedup.py @@ -331,13 +331,14 @@ def _select_survivor( if rule == SurvivorRule.KEEP_MOST_RECENT: if not date_column or date_column not in df.columns: - # The public ``deduplicate()`` validates this earlier, so - # reaching here means a caller invoked the helper directly - # with bad arguments — surface a clear error instead of a - # silent fallback that produces wrong-but-plausible output. - raise ValueError( - f"KEEP_MOST_RECENT requires date_column to be a column in df; " - f"got {date_column!r} (available: {list(df.columns)})" + from .errors import InputValidationError + raise InputValidationError( + "KEEP_MOST_RECENT requires date_column to be a column in df", + operation="_select_survivor", + column=date_column, + suggestion=( + f"Got {date_column!r}; available columns: {list(df.columns)}" + ), ) best_idx = indices[0] best_date = _parse_date(df.iloc[indices[0]].get(date_column, "")) @@ -362,10 +363,22 @@ def _count_empty(row: pd.Series) -> int: return count -def _parse_date(value) -> Optional[pd.Timestamp]: +def _parse_date(value: Any) -> Optional[pd.Timestamp]: + """Best-effort date parse for survivor selection. + + Returns None for empty / unparseable values; logs at debug so a + survivor-selection oddity ("the wrong row got kept") can be traced + by enabling debug logs without changing code. + """ + if value is None or (isinstance(value, float) and pd.isna(value)): + return None try: return pd.to_datetime(value) - except Exception: + except (TypeError, ValueError, pd.errors.OutOfBoundsDatetime) as e: + logger.debug( + "_parse_date: could not parse {!r} ({}): {}", + value, type(e).__name__, e, + ) return None @@ -517,22 +530,27 @@ def deduplicate( Returns a ``DeduplicationResult``. """ - if not isinstance(df, pd.DataFrame): - raise TypeError( - f"deduplicate() requires a pandas DataFrame; got {type(df).__name__}" - ) + from .errors import ensure_dataframe, InputValidationError + ensure_dataframe(df, function="deduplicate") if survivor_rule == SurvivorRule.KEEP_MOST_RECENT and not date_column: - raise ValueError( - "survivor_rule=KEEP_MOST_RECENT requires date_column to be set" + raise InputValidationError( + "survivor_rule=KEEP_MOST_RECENT requires date_column", + operation="deduplicate", + suggestion=( + "Pass date_column='created_at' (or whichever column holds " + "the timestamp). Without it, 'most recent' has no reference." + ), ) if ( survivor_rule == SurvivorRule.KEEP_MOST_RECENT and date_column and date_column not in df.columns ): - raise ValueError( - f"date_column={date_column!r} not found in input. " - f"Available columns: {list(df.columns)}" + raise InputValidationError( + f"date_column={date_column!r} not found in input", + operation="deduplicate", + column=date_column, + suggestion=f"Available columns: {list(df.columns)}", ) log_entries: list[str] = [] @@ -561,9 +579,14 @@ def deduplicate( referenced = {cs.column for s in strategies for cs in s.column_strategies} missing = sorted(c for c in referenced if c not in df.columns) if missing: - raise ValueError( - f"Strategy references columns not present in the input: {missing}. " - f"Available columns: {list(df.columns)}" + raise InputValidationError( + f"Strategy references columns not present in the input: {missing}", + operation="deduplicate", + suggestion=( + f"Available columns: {list(df.columns)}. " + "Check for typos (e.g., 'e_mail' vs 'email') or for " + "header rows that didn't get parsed." + ), ) # Log strategies diff --git a/src/core/errors.py b/src/core/errors.py new file mode 100644 index 0000000..7c09fd4 --- /dev/null +++ b/src/core/errors.py @@ -0,0 +1,185 @@ +"""Shared error-formatting helpers. + +These keep error messages uniform across modules: same "what failed, +where, and what to try next" structure regardless of which layer +raises. Public CLIs / GUIs can rely on the message format being +consistent enough to surface to end users without further wrapping. + +Usage patterns: + + raise DataToolsError( + "Could not read input file", + path=path, + suggestion="Check that the file exists and is readable.", + ) + + # Wrapping a library error: + try: + wb = load_workbook(path) + except (BadZipFile, InvalidFileException) as e: + raise FileFormatError( + "Excel file is corrupted or not a valid .xlsx", + path=path, + cause=e, + ) from e +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Iterable, Optional + + +class DataToolsError(Exception): + """Base class for all DataTools-raised errors. + + Carries optional structured fields so GUIs / logs can render them + consistently rather than re-parsing free-form messages. + """ + + def __init__( + self, + message: str, + *, + path: Optional[Path | str] = None, + column: Optional[str] = None, + operation: Optional[str] = None, + suggestion: Optional[str] = None, + cause: Optional[BaseException] = None, + ): + self.message = message + self.path = Path(path) if path is not None else None + self.column = column + self.operation = operation + self.suggestion = suggestion + self.cause = cause + super().__init__(self.format()) + + def format(self) -> str: + """Render a human-friendly multi-line message.""" + lines = [self.message] + if self.operation: + lines.append(f" while: {self.operation}") + if self.path: + lines.append(f" file: {self.path}") + if self.column: + lines.append(f" column: {self.column!r}") + if self.cause: + lines.append(f" underlying: {type(self.cause).__name__}: {self.cause}") + if self.suggestion: + lines.append(f" suggestion: {self.suggestion}") + return "\n".join(lines) + + +class InputValidationError(DataToolsError, ValueError): + """Caller passed a bad argument — e.g., non-DataFrame, bad enum value.""" + + +class ConfigError(DataToolsError, ValueError): + """Configuration file or options object is invalid.""" + + +class FileFormatError(DataToolsError, ValueError): + """File exists but is not in the expected format (corrupted, wrong schema).""" + + +class FileAccessError(DataToolsError, OSError): + """File could not be read or written — permissions, missing parent, full disk.""" + + +# --------------------------------------------------------------------------- +# Convenience constructors +# --------------------------------------------------------------------------- + + +def ensure_dataframe(value: Any, *, function: str, parameter: str = "df") -> None: + """Raise InputValidationError if *value* isn't a pandas DataFrame. + + Centralizes the repetitive guard so every public entry point gives + the same message shape. + """ + import pandas as pd # lazy — keeps this module dependency-light + if not isinstance(value, pd.DataFrame): + raise InputValidationError( + f"{function}() requires a pandas DataFrame for {parameter!r}", + operation=function, + suggestion=( + f"Got {type(value).__name__}. " + "Pass a DataFrame loaded via src.core.io.read_file() " + "or constructed with pd.DataFrame(...)." + ), + ) + + +def ensure_choice( + value: Any, + *, + name: str, + choices: Iterable[Any], + function: Optional[str] = None, +) -> None: + """Raise InputValidationError if *value* isn't in *choices*.""" + choices = list(choices) + if value in choices: + return + raise InputValidationError( + f"Invalid {name}={value!r}", + operation=function, + suggestion=f"Valid: {sorted(map(str, choices))}", + ) + + +def wrap_file_read(path: Path | str, operation: str, exc: BaseException) -> FileAccessError: + """Build a FileAccessError describing a read failure with helpful context.""" + return FileAccessError( + f"Could not read file ({type(exc).__name__})", + path=path, + operation=operation, + cause=exc, + suggestion=( + "Check that the file exists, you have read permission, and the " + "path isn't on a network mount that may have disconnected." + ), + ) + + +def wrap_file_write(path: Path | str, operation: str, exc: BaseException) -> FileAccessError: + """Build a FileAccessError describing a write failure with helpful context.""" + suggestion = ( + "Check that the parent directory exists, you have write permission, " + "and there is enough free disk space." + ) + if isinstance(exc, PermissionError): + suggestion = ( + "Check write permissions on the parent directory. " + "On Windows, also ensure the file is not open in another program." + ) + return FileAccessError( + f"Could not write file ({type(exc).__name__})", + path=path, + operation=operation, + cause=exc, + suggestion=suggestion, + ) + + +# --------------------------------------------------------------------------- +# Friendly formatter for end-user surfaces (CLI stderr, GUI st.error) +# --------------------------------------------------------------------------- + + +def format_for_user(exc: BaseException, *, context: Optional[str] = None) -> str: + """Render an exception for end-user display. + + Recognizes :class:`DataToolsError` and uses its structured fields; + falls back to a generic message + class name for unrecognized + exceptions. ``context`` is an optional one-line prefix describing + what the user was trying to do (e.g., ``"Failed to read upload"``). + """ + if isinstance(exc, DataToolsError): + body = exc.format() + else: + body = f"{type(exc).__name__}: {exc}" + if context: + return f"{context}\n\n{body}" + return body diff --git a/src/core/fixes.py b/src/core/fixes.py index 20ce00a..45d5944 100644 --- a/src/core/fixes.py +++ b/src/core/fixes.py @@ -369,7 +369,12 @@ def repair_mojibake(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[p """ try: import ftfy # type: ignore - except ImportError: + except ImportError as e: + from loguru import logger as _log + _log.info( + "repair_mojibake: ftfy not installed ({}). " + "Skipping mojibake repair — install ftfy to enable.", e, + ) return df, 0 def fix(s: str) -> str: diff --git a/src/core/format_standardize.py b/src/core/format_standardize.py index 963daf4..41187de 100644 --- a/src/core/format_standardize.py +++ b/src/core/format_standardize.py @@ -1527,19 +1527,26 @@ class StandardizeOptions: @classmethod def from_dict(cls, data: dict) -> StandardizeOptions: + from .errors import ConfigError known = {f for f in cls.__dataclass_fields__} kwargs = {k: v for k, v in data.items() if k in known} column_types = kwargs.get("column_types") or {} - try: - kwargs["column_types"] = { - c: FieldType(t) if not isinstance(t, FieldType) else t - for c, t in column_types.items() - } - except ValueError as e: - valid = ", ".join(sorted(t.value for t in FieldType)) - raise ValueError( - f"Invalid field type in column_types: {e}. Valid: {valid}" - ) from e + resolved: dict[str, FieldType] = {} + for col, raw in column_types.items(): + try: + resolved[col] = ( + FieldType(raw) if not isinstance(raw, FieldType) else raw + ) + except ValueError as e: + valid = sorted(t.value for t in FieldType) + raise ConfigError( + f"Invalid field type {raw!r} for column {col!r}", + column=col, + operation="StandardizeOptions.from_dict", + cause=e, + suggestion=f"Valid field types: {valid}", + ) from e + kwargs["column_types"] = resolved # Surface enum-string mismatches early — bad date_order ("xyz") # would otherwise crash deep inside standardize_date. for field_name, valid in ( @@ -1555,8 +1562,10 @@ class StandardizeOptions: ): value = kwargs.get(field_name) if value is not None and value not in valid: - raise ValueError( - f"Invalid {field_name}={value!r}. Valid: {sorted(valid)}" + raise ConfigError( + f"Invalid {field_name}={value!r}", + operation="StandardizeOptions.from_dict", + suggestion=f"Valid values: {sorted(valid)}", ) return cls(**kwargs) @@ -1567,24 +1576,47 @@ class StandardizeOptions: return d def to_file(self, path: str | Path) -> Path: + from .errors import ConfigError, wrap_file_write out = Path(path) - out.write_text(json.dumps(self.to_dict(), indent=2)) + try: + payload = json.dumps(self.to_dict(), indent=2) + except TypeError as e: + raise ConfigError( + "Could not serialize StandardizeOptions to JSON", + operation="StandardizeOptions.to_file", + cause=e, + suggestion=( + "extra_abbreviations or column_types likely contains a " + "non-string/non-enum value. Inspect with .to_dict() and " + "remove the offending entry." + ), + ) from e + try: + out.write_text(payload) + except (OSError, PermissionError) as e: + raise wrap_file_write(out, "StandardizeOptions.to_file", e) from e return out @classmethod def from_file(cls, path: str | Path) -> StandardizeOptions: + from .errors import ConfigError, wrap_file_read path = Path(path) try: text = path.read_text() except OSError as e: - raise OSError( - f"Could not read StandardizeOptions config from {path}: {e}" - ) from e + raise wrap_file_read(path, "StandardizeOptions.from_file", e) from e try: data = json.loads(text) except json.JSONDecodeError as e: - raise ValueError( - f"Invalid JSON in StandardizeOptions config {path}: {e}" + raise ConfigError( + "Invalid JSON in StandardizeOptions config", + path=path, + operation="StandardizeOptions.from_file", + cause=e, + suggestion=( + f"JSON parser failed at line {e.lineno}, column {e.colno}. " + "Validate the file with `python -m json.tool < file.json`." + ), ) from e return cls.from_dict(data) @@ -1679,7 +1711,14 @@ def _apply_field_type( elif field_type == FieldType.BOOLEAN: new, changed = standardize_boolean(value, style=options.boolean_style) else: - raise ValueError(f"Unknown field type: {field_type}") + # Unreachable for well-formed input — _resolve_column_types + # would have rejected the bad enum at the entry point. Hitting + # this means an internal invariant was broken, not user error. + raise AssertionError( + f"Unhandled FieldType in dispatcher: {field_type!r}. " + "This indicates a code bug — a new FieldType was added to " + "the enum without a matching branch here." + ) # ``changed=False`` on a non-empty cell means the standardizer either # accepted the input as already-canonical OR couldn't parse it. The @@ -1760,9 +1799,14 @@ def _resolve_column_types( continue resolved[col] = ft if isinstance(ft, FieldType) else FieldType(ft) if missing: - raise ValueError( - f"Columns not found in input: {missing}. " - f"Available: {list(df_columns)}" + from .errors import InputValidationError + raise InputValidationError( + f"Columns referenced by column_types not found in input: {missing}", + operation="standardize_dataframe", + suggestion=( + f"Available columns: {list(df_columns)}. " + "Check for typos and for header rows that didn't get parsed." + ), ) return resolved @@ -1776,6 +1820,8 @@ def standardize_dataframe( Columns absent from ``options.column_types`` pass through unchanged. The input DataFrame is not mutated. """ + from .errors import ensure_dataframe + ensure_dataframe(df, function="standardize_dataframe") options = options or StandardizeOptions() out = df.copy() column_types = _resolve_column_types(options, out.columns) diff --git a/src/core/io.py b/src/core/io.py index 29ff4b9..c3cb8ec 100644 --- a/src/core/io.py +++ b/src/core/io.py @@ -182,14 +182,25 @@ def read_file( Returns a DataFrame (or generator when *chunk_size* is set). """ + from .errors import FileAccessError, InputValidationError filepath = Path(path) if not filepath.exists(): - raise FileNotFoundError( - f"Input file not found: {filepath} " - f"(required for encoding/delimiter detection and reading)" + raise FileAccessError( + "Input file not found", + path=filepath, + operation="read_file", + suggestion=( + f"Check the path is correct. Parent directory " + f"{filepath.parent} " + f"{'exists' if filepath.parent.exists() else 'does NOT exist'}." + ), ) if chunk_size is not None and chunk_size <= 0: - raise ValueError(f"chunk_size must be positive; got {chunk_size}") + raise InputValidationError( + f"chunk_size must be positive; got {chunk_size}", + operation="read_file", + suggestion="Pass a positive integer (e.g., chunk_size=10000) or omit for non-streaming reads.", + ) suffix = filepath.suffix.lower() logger.info( @@ -288,14 +299,42 @@ def _read_excel( else _detect_excel_header_row(path, sheet_name) ) logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr) - return pd.read_excel( - path, - sheet_name=sheet_name, - header=hdr, - dtype=str, - keep_default_na=False, - engine="openpyxl", - ) + try: + return pd.read_excel( + path, + sheet_name=sheet_name, + header=hdr, + dtype=str, + keep_default_na=False, + engine="openpyxl", + ) + except ValueError as e: + # pandas raises ValueError for "Worksheet named 'X' not found". + from .errors import FileFormatError + raise FileFormatError( + "Could not read Excel sheet", + path=path, + operation=f"open sheet {sheet_name!r}", + cause=e, + suggestion=( + "Check the sheet name exists. List available sheets with " + "`from src.core.io import list_sheets; list_sheets(path)`." + ), + ) from e + except Exception as e: + # openpyxl can raise BadZipFile, InvalidFileException for + # corrupt / non-xlsx inputs. Wrap with file context. + from .errors import FileFormatError + raise FileFormatError( + "Excel file could not be parsed", + path=path, + operation="pd.read_excel", + cause=e, + suggestion=( + "Confirm the file is a valid .xlsx workbook and not " + "renamed/corrupted. Try opening it in Excel to verify." + ), + ) from e def _detect_excel_header_row( @@ -308,18 +347,20 @@ def _detect_excel_header_row( Scans the first *max_scan* rows of *sheet_name* in read-only mode (so a 100 MB workbook doesn't get fully materialized) and returns the index of the first row where every non-empty cell looks like a - column header. Falls back to 0. + column header. Falls back to 0 on parse failure (logged at debug — + the caller's ``pd.read_excel`` will raise a useful FileFormatError + with full context). """ try: from openpyxl import load_workbook - except ImportError: + from openpyxl.utils.exceptions import InvalidFileException + except ImportError as e: + logger.debug("openpyxl unavailable for header detection: {}", e) return 0 + wb = None try: wb = load_workbook(path, read_only=True, data_only=True) - except Exception: - return 0 - try: if isinstance(sheet_name, int): names = wb.sheetnames target = names[sheet_name] if 0 <= sheet_name < len(names) else names[0] @@ -340,8 +381,18 @@ def _detect_excel_header_row( ): return idx return 0 + except (InvalidFileException, KeyError, IndexError, OSError) as e: + # Corrupt workbook, missing sheet name, or read failure — fall + # back to row 0 and let pd.read_excel raise the user-facing error + # with full context. + logger.debug( + "Excel header detection failed for {} (sheet={}): {}", + path, sheet_name, e, + ) + return 0 finally: - wb.close() + if wb is not None: + wb.close() # --------------------------------------------------------------------------- @@ -371,20 +422,22 @@ def write_file( Returns the resolved output Path. """ - if not isinstance(df, pd.DataFrame): - raise TypeError( - f"write_file() requires a pandas DataFrame; got {type(df).__name__}" - ) + from .errors import ensure_dataframe, wrap_file_write + ensure_dataframe(df, function="write_file") + out = Path(path) fmt = file_format or out.suffix.lstrip(".").lower() - if fmt in ("xlsx", "xls"): - df.to_excel(out, index=False, engine="openpyxl") - else: - sep = delimiter if delimiter is not None else ( - "\t" if fmt == "tsv" else "," - ) - df.to_csv(out, index=False, encoding=encoding, sep=sep) - logger.info("Wrote {} rows to {}", len(df), out) + try: + if fmt in ("xlsx", "xls"): + df.to_excel(out, index=False, engine="openpyxl") + else: + sep = delimiter if delimiter is not None else ( + "\t" if fmt == "tsv" else "," + ) + df.to_csv(out, index=False, encoding=encoding, sep=sep) + except (OSError, PermissionError) as e: + raise wrap_file_write(out, f"write_file (format={fmt})", e) from e + logger.info("Wrote {} rows × {} cols to {}", len(df), len(df.columns), out) return out diff --git a/src/core/normalizers.py b/src/core/normalizers.py index 7e57c0e..49ec95b 100644 --- a/src/core/normalizers.py +++ b/src/core/normalizers.py @@ -89,8 +89,15 @@ def normalize_phone(value: Optional[str], default_region: str = "US") -> str: if parsed.extension: return f"{base};ext={parsed.extension}" return base - except phonenumbers.NumberParseException: - pass + except phonenumbers.NumberParseException as e: + # Surface the fallback so a "wrong duplicate match" investigation + # can be traced — the fallback only keeps digits, so extensions + # and country codes inferred from formatting are lost. + from loguru import logger as _log + _log.debug( + "normalize_phone fallback for {!r} ({}): " + "dropping to digits-only.", stripped, e, + ) # Fallback: digits only digits = re.sub(r"\D", "", stripped) diff --git a/src/core/text_clean.py b/src/core/text_clean.py index ecbb25c..f5af0cd 100644 --- a/src/core/text_clean.py +++ b/src/core/text_clean.py @@ -536,11 +536,8 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) -> Numeric, datetime, and boolean columns are skipped by default. The input DataFrame is not mutated; a copy is returned in ``CleanResult.cleaned_df``. """ - if not isinstance(df, pd.DataFrame): - raise TypeError( - f"clean_dataframe() requires a pandas DataFrame; " - f"got {type(df).__name__}" - ) + from .errors import ensure_dataframe + ensure_dataframe(df, function="clean_dataframe") options = options or CleanOptions() logger.debug( "clean_dataframe: rows={}, cols={}, case={}", diff --git a/src/gui/pages/1_Deduplicator.py b/src/gui/pages/1_Deduplicator.py index 4da19b5..4cf5b0a 100644 --- a/src/gui/pages/1_Deduplicator.py +++ b/src/gui/pages/1_Deduplicator.py @@ -97,7 +97,11 @@ if uploaded is not None: tmp_path.unlink(missing_ok=True) except Exception as e: - st.error(f"Failed to read file: {e}") + from src.core.errors import format_for_user + st.error( + f"**Could not read `{uploaded.name}`**\n\n" + f"```\n{format_for_user(e)}\n```" + ) st.session_state["df"] = None df = st.session_state["df"] diff --git a/src/gui/pages/2_Text_Cleaner.py b/src/gui/pages/2_Text_Cleaner.py index 80ba7e6..f7e74cc 100644 --- a/src/gui/pages/2_Text_Cleaner.py +++ b/src/gui/pages/2_Text_Cleaner.py @@ -81,8 +81,21 @@ def _read_uploaded(name: str, data: bytes) -> pd.DataFrame: try: df = _read_uploaded(uploaded.name, uploaded.getvalue()) +except UnicodeDecodeError as e: + st.error( + f"**Could not decode `{uploaded.name}`**\n\n" + f"The file isn't UTF-8, UTF-8-with-BOM, or Latin-1.\n\n" + f"_Underlying error: {e}_\n\n" + f"Try re-saving the file as UTF-8 from the source application, " + f"or convert it with `iconv -f -t utf-8`." + ) + st.stop() except Exception as e: - st.error(f"Failed to read file: {e}") + from src.core.errors import format_for_user + st.error( + f"**Could not read `{uploaded.name}`**\n\n" + f"```\n{format_for_user(e)}\n```" + ) st.stop() st.subheader(f"Preview: {uploaded.name}") diff --git a/src/gui/pages/3_Format_Standardizer.py b/src/gui/pages/3_Format_Standardizer.py index e3e01b3..4c605e7 100644 --- a/src/gui/pages/3_Format_Standardizer.py +++ b/src/gui/pages/3_Format_Standardizer.py @@ -80,8 +80,20 @@ def _read_uploaded(name: str, data: bytes) -> pd.DataFrame: try: df = _read_uploaded(uploaded.name, uploaded.getvalue()) +except UnicodeDecodeError as e: + st.error( + f"**Could not decode `{uploaded.name}`**\n\n" + f"The file isn't UTF-8, UTF-8-with-BOM, or Latin-1.\n\n" + f"_Underlying error: {e}_\n\n" + f"Try re-saving the file as UTF-8 from the source application." + ) + st.stop() except Exception as e: - st.error(f"Failed to read file: {e}") + from src.core.errors import format_for_user + st.error( + f"**Could not read `{uploaded.name}`**\n\n" + f"```\n{format_for_user(e)}\n```" + ) st.stop() st.subheader(f"Preview: {uploaded.name}") diff --git a/src/gui/pages/4_Missing_Values.py b/src/gui/pages/4_Missing_Values.py index 8a181ed..0f89284 100644 --- a/src/gui/pages/4_Missing_Values.py +++ b/src/gui/pages/4_Missing_Values.py @@ -63,7 +63,11 @@ if uploaded is not None: st.caption(f"{len(df)} rows, {len(df.columns)} columns") st.dataframe(df.head(10), use_container_width=True) except Exception as e: - st.error(f"Failed to read file: {e}") + from src.core.errors import format_for_user + st.error( + f"**Could not read `{uploaded.name}`**\n\n" + f"```\n{format_for_user(e)}\n```" + ) # --------------------------------------------------------------------------- # Placeholder options diff --git a/src/gui/pages/5_Column_Mapper.py b/src/gui/pages/5_Column_Mapper.py index d36cc05..60b3a91 100644 --- a/src/gui/pages/5_Column_Mapper.py +++ b/src/gui/pages/5_Column_Mapper.py @@ -72,7 +72,11 @@ if uploaded is not None: }) st.dataframe(mapping_data, use_container_width=True, hide_index=True) except Exception as e: - st.error(f"Failed to read file: {e}") + from src.core.errors import format_for_user + st.error( + f"**Could not read `{uploaded.name}`**\n\n" + f"```\n{format_for_user(e)}\n```" + ) # --------------------------------------------------------------------------- # Placeholder options diff --git a/src/gui/pages/6_Outlier_Detector.py b/src/gui/pages/6_Outlier_Detector.py index 02fbdc7..169c816 100644 --- a/src/gui/pages/6_Outlier_Detector.py +++ b/src/gui/pages/6_Outlier_Detector.py @@ -63,7 +63,11 @@ if uploaded is not None: st.caption(f"{len(df)} rows, {len(df.columns)} columns") st.dataframe(df.head(10), use_container_width=True) except Exception as e: - st.error(f"Failed to read file: {e}") + from src.core.errors import format_for_user + st.error( + f"**Could not read `{uploaded.name}`**\n\n" + f"```\n{format_for_user(e)}\n```" + ) # --------------------------------------------------------------------------- # Placeholder options diff --git a/src/gui/pages/7_Multi_File_Merger.py b/src/gui/pages/7_Multi_File_Merger.py index 7b28fc1..4c41c79 100644 --- a/src/gui/pages/7_Multi_File_Merger.py +++ b/src/gui/pages/7_Multi_File_Merger.py @@ -65,7 +65,11 @@ if uploaded_files: st.caption(f"{len(df)} rows, {len(df.columns)} columns — Columns: {', '.join(df.columns[:10])}{'...' if len(df.columns) > 10 else ''}") st.dataframe(df.head(5), use_container_width=True) except Exception as e: - st.error(f"Failed to read {f.name}: {e}") + from src.core.errors import format_for_user + st.error( + f"**Could not read `{f.name}`**\n\n" + f"```\n{format_for_user(e)}\n```" + ) # --------------------------------------------------------------------------- # Placeholder options diff --git a/src/gui/pages/8_Validator_Reporter.py b/src/gui/pages/8_Validator_Reporter.py index 6a6b2cf..dac5819 100644 --- a/src/gui/pages/8_Validator_Reporter.py +++ b/src/gui/pages/8_Validator_Reporter.py @@ -63,7 +63,11 @@ if uploaded is not None: st.caption(f"{len(df)} rows, {len(df.columns)} columns") st.dataframe(df.head(10), use_container_width=True) except Exception as e: - st.error(f"Failed to read file: {e}") + from src.core.errors import format_for_user + st.error( + f"**Could not read `{uploaded.name}`**\n\n" + f"```\n{format_for_user(e)}\n```" + ) # --------------------------------------------------------------------------- # Placeholder options diff --git a/src/gui/pages/9_Pipeline_Runner.py b/src/gui/pages/9_Pipeline_Runner.py index 8057e80..175f4cf 100644 --- a/src/gui/pages/9_Pipeline_Runner.py +++ b/src/gui/pages/9_Pipeline_Runner.py @@ -63,7 +63,11 @@ if uploaded is not None: st.caption(f"{len(df)} rows, {len(df.columns)} columns") st.dataframe(df.head(10), use_container_width=True) except Exception as e: - st.error(f"Failed to read file: {e}") + from src.core.errors import format_for_user + st.error( + f"**Could not read `{uploaded.name}`**\n\n" + f"```\n{format_for_user(e)}\n```" + ) # --------------------------------------------------------------------------- # Pipeline steps (checklist) diff --git a/tests/test_audit_fixes.py b/tests/test_audit_fixes.py index 9ef847c..c1ab141 100644 --- a/tests/test_audit_fixes.py +++ b/tests/test_audit_fixes.py @@ -319,17 +319,20 @@ class TestProductionReadyValidation: def test_write_file_rejects_non_dataframe(self, tmp_path: Path): from src.core.io import write_file - with pytest.raises(TypeError, match="requires a pandas DataFrame"): + from src.core.errors import InputValidationError + with pytest.raises(InputValidationError, match="requires a pandas DataFrame"): write_file({"a": [1]}, tmp_path / "out.csv") # type: ignore[arg-type] def test_clean_dataframe_rejects_non_dataframe(self): from src.core.text_clean import clean_dataframe - with pytest.raises(TypeError, match="requires a pandas DataFrame"): + from src.core.errors import InputValidationError + with pytest.raises(InputValidationError, match="requires a pandas DataFrame"): clean_dataframe([{"a": 1}]) # type: ignore[arg-type] def test_deduplicate_rejects_non_dataframe(self): from src.core.dedup import deduplicate - with pytest.raises(TypeError, match="requires a pandas DataFrame"): + from src.core.errors import InputValidationError + with pytest.raises(InputValidationError, match="requires a pandas DataFrame"): deduplicate({"x": [1]}) # type: ignore[arg-type] def test_keep_most_recent_requires_date_column(self): diff --git a/tests/test_errors.py b/tests/test_errors.py new file mode 100644 index 0000000..07b49d1 --- /dev/null +++ b/tests/test_errors.py @@ -0,0 +1,230 @@ +"""Tests for the structured error-handling infrastructure. + +Covers: +- DataToolsError base class formatting (path, column, operation, suggestion). +- Specialized subclasses inherit from the right stdlib bases so existing + ``except OSError`` / ``except ValueError`` handlers still catch them. +- ensure_dataframe / ensure_choice raise the right structured errors. +- format_for_user produces readable output for both DataTools and + unrecognized exceptions. +- Per-module integration: bad config / bad file / bad input each + surface a helpful error rather than a deep library traceback. +""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pandas as pd +import pytest + +from src.core.errors import ( + ConfigError, + DataToolsError, + FileAccessError, + FileFormatError, + InputValidationError, + ensure_choice, + ensure_dataframe, + format_for_user, + wrap_file_read, + wrap_file_write, +) + + +# --------------------------------------------------------------------------- +# Base class +# --------------------------------------------------------------------------- + +class TestDataToolsError: + def test_message_only(self): + err = DataToolsError("something failed") + assert "something failed" in str(err) + + def test_full_context(self): + err = DataToolsError( + "could not parse", + path="/tmp/foo.csv", + column="email", + operation="read_file", + suggestion="check encoding", + cause=ValueError("inner"), + ) + text = str(err) + assert "could not parse" in text + assert "read_file" in text + assert "/tmp/foo.csv" in text + assert "'email'" in text + assert "ValueError" in text + assert "check encoding" in text + + def test_inheritance_for_oserror_handlers(self): + # FileAccessError must be catchable as OSError so callers using + # the stdlib hierarchy continue to work. + with pytest.raises(OSError): + raise FileAccessError("nope", path="/tmp/x") + + def test_inheritance_for_valueerror_handlers(self): + for cls in (InputValidationError, ConfigError, FileFormatError): + with pytest.raises(ValueError): + raise cls("nope") + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +class TestEnsureDataframe: + def test_passes_real_df(self): + ensure_dataframe(pd.DataFrame({"a": [1]}), function="x") + + def test_rejects_dict(self): + with pytest.raises(InputValidationError, match="DataFrame"): + ensure_dataframe({"a": 1}, function="my_func") + + def test_includes_function_name(self): + try: + ensure_dataframe(None, function="my_func") + except InputValidationError as e: + assert "my_func" in str(e) + else: # pragma: no cover + pytest.fail("should have raised") + + def test_includes_actual_type(self): + try: + ensure_dataframe([1, 2, 3], function="x") + except InputValidationError as e: + assert "list" in str(e) + + +class TestEnsureChoice: + def test_passes_valid(self): + ensure_choice("a", name="mode", choices=["a", "b"]) + + def test_rejects_invalid(self): + with pytest.raises(InputValidationError, match="Invalid mode"): + ensure_choice("c", name="mode", choices=["a", "b"]) + + def test_lists_choices_in_message(self): + try: + ensure_choice("c", name="mode", choices=["a", "b"]) + except InputValidationError as e: + assert "'a'" in str(e) and "'b'" in str(e) + + +class TestWrapFileHelpers: + def test_wrap_read_keeps_cause(self): + inner = OSError("disk error") + wrapped = wrap_file_read("/tmp/x", "read_file", inner) + assert wrapped.cause is inner + assert "/tmp/x" in str(wrapped) + + def test_wrap_write_permission_hint(self): + inner = PermissionError("no perm") + wrapped = wrap_file_write("/tmp/x", "save", inner) + # Permission failures get a Windows-aware suggestion + assert "Windows" in str(wrapped) or "permission" in str(wrapped).lower() + + +# --------------------------------------------------------------------------- +# format_for_user +# --------------------------------------------------------------------------- + +class TestFormatForUser: + def test_datatools_error(self): + err = InputValidationError( + "bad date_order", suggestion="use MDY or DMY", + ) + out = format_for_user(err) + assert "bad date_order" in out + assert "use MDY or DMY" in out + + def test_with_context_prefix(self): + err = ValueError("inner") + out = format_for_user(err, context="Failed to read upload") + assert out.startswith("Failed to read upload") + assert "ValueError" in out + + def test_unrecognized_exception(self): + err = RuntimeError("oops") + out = format_for_user(err) + assert "RuntimeError" in out + assert "oops" in out + + +# --------------------------------------------------------------------------- +# Integration — every public entry point surfaces structured errors +# --------------------------------------------------------------------------- + +class TestIntegration: + def test_io_read_missing_file_is_structured(self, tmp_path): + from src.core.io import read_file + with pytest.raises(FileAccessError) as exc_info: + read_file(tmp_path / "missing.csv") + msg = str(exc_info.value) + assert "Input file not found" in msg + assert str(tmp_path) in msg + assert "exists" in msg or "does NOT exist" in msg + + def test_io_write_to_missing_dir(self, tmp_path): + from src.core.io import write_file + # Writing into a non-existent directory raises a wrapped + # FileAccessError rather than a raw FileNotFoundError, so the + # user sees the path and a recovery hint. + df = pd.DataFrame({"a": [1]}) + with pytest.raises(FileAccessError) as exc_info: + write_file(df, tmp_path / "no_such_dir" / "out.csv") + msg = str(exc_info.value) + assert "Could not write" in msg + assert "no_such_dir" in msg + + def test_config_bad_json(self, tmp_path): + from src.core.config import DeduplicationConfig + path = tmp_path / "bad.json" + path.write_text("{not json") + with pytest.raises(ConfigError) as exc_info: + DeduplicationConfig.from_file(path) + assert "Invalid JSON" in str(exc_info.value) + assert "line" in str(exc_info.value) + + def test_config_bad_algorithm_includes_strategy_index(self, tmp_path): + from src.core.config import DeduplicationConfig + path = tmp_path / "cfg.json" + path.write_text(json.dumps({ + "strategies": [{ + "columns": [{ + "column": "name", + "algorithm": "not_a_real_algo", + "threshold": 90.0, + }], + }], + })) + loaded = DeduplicationConfig.from_file(path) + with pytest.raises(ConfigError) as exc_info: + loaded.to_strategies() + msg = str(exc_info.value) + assert "not_a_real_algo" in msg + assert "name" in msg # column name + assert "strategy[0]" in msg # strategy index + + def test_standardize_options_bad_field_type_includes_column(self): + from src.core.format_standardize import StandardizeOptions + with pytest.raises(ConfigError) as exc_info: + StandardizeOptions.from_dict({ + "column_types": {"my_col": "made_up"}, + }) + msg = str(exc_info.value) + assert "my_col" in msg + assert "made_up" in msg + + def test_standardize_dataframe_unknown_column(self): + from src.core.format_standardize import ( + FieldType, StandardizeOptions, standardize_dataframe, + ) + df = pd.DataFrame({"name": ["a"]}) + opts = StandardizeOptions(column_types={"missing": FieldType.DATE}) + with pytest.raises(InputValidationError) as exc_info: + standardize_dataframe(df, opts) + assert "missing" in str(exc_info.value) + assert "['name']" in str(exc_info.value) diff --git a/tests/test_io.py b/tests/test_io.py index e455c5d..9f85eb6 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -85,7 +85,10 @@ class TestReadFile: assert "customer_name" in df.columns def test_read_nonexistent(self): - with pytest.raises(FileNotFoundError): + # FileAccessError extends OSError so existing `except OSError` + # handlers still catch it. + from src.core.errors import FileAccessError + with pytest.raises((FileAccessError, OSError)): read_file("/tmp/nonexistent_file_xyz.csv") def test_read_with_encoding_override(self, sample_csv_path):