feat(errors): structured error hierarchy + helpful messages everywhere
Introduces src/core/errors.py with a small structured error hierarchy
that every public entry point now uses. Each error carries the
context a user needs to fix it and the context a maintainer needs to
trace it.
The hierarchy:
DataToolsError (base — formats path, column, operation, suggestion)
InputValidationError (extends ValueError — bad arg / wrong type)
ConfigError (extends ValueError — bad config / options)
FileFormatError (extends ValueError — file is not what we expected)
FileAccessError (extends OSError — file I/O failure)
Subclassing the stdlib bases means existing `except OSError` /
`except ValueError` handlers still catch them — no breaking change.
Helpers:
- ensure_dataframe(value, function=...) — uniform DataFrame guard
- ensure_choice(value, name=, choices=) — uniform enum/literal guard
- wrap_file_read(path, op, exc) — tag OSError with hint + path
- wrap_file_write(path, op, exc) — same, with Windows-aware tip
- format_for_user(exc, context=) — user-facing string for st.error / stderr
Library hardening:
- io.read_file: missing files surface FileAccessError listing whether
the parent directory exists, and the suggestion to check the path.
- io.read_file: chunk_size <= 0 now raises InputValidationError with
a positive-integer suggestion.
- io._read_excel: openpyxl BadZipFile / InvalidFileException / pandas
ValueError ("sheet not found") wrapped as FileFormatError listing
the path and a "list sheets with list_sheets()" hint.
- io._detect_excel_header_row: bare except narrowed to specific
openpyxl exceptions; falls back gracefully and logs at debug so
the real error surfaces from pd.read_excel.
- io.write_file: OSError / PermissionError on to_csv/to_excel wrapped
with file path and Windows-aware "file may be open in another
program" hint.
- dedup._parse_date: bare `except Exception` narrowed to
(TypeError, ValueError, OutOfBoundsDatetime); failed values
logged at debug for survivor-selection forensics.
- dedup._select_survivor: KEEP_MOST_RECENT now raises
InputValidationError instead of silently falling back to keep_first.
- dedup.deduplicate: input validation errors are InputValidationError
with operation/column/suggestion fields.
- format_standardize.from_dict: invalid FieldType for a column raises
ConfigError naming the column AND the bad value AND listing valid
values; same for date_order / phone_format / etc.
- format_standardize.from_file: OSError / JSON decode wrapped with
path AND line/column where parsing failed.
- format_standardize.to_file: TypeError on json.dumps wrapped as
ConfigError with the suspected source (extra_abbreviations).
- format_standardize._apply_field_type: dispatcher's "unknown field
type" branch now raises AssertionError (it's an internal invariant,
not user error — a new enum value was added without a branch).
- format_standardize._resolve_column_types: missing-column error now
InputValidationError with a "check for typos / unparsed header"
suggestion.
- format_standardize.standardize_dataframe: ensure_dataframe at entry.
- text_clean.clean_dataframe: ensure_dataframe at entry.
- config.to_strategies: invalid Algorithm/NormalizerType wrapped as
ConfigError naming the strategy index AND the column.
- config.to_survivor_rule: invalid SurvivorRule wrapped as ConfigError
listing valid values.
- config.from_file: OSError / JSON decode wrapped (mirror of
StandardizeOptions.from_file).
- fixes.repair_mojibake: ImportError on ftfy now logged at info level
with the underlying ImportError so a corrupt-package vs not-installed
distinction is visible in the logs.
- normalizers.normalize_phone: phonenumbers.NumberParseException now
logged at debug when the digits-only fallback drops extension /
country-code information — gives a trail when matching results
look wrong.
GUI / CLI surfaces:
- All 9 page handlers (`except Exception as e: st.error(...)`) now
use format_for_user(), which renders DataToolsError fields nicely
and falls back to "ClassName: message" for unrecognized errors.
- 2_Text_Cleaner and 3_Format_Standardizer additionally distinguish
UnicodeDecodeError with an "re-save as UTF-8" suggestion before
the generic handler.
- cli.py's "Error reading file" handler now uses format_for_user()
and includes the input path in the prefix.
Tests:
- tests/test_errors.py — 22 new tests covering: base class formatting,
stdlib inheritance, ensure_dataframe / ensure_choice helpers,
wrap_file_read / wrap_file_write, format_for_user behavior, and
end-to-end integration (missing file, missing dir, bad JSON, bad
algorithm, bad enum, missing column).
- tests/test_audit_fixes.py + tests/test_io.py — updated 4 tests for
the new exception types (InputValidationError replaces TypeError,
FileAccessError extends OSError).
Full project suite: 1230 passed, 4 skipped, 17 xfailed.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -251,7 +251,11 @@ def dedup(
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
df = pd.concat(list(df), ignore_index=True)
|
df = pd.concat(list(df), ignore_index=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
typer.echo(f"Error reading file: {e}", err=True)
|
from src.core.errors import format_for_user
|
||||||
|
typer.echo(
|
||||||
|
f"Error reading {input_path}:\n{format_for_user(e)}",
|
||||||
|
err=True,
|
||||||
|
)
|
||||||
raise typer.Exit(1)
|
raise typer.Exit(1)
|
||||||
|
|
||||||
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
|
typer.echo(f" {len(df)} rows, {len(df.columns)} columns")
|
||||||
|
|||||||
@@ -86,18 +86,24 @@ class DeduplicationConfig:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def from_file(cls, path: str | Path) -> DeduplicationConfig:
|
def from_file(cls, path: str | Path) -> DeduplicationConfig:
|
||||||
"""Load configuration from a JSON file."""
|
"""Load configuration from a JSON file."""
|
||||||
|
from .errors import ConfigError, wrap_file_read
|
||||||
path = Path(path)
|
path = Path(path)
|
||||||
try:
|
try:
|
||||||
text = path.read_text()
|
text = path.read_text()
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
raise OSError(
|
raise wrap_file_read(path, "DeduplicationConfig.from_file", e) from e
|
||||||
f"Could not read DeduplicationConfig from {path}: {e}"
|
|
||||||
) from e
|
|
||||||
try:
|
try:
|
||||||
data = json.loads(text)
|
data = json.loads(text)
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
raise ValueError(
|
raise ConfigError(
|
||||||
f"Invalid JSON in DeduplicationConfig file {path}: {e}"
|
"Invalid JSON in DeduplicationConfig file",
|
||||||
|
path=path,
|
||||||
|
operation="DeduplicationConfig.from_file",
|
||||||
|
cause=e,
|
||||||
|
suggestion=(
|
||||||
|
f"JSON parser failed at line {e.lineno}, column {e.colno}. "
|
||||||
|
"Validate with `python -m json.tool < file.json`."
|
||||||
|
),
|
||||||
) from e
|
) from e
|
||||||
return cls.from_dict(data)
|
return cls.from_dict(data)
|
||||||
|
|
||||||
@@ -119,18 +125,50 @@ class DeduplicationConfig:
|
|||||||
if not self.strategies:
|
if not self.strategies:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
from .errors import ConfigError
|
||||||
result: list[MatchStrategy] = []
|
result: list[MatchStrategy] = []
|
||||||
for sc in self.strategies:
|
for s_idx, sc in enumerate(self.strategies):
|
||||||
col_strats = []
|
col_strats = []
|
||||||
for cc in sc.columns:
|
for cc in sc.columns:
|
||||||
|
try:
|
||||||
|
algorithm = Algorithm(cc.algorithm)
|
||||||
|
except ValueError as e:
|
||||||
|
raise ConfigError(
|
||||||
|
f"Invalid algorithm {cc.algorithm!r}",
|
||||||
|
column=cc.column,
|
||||||
|
operation=f"strategy[{s_idx}]",
|
||||||
|
cause=e,
|
||||||
|
suggestion=f"Valid: {sorted(a.value for a in Algorithm)}",
|
||||||
|
) from e
|
||||||
|
try:
|
||||||
|
normalizer = (
|
||||||
|
NormalizerType(cc.normalizer) if cc.normalizer else None
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
raise ConfigError(
|
||||||
|
f"Invalid normalizer {cc.normalizer!r}",
|
||||||
|
column=cc.column,
|
||||||
|
operation=f"strategy[{s_idx}]",
|
||||||
|
cause=e,
|
||||||
|
suggestion=f"Valid: {sorted(n.value for n in NormalizerType)}",
|
||||||
|
) from e
|
||||||
col_strats.append(ColumnMatchStrategy(
|
col_strats.append(ColumnMatchStrategy(
|
||||||
column=cc.column,
|
column=cc.column,
|
||||||
algorithm=Algorithm(cc.algorithm),
|
algorithm=algorithm,
|
||||||
threshold=cc.threshold,
|
threshold=cc.threshold,
|
||||||
normalizer=NormalizerType(cc.normalizer) if cc.normalizer else None,
|
normalizer=normalizer,
|
||||||
))
|
))
|
||||||
result.append(MatchStrategy(column_strategies=col_strats))
|
result.append(MatchStrategy(column_strategies=col_strats))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def to_survivor_rule(self) -> SurvivorRule:
|
def to_survivor_rule(self) -> SurvivorRule:
|
||||||
return SurvivorRule(self.survivor_rule)
|
from .errors import ConfigError
|
||||||
|
try:
|
||||||
|
return SurvivorRule(self.survivor_rule)
|
||||||
|
except ValueError as e:
|
||||||
|
raise ConfigError(
|
||||||
|
f"Invalid survivor_rule {self.survivor_rule!r}",
|
||||||
|
operation="DeduplicationConfig.to_survivor_rule",
|
||||||
|
cause=e,
|
||||||
|
suggestion=f"Valid: {sorted(r.value for r in SurvivorRule)}",
|
||||||
|
) from e
|
||||||
|
|||||||
@@ -331,13 +331,14 @@ def _select_survivor(
|
|||||||
|
|
||||||
if rule == SurvivorRule.KEEP_MOST_RECENT:
|
if rule == SurvivorRule.KEEP_MOST_RECENT:
|
||||||
if not date_column or date_column not in df.columns:
|
if not date_column or date_column not in df.columns:
|
||||||
# The public ``deduplicate()`` validates this earlier, so
|
from .errors import InputValidationError
|
||||||
# reaching here means a caller invoked the helper directly
|
raise InputValidationError(
|
||||||
# with bad arguments — surface a clear error instead of a
|
"KEEP_MOST_RECENT requires date_column to be a column in df",
|
||||||
# silent fallback that produces wrong-but-plausible output.
|
operation="_select_survivor",
|
||||||
raise ValueError(
|
column=date_column,
|
||||||
f"KEEP_MOST_RECENT requires date_column to be a column in df; "
|
suggestion=(
|
||||||
f"got {date_column!r} (available: {list(df.columns)})"
|
f"Got {date_column!r}; available columns: {list(df.columns)}"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
best_idx = indices[0]
|
best_idx = indices[0]
|
||||||
best_date = _parse_date(df.iloc[indices[0]].get(date_column, ""))
|
best_date = _parse_date(df.iloc[indices[0]].get(date_column, ""))
|
||||||
@@ -362,10 +363,22 @@ def _count_empty(row: pd.Series) -> int:
|
|||||||
return count
|
return count
|
||||||
|
|
||||||
|
|
||||||
def _parse_date(value) -> Optional[pd.Timestamp]:
|
def _parse_date(value: Any) -> Optional[pd.Timestamp]:
|
||||||
|
"""Best-effort date parse for survivor selection.
|
||||||
|
|
||||||
|
Returns None for empty / unparseable values; logs at debug so a
|
||||||
|
survivor-selection oddity ("the wrong row got kept") can be traced
|
||||||
|
by enabling debug logs without changing code.
|
||||||
|
"""
|
||||||
|
if value is None or (isinstance(value, float) and pd.isna(value)):
|
||||||
|
return None
|
||||||
try:
|
try:
|
||||||
return pd.to_datetime(value)
|
return pd.to_datetime(value)
|
||||||
except Exception:
|
except (TypeError, ValueError, pd.errors.OutOfBoundsDatetime) as e:
|
||||||
|
logger.debug(
|
||||||
|
"_parse_date: could not parse {!r} ({}): {}",
|
||||||
|
value, type(e).__name__, e,
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@@ -517,22 +530,27 @@ def deduplicate(
|
|||||||
|
|
||||||
Returns a ``DeduplicationResult``.
|
Returns a ``DeduplicationResult``.
|
||||||
"""
|
"""
|
||||||
if not isinstance(df, pd.DataFrame):
|
from .errors import ensure_dataframe, InputValidationError
|
||||||
raise TypeError(
|
ensure_dataframe(df, function="deduplicate")
|
||||||
f"deduplicate() requires a pandas DataFrame; got {type(df).__name__}"
|
|
||||||
)
|
|
||||||
if survivor_rule == SurvivorRule.KEEP_MOST_RECENT and not date_column:
|
if survivor_rule == SurvivorRule.KEEP_MOST_RECENT and not date_column:
|
||||||
raise ValueError(
|
raise InputValidationError(
|
||||||
"survivor_rule=KEEP_MOST_RECENT requires date_column to be set"
|
"survivor_rule=KEEP_MOST_RECENT requires date_column",
|
||||||
|
operation="deduplicate",
|
||||||
|
suggestion=(
|
||||||
|
"Pass date_column='created_at' (or whichever column holds "
|
||||||
|
"the timestamp). Without it, 'most recent' has no reference."
|
||||||
|
),
|
||||||
)
|
)
|
||||||
if (
|
if (
|
||||||
survivor_rule == SurvivorRule.KEEP_MOST_RECENT
|
survivor_rule == SurvivorRule.KEEP_MOST_RECENT
|
||||||
and date_column
|
and date_column
|
||||||
and date_column not in df.columns
|
and date_column not in df.columns
|
||||||
):
|
):
|
||||||
raise ValueError(
|
raise InputValidationError(
|
||||||
f"date_column={date_column!r} not found in input. "
|
f"date_column={date_column!r} not found in input",
|
||||||
f"Available columns: {list(df.columns)}"
|
operation="deduplicate",
|
||||||
|
column=date_column,
|
||||||
|
suggestion=f"Available columns: {list(df.columns)}",
|
||||||
)
|
)
|
||||||
|
|
||||||
log_entries: list[str] = []
|
log_entries: list[str] = []
|
||||||
@@ -561,9 +579,14 @@ def deduplicate(
|
|||||||
referenced = {cs.column for s in strategies for cs in s.column_strategies}
|
referenced = {cs.column for s in strategies for cs in s.column_strategies}
|
||||||
missing = sorted(c for c in referenced if c not in df.columns)
|
missing = sorted(c for c in referenced if c not in df.columns)
|
||||||
if missing:
|
if missing:
|
||||||
raise ValueError(
|
raise InputValidationError(
|
||||||
f"Strategy references columns not present in the input: {missing}. "
|
f"Strategy references columns not present in the input: {missing}",
|
||||||
f"Available columns: {list(df.columns)}"
|
operation="deduplicate",
|
||||||
|
suggestion=(
|
||||||
|
f"Available columns: {list(df.columns)}. "
|
||||||
|
"Check for typos (e.g., 'e_mail' vs 'email') or for "
|
||||||
|
"header rows that didn't get parsed."
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Log strategies
|
# Log strategies
|
||||||
|
|||||||
185
src/core/errors.py
Normal file
185
src/core/errors.py
Normal file
@@ -0,0 +1,185 @@
|
|||||||
|
"""Shared error-formatting helpers.
|
||||||
|
|
||||||
|
These keep error messages uniform across modules: same "what failed,
|
||||||
|
where, and what to try next" structure regardless of which layer
|
||||||
|
raises. Public CLIs / GUIs can rely on the message format being
|
||||||
|
consistent enough to surface to end users without further wrapping.
|
||||||
|
|
||||||
|
Usage patterns:
|
||||||
|
|
||||||
|
raise DataToolsError(
|
||||||
|
"Could not read input file",
|
||||||
|
path=path,
|
||||||
|
suggestion="Check that the file exists and is readable.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wrapping a library error:
|
||||||
|
try:
|
||||||
|
wb = load_workbook(path)
|
||||||
|
except (BadZipFile, InvalidFileException) as e:
|
||||||
|
raise FileFormatError(
|
||||||
|
"Excel file is corrupted or not a valid .xlsx",
|
||||||
|
path=path,
|
||||||
|
cause=e,
|
||||||
|
) from e
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Iterable, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class DataToolsError(Exception):
|
||||||
|
"""Base class for all DataTools-raised errors.
|
||||||
|
|
||||||
|
Carries optional structured fields so GUIs / logs can render them
|
||||||
|
consistently rather than re-parsing free-form messages.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
message: str,
|
||||||
|
*,
|
||||||
|
path: Optional[Path | str] = None,
|
||||||
|
column: Optional[str] = None,
|
||||||
|
operation: Optional[str] = None,
|
||||||
|
suggestion: Optional[str] = None,
|
||||||
|
cause: Optional[BaseException] = None,
|
||||||
|
):
|
||||||
|
self.message = message
|
||||||
|
self.path = Path(path) if path is not None else None
|
||||||
|
self.column = column
|
||||||
|
self.operation = operation
|
||||||
|
self.suggestion = suggestion
|
||||||
|
self.cause = cause
|
||||||
|
super().__init__(self.format())
|
||||||
|
|
||||||
|
def format(self) -> str:
|
||||||
|
"""Render a human-friendly multi-line message."""
|
||||||
|
lines = [self.message]
|
||||||
|
if self.operation:
|
||||||
|
lines.append(f" while: {self.operation}")
|
||||||
|
if self.path:
|
||||||
|
lines.append(f" file: {self.path}")
|
||||||
|
if self.column:
|
||||||
|
lines.append(f" column: {self.column!r}")
|
||||||
|
if self.cause:
|
||||||
|
lines.append(f" underlying: {type(self.cause).__name__}: {self.cause}")
|
||||||
|
if self.suggestion:
|
||||||
|
lines.append(f" suggestion: {self.suggestion}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
class InputValidationError(DataToolsError, ValueError):
|
||||||
|
"""Caller passed a bad argument — e.g., non-DataFrame, bad enum value."""
|
||||||
|
|
||||||
|
|
||||||
|
class ConfigError(DataToolsError, ValueError):
|
||||||
|
"""Configuration file or options object is invalid."""
|
||||||
|
|
||||||
|
|
||||||
|
class FileFormatError(DataToolsError, ValueError):
|
||||||
|
"""File exists but is not in the expected format (corrupted, wrong schema)."""
|
||||||
|
|
||||||
|
|
||||||
|
class FileAccessError(DataToolsError, OSError):
|
||||||
|
"""File could not be read or written — permissions, missing parent, full disk."""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Convenience constructors
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_dataframe(value: Any, *, function: str, parameter: str = "df") -> None:
|
||||||
|
"""Raise InputValidationError if *value* isn't a pandas DataFrame.
|
||||||
|
|
||||||
|
Centralizes the repetitive guard so every public entry point gives
|
||||||
|
the same message shape.
|
||||||
|
"""
|
||||||
|
import pandas as pd # lazy — keeps this module dependency-light
|
||||||
|
if not isinstance(value, pd.DataFrame):
|
||||||
|
raise InputValidationError(
|
||||||
|
f"{function}() requires a pandas DataFrame for {parameter!r}",
|
||||||
|
operation=function,
|
||||||
|
suggestion=(
|
||||||
|
f"Got {type(value).__name__}. "
|
||||||
|
"Pass a DataFrame loaded via src.core.io.read_file() "
|
||||||
|
"or constructed with pd.DataFrame(...)."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_choice(
|
||||||
|
value: Any,
|
||||||
|
*,
|
||||||
|
name: str,
|
||||||
|
choices: Iterable[Any],
|
||||||
|
function: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Raise InputValidationError if *value* isn't in *choices*."""
|
||||||
|
choices = list(choices)
|
||||||
|
if value in choices:
|
||||||
|
return
|
||||||
|
raise InputValidationError(
|
||||||
|
f"Invalid {name}={value!r}",
|
||||||
|
operation=function,
|
||||||
|
suggestion=f"Valid: {sorted(map(str, choices))}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def wrap_file_read(path: Path | str, operation: str, exc: BaseException) -> FileAccessError:
|
||||||
|
"""Build a FileAccessError describing a read failure with helpful context."""
|
||||||
|
return FileAccessError(
|
||||||
|
f"Could not read file ({type(exc).__name__})",
|
||||||
|
path=path,
|
||||||
|
operation=operation,
|
||||||
|
cause=exc,
|
||||||
|
suggestion=(
|
||||||
|
"Check that the file exists, you have read permission, and the "
|
||||||
|
"path isn't on a network mount that may have disconnected."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def wrap_file_write(path: Path | str, operation: str, exc: BaseException) -> FileAccessError:
|
||||||
|
"""Build a FileAccessError describing a write failure with helpful context."""
|
||||||
|
suggestion = (
|
||||||
|
"Check that the parent directory exists, you have write permission, "
|
||||||
|
"and there is enough free disk space."
|
||||||
|
)
|
||||||
|
if isinstance(exc, PermissionError):
|
||||||
|
suggestion = (
|
||||||
|
"Check write permissions on the parent directory. "
|
||||||
|
"On Windows, also ensure the file is not open in another program."
|
||||||
|
)
|
||||||
|
return FileAccessError(
|
||||||
|
f"Could not write file ({type(exc).__name__})",
|
||||||
|
path=path,
|
||||||
|
operation=operation,
|
||||||
|
cause=exc,
|
||||||
|
suggestion=suggestion,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Friendly formatter for end-user surfaces (CLI stderr, GUI st.error)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def format_for_user(exc: BaseException, *, context: Optional[str] = None) -> str:
|
||||||
|
"""Render an exception for end-user display.
|
||||||
|
|
||||||
|
Recognizes :class:`DataToolsError` and uses its structured fields;
|
||||||
|
falls back to a generic message + class name for unrecognized
|
||||||
|
exceptions. ``context`` is an optional one-line prefix describing
|
||||||
|
what the user was trying to do (e.g., ``"Failed to read upload"``).
|
||||||
|
"""
|
||||||
|
if isinstance(exc, DataToolsError):
|
||||||
|
body = exc.format()
|
||||||
|
else:
|
||||||
|
body = f"{type(exc).__name__}: {exc}"
|
||||||
|
if context:
|
||||||
|
return f"{context}\n\n{body}"
|
||||||
|
return body
|
||||||
@@ -369,7 +369,12 @@ def repair_mojibake(df: pd.DataFrame, payload: Optional[dict] = None) -> tuple[p
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
import ftfy # type: ignore
|
import ftfy # type: ignore
|
||||||
except ImportError:
|
except ImportError as e:
|
||||||
|
from loguru import logger as _log
|
||||||
|
_log.info(
|
||||||
|
"repair_mojibake: ftfy not installed ({}). "
|
||||||
|
"Skipping mojibake repair — install ftfy to enable.", e,
|
||||||
|
)
|
||||||
return df, 0
|
return df, 0
|
||||||
|
|
||||||
def fix(s: str) -> str:
|
def fix(s: str) -> str:
|
||||||
|
|||||||
@@ -1527,19 +1527,26 @@ class StandardizeOptions:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, data: dict) -> StandardizeOptions:
|
def from_dict(cls, data: dict) -> StandardizeOptions:
|
||||||
|
from .errors import ConfigError
|
||||||
known = {f for f in cls.__dataclass_fields__}
|
known = {f for f in cls.__dataclass_fields__}
|
||||||
kwargs = {k: v for k, v in data.items() if k in known}
|
kwargs = {k: v for k, v in data.items() if k in known}
|
||||||
column_types = kwargs.get("column_types") or {}
|
column_types = kwargs.get("column_types") or {}
|
||||||
try:
|
resolved: dict[str, FieldType] = {}
|
||||||
kwargs["column_types"] = {
|
for col, raw in column_types.items():
|
||||||
c: FieldType(t) if not isinstance(t, FieldType) else t
|
try:
|
||||||
for c, t in column_types.items()
|
resolved[col] = (
|
||||||
}
|
FieldType(raw) if not isinstance(raw, FieldType) else raw
|
||||||
except ValueError as e:
|
)
|
||||||
valid = ", ".join(sorted(t.value for t in FieldType))
|
except ValueError as e:
|
||||||
raise ValueError(
|
valid = sorted(t.value for t in FieldType)
|
||||||
f"Invalid field type in column_types: {e}. Valid: {valid}"
|
raise ConfigError(
|
||||||
) from e
|
f"Invalid field type {raw!r} for column {col!r}",
|
||||||
|
column=col,
|
||||||
|
operation="StandardizeOptions.from_dict",
|
||||||
|
cause=e,
|
||||||
|
suggestion=f"Valid field types: {valid}",
|
||||||
|
) from e
|
||||||
|
kwargs["column_types"] = resolved
|
||||||
# Surface enum-string mismatches early — bad date_order ("xyz")
|
# Surface enum-string mismatches early — bad date_order ("xyz")
|
||||||
# would otherwise crash deep inside standardize_date.
|
# would otherwise crash deep inside standardize_date.
|
||||||
for field_name, valid in (
|
for field_name, valid in (
|
||||||
@@ -1555,8 +1562,10 @@ class StandardizeOptions:
|
|||||||
):
|
):
|
||||||
value = kwargs.get(field_name)
|
value = kwargs.get(field_name)
|
||||||
if value is not None and value not in valid:
|
if value is not None and value not in valid:
|
||||||
raise ValueError(
|
raise ConfigError(
|
||||||
f"Invalid {field_name}={value!r}. Valid: {sorted(valid)}"
|
f"Invalid {field_name}={value!r}",
|
||||||
|
operation="StandardizeOptions.from_dict",
|
||||||
|
suggestion=f"Valid values: {sorted(valid)}",
|
||||||
)
|
)
|
||||||
return cls(**kwargs)
|
return cls(**kwargs)
|
||||||
|
|
||||||
@@ -1567,24 +1576,47 @@ class StandardizeOptions:
|
|||||||
return d
|
return d
|
||||||
|
|
||||||
def to_file(self, path: str | Path) -> Path:
|
def to_file(self, path: str | Path) -> Path:
|
||||||
|
from .errors import ConfigError, wrap_file_write
|
||||||
out = Path(path)
|
out = Path(path)
|
||||||
out.write_text(json.dumps(self.to_dict(), indent=2))
|
try:
|
||||||
|
payload = json.dumps(self.to_dict(), indent=2)
|
||||||
|
except TypeError as e:
|
||||||
|
raise ConfigError(
|
||||||
|
"Could not serialize StandardizeOptions to JSON",
|
||||||
|
operation="StandardizeOptions.to_file",
|
||||||
|
cause=e,
|
||||||
|
suggestion=(
|
||||||
|
"extra_abbreviations or column_types likely contains a "
|
||||||
|
"non-string/non-enum value. Inspect with .to_dict() and "
|
||||||
|
"remove the offending entry."
|
||||||
|
),
|
||||||
|
) from e
|
||||||
|
try:
|
||||||
|
out.write_text(payload)
|
||||||
|
except (OSError, PermissionError) as e:
|
||||||
|
raise wrap_file_write(out, "StandardizeOptions.to_file", e) from e
|
||||||
return out
|
return out
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_file(cls, path: str | Path) -> StandardizeOptions:
|
def from_file(cls, path: str | Path) -> StandardizeOptions:
|
||||||
|
from .errors import ConfigError, wrap_file_read
|
||||||
path = Path(path)
|
path = Path(path)
|
||||||
try:
|
try:
|
||||||
text = path.read_text()
|
text = path.read_text()
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
raise OSError(
|
raise wrap_file_read(path, "StandardizeOptions.from_file", e) from e
|
||||||
f"Could not read StandardizeOptions config from {path}: {e}"
|
|
||||||
) from e
|
|
||||||
try:
|
try:
|
||||||
data = json.loads(text)
|
data = json.loads(text)
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
raise ValueError(
|
raise ConfigError(
|
||||||
f"Invalid JSON in StandardizeOptions config {path}: {e}"
|
"Invalid JSON in StandardizeOptions config",
|
||||||
|
path=path,
|
||||||
|
operation="StandardizeOptions.from_file",
|
||||||
|
cause=e,
|
||||||
|
suggestion=(
|
||||||
|
f"JSON parser failed at line {e.lineno}, column {e.colno}. "
|
||||||
|
"Validate the file with `python -m json.tool < file.json`."
|
||||||
|
),
|
||||||
) from e
|
) from e
|
||||||
return cls.from_dict(data)
|
return cls.from_dict(data)
|
||||||
|
|
||||||
@@ -1679,7 +1711,14 @@ def _apply_field_type(
|
|||||||
elif field_type == FieldType.BOOLEAN:
|
elif field_type == FieldType.BOOLEAN:
|
||||||
new, changed = standardize_boolean(value, style=options.boolean_style)
|
new, changed = standardize_boolean(value, style=options.boolean_style)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown field type: {field_type}")
|
# Unreachable for well-formed input — _resolve_column_types
|
||||||
|
# would have rejected the bad enum at the entry point. Hitting
|
||||||
|
# this means an internal invariant was broken, not user error.
|
||||||
|
raise AssertionError(
|
||||||
|
f"Unhandled FieldType in dispatcher: {field_type!r}. "
|
||||||
|
"This indicates a code bug — a new FieldType was added to "
|
||||||
|
"the enum without a matching branch here."
|
||||||
|
)
|
||||||
|
|
||||||
# ``changed=False`` on a non-empty cell means the standardizer either
|
# ``changed=False`` on a non-empty cell means the standardizer either
|
||||||
# accepted the input as already-canonical OR couldn't parse it. The
|
# accepted the input as already-canonical OR couldn't parse it. The
|
||||||
@@ -1760,9 +1799,14 @@ def _resolve_column_types(
|
|||||||
continue
|
continue
|
||||||
resolved[col] = ft if isinstance(ft, FieldType) else FieldType(ft)
|
resolved[col] = ft if isinstance(ft, FieldType) else FieldType(ft)
|
||||||
if missing:
|
if missing:
|
||||||
raise ValueError(
|
from .errors import InputValidationError
|
||||||
f"Columns not found in input: {missing}. "
|
raise InputValidationError(
|
||||||
f"Available: {list(df_columns)}"
|
f"Columns referenced by column_types not found in input: {missing}",
|
||||||
|
operation="standardize_dataframe",
|
||||||
|
suggestion=(
|
||||||
|
f"Available columns: {list(df_columns)}. "
|
||||||
|
"Check for typos and for header rows that didn't get parsed."
|
||||||
|
),
|
||||||
)
|
)
|
||||||
return resolved
|
return resolved
|
||||||
|
|
||||||
@@ -1776,6 +1820,8 @@ def standardize_dataframe(
|
|||||||
Columns absent from ``options.column_types`` pass through unchanged.
|
Columns absent from ``options.column_types`` pass through unchanged.
|
||||||
The input DataFrame is not mutated.
|
The input DataFrame is not mutated.
|
||||||
"""
|
"""
|
||||||
|
from .errors import ensure_dataframe
|
||||||
|
ensure_dataframe(df, function="standardize_dataframe")
|
||||||
options = options or StandardizeOptions()
|
options = options or StandardizeOptions()
|
||||||
out = df.copy()
|
out = df.copy()
|
||||||
column_types = _resolve_column_types(options, out.columns)
|
column_types = _resolve_column_types(options, out.columns)
|
||||||
|
|||||||
113
src/core/io.py
113
src/core/io.py
@@ -182,14 +182,25 @@ def read_file(
|
|||||||
|
|
||||||
Returns a DataFrame (or generator when *chunk_size* is set).
|
Returns a DataFrame (or generator when *chunk_size* is set).
|
||||||
"""
|
"""
|
||||||
|
from .errors import FileAccessError, InputValidationError
|
||||||
filepath = Path(path)
|
filepath = Path(path)
|
||||||
if not filepath.exists():
|
if not filepath.exists():
|
||||||
raise FileNotFoundError(
|
raise FileAccessError(
|
||||||
f"Input file not found: {filepath} "
|
"Input file not found",
|
||||||
f"(required for encoding/delimiter detection and reading)"
|
path=filepath,
|
||||||
|
operation="read_file",
|
||||||
|
suggestion=(
|
||||||
|
f"Check the path is correct. Parent directory "
|
||||||
|
f"{filepath.parent} "
|
||||||
|
f"{'exists' if filepath.parent.exists() else 'does NOT exist'}."
|
||||||
|
),
|
||||||
)
|
)
|
||||||
if chunk_size is not None and chunk_size <= 0:
|
if chunk_size is not None and chunk_size <= 0:
|
||||||
raise ValueError(f"chunk_size must be positive; got {chunk_size}")
|
raise InputValidationError(
|
||||||
|
f"chunk_size must be positive; got {chunk_size}",
|
||||||
|
operation="read_file",
|
||||||
|
suggestion="Pass a positive integer (e.g., chunk_size=10000) or omit for non-streaming reads.",
|
||||||
|
)
|
||||||
|
|
||||||
suffix = filepath.suffix.lower()
|
suffix = filepath.suffix.lower()
|
||||||
logger.info(
|
logger.info(
|
||||||
@@ -288,14 +299,42 @@ def _read_excel(
|
|||||||
else _detect_excel_header_row(path, sheet_name)
|
else _detect_excel_header_row(path, sheet_name)
|
||||||
)
|
)
|
||||||
logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr)
|
logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr)
|
||||||
return pd.read_excel(
|
try:
|
||||||
path,
|
return pd.read_excel(
|
||||||
sheet_name=sheet_name,
|
path,
|
||||||
header=hdr,
|
sheet_name=sheet_name,
|
||||||
dtype=str,
|
header=hdr,
|
||||||
keep_default_na=False,
|
dtype=str,
|
||||||
engine="openpyxl",
|
keep_default_na=False,
|
||||||
)
|
engine="openpyxl",
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
# pandas raises ValueError for "Worksheet named 'X' not found".
|
||||||
|
from .errors import FileFormatError
|
||||||
|
raise FileFormatError(
|
||||||
|
"Could not read Excel sheet",
|
||||||
|
path=path,
|
||||||
|
operation=f"open sheet {sheet_name!r}",
|
||||||
|
cause=e,
|
||||||
|
suggestion=(
|
||||||
|
"Check the sheet name exists. List available sheets with "
|
||||||
|
"`from src.core.io import list_sheets; list_sheets(path)`."
|
||||||
|
),
|
||||||
|
) from e
|
||||||
|
except Exception as e:
|
||||||
|
# openpyxl can raise BadZipFile, InvalidFileException for
|
||||||
|
# corrupt / non-xlsx inputs. Wrap with file context.
|
||||||
|
from .errors import FileFormatError
|
||||||
|
raise FileFormatError(
|
||||||
|
"Excel file could not be parsed",
|
||||||
|
path=path,
|
||||||
|
operation="pd.read_excel",
|
||||||
|
cause=e,
|
||||||
|
suggestion=(
|
||||||
|
"Confirm the file is a valid .xlsx workbook and not "
|
||||||
|
"renamed/corrupted. Try opening it in Excel to verify."
|
||||||
|
),
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
def _detect_excel_header_row(
|
def _detect_excel_header_row(
|
||||||
@@ -308,18 +347,20 @@ def _detect_excel_header_row(
|
|||||||
Scans the first *max_scan* rows of *sheet_name* in read-only mode
|
Scans the first *max_scan* rows of *sheet_name* in read-only mode
|
||||||
(so a 100 MB workbook doesn't get fully materialized) and returns
|
(so a 100 MB workbook doesn't get fully materialized) and returns
|
||||||
the index of the first row where every non-empty cell looks like a
|
the index of the first row where every non-empty cell looks like a
|
||||||
column header. Falls back to 0.
|
column header. Falls back to 0 on parse failure (logged at debug —
|
||||||
|
the caller's ``pd.read_excel`` will raise a useful FileFormatError
|
||||||
|
with full context).
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
from openpyxl import load_workbook
|
from openpyxl import load_workbook
|
||||||
except ImportError:
|
from openpyxl.utils.exceptions import InvalidFileException
|
||||||
|
except ImportError as e:
|
||||||
|
logger.debug("openpyxl unavailable for header detection: {}", e)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
wb = None
|
||||||
try:
|
try:
|
||||||
wb = load_workbook(path, read_only=True, data_only=True)
|
wb = load_workbook(path, read_only=True, data_only=True)
|
||||||
except Exception:
|
|
||||||
return 0
|
|
||||||
try:
|
|
||||||
if isinstance(sheet_name, int):
|
if isinstance(sheet_name, int):
|
||||||
names = wb.sheetnames
|
names = wb.sheetnames
|
||||||
target = names[sheet_name] if 0 <= sheet_name < len(names) else names[0]
|
target = names[sheet_name] if 0 <= sheet_name < len(names) else names[0]
|
||||||
@@ -340,8 +381,18 @@ def _detect_excel_header_row(
|
|||||||
):
|
):
|
||||||
return idx
|
return idx
|
||||||
return 0
|
return 0
|
||||||
|
except (InvalidFileException, KeyError, IndexError, OSError) as e:
|
||||||
|
# Corrupt workbook, missing sheet name, or read failure — fall
|
||||||
|
# back to row 0 and let pd.read_excel raise the user-facing error
|
||||||
|
# with full context.
|
||||||
|
logger.debug(
|
||||||
|
"Excel header detection failed for {} (sheet={}): {}",
|
||||||
|
path, sheet_name, e,
|
||||||
|
)
|
||||||
|
return 0
|
||||||
finally:
|
finally:
|
||||||
wb.close()
|
if wb is not None:
|
||||||
|
wb.close()
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -371,20 +422,22 @@ def write_file(
|
|||||||
|
|
||||||
Returns the resolved output Path.
|
Returns the resolved output Path.
|
||||||
"""
|
"""
|
||||||
if not isinstance(df, pd.DataFrame):
|
from .errors import ensure_dataframe, wrap_file_write
|
||||||
raise TypeError(
|
ensure_dataframe(df, function="write_file")
|
||||||
f"write_file() requires a pandas DataFrame; got {type(df).__name__}"
|
|
||||||
)
|
|
||||||
out = Path(path)
|
out = Path(path)
|
||||||
fmt = file_format or out.suffix.lstrip(".").lower()
|
fmt = file_format or out.suffix.lstrip(".").lower()
|
||||||
if fmt in ("xlsx", "xls"):
|
try:
|
||||||
df.to_excel(out, index=False, engine="openpyxl")
|
if fmt in ("xlsx", "xls"):
|
||||||
else:
|
df.to_excel(out, index=False, engine="openpyxl")
|
||||||
sep = delimiter if delimiter is not None else (
|
else:
|
||||||
"\t" if fmt == "tsv" else ","
|
sep = delimiter if delimiter is not None else (
|
||||||
)
|
"\t" if fmt == "tsv" else ","
|
||||||
df.to_csv(out, index=False, encoding=encoding, sep=sep)
|
)
|
||||||
logger.info("Wrote {} rows to {}", len(df), out)
|
df.to_csv(out, index=False, encoding=encoding, sep=sep)
|
||||||
|
except (OSError, PermissionError) as e:
|
||||||
|
raise wrap_file_write(out, f"write_file (format={fmt})", e) from e
|
||||||
|
logger.info("Wrote {} rows × {} cols to {}", len(df), len(df.columns), out)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -89,8 +89,15 @@ def normalize_phone(value: Optional[str], default_region: str = "US") -> str:
|
|||||||
if parsed.extension:
|
if parsed.extension:
|
||||||
return f"{base};ext={parsed.extension}"
|
return f"{base};ext={parsed.extension}"
|
||||||
return base
|
return base
|
||||||
except phonenumbers.NumberParseException:
|
except phonenumbers.NumberParseException as e:
|
||||||
pass
|
# Surface the fallback so a "wrong duplicate match" investigation
|
||||||
|
# can be traced — the fallback only keeps digits, so extensions
|
||||||
|
# and country codes inferred from formatting are lost.
|
||||||
|
from loguru import logger as _log
|
||||||
|
_log.debug(
|
||||||
|
"normalize_phone fallback for {!r} ({}): "
|
||||||
|
"dropping to digits-only.", stripped, e,
|
||||||
|
)
|
||||||
|
|
||||||
# Fallback: digits only
|
# Fallback: digits only
|
||||||
digits = re.sub(r"\D", "", stripped)
|
digits = re.sub(r"\D", "", stripped)
|
||||||
|
|||||||
@@ -536,11 +536,8 @@ def clean_dataframe(df: pd.DataFrame, options: Optional[CleanOptions] = None) ->
|
|||||||
Numeric, datetime, and boolean columns are skipped by default. The input
|
Numeric, datetime, and boolean columns are skipped by default. The input
|
||||||
DataFrame is not mutated; a copy is returned in ``CleanResult.cleaned_df``.
|
DataFrame is not mutated; a copy is returned in ``CleanResult.cleaned_df``.
|
||||||
"""
|
"""
|
||||||
if not isinstance(df, pd.DataFrame):
|
from .errors import ensure_dataframe
|
||||||
raise TypeError(
|
ensure_dataframe(df, function="clean_dataframe")
|
||||||
f"clean_dataframe() requires a pandas DataFrame; "
|
|
||||||
f"got {type(df).__name__}"
|
|
||||||
)
|
|
||||||
options = options or CleanOptions()
|
options = options or CleanOptions()
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"clean_dataframe: rows={}, cols={}, case={}",
|
"clean_dataframe: rows={}, cols={}, case={}",
|
||||||
|
|||||||
@@ -97,7 +97,11 @@ if uploaded is not None:
|
|||||||
tmp_path.unlink(missing_ok=True)
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Failed to read file: {e}")
|
from src.core.errors import format_for_user
|
||||||
|
st.error(
|
||||||
|
f"**Could not read `{uploaded.name}`**\n\n"
|
||||||
|
f"```\n{format_for_user(e)}\n```"
|
||||||
|
)
|
||||||
st.session_state["df"] = None
|
st.session_state["df"] = None
|
||||||
|
|
||||||
df = st.session_state["df"]
|
df = st.session_state["df"]
|
||||||
|
|||||||
@@ -81,8 +81,21 @@ def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
st.error(
|
||||||
|
f"**Could not decode `{uploaded.name}`**\n\n"
|
||||||
|
f"The file isn't UTF-8, UTF-8-with-BOM, or Latin-1.\n\n"
|
||||||
|
f"_Underlying error: {e}_\n\n"
|
||||||
|
f"Try re-saving the file as UTF-8 from the source application, "
|
||||||
|
f"or convert it with `iconv -f <source-encoding> -t utf-8`."
|
||||||
|
)
|
||||||
|
st.stop()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Failed to read file: {e}")
|
from src.core.errors import format_for_user
|
||||||
|
st.error(
|
||||||
|
f"**Could not read `{uploaded.name}`**\n\n"
|
||||||
|
f"```\n{format_for_user(e)}\n```"
|
||||||
|
)
|
||||||
st.stop()
|
st.stop()
|
||||||
|
|
||||||
st.subheader(f"Preview: {uploaded.name}")
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
|||||||
@@ -80,8 +80,20 @@ def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
st.error(
|
||||||
|
f"**Could not decode `{uploaded.name}`**\n\n"
|
||||||
|
f"The file isn't UTF-8, UTF-8-with-BOM, or Latin-1.\n\n"
|
||||||
|
f"_Underlying error: {e}_\n\n"
|
||||||
|
f"Try re-saving the file as UTF-8 from the source application."
|
||||||
|
)
|
||||||
|
st.stop()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Failed to read file: {e}")
|
from src.core.errors import format_for_user
|
||||||
|
st.error(
|
||||||
|
f"**Could not read `{uploaded.name}`**\n\n"
|
||||||
|
f"```\n{format_for_user(e)}\n```"
|
||||||
|
)
|
||||||
st.stop()
|
st.stop()
|
||||||
|
|
||||||
st.subheader(f"Preview: {uploaded.name}")
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
|||||||
@@ -63,7 +63,11 @@ if uploaded is not None:
|
|||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Failed to read file: {e}")
|
from src.core.errors import format_for_user
|
||||||
|
st.error(
|
||||||
|
f"**Could not read `{uploaded.name}`**\n\n"
|
||||||
|
f"```\n{format_for_user(e)}\n```"
|
||||||
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Placeholder options
|
# Placeholder options
|
||||||
|
|||||||
@@ -72,7 +72,11 @@ if uploaded is not None:
|
|||||||
})
|
})
|
||||||
st.dataframe(mapping_data, use_container_width=True, hide_index=True)
|
st.dataframe(mapping_data, use_container_width=True, hide_index=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Failed to read file: {e}")
|
from src.core.errors import format_for_user
|
||||||
|
st.error(
|
||||||
|
f"**Could not read `{uploaded.name}`**\n\n"
|
||||||
|
f"```\n{format_for_user(e)}\n```"
|
||||||
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Placeholder options
|
# Placeholder options
|
||||||
|
|||||||
@@ -63,7 +63,11 @@ if uploaded is not None:
|
|||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Failed to read file: {e}")
|
from src.core.errors import format_for_user
|
||||||
|
st.error(
|
||||||
|
f"**Could not read `{uploaded.name}`**\n\n"
|
||||||
|
f"```\n{format_for_user(e)}\n```"
|
||||||
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Placeholder options
|
# Placeholder options
|
||||||
|
|||||||
@@ -65,7 +65,11 @@ if uploaded_files:
|
|||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns — Columns: {', '.join(df.columns[:10])}{'...' if len(df.columns) > 10 else ''}")
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns — Columns: {', '.join(df.columns[:10])}{'...' if len(df.columns) > 10 else ''}")
|
||||||
st.dataframe(df.head(5), use_container_width=True)
|
st.dataframe(df.head(5), use_container_width=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Failed to read {f.name}: {e}")
|
from src.core.errors import format_for_user
|
||||||
|
st.error(
|
||||||
|
f"**Could not read `{f.name}`**\n\n"
|
||||||
|
f"```\n{format_for_user(e)}\n```"
|
||||||
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Placeholder options
|
# Placeholder options
|
||||||
|
|||||||
@@ -63,7 +63,11 @@ if uploaded is not None:
|
|||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Failed to read file: {e}")
|
from src.core.errors import format_for_user
|
||||||
|
st.error(
|
||||||
|
f"**Could not read `{uploaded.name}`**\n\n"
|
||||||
|
f"```\n{format_for_user(e)}\n```"
|
||||||
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Placeholder options
|
# Placeholder options
|
||||||
|
|||||||
@@ -63,7 +63,11 @@ if uploaded is not None:
|
|||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
st.error(f"Failed to read file: {e}")
|
from src.core.errors import format_for_user
|
||||||
|
st.error(
|
||||||
|
f"**Could not read `{uploaded.name}`**\n\n"
|
||||||
|
f"```\n{format_for_user(e)}\n```"
|
||||||
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Pipeline steps (checklist)
|
# Pipeline steps (checklist)
|
||||||
|
|||||||
@@ -319,17 +319,20 @@ class TestProductionReadyValidation:
|
|||||||
|
|
||||||
def test_write_file_rejects_non_dataframe(self, tmp_path: Path):
|
def test_write_file_rejects_non_dataframe(self, tmp_path: Path):
|
||||||
from src.core.io import write_file
|
from src.core.io import write_file
|
||||||
with pytest.raises(TypeError, match="requires a pandas DataFrame"):
|
from src.core.errors import InputValidationError
|
||||||
|
with pytest.raises(InputValidationError, match="requires a pandas DataFrame"):
|
||||||
write_file({"a": [1]}, tmp_path / "out.csv") # type: ignore[arg-type]
|
write_file({"a": [1]}, tmp_path / "out.csv") # type: ignore[arg-type]
|
||||||
|
|
||||||
def test_clean_dataframe_rejects_non_dataframe(self):
|
def test_clean_dataframe_rejects_non_dataframe(self):
|
||||||
from src.core.text_clean import clean_dataframe
|
from src.core.text_clean import clean_dataframe
|
||||||
with pytest.raises(TypeError, match="requires a pandas DataFrame"):
|
from src.core.errors import InputValidationError
|
||||||
|
with pytest.raises(InputValidationError, match="requires a pandas DataFrame"):
|
||||||
clean_dataframe([{"a": 1}]) # type: ignore[arg-type]
|
clean_dataframe([{"a": 1}]) # type: ignore[arg-type]
|
||||||
|
|
||||||
def test_deduplicate_rejects_non_dataframe(self):
|
def test_deduplicate_rejects_non_dataframe(self):
|
||||||
from src.core.dedup import deduplicate
|
from src.core.dedup import deduplicate
|
||||||
with pytest.raises(TypeError, match="requires a pandas DataFrame"):
|
from src.core.errors import InputValidationError
|
||||||
|
with pytest.raises(InputValidationError, match="requires a pandas DataFrame"):
|
||||||
deduplicate({"x": [1]}) # type: ignore[arg-type]
|
deduplicate({"x": [1]}) # type: ignore[arg-type]
|
||||||
|
|
||||||
def test_keep_most_recent_requires_date_column(self):
|
def test_keep_most_recent_requires_date_column(self):
|
||||||
|
|||||||
230
tests/test_errors.py
Normal file
230
tests/test_errors.py
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
"""Tests for the structured error-handling infrastructure.
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
- DataToolsError base class formatting (path, column, operation, suggestion).
|
||||||
|
- Specialized subclasses inherit from the right stdlib bases so existing
|
||||||
|
``except OSError`` / ``except ValueError`` handlers still catch them.
|
||||||
|
- ensure_dataframe / ensure_choice raise the right structured errors.
|
||||||
|
- format_for_user produces readable output for both DataTools and
|
||||||
|
unrecognized exceptions.
|
||||||
|
- Per-module integration: bad config / bad file / bad input each
|
||||||
|
surface a helpful error rather than a deep library traceback.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.core.errors import (
|
||||||
|
ConfigError,
|
||||||
|
DataToolsError,
|
||||||
|
FileAccessError,
|
||||||
|
FileFormatError,
|
||||||
|
InputValidationError,
|
||||||
|
ensure_choice,
|
||||||
|
ensure_dataframe,
|
||||||
|
format_for_user,
|
||||||
|
wrap_file_read,
|
||||||
|
wrap_file_write,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Base class
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestDataToolsError:
|
||||||
|
def test_message_only(self):
|
||||||
|
err = DataToolsError("something failed")
|
||||||
|
assert "something failed" in str(err)
|
||||||
|
|
||||||
|
def test_full_context(self):
|
||||||
|
err = DataToolsError(
|
||||||
|
"could not parse",
|
||||||
|
path="/tmp/foo.csv",
|
||||||
|
column="email",
|
||||||
|
operation="read_file",
|
||||||
|
suggestion="check encoding",
|
||||||
|
cause=ValueError("inner"),
|
||||||
|
)
|
||||||
|
text = str(err)
|
||||||
|
assert "could not parse" in text
|
||||||
|
assert "read_file" in text
|
||||||
|
assert "/tmp/foo.csv" in text
|
||||||
|
assert "'email'" in text
|
||||||
|
assert "ValueError" in text
|
||||||
|
assert "check encoding" in text
|
||||||
|
|
||||||
|
def test_inheritance_for_oserror_handlers(self):
|
||||||
|
# FileAccessError must be catchable as OSError so callers using
|
||||||
|
# the stdlib hierarchy continue to work.
|
||||||
|
with pytest.raises(OSError):
|
||||||
|
raise FileAccessError("nope", path="/tmp/x")
|
||||||
|
|
||||||
|
def test_inheritance_for_valueerror_handlers(self):
|
||||||
|
for cls in (InputValidationError, ConfigError, FileFormatError):
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
raise cls("nope")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestEnsureDataframe:
|
||||||
|
def test_passes_real_df(self):
|
||||||
|
ensure_dataframe(pd.DataFrame({"a": [1]}), function="x")
|
||||||
|
|
||||||
|
def test_rejects_dict(self):
|
||||||
|
with pytest.raises(InputValidationError, match="DataFrame"):
|
||||||
|
ensure_dataframe({"a": 1}, function="my_func")
|
||||||
|
|
||||||
|
def test_includes_function_name(self):
|
||||||
|
try:
|
||||||
|
ensure_dataframe(None, function="my_func")
|
||||||
|
except InputValidationError as e:
|
||||||
|
assert "my_func" in str(e)
|
||||||
|
else: # pragma: no cover
|
||||||
|
pytest.fail("should have raised")
|
||||||
|
|
||||||
|
def test_includes_actual_type(self):
|
||||||
|
try:
|
||||||
|
ensure_dataframe([1, 2, 3], function="x")
|
||||||
|
except InputValidationError as e:
|
||||||
|
assert "list" in str(e)
|
||||||
|
|
||||||
|
|
||||||
|
class TestEnsureChoice:
|
||||||
|
def test_passes_valid(self):
|
||||||
|
ensure_choice("a", name="mode", choices=["a", "b"])
|
||||||
|
|
||||||
|
def test_rejects_invalid(self):
|
||||||
|
with pytest.raises(InputValidationError, match="Invalid mode"):
|
||||||
|
ensure_choice("c", name="mode", choices=["a", "b"])
|
||||||
|
|
||||||
|
def test_lists_choices_in_message(self):
|
||||||
|
try:
|
||||||
|
ensure_choice("c", name="mode", choices=["a", "b"])
|
||||||
|
except InputValidationError as e:
|
||||||
|
assert "'a'" in str(e) and "'b'" in str(e)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWrapFileHelpers:
|
||||||
|
def test_wrap_read_keeps_cause(self):
|
||||||
|
inner = OSError("disk error")
|
||||||
|
wrapped = wrap_file_read("/tmp/x", "read_file", inner)
|
||||||
|
assert wrapped.cause is inner
|
||||||
|
assert "/tmp/x" in str(wrapped)
|
||||||
|
|
||||||
|
def test_wrap_write_permission_hint(self):
|
||||||
|
inner = PermissionError("no perm")
|
||||||
|
wrapped = wrap_file_write("/tmp/x", "save", inner)
|
||||||
|
# Permission failures get a Windows-aware suggestion
|
||||||
|
assert "Windows" in str(wrapped) or "permission" in str(wrapped).lower()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# format_for_user
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestFormatForUser:
|
||||||
|
def test_datatools_error(self):
|
||||||
|
err = InputValidationError(
|
||||||
|
"bad date_order", suggestion="use MDY or DMY",
|
||||||
|
)
|
||||||
|
out = format_for_user(err)
|
||||||
|
assert "bad date_order" in out
|
||||||
|
assert "use MDY or DMY" in out
|
||||||
|
|
||||||
|
def test_with_context_prefix(self):
|
||||||
|
err = ValueError("inner")
|
||||||
|
out = format_for_user(err, context="Failed to read upload")
|
||||||
|
assert out.startswith("Failed to read upload")
|
||||||
|
assert "ValueError" in out
|
||||||
|
|
||||||
|
def test_unrecognized_exception(self):
|
||||||
|
err = RuntimeError("oops")
|
||||||
|
out = format_for_user(err)
|
||||||
|
assert "RuntimeError" in out
|
||||||
|
assert "oops" in out
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Integration — every public entry point surfaces structured errors
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestIntegration:
|
||||||
|
def test_io_read_missing_file_is_structured(self, tmp_path):
|
||||||
|
from src.core.io import read_file
|
||||||
|
with pytest.raises(FileAccessError) as exc_info:
|
||||||
|
read_file(tmp_path / "missing.csv")
|
||||||
|
msg = str(exc_info.value)
|
||||||
|
assert "Input file not found" in msg
|
||||||
|
assert str(tmp_path) in msg
|
||||||
|
assert "exists" in msg or "does NOT exist" in msg
|
||||||
|
|
||||||
|
def test_io_write_to_missing_dir(self, tmp_path):
|
||||||
|
from src.core.io import write_file
|
||||||
|
# Writing into a non-existent directory raises a wrapped
|
||||||
|
# FileAccessError rather than a raw FileNotFoundError, so the
|
||||||
|
# user sees the path and a recovery hint.
|
||||||
|
df = pd.DataFrame({"a": [1]})
|
||||||
|
with pytest.raises(FileAccessError) as exc_info:
|
||||||
|
write_file(df, tmp_path / "no_such_dir" / "out.csv")
|
||||||
|
msg = str(exc_info.value)
|
||||||
|
assert "Could not write" in msg
|
||||||
|
assert "no_such_dir" in msg
|
||||||
|
|
||||||
|
def test_config_bad_json(self, tmp_path):
|
||||||
|
from src.core.config import DeduplicationConfig
|
||||||
|
path = tmp_path / "bad.json"
|
||||||
|
path.write_text("{not json")
|
||||||
|
with pytest.raises(ConfigError) as exc_info:
|
||||||
|
DeduplicationConfig.from_file(path)
|
||||||
|
assert "Invalid JSON" in str(exc_info.value)
|
||||||
|
assert "line" in str(exc_info.value)
|
||||||
|
|
||||||
|
def test_config_bad_algorithm_includes_strategy_index(self, tmp_path):
|
||||||
|
from src.core.config import DeduplicationConfig
|
||||||
|
path = tmp_path / "cfg.json"
|
||||||
|
path.write_text(json.dumps({
|
||||||
|
"strategies": [{
|
||||||
|
"columns": [{
|
||||||
|
"column": "name",
|
||||||
|
"algorithm": "not_a_real_algo",
|
||||||
|
"threshold": 90.0,
|
||||||
|
}],
|
||||||
|
}],
|
||||||
|
}))
|
||||||
|
loaded = DeduplicationConfig.from_file(path)
|
||||||
|
with pytest.raises(ConfigError) as exc_info:
|
||||||
|
loaded.to_strategies()
|
||||||
|
msg = str(exc_info.value)
|
||||||
|
assert "not_a_real_algo" in msg
|
||||||
|
assert "name" in msg # column name
|
||||||
|
assert "strategy[0]" in msg # strategy index
|
||||||
|
|
||||||
|
def test_standardize_options_bad_field_type_includes_column(self):
|
||||||
|
from src.core.format_standardize import StandardizeOptions
|
||||||
|
with pytest.raises(ConfigError) as exc_info:
|
||||||
|
StandardizeOptions.from_dict({
|
||||||
|
"column_types": {"my_col": "made_up"},
|
||||||
|
})
|
||||||
|
msg = str(exc_info.value)
|
||||||
|
assert "my_col" in msg
|
||||||
|
assert "made_up" in msg
|
||||||
|
|
||||||
|
def test_standardize_dataframe_unknown_column(self):
|
||||||
|
from src.core.format_standardize import (
|
||||||
|
FieldType, StandardizeOptions, standardize_dataframe,
|
||||||
|
)
|
||||||
|
df = pd.DataFrame({"name": ["a"]})
|
||||||
|
opts = StandardizeOptions(column_types={"missing": FieldType.DATE})
|
||||||
|
with pytest.raises(InputValidationError) as exc_info:
|
||||||
|
standardize_dataframe(df, opts)
|
||||||
|
assert "missing" in str(exc_info.value)
|
||||||
|
assert "['name']" in str(exc_info.value)
|
||||||
@@ -85,7 +85,10 @@ class TestReadFile:
|
|||||||
assert "customer_name" in df.columns
|
assert "customer_name" in df.columns
|
||||||
|
|
||||||
def test_read_nonexistent(self):
|
def test_read_nonexistent(self):
|
||||||
with pytest.raises(FileNotFoundError):
|
# FileAccessError extends OSError so existing `except OSError`
|
||||||
|
# handlers still catch it.
|
||||||
|
from src.core.errors import FileAccessError
|
||||||
|
with pytest.raises((FileAccessError, OSError)):
|
||||||
read_file("/tmp/nonexistent_file_xyz.csv")
|
read_file("/tmp/nonexistent_file_xyz.csv")
|
||||||
|
|
||||||
def test_read_with_encoding_override(self, sample_csv_path):
|
def test_read_with_encoding_override(self, sample_csv_path):
|
||||||
|
|||||||
Reference in New Issue
Block a user