datatools-dev/tests/test_errors.py

"""Tests for the structured error-handling infrastructure.

Covers:
- DataToolsError base class formatting (path, column, operation, suggestion).
- Specialized subclasses inherit from the right stdlib bases so existing
  ``except OSError`` / ``except ValueError`` handlers still catch them.
- ensure_dataframe / ensure_choice raise the right structured errors.
- format_for_user produces readable output for both DataTools and
  unrecognized exceptions.
- Per-module integration: bad config / bad file / bad input each
  surface a helpful error rather than a deep library traceback.
"""

from __future__ import annotations

import json
from pathlib import Path

import pandas as pd
import pytest

from src.core.errors import (
    ConfigError,
    DataToolsError,
    FileAccessError,
    FileFormatError,
    InputValidationError,
    ensure_choice,
    ensure_dataframe,
    format_for_user,
    wrap_file_read,
    wrap_file_write,
)


# ---------------------------------------------------------------------------
# Base class
# ---------------------------------------------------------------------------

class TestDataToolsError:
    def test_message_only(self):
        err = DataToolsError("something failed")
        assert "something failed" in str(err)

    def test_full_context(self):
        err = DataToolsError(
            "could not parse",
            path="/tmp/foo.csv",
            column="email",
            operation="read_file",
            suggestion="check encoding",
            cause=ValueError("inner"),
        )
        text = str(err)
        assert "could not parse" in text
        assert "read_file" in text
        assert "/tmp/foo.csv" in text
        assert "'email'" in text
        assert "ValueError" in text
        assert "check encoding" in text

    def test_inheritance_for_oserror_handlers(self):
        # FileAccessError must be catchable as OSError so callers using
        # the stdlib hierarchy continue to work.
        with pytest.raises(OSError):
            raise FileAccessError("nope", path="/tmp/x")

    def test_inheritance_for_valueerror_handlers(self):
        for cls in (InputValidationError, ConfigError, FileFormatError):
            with pytest.raises(ValueError):
                raise cls("nope")


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

class TestEnsureDataframe:
    def test_passes_real_df(self):
        ensure_dataframe(pd.DataFrame({"a": [1]}), function="x")

    def test_rejects_dict(self):
        with pytest.raises(InputValidationError, match="DataFrame"):
            ensure_dataframe({"a": 1}, function="my_func")

    def test_includes_function_name(self):
        try:
            ensure_dataframe(None, function="my_func")
        except InputValidationError as e:
            assert "my_func" in str(e)
        else:  # pragma: no cover
            pytest.fail("should have raised")

    def test_includes_actual_type(self):
        try:
            ensure_dataframe([1, 2, 3], function="x")
        except InputValidationError as e:
            assert "list" in str(e)


class TestEnsureChoice:
    def test_passes_valid(self):
        ensure_choice("a", name="mode", choices=["a", "b"])

    def test_rejects_invalid(self):
        with pytest.raises(InputValidationError, match="Invalid mode"):
            ensure_choice("c", name="mode", choices=["a", "b"])

    def test_lists_choices_in_message(self):
        try:
            ensure_choice("c", name="mode", choices=["a", "b"])
        except InputValidationError as e:
            assert "'a'" in str(e) and "'b'" in str(e)


class TestWrapFileHelpers:
    def test_wrap_read_keeps_cause(self):
        inner = OSError("disk error")
        wrapped = wrap_file_read("/tmp/x", "read_file", inner)
        assert wrapped.cause is inner
        assert "/tmp/x" in str(wrapped)

    def test_wrap_write_permission_hint(self):
        inner = PermissionError("no perm")
        wrapped = wrap_file_write("/tmp/x", "save", inner)
        # Permission failures get a Windows-aware suggestion
        assert "Windows" in str(wrapped) or "permission" in str(wrapped).lower()


# ---------------------------------------------------------------------------
# format_for_user
# ---------------------------------------------------------------------------

class TestFormatForUser:
    def test_datatools_error(self):
        err = InputValidationError(
            "bad date_order", suggestion="use MDY or DMY",
        )
        out = format_for_user(err)
        assert "bad date_order" in out
        assert "use MDY or DMY" in out

    def test_with_context_prefix(self):
        err = ValueError("inner")
        out = format_for_user(err, context="Failed to read upload")
        assert out.startswith("Failed to read upload")
        assert "ValueError" in out

    def test_unrecognized_exception(self):
        err = RuntimeError("oops")
        out = format_for_user(err)
        assert "RuntimeError" in out
        assert "oops" in out


# ---------------------------------------------------------------------------
# Integration — every public entry point surfaces structured errors
# ---------------------------------------------------------------------------

class TestIntegration:
    def test_io_read_missing_file_is_structured(self, tmp_path):
        from src.core.io import read_file
        with pytest.raises(FileAccessError) as exc_info:
            read_file(tmp_path / "missing.csv")
        msg = str(exc_info.value)
        assert "Input file not found" in msg
        assert str(tmp_path) in msg
        assert "exists" in msg or "does NOT exist" in msg

    def test_io_write_to_missing_dir(self, tmp_path):
        from src.core.io import write_file
        # Writing into a non-existent directory raises a wrapped
        # FileAccessError rather than a raw FileNotFoundError, so the
        # user sees the path and a recovery hint.
        df = pd.DataFrame({"a": [1]})
        with pytest.raises(FileAccessError) as exc_info:
            write_file(df, tmp_path / "no_such_dir" / "out.csv")
        msg = str(exc_info.value)
        assert "Could not write" in msg
        assert "no_such_dir" in msg

    def test_config_bad_json(self, tmp_path):
        from src.core.config import DeduplicationConfig
        path = tmp_path / "bad.json"
        path.write_text("{not json")
        with pytest.raises(ConfigError) as exc_info:
            DeduplicationConfig.from_file(path)
        assert "Invalid JSON" in str(exc_info.value)
        assert "line" in str(exc_info.value)

    def test_config_bad_algorithm_includes_strategy_index(self, tmp_path):
        from src.core.config import DeduplicationConfig
        path = tmp_path / "cfg.json"
        path.write_text(json.dumps({
            "strategies": [{
                "columns": [{
                    "column": "name",
                    "algorithm": "not_a_real_algo",
                    "threshold": 90.0,
                }],
            }],
        }))
        loaded = DeduplicationConfig.from_file(path)
        with pytest.raises(ConfigError) as exc_info:
            loaded.to_strategies()
        msg = str(exc_info.value)
        assert "not_a_real_algo" in msg
        assert "name" in msg          # column name
        assert "strategy[0]" in msg   # strategy index

    def test_standardize_options_bad_field_type_includes_column(self):
        from src.core.format_standardize import StandardizeOptions
        with pytest.raises(ConfigError) as exc_info:
            StandardizeOptions.from_dict({
                "column_types": {"my_col": "made_up"},
            })
        msg = str(exc_info.value)
        assert "my_col" in msg
        assert "made_up" in msg

    def test_standardize_dataframe_unknown_column(self):
        from src.core.format_standardize import (
            FieldType, StandardizeOptions, standardize_dataframe,
        )
        df = pd.DataFrame({"name": ["a"]})
        opts = StandardizeOptions(column_types={"missing": FieldType.DATE})
        with pytest.raises(InputValidationError) as exc_info:
            standardize_dataframe(df, opts)
        assert "missing" in str(exc_info.value)
        assert "['name']" in str(exc_info.value)