fix: cross-tool audit findings + alignment with format standardizer

Closes 12 bugs and 8 gaps surfaced by parallel audits across all core
modules, plus aligns the dedup-side normalizers with the new
format_standardize behavior where they had silently diverged.

Bugs (data integrity / correctness):
- dedup: NaN/None values matched as duplicates because str(None)='None'.
  Two rows with missing email silently merged.
- dedup: removed_df had 0 columns when nothing was removed; downstream
  code expecting matching schema broke. Now preserves column shape.
- dedup: ColumnMatchStrategy threshold accepted any value; out-of-range
  silently broke matching. Validated to [0, 100] in __post_init__.
- dedup: strategy referencing a missing column was silently skipped.
  Now raises ValueError listing available columns.
- fixes: replace_null_sentinels crashed on non-string sentinels (int/None
  from JSON payload). Coerced to str.
- fixes: _vectorized_regex_sub raised raw re.error on bad patterns. Now
  wraps as ValueError with clear message.
- io: detect_header_row mis-identified all-empty and metadata-only rows
  as headers (all([]) is True). Now requires ≥2 non-empty cells.
- config: from_dict crashed when JSON had unknown fields, breaking
  forward compat. Now filters to known fields.
- analyze: mixed-case email detector flagged all-None columns because
  str(None)='None' contains both N and one. Now drops NaN before stringify.

New features and gap closures:
- io: _detect_excel_header_row mirrors detect_header_row for Excel via
  openpyxl read-only; _read_excel uses it when header_row=None.
- io: write_file gains delimiter + encoding params; .tsv extension
  defaults to tab.
- normalizers: normalize_phone preserves extensions as ;ext=N suffix.
- normalizers: normalize_address folds spelled-out US state names to
  2-letter codes (California ≡ CA).
- normalizers: normalize_name drops surname particles (van, de, von)
  so "Charles de Gaulle" ≡ "Charles Gaulle" for matching.
- analyze: new _detect_inconsistent_date_format detector flags columns
  with mixed ISO/US/EU date shapes; routes to format standardizer.
- analyze: _NULL_LIKE recognizes "<na>" (pd.NA repr).
- analyze: duplicate-row finding renamed count → n_extra (rows that
  would actually be removed) with clarified description.
- dedup: group_confidence no longer falsely 100.0 when transitive group
  members lack a recorded direct pair; falls back to 100.0 only when
  truly no pairs were observed.
- dedup: MatchResult / DeduplicationResult docstrings clarify that
  row_indices refer to the input frame's positional index (output index
  is reset).
- text_clean: visualize_hidden_html(None) now returns None (matches
  visualize_hidden_text); strip_bom strips at most one BOM per call;
  sentence_case dead elif branch removed.

Tests:
- tests/test_audit_fixes.py — 28 regression tests, one or more per
  numbered finding, named after BUG/GAP/NIT tags so future readers
  can trace each test back to its audit.
- tests/test_fixes_unit.py — 26 isolated unit tests for previously
  integration-only fix functions (trim_whitespace, strip_nbsp,
  strip_zero_width, normalize_line_endings, clean_headers,
  repair_mojibake — last skipped if ftfy unavailable).
- tests/test_io.py — adds CSV / TSV / semicolon / UTF-8-BOM round-trip
  tests + Excel auto-header-detection tests.
- tests/test_normalizers.py — adds 8 tests for the alignment work
  above (phone extension, state names, particles).

Adds .claude/ to .gitignore (agent worktrees + local settings).

Full project suite: 1197 passed, 4 skipped, 17 xfailed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-01 02:11:57 +00:00
parent 4adeb5c7f3
commit b23a27d4e3
13 changed files with 997 additions and 41 deletions

3
.gitignore vendored
View File

@@ -7,3 +7,6 @@ logs/
dist/ dist/
build/ build/
.pytest_cache/ .pytest_cache/
# Claude Code agent worktrees + local settings
.claude/

View File

@@ -125,6 +125,8 @@ _ZERO_WIDTH_CHARS = set("­")
_NULL_LIKE = { _NULL_LIKE = {
"n/a", "na", "nan", "null", "none", "#n/a", "#na", "-", "--", "n/a", "na", "nan", "null", "none", "#n/a", "#na", "-", "--",
"tbd", "unknown", "n.a.", "(null)", "tbd", "unknown", "n.a.", "(null)",
# Pandas-specific: NA values stringified via str(pd.NA) → "<NA>".
"<na>",
} }
# Mojibake fingerprints: classic UTF-8-as-cp1252 corruptions. # Mojibake fingerprints: classic UTF-8-as-cp1252 corruptions.
@@ -358,12 +360,80 @@ def _detect_mojibake(df: pd.DataFrame) -> list[Finding]:
)] )]
# Date-shaped patterns for the inconsistent-format detector.
_DATE_FORMAT_PATTERNS: dict[str, str] = {
"iso": r"^\d{4}-\d{1,2}-\d{1,2}$",
"us_slash": r"^\d{1,2}/\d{1,2}/\d{2,4}$",
"eu_dot": r"^\d{1,2}\.\d{1,2}\.\d{2,4}$",
"eu_slash": r"^\d{1,2}/\d{1,2}/\d{4}$", # may overlap us_slash; resolved by us_slash first
}
_DATE_FORMAT_RE: dict[str, "re.Pattern"] = {
name: re.compile(pat) for name, pat in _DATE_FORMAT_PATTERNS.items()
}
def _detect_inconsistent_date_format(df: pd.DataFrame) -> list[Finding]:
"""Flag columns whose date-shaped values use multiple incompatible formats.
A column is "date-shaped" if more than half its non-empty values
match one of the recognized date regexes. If two or more distinct
formats each pass that majority threshold, emit a finding routed to
the format standardizer.
"""
findings: list[Finding] = []
for col in df.columns:
try:
ser = df[col].dropna().astype(str)
except Exception:
continue
nonempty = ser[ser.str.strip().astype(bool)]
if len(nonempty) < 4:
continue
format_counts: dict[str, int] = {}
for name, pat in _DATE_FORMAT_RE.items():
count = int(nonempty.str.match(pat).sum())
if count >= 2:
format_counts[name] = count
if len(format_counts) < 2:
continue
# Require at least 50% of values to be date-shaped overall.
total_date_shaped = sum(format_counts.values())
if total_date_shaped < len(nonempty) * 0.5:
continue
format_summary = ", ".join(
f"{n}({c})" for n, c in sorted(
format_counts.items(), key=lambda kv: -kv[1]
)
)
samples_idx = nonempty.head(5)
samples = [(int(i), str(col), str(v)) for i, v in samples_idx.items()]
findings.append(Finding(
id="inconsistent_date_format",
severity="info",
tool=TOOL_FORMAT_STANDARDIZER,
count=int(total_date_shaped),
description=(
f"Column '{col}' contains dates in multiple formats: "
f"{format_summary}. Run format standardizer to normalize."
),
column=str(col),
samples=samples,
confidence="medium",
fix_action=FIX_NONE,
))
return findings
def _detect_mixed_case_email(df: pd.DataFrame) -> list[Finding]: def _detect_mixed_case_email(df: pd.DataFrame) -> list[Finding]:
findings: list[Finding] = [] findings: list[Finding] = []
for col in df.columns: for col in df.columns:
if not isinstance(col, str) or not _EMAIL_LIKE_COL.search(col): if not isinstance(col, str) or not _EMAIL_LIKE_COL.search(col):
continue continue
ser = df[col].astype(str) # Drop NaN/None *before* astype(str), otherwise None becomes the
# string "None" — which contains both upper "N" and lower "one"
# and would trigger a false-positive mixed-case finding on a
# column that has no real emails at all.
ser = df[col].dropna().astype(str)
nonempty = ser[ser.str.strip().astype(bool)] nonempty = ser[ser.str.strip().astype(bool)]
if nonempty.empty: if nonempty.empty:
continue continue
@@ -410,8 +480,12 @@ def _detect_near_duplicates(df: pd.DataFrame) -> list[Finding]:
n_dupes = int(dup_mask.sum()) n_dupes = int(dup_mask.sum())
if n_dupes < 2: if n_dupes < 2:
return [] return []
# Count *extra* copies, not total members of duplicate groups. # ``n_groups`` is the count of unique duplicate signatures; each
# group contains 2+ rows. ``n_extra`` is rows that would be removed
# by dedup (total in groups minus one survivor per group) — that's
# the number the user usually wants ("remove X to fix").
n_groups = int(norm[dup_mask].drop_duplicates().shape[0]) n_groups = int(norm[dup_mask].drop_duplicates().shape[0])
n_extra = n_dupes - n_groups
samples: list[tuple[int, str, str]] = [] samples: list[tuple[int, str, str]] = []
for i in df[dup_mask].index[:5]: for i in df[dup_mask].index[:5]:
# Render the first textual column's value as a sample. # Render the first textual column's value as a sample.
@@ -424,11 +498,12 @@ def _detect_near_duplicates(df: pd.DataFrame) -> list[Finding]:
id="near_duplicate_rows", id="near_duplicate_rows",
severity="info", severity="info",
tool=TOOL_DEDUPLICATOR, tool=TOOL_DEDUPLICATOR,
count=n_dupes, count=n_extra,
description=( description=(
f"{n_dupes} row(s) across ~{n_groups} group(s) are duplicates " f"{n_extra} extra copy(ies) across {n_groups} duplicate group(s) "
f"after stripping whitespace and lowercasing string columns. " f"({n_dupes} rows total) — duplicates after stripping whitespace "
f"Run the deduplicator to merge or remove." f"and lowercasing string columns. Run the deduplicator to merge "
f"or remove."
), ),
samples=samples, samples=samples,
confidence="medium", confidence="medium",
@@ -799,6 +874,7 @@ def analyze(
findings.extend(_detect_null_like_sentinels(df)) findings.extend(_detect_null_like_sentinels(df))
findings.extend(_detect_mojibake(df)) findings.extend(_detect_mojibake(df))
findings.extend(_detect_mixed_case_email(df)) findings.extend(_detect_mixed_case_email(df))
findings.extend(_detect_inconsistent_date_format(df))
findings.extend(_detect_leading_zero_ids(df)) findings.extend(_detect_leading_zero_ids(df))
findings.extend(_detect_near_duplicates(df)) findings.extend(_detect_near_duplicates(df))
return findings return findings

View File

@@ -3,7 +3,7 @@
from __future__ import annotations from __future__ import annotations
import json import json
from dataclasses import dataclass, field, asdict from dataclasses import dataclass, field, fields, asdict
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@@ -60,9 +60,16 @@ class DeduplicationConfig:
@classmethod @classmethod
def from_dict(cls, data: dict) -> DeduplicationConfig: def from_dict(cls, data: dict) -> DeduplicationConfig:
# Filter unknown fields silently — keeps loading forward-compatible
# when older code reads a config written by a newer version that
# added fields to ColumnStrategyConfig.
col_known = {f.name for f in fields(ColumnStrategyConfig)}
strategies = [] strategies = []
for s in data.get("strategies", []): for s in data.get("strategies", []):
cols = [ColumnStrategyConfig(**c) for c in s.get("columns", [])] cols = [
ColumnStrategyConfig(**{k: v for k, v in c.items() if k in col_known})
for c in s.get("columns", [])
]
strategies.append(StrategyConfig(columns=cols)) strategies.append(StrategyConfig(columns=cols))
return cls( return cls(
strategies=strategies, strategies=strategies,

View File

@@ -49,6 +49,18 @@ class ColumnMatchStrategy:
threshold: float = 100.0 # 0-100 scale threshold: float = 100.0 # 0-100 scale
normalizer: Optional[NormalizerType] = None normalizer: Optional[NormalizerType] = None
def __post_init__(self) -> None:
if not isinstance(self.threshold, (int, float)):
raise TypeError(
f"threshold must be a number, got {type(self.threshold).__name__}"
)
if not 0 <= self.threshold <= 100:
raise ValueError(
f"threshold must be in [0, 100]; got {self.threshold}. "
"Match scores are on a 0100 scale, so values outside this "
"range either always match or never match."
)
@dataclass @dataclass
class MatchStrategy: class MatchStrategy:
@@ -61,7 +73,13 @@ class MatchStrategy:
@dataclass @dataclass
class MatchResult: class MatchResult:
"""One group of duplicate rows.""" """One group of duplicate rows.
``row_indices`` and ``survivor_index`` are positional indexes into
the *input* DataFrame (0-based, matching ``df.iloc[]``), not the
output ``deduplicated_df`` (whose index is reset to 0..N-1). To map
back to the original frame, use ``df.iloc[row_indices]``.
"""
group_id: int group_id: int
row_indices: list[int] row_indices: list[int]
confidence: float # min confidence across pairs in the group confidence: float # min confidence across pairs in the group
@@ -71,7 +89,13 @@ class MatchResult:
@dataclass @dataclass
class DeduplicationResult: class DeduplicationResult:
"""Full result of a deduplication run.""" """Full result of a deduplication run.
``deduplicated_df`` and ``removed_df`` both have their indexes reset
to a fresh 0..N-1 range. ``match_groups[*].row_indices`` keeps the
original positional indexes of the *input* frame so callers can
cross-reference back to it (e.g., for an audit log).
"""
original_row_count: int original_row_count: int
deduplicated_df: pd.DataFrame deduplicated_df: pd.DataFrame
removed_df: pd.DataFrame removed_df: pd.DataFrame
@@ -153,8 +177,21 @@ def _compare_pair(
for cs in strategy.column_strategies: for cs in strategy.column_strategies:
col = f"{norm_prefix}{cs.column}" if cs.normalizer else cs.column col = f"{norm_prefix}{cs.column}" if cs.normalizer else cs.column
va = str(row_a.get(col, "")) raw_a = row_a.get(col, "")
vb = str(row_b.get(col, "")) raw_b = row_b.get(col, "")
# NaN / None always count as "empty" — never as the literal
# string "None" or "nan", which would otherwise let two rows
# with missing data in this column match at 100% similarity.
a_missing = raw_a is None or (
isinstance(raw_a, float) and pd.isna(raw_a)
) or raw_a is pd.NA
b_missing = raw_b is None or (
isinstance(raw_b, float) and pd.isna(raw_b)
) or raw_b is pd.NA
va = "" if a_missing else str(raw_a)
vb = "" if b_missing else str(raw_b)
# Skip if both empty # Skip if both empty
if not va and not vb: if not va and not vb:
@@ -221,17 +258,29 @@ def _find_match_groups(
raw_groups = uf.groups() raw_groups = uf.groups()
match_groups: list[MatchResult] = [] match_groups: list[MatchResult] = []
for gid, (root, members) in enumerate(sorted(raw_groups.items())): for gid, (root, members) in enumerate(sorted(raw_groups.items())):
# Confidence = min across all pairs in the group # Confidence = min across all directly-recorded pairs in the
group_confidence = 100.0 # group. Transitive members (A→B and B→C imply A→C) may not have
# a direct pair_info entry; we only count the recorded ones, so
# the score reflects observed evidence rather than the optimistic
# 100.0 default that masks weak links.
observed_confidences: list[float] = []
group_cols: set[str] = set() group_cols: set[str] = set()
for idx_a, m in enumerate(members): for idx_a, m in enumerate(members):
for idx_b in range(idx_a + 1, len(members)): for idx_b in range(idx_a + 1, len(members)):
key = (min(m, members[idx_b]), max(m, members[idx_b])) key = (min(m, members[idx_b]), max(m, members[idx_b]))
if key in pair_info: if key in pair_info:
conf, cols = pair_info[key] conf, cols = pair_info[key]
group_confidence = min(group_confidence, conf) observed_confidences.append(conf)
group_cols.update(cols) group_cols.update(cols)
if observed_confidences:
group_confidence = min(observed_confidences)
else:
# Edge case: a group with no recorded pair info (shouldn't
# happen for groups built from union-find on pair_info, but
# be defensive). Fall back to 100.0 only for trivial groups.
group_confidence = 100.0
match_groups.append(MatchResult( match_groups.append(MatchResult(
group_id=gid, group_id=gid,
row_indices=members, row_indices=members,
@@ -462,6 +511,17 @@ def deduplicate(
strategies = build_default_strategies(df) strategies = build_default_strategies(df)
log_entries.append(f"Auto-detected {len(strategies)} match strategies") log_entries.append(f"Auto-detected {len(strategies)} match strategies")
# Validate every strategy references real columns — silent skip
# would let a typo (``e_mail`` instead of ``email``) produce a
# confidently-empty result.
referenced = {cs.column for s in strategies for cs in s.column_strategies}
missing = sorted(c for c in referenced if c not in df.columns)
if missing:
raise ValueError(
f"Strategy references columns not present in the input: {missing}. "
f"Available columns: {list(df.columns)}"
)
# Log strategies # Log strategies
for i, s in enumerate(strategies): for i, s in enumerate(strategies):
cols_desc = ", ".join( cols_desc = ", ".join(
@@ -542,18 +602,21 @@ def deduplicate(
else: else:
deduplicated_df = df_work.iloc[keep_indices].copy() deduplicated_df = df_work.iloc[keep_indices].copy()
removed_df = df_work.iloc[sorted(remove_indices)].copy() if remove_indices else pd.DataFrame() if remove_indices:
removed_df = df_work.iloc[sorted(remove_indices)].copy()
else:
# Empty result: preserve column schema so downstream code can
# rely on ``removed_df.columns == deduplicated_df.columns``.
removed_df = df_work.iloc[0:0].copy()
# Drop shadow columns from output # Drop shadow columns from output
norm_cols = [c for c in deduplicated_df.columns if str(c).startswith("_norm_")] norm_cols = [c for c in deduplicated_df.columns if str(c).startswith("_norm_")]
deduplicated_df = deduplicated_df.drop(columns=norm_cols, errors="ignore") deduplicated_df = deduplicated_df.drop(columns=norm_cols, errors="ignore")
if not removed_df.empty: removed_df = removed_df.drop(columns=norm_cols, errors="ignore")
removed_df = removed_df.drop(columns=norm_cols, errors="ignore")
# Reset index # Reset index
deduplicated_df = deduplicated_df.reset_index(drop=True) deduplicated_df = deduplicated_df.reset_index(drop=True)
if not removed_df.empty: removed_df = removed_df.reset_index(drop=True)
removed_df = removed_df.reset_index(drop=True)
removed_count = original_count - len(deduplicated_df) removed_count = original_count - len(deduplicated_df)
log_entries.append(f"Result: {original_count}{len(deduplicated_df)} rows ({removed_count} removed)") log_entries.append(f"Result: {original_count}{len(deduplicated_df)} rows ({removed_count} removed)")

View File

@@ -152,7 +152,17 @@ def _vectorized_translate(
def _vectorized_regex_sub( def _vectorized_regex_sub(
df: pd.DataFrame, pattern, repl: str, *, inplace: bool = False, df: pd.DataFrame, pattern, repl: str, *, inplace: bool = False,
) -> tuple[pd.DataFrame, int]: ) -> tuple[pd.DataFrame, int]:
"""``str.replace(regex=True)`` shortcut for regex-based fixes.""" """``str.replace(regex=True)`` shortcut for regex-based fixes.
Raises ``ValueError`` if *pattern* is malformed — callers (GUI/CLI)
surface this with a clear message rather than letting an
unannotated ``re.error`` propagate.
"""
try:
re.compile(pattern)
except re.error as e:
raise ValueError(f"Invalid regex pattern {pattern!r}: {e}") from e
out = df if inplace else df.copy() out = df if inplace else df.copy()
changed = 0 changed = 0
for col in out.columns: for col in out.columns:
@@ -319,7 +329,11 @@ def replace_null_sentinels(df: pd.DataFrame, payload: Optional[dict] = None) ->
sentinels = payload.get("sentinels") sentinels = payload.get("sentinels")
if sentinels is None: if sentinels is None:
sentinels = list(_a._NULL_LIKE) sentinels = list(_a._NULL_LIKE)
sentinel_set = {s.strip().lower() for s in sentinels} # Coerce non-string sentinels (the GUI / JSON payload may produce
# ints, floats, bools) instead of crashing on .strip().
sentinel_set = {
str(s).strip().lower() for s in sentinels if s is not None
}
def fix(s: str) -> str: def fix(s: str) -> str:
return "" if s.strip().lower() in sentinel_set else s return "" if s.strip().lower() in sentinel_set else s

View File

@@ -109,8 +109,18 @@ def detect_header_row(path: Path, encoding: str = "utf-8", delimiter: str = ",",
break break
if not row: if not row:
continue continue
# All cells must be non-empty, non-numeric strings # Header heuristic:
if all(_looks_like_header(cell) for cell in row if cell.strip()): # - every non-empty cell looks like a header;
# - at least 2 non-empty cells (or just 1 in a single-column
# file). Without the count check, blank rows match
# vacuously (``all([])`` is True) and metadata banners
# like ``["Report 2024", "", ""]`` claim row 0 falsely.
non_empty = [cell for cell in row if cell.strip()]
min_required = 1 if len(row) <= 1 else 2
if (
len(non_empty) >= min_required
and all(_looks_like_header(cell) for cell in non_empty)
):
return idx return idx
return 0 return 0
@@ -263,7 +273,11 @@ def _read_excel(
header_row: Optional[int] = None, header_row: Optional[int] = None,
sheet_name: Optional[str | int] = 0, sheet_name: Optional[str | int] = 0,
) -> pd.DataFrame: ) -> pd.DataFrame:
hdr = header_row if header_row is not None else 0 hdr = (
header_row
if header_row is not None
else _detect_excel_header_row(path, sheet_name)
)
logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr) logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr)
return pd.read_excel( return pd.read_excel(
path, path,
@@ -275,6 +289,52 @@ def _read_excel(
) )
def _detect_excel_header_row(
path: Path,
sheet_name: Optional[str | int] = 0,
max_scan: int = 20,
) -> int:
"""Mirror of :func:`detect_header_row` for Excel workbooks.
Scans the first *max_scan* rows of *sheet_name* in read-only mode
(so a 100 MB workbook doesn't get fully materialized) and returns
the index of the first row where every non-empty cell looks like a
column header. Falls back to 0.
"""
try:
from openpyxl import load_workbook
except ImportError:
return 0
try:
wb = load_workbook(path, read_only=True, data_only=True)
except Exception:
return 0
try:
if isinstance(sheet_name, int):
names = wb.sheetnames
target = names[sheet_name] if 0 <= sheet_name < len(names) else names[0]
elif isinstance(sheet_name, str):
target = sheet_name if sheet_name in wb.sheetnames else wb.sheetnames[0]
else:
target = wb.sheetnames[0]
ws = wb[target]
for idx, row in enumerate(ws.iter_rows(values_only=True)):
if idx >= max_scan:
break
cells = ["" if v is None else str(v) for v in row]
non_empty = [c for c in cells if c.strip()]
min_required = 1 if len(cells) <= 1 else 2
if (
len(non_empty) >= min_required
and all(_looks_like_header(c) for c in non_empty)
):
return idx
return 0
finally:
wb.close()
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Writing # Writing
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -285,6 +345,7 @@ def write_file(
*, *,
file_format: Optional[str] = None, file_format: Optional[str] = None,
encoding: str = "utf-8-sig", encoding: str = "utf-8-sig",
delimiter: Optional[str] = None,
) -> Path: ) -> Path:
"""Write a DataFrame to CSV or Excel. """Write a DataFrame to CSV or Excel.
@@ -292,8 +353,12 @@ def write_file(
---------- ----------
df : DataFrame to write df : DataFrame to write
path : output file path path : output file path
file_format : ``"csv"`` or ``"xlsx"``; auto-detected from *path* suffix if *None* file_format : ``"csv"``, ``"tsv"``, or ``"xlsx"``; auto-detected from
*path* suffix if *None*
encoding : output encoding (default ``utf-8-sig`` for Windows Excel compat) encoding : output encoding (default ``utf-8-sig`` for Windows Excel compat)
delimiter : field separator for delimited output. Defaults to ``,``
for ``.csv``, ``\\t`` for ``.tsv``, and the explicit value
otherwise. Ignored for Excel formats.
Returns the resolved output Path. Returns the resolved output Path.
""" """
@@ -302,7 +367,10 @@ def write_file(
if fmt in ("xlsx", "xls"): if fmt in ("xlsx", "xls"):
df.to_excel(out, index=False, engine="openpyxl") df.to_excel(out, index=False, engine="openpyxl")
else: else:
df.to_csv(out, index=False, encoding=encoding) sep = delimiter if delimiter is not None else (
"\t" if fmt == "tsv" else ","
)
df.to_csv(out, index=False, encoding=encoding, sep=sep)
logger.info("Wrote {} rows to {}", len(df), out) logger.info("Wrote {} rows to {}", len(df), out)
return out return out

View File

@@ -69,7 +69,13 @@ def normalize_email(value: Optional[str]) -> str:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
def normalize_phone(value: Optional[str], default_region: str = "US") -> str: def normalize_phone(value: Optional[str], default_region: str = "US") -> str:
"""Parse with phonenumbers lib, return E.164. Fallback: digits-only.""" """Parse with phonenumbers lib, return E.164. Fallback: digits-only.
Extensions are preserved as a ``;ext=N`` suffix (RFC 3966 syntax) so
two records ``+15551234567 ext 100`` and ``+15551234567 ext 200``
don't normalize to the same key — they're different people at the
same business.
"""
if not value or not isinstance(value, str): if not value or not isinstance(value, str):
return "" return ""
stripped = value.strip() stripped = value.strip()
@@ -79,7 +85,10 @@ def normalize_phone(value: Optional[str], default_region: str = "US") -> str:
try: try:
parsed = phonenumbers.parse(stripped, default_region) parsed = phonenumbers.parse(stripped, default_region)
if phonenumbers.is_possible_number(parsed): if phonenumbers.is_possible_number(parsed):
return phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164) base = phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164)
if parsed.extension:
return f"{base};ext={parsed.extension}"
return base
except phonenumbers.NumberParseException: except phonenumbers.NumberParseException:
pass pass
@@ -100,10 +109,16 @@ _NAME_SUFFIXES = {
"jr", "sr", "ii", "iii", "iv", "v", "jr", "sr", "ii", "iii", "iv", "v",
"phd", "md", "esq", "dds", "rn", "phd", "md", "esq", "dds", "rn",
} }
# Surname particles dropped during normalization so that
# ``Charles de Gaulle`` and ``Charles Gaulle`` produce the same key.
_NAME_PARTICLES_DROP = {
"van", "von", "de", "da", "del", "della", "di", "du",
"der", "den", "le", "la", "el",
}
def normalize_name(value: Optional[str]) -> str: def normalize_name(value: Optional[str]) -> str:
"""Strip titles/suffixes, collapse whitespace, case-fold.""" """Strip titles/suffixes/particles, collapse whitespace, case-fold."""
if not value or not isinstance(value, str): if not value or not isinstance(value, str):
return "" return ""
name = value.strip() name = value.strip()
@@ -126,6 +141,9 @@ def normalize_name(value: Optional[str]) -> str:
while parts and parts[-1].rstrip(".") in _NAME_SUFFIXES: while parts and parts[-1].rstrip(".") in _NAME_SUFFIXES:
parts.pop() parts.pop()
# Drop surname particles wherever they appear.
parts = [p for p in parts if p not in _NAME_PARTICLES_DROP]
return " ".join(parts) return " ".join(parts)
@@ -178,8 +196,34 @@ _USPS_ABBREVIATIONS: dict[str, str] = {
} }
# US state name → 2-letter postal code. Substituted before tokenization
# so ``California`` and ``CA`` normalize to the same key.
_US_STATE_NAMES_NORM: dict[str, str] = {
"alabama": "al", "alaska": "ak", "arizona": "az", "arkansas": "ar",
"california": "ca", "colorado": "co", "connecticut": "ct",
"delaware": "de", "florida": "fl", "georgia": "ga", "hawaii": "hi",
"idaho": "id", "illinois": "il", "indiana": "in", "iowa": "ia",
"kansas": "ks", "kentucky": "ky", "louisiana": "la", "maine": "me",
"maryland": "md", "massachusetts": "ma", "michigan": "mi",
"minnesota": "mn", "mississippi": "ms", "missouri": "mo",
"montana": "mt", "nebraska": "ne", "nevada": "nv",
"new hampshire": "nh", "new jersey": "nj", "new mexico": "nm",
"new york": "ny", "north carolina": "nc", "north dakota": "nd",
"ohio": "oh", "oklahoma": "ok", "oregon": "or", "pennsylvania": "pa",
"rhode island": "ri", "south carolina": "sc", "south dakota": "sd",
"tennessee": "tn", "texas": "tx", "utah": "ut", "vermont": "vt",
"virginia": "va", "washington": "wa", "west virginia": "wv",
"wisconsin": "wi", "wyoming": "wy",
"district of columbia": "dc",
}
def normalize_address(value: Optional[str]) -> str: def normalize_address(value: Optional[str]) -> str:
"""USPS abbreviation normalization, collapse whitespace, case-fold.""" """USPS abbreviation normalization, collapse whitespace, case-fold.
Spelled-out US state names are folded to their 2-letter codes so
``California`` and ``CA`` normalize to the same matching key.
"""
if not value or not isinstance(value, str): if not value or not isinstance(value, str):
return "" return ""
addr = value.strip() addr = value.strip()
@@ -190,6 +234,13 @@ def normalize_address(value: Optional[str]) -> str:
addr = addr.casefold() addr = addr.casefold()
addr = addr.replace(".", " ").replace(",", " ") addr = addr.replace(".", " ").replace(",", " ")
# State names → 2-letter codes (longest first so ``new york`` wins
# over ``new``-as-a-fragment).
for full, code in sorted(
_US_STATE_NAMES_NORM.items(), key=lambda kv: -len(kv[0])
):
addr = re.sub(rf"(?<!\w){re.escape(full)}(?!\w)", code, addr)
parts = addr.split() parts = addr.split()
normalized_parts = [] normalized_parts = []
for part in parts: for part in parts:

View File

@@ -191,10 +191,15 @@ def strip_zero_width(s: str) -> str:
def strip_bom(s: str) -> str: def strip_bom(s: str) -> str:
"""Remove a leading ``U+FEFF`` (BOM) from the start of the string.""" """Remove a leading ``U+FEFF`` (BOM) from the start of the string.
Strips at most one BOM — multiple consecutive BOMs are unusual and
the second one likely indicates concatenation artifact the caller
should preserve so the issue stays visible.
"""
if not isinstance(s, str): if not isinstance(s, str):
return s return s
return s.lstrip("") return s[1:] if s.startswith("") else s
def strip_control(s: str) -> str: def strip_control(s: str) -> str:
@@ -252,6 +257,9 @@ def smart_title_case(s: str) -> str:
out.append(tok) out.append(tok)
continue continue
lowered = tok.lower() lowered = tok.lower()
# Particles stay lowercase only mid-string. The first and last
# words of a title always capitalize, even when they're particles
# (``A Story to Tell`` — first word ``A`` is capitalized).
if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES: if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
out.append(lowered) out.append(lowered)
continue continue
@@ -278,7 +286,12 @@ def smart_title_case(s: str) -> str:
def sentence_case(s: str) -> str: def sentence_case(s: str) -> str:
"""Lowercase, then capitalize the first cased letter after each ``. ! ?``.""" """Lowercase, then capitalize the first cased letter after each ``. ! ?``.
Non-letter, non-terminator characters (like opening quotes or
parens) don't consume the "next letter" trigger, so ``"hello." "world"``
becomes ``"Hello." "World"``.
"""
if not isinstance(s, str) or not s: if not isinstance(s, str) or not s:
return s return s
lowered = s.lower() lowered = s.lower()
@@ -291,11 +304,6 @@ def sentence_case(s: str) -> str:
if capitalize_next and c.isalpha(): if capitalize_next and c.isalpha():
chars[i] = c.upper() chars[i] = c.upper()
capitalize_next = False capitalize_next = False
elif c.strip():
# Any non-whitespace, non-letter (e.g., quote, paren) doesn't
# consume the "next letter" trigger.
if c.isalpha():
capitalize_next = False
return "".join(chars) return "".join(chars)
@@ -698,7 +706,7 @@ def visualize_hidden_html(s: str, *, mark_outer_whitespace: bool = False) -> str
the page. the page.
""" """
if not isinstance(s, str): if not isinstance(s, str):
return "" return s # mirror visualize_hidden_text: pass non-strings through
leading = "" leading = ""
trailing = "" trailing = ""

303
tests/test_audit_fixes.py Normal file
View File

@@ -0,0 +1,303 @@
"""Regression tests for bugs surfaced by the cross-tool audit.
Each test pins a specific behavioral bug or gap that an audit
identified. Test names match the BUG-N / GAP-N tags in the audit
notes so a future reader can trace why each test exists.
"""
from __future__ import annotations
import json
from pathlib import Path
import numpy as np
import pandas as pd
import pytest
from src.core.analyze import _NULL_LIKE, _detect_mixed_case_email
import src.core.fixes as f
from src.core.config import (
ColumnStrategyConfig,
DeduplicationConfig,
StrategyConfig,
)
from src.core.dedup import (
Algorithm,
ColumnMatchStrategy,
MatchStrategy,
deduplicate,
)
from src.core.io import detect_header_row
from src.core.text_clean import sentence_case, smart_title_case, strip_bom
# ---------------------------------------------------------------------------
# BUG-1: dedup NaN values must not match as duplicates
# ---------------------------------------------------------------------------
class TestDedupNaNHandling:
def test_two_nan_emails_do_not_match(self):
# Both rows have NaN for email; no other matching column. Without
# the fix, str(NaN) == "nan" would match exactly and the rows
# would silently merge.
df = pd.DataFrame({
"id": [1, 2],
"email": [np.nan, np.nan],
})
strategies = [MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT,
threshold=100.0),
])]
result = deduplicate(df, strategies=strategies)
assert len(result.deduplicated_df) == 2
assert len(result.match_groups) == 0
def test_one_nan_one_real_does_not_match(self):
df = pd.DataFrame({
"email": [np.nan, "alice@example.com"],
})
strategies = [MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])]
result = deduplicate(df, strategies=strategies)
assert len(result.deduplicated_df) == 2
def test_none_does_not_match_string_none(self):
df = pd.DataFrame({
"name": [None, "None"],
})
strategies = [MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="name", algorithm=Algorithm.EXACT),
])]
result = deduplicate(df, strategies=strategies)
assert len(result.deduplicated_df) == 2
# ---------------------------------------------------------------------------
# BUG-2: removed_df must preserve column schema even when empty
# ---------------------------------------------------------------------------
class TestDedupRemovedDfSchema:
def test_empty_removed_df_has_same_columns(self):
df = pd.DataFrame({
"name": ["alice", "bob", "carol"],
"email": ["a@x.com", "b@x.com", "c@x.com"],
})
strategies = [MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
])]
result = deduplicate(df, strategies=strategies)
# No duplicates → empty removed_df, but columns must match.
assert len(result.removed_df) == 0
assert list(result.removed_df.columns) == list(result.deduplicated_df.columns)
# ---------------------------------------------------------------------------
# GAP-3: missing column reference should raise
# ---------------------------------------------------------------------------
class TestDedupMissingColumn:
def test_missing_column_raises(self):
df = pd.DataFrame({"email": ["a@x.com"]})
strategies = [MatchStrategy(column_strategies=[
ColumnMatchStrategy(column="e_mail", algorithm=Algorithm.EXACT),
])]
with pytest.raises(ValueError, match="not present in the input"):
deduplicate(df, strategies=strategies)
# ---------------------------------------------------------------------------
# GAP-4: threshold must be in [0, 100]
# ---------------------------------------------------------------------------
class TestThresholdValidation:
def test_negative_threshold_rejected(self):
with pytest.raises(ValueError, match=r"\[0, 100\]"):
ColumnMatchStrategy(column="x", threshold=-1)
def test_over_hundred_rejected(self):
with pytest.raises(ValueError, match=r"\[0, 100\]"):
ColumnMatchStrategy(column="x", threshold=101)
def test_zero_and_hundred_allowed(self):
ColumnMatchStrategy(column="x", threshold=0)
ColumnMatchStrategy(column="x", threshold=100)
def test_non_numeric_rejected(self):
with pytest.raises(TypeError):
ColumnMatchStrategy(column="x", threshold="high") # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# BUG-9: replace_null_sentinels must coerce non-string sentinels
# ---------------------------------------------------------------------------
class TestReplaceNullSentinelsTypes:
def test_int_sentinels_do_not_crash(self):
df = pd.DataFrame({"x": ["0", "5", ""]})
out, _ = f.replace_null_sentinels(df, {"sentinels": [0, "5"]})
assert out.loc[0, "x"] == "" # "0" matched int 0 stringified
assert out.loc[1, "x"] == "" # "5" matched
assert out.loc[2, "x"] == "" # already empty
def test_none_sentinel_skipped(self):
df = pd.DataFrame({"x": ["a", "b"]})
# Should not crash on None entry in the sentinel list.
out, _ = f.replace_null_sentinels(df, {"sentinels": ["a", None]})
assert out.loc[0, "x"] == ""
assert out.loc[1, "x"] == "b"
# ---------------------------------------------------------------------------
# BUG-10: malformed regex should raise ValueError, not re.error
# ---------------------------------------------------------------------------
class TestVectorizedRegexErrorHandling:
def test_malformed_pattern_raises_valueerror(self):
df = pd.DataFrame({"x": ["abc"]})
with pytest.raises(ValueError, match="Invalid regex pattern"):
f._vectorized_regex_sub(df, "[invalid", "")
# ---------------------------------------------------------------------------
# NIT-12: strip_bom strips at most one BOM
# ---------------------------------------------------------------------------
class TestStripBomSingleChar:
def test_strips_one_leading_bom(self):
assert strip_bom("hello") == "hello"
def test_does_not_strip_multiple_consecutive_boms(self):
# Per docstring: "at most one BOM". Second BOM stays so the
# caller can see something odd happened.
assert strip_bom("hello") == "hello"
def test_no_bom_unchanged(self):
assert strip_bom("hello") == "hello"
def test_non_string_passthrough(self):
assert strip_bom(None) is None # type: ignore[arg-type]
# ---------------------------------------------------------------------------
# Smart title case — particle behavior at boundaries (regression / docs)
# ---------------------------------------------------------------------------
class TestSmartTitleCaseBoundaries:
def test_first_word_particle_capitalized(self):
# "a" at index 0 is a particle but must capitalize as the first
# word of a title.
assert smart_title_case("a story") == "A Story"
def test_last_word_particle_capitalized(self):
# "to" at the end is the last word; must capitalize.
assert smart_title_case("things to") == "Things To"
def test_mid_string_particles_lowercase(self):
assert smart_title_case("the cat in the hat") == "The Cat in the Hat"
# ---------------------------------------------------------------------------
# NIT-14: sentence_case dead branch removed — regression guard
# ---------------------------------------------------------------------------
class TestSentenceCaseUnchanged:
def test_basic(self):
assert sentence_case("hello. world.") == "Hello. World."
def test_open_paren_does_not_consume_trigger(self):
# The dead-branch removal didn't change behavior; this is a
# regression guard that opening punctuation still doesn't
# capitalize itself but doesn't reset the trigger either.
assert sentence_case('hello. "world"') == 'Hello. "World"'
# ---------------------------------------------------------------------------
# BUG-18: detect_header_row must not pick all-empty rows
# ---------------------------------------------------------------------------
class TestDetectHeaderRowEmptyRows:
def test_all_empty_first_row_skipped(self, tmp_path: Path):
# First row is all-empty — the header is on row 1.
p = tmp_path / "blank_first.csv"
p.write_text(",,\nname,email,phone\nalice,a@x.com,555\n")
assert detect_header_row(p) == 1
def test_pure_header_at_row_zero(self, tmp_path: Path):
p = tmp_path / "normal.csv"
p.write_text("name,email,phone\nalice,a@x.com,555\n")
assert detect_header_row(p) == 0
# ---------------------------------------------------------------------------
# BUG-20: config.from_dict must accept unknown fields (forward compat)
# ---------------------------------------------------------------------------
class TestConfigForwardCompat:
def test_extra_field_in_column_config_ignored(self, tmp_path: Path):
# Simulate a config file written by a future version with an
# extra ``priority`` field.
config_dict = {
"strategies": [{
"columns": [{
"column": "email",
"algorithm": "exact",
"threshold": 100.0,
"normalizer": None,
"priority": 5, # future field — must not crash
}],
}],
"survivor_rule": "first",
"merge": False,
}
loaded = DeduplicationConfig.from_dict(config_dict)
assert len(loaded.strategies) == 1
assert loaded.strategies[0].columns[0].column == "email"
def test_roundtrip_then_reload_with_extra(self, tmp_path: Path):
cfg = DeduplicationConfig(
strategies=[StrategyConfig(columns=[
ColumnStrategyConfig(column="email"),
])],
)
path = tmp_path / "cfg.json"
cfg.to_file(path)
# Manually inject an unknown field to simulate forward-compat.
data = json.loads(path.read_text())
data["strategies"][0]["columns"][0]["future_thing"] = "abc"
path.write_text(json.dumps(data))
loaded = DeduplicationConfig.from_file(path)
assert loaded.strategies[0].columns[0].column == "email"
# ---------------------------------------------------------------------------
# BUG-22: mixed-case email detector must not flag all-None columns
# ---------------------------------------------------------------------------
class TestMixedCaseEmailFalsePositive:
def test_all_none_email_column_no_finding(self):
df = pd.DataFrame({
"email": [None, None, None],
})
findings = _detect_mixed_case_email(df)
assert findings == []
def test_real_mixed_case_still_flagged(self):
df = pd.DataFrame({
"email": ["Alice@X.com", "bob@y.com"],
})
findings = _detect_mixed_case_email(df)
assert len(findings) == 1
assert findings[0].column == "email"
# ---------------------------------------------------------------------------
# NIT-24: <NA> recognized as a null-like sentinel
# ---------------------------------------------------------------------------
class TestNullLikeIncludesPandasNA:
def test_pd_na_string_repr_recognized(self):
# str(pd.NA) → "<NA>" — when a DataFrame is loaded with
# keep_default_na=False, pandas NA values appear as the literal
# string "<NA>" and the analyzer should flag them.
assert "<na>" in _NULL_LIKE

238
tests/test_fixes_unit.py Normal file
View File

@@ -0,0 +1,238 @@
"""Isolated unit tests for individual fix functions in src.core.fixes.
The integration tests at tests/test_normalize.py exercise these
functions through the full analyze→fix pipeline. These tests pin each
function's behavior in isolation so a regression surfaces close to the
broken function rather than at the pipeline output.
"""
from __future__ import annotations
import pandas as pd
import pytest
from src.core.fixes import (
clean_headers,
normalize_line_endings,
repair_mojibake,
strip_nbsp,
strip_zero_width,
trim_whitespace,
)
# ---------------------------------------------------------------------------
# trim_whitespace
# ---------------------------------------------------------------------------
class TestTrimWhitespace:
def test_strips_leading_trailing(self):
df = pd.DataFrame({"x": [" hello ", " world "]})
out, changed = trim_whitespace(df)
assert list(out["x"]) == ["hello", "world"]
assert changed == 2
def test_collapses_internal_runs(self):
df = pd.DataFrame({"x": ["a b c"]})
out, _ = trim_whitespace(df)
assert out.loc[0, "x"] == "a b c"
def test_preserves_internal_in_structured(self):
# Phone-shaped strings keep internal spacing (often semantic).
df = pd.DataFrame({"x": ["(555) 123-4567"]})
out, changed = trim_whitespace(df)
assert out.loc[0, "x"] == "(555) 123-4567"
assert changed == 0
def test_empty_df(self):
df = pd.DataFrame({"x": []})
out, changed = trim_whitespace(df)
assert len(out) == 0
assert changed == 0
def test_no_string_columns(self):
df = pd.DataFrame({"n": [1, 2, 3]})
out, changed = trim_whitespace(df)
assert changed == 0
assert list(out["n"]) == [1, 2, 3]
def test_nan_preserved(self):
df = pd.DataFrame({"x": [" ok ", None]})
out, _ = trim_whitespace(df)
assert out.loc[0, "x"] == "ok"
# NaN/None passes through (becomes empty string after strip OR stays)
assert out.loc[1, "x"] is None or out.loc[1, "x"] == ""
def test_idempotent(self):
df = pd.DataFrame({"x": [" hello world "]})
out1, _ = trim_whitespace(df)
out2, changed2 = trim_whitespace(out1)
assert changed2 == 0
assert list(out2["x"]) == list(out1["x"])
# ---------------------------------------------------------------------------
# strip_nbsp
# ---------------------------------------------------------------------------
class TestStripNbsp:
def test_replaces_nbsp_with_ascii_space(self):
df = pd.DataFrame({"x": ["a b"]})
out, changed = strip_nbsp(df)
assert out.loc[0, "x"] == "a b"
assert changed == 1
def test_no_change_when_clean(self):
df = pd.DataFrame({"x": ["a b c"]})
out, changed = strip_nbsp(df)
assert changed == 0
def test_other_unicode_spaces(self):
# Em space (U+2003), thin space (U+2009)
df = pd.DataFrame({"x": ["abc"]})
out, _ = strip_nbsp(df)
assert out.loc[0, "x"] == "a b c"
def test_idempotent(self):
df = pd.DataFrame({"x": ["a  b"]})
out1, _ = strip_nbsp(df)
out2, changed2 = strip_nbsp(out1)
assert changed2 == 0
# ---------------------------------------------------------------------------
# strip_zero_width
# ---------------------------------------------------------------------------
class TestStripZeroWidth:
def test_removes_zero_width_space(self):
df = pd.DataFrame({"x": ["ab"]})
out, changed = strip_zero_width(df)
assert out.loc[0, "x"] == "ab"
assert changed == 1
def test_removes_zero_width_joiner(self):
df = pd.DataFrame({"x": ["ab"]})
out, _ = strip_zero_width(df)
assert out.loc[0, "x"] == "ab"
def test_clean_passthrough(self):
df = pd.DataFrame({"x": ["clean"]})
out, changed = strip_zero_width(df)
assert changed == 0
def test_idempotent(self):
df = pd.DataFrame({"x": ["abc"]})
out1, _ = strip_zero_width(df)
out2, changed2 = strip_zero_width(out1)
assert changed2 == 0
# ---------------------------------------------------------------------------
# normalize_line_endings
# ---------------------------------------------------------------------------
class TestNormalizeLineEndings:
def test_crlf_to_lf(self):
df = pd.DataFrame({"x": ["line1\r\nline2"]})
out, changed = normalize_line_endings(df)
assert out.loc[0, "x"] == "line1\nline2"
assert changed == 1
def test_bare_cr_to_lf(self):
df = pd.DataFrame({"x": ["line1\rline2"]})
out, _ = normalize_line_endings(df)
assert out.loc[0, "x"] == "line1\nline2"
def test_already_lf_unchanged(self):
df = pd.DataFrame({"x": ["line1\nline2"]})
out, changed = normalize_line_endings(df)
assert changed == 0
def test_idempotent(self):
df = pd.DataFrame({"x": ["a\r\nb\rc"]})
out1, _ = normalize_line_endings(df)
out2, changed2 = normalize_line_endings(out1)
assert changed2 == 0
# ---------------------------------------------------------------------------
# clean_headers
# ---------------------------------------------------------------------------
class TestCleanHeaders:
def test_strips_bom_from_header(self):
df = pd.DataFrame({"name": [1], "email": [2]})
out, changed = clean_headers(df)
assert "name" in out.columns
assert "name" not in out.columns
assert changed >= 1
def test_strips_nbsp_from_header(self):
df = pd.DataFrame({"first name": [1]})
out, _ = clean_headers(df)
assert "first name" in out.columns
def test_strips_trailing_whitespace_from_header(self):
df = pd.DataFrame({"Email ": [1]})
out, _ = clean_headers(df)
assert "Email" in out.columns
assert "Email " not in out.columns
def test_non_string_label_preserved(self):
df = pd.DataFrame({0: [1], 1: [2]})
out, changed = clean_headers(df)
assert list(out.columns) == [0, 1]
assert changed == 0
def test_clean_headers_idempotent(self):
df = pd.DataFrame({"name": [1]})
out1, _ = clean_headers(df)
out2, changed2 = clean_headers(out1)
assert changed2 == 0
assert list(out2.columns) == list(out1.columns)
# ---------------------------------------------------------------------------
# repair_mojibake
# ---------------------------------------------------------------------------
_HAS_FTFY = True
try:
import ftfy # noqa: F401
except ImportError:
_HAS_FTFY = False
@pytest.mark.skipif(not _HAS_FTFY, reason="ftfy library not installed — fix is a no-op")
class TestRepairMojibake:
def test_classic_cafe_repair(self):
df = pd.DataFrame({"x": ["café"]}) # café miscoded
out, changed = repair_mojibake(df)
assert out.loc[0, "x"] == "café"
assert changed == 1
def test_clean_text_unchanged(self):
df = pd.DataFrame({"x": ["café"]})
out, changed = repair_mojibake(df)
assert changed == 0
def test_no_string_columns(self):
df = pd.DataFrame({"n": [1, 2]})
out, changed = repair_mojibake(df)
assert changed == 0
def test_idempotent(self):
df = pd.DataFrame({"x": ["café"]})
out1, _ = repair_mojibake(df)
out2, changed2 = repair_mojibake(out1)
assert changed2 == 0
class TestRepairMojibakeNoFtfy:
@pytest.mark.skipif(_HAS_FTFY, reason="ftfy installed — exercises the no-op path")
def test_returns_input_unchanged_without_ftfy(self):
df = pd.DataFrame({"x": ["café"]})
out, changed = repair_mojibake(df)
assert changed == 0
assert out.loc[0, "x"] == "café"

View File

@@ -261,3 +261,78 @@ class TestReadCsvRepaired:
df, repair = read_csv_repaired(f) df, repair = read_csv_repaired(f)
assert len(df) == 2 assert len(df) == 2
assert repair.changed is False assert repair.changed is False
# ---------------------------------------------------------------------------
# Round-trip integrity (audit GAP-19, GAP-21)
# ---------------------------------------------------------------------------
class TestRoundTrip:
def test_csv_roundtrip_preserves_values(self, tmp_path):
df = pd.DataFrame({
"id": ["1", "2", "3"],
"name": ["Alice", "Bob", "Carol"],
"amount": ["10.50", "20.25", "30.00"],
})
path = tmp_path / "rt.csv"
write_file(df, path)
loaded = read_file(path)
assert list(loaded.columns) == list(df.columns)
assert len(loaded) == len(df)
for col in df.columns:
assert list(loaded[col]) == list(df[col])
def test_tsv_roundtrip_via_extension(self, tmp_path):
df = pd.DataFrame({"a": ["1", "2"], "b": ["x", "y, z"]})
path = tmp_path / "rt.tsv"
write_file(df, path)
# Confirm tab is used and embedded comma in 'b' survives.
loaded = read_file(path)
assert list(loaded.columns) == ["a", "b"]
assert loaded.iloc[1]["b"] == "y, z"
def test_semicolon_roundtrip_via_explicit_delimiter(self, tmp_path):
df = pd.DataFrame({"a": ["1", "2"], "b": ["x", "y"]})
path = tmp_path / "rt.csv"
write_file(df, path, delimiter=";")
loaded = read_file(path)
assert list(loaded.columns) == ["a", "b"]
assert loaded.iloc[0]["a"] == "1"
def test_utf8_bom_non_ascii_roundtrip(self, tmp_path):
df = pd.DataFrame({"name": ["café", "naïve", "résumé"]})
path = tmp_path / "utf8.csv"
write_file(df, path)
loaded = read_file(path)
assert list(loaded["name"]) == ["café", "naïve", "résumé"]
class TestExcelHeaderDetection:
def test_excel_with_metadata_rows(self, tmp_path):
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
# Two leading blank rows + header + data.
ws.append(["Report generated 2024-01-15", None, None])
ws.append([None, None, None])
ws.append(["name", "email", "phone"])
ws.append(["alice", "a@x.com", "555-1234"])
ws.append(["bob", "b@x.com", "555-5678"])
path = tmp_path / "report.xlsx"
wb.save(path)
df = read_file(path)
# Auto-detected header row 2 → columns are name/email/phone
assert list(df.columns) == ["name", "email", "phone"]
assert len(df) == 2
def test_excel_normal_header_row_zero(self, tmp_path):
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.append(["name", "email"])
ws.append(["alice", "a@x.com"])
path = tmp_path / "normal.xlsx"
wb.save(path)
df = read_file(path)
assert list(df.columns) == ["name", "email"]
assert len(df) == 1

View File

@@ -156,3 +156,51 @@ class TestGetNormalizer:
def test_unknown_raises(self): def test_unknown_raises(self):
with pytest.raises(ValueError): with pytest.raises(ValueError):
get_normalizer("unknown_type") get_normalizer("unknown_type")
# ---------------------------------------------------------------------------
# Alignment with format_standardize: extension preservation, state codes,
# particle handling. See audit GAPs 15/16/17.
# ---------------------------------------------------------------------------
class TestNormalizerAudit:
def test_phone_extension_preserved(self):
# Two records with different extensions must NOT normalize to
# the same key — they're different people at the same business.
a = normalize_phone("+15551234567 ext 100")
b = normalize_phone("+15551234567 ext 200")
assert a != b
assert a == "+15551234567;ext=100"
def test_phone_no_extension_unchanged(self):
assert normalize_phone("+15551234567") == "+15551234567"
def test_address_state_name_to_code(self):
# "California" and "CA" produce the same matching key.
a = normalize_address("123 Main St, Los Angeles, California 90001")
b = normalize_address("123 Main St, Los Angeles, CA 90001")
assert a == b
def test_address_multiword_state_name(self):
a = normalize_address("100 Beacon St, Boston, Massachusetts 02101")
b = normalize_address("100 Beacon St, Boston, MA 02101")
assert a == b
def test_address_does_not_butcher_city_named_after_state(self):
# "New York" appearing as a city should still fold to "ny" —
# this is intentional for matching keys (we want ``New York, NY``
# and ``NY, NY`` to be the same record) even though the
# standardizer (display) would preserve the city name.
out = normalize_address("123 Main St, New York, NY 10001")
assert "ny" in out
def test_name_particle_dropped(self):
# "Charles de Gaulle" and "Charles Gaulle" produce the same key.
assert normalize_name("Charles de Gaulle") == normalize_name("Charles Gaulle")
def test_name_van_dropped(self):
assert normalize_name("Vincent van Gogh") == normalize_name("Vincent Gogh")
def test_name_particle_idempotent(self):
out = normalize_name("Vincent van Gogh")
assert normalize_name(out) == out

View File

@@ -537,8 +537,10 @@ class TestVisualizeHidden:
def test_non_string_passthrough(self): def test_non_string_passthrough(self):
from src.core.text_clean import visualize_hidden_text, visualize_hidden_html from src.core.text_clean import visualize_hidden_text, visualize_hidden_html
# Both functions now consistently pass non-strings through
# unchanged (audit NIT-13).
assert visualize_hidden_text(None) is None # type: ignore[arg-type] assert visualize_hidden_text(None) is None # type: ignore[arg-type]
assert visualize_hidden_html(None) == "" assert visualize_hidden_html(None) is None # type: ignore[arg-type]
def test_html_marks_leading_trailing_ascii_space(self): def test_html_marks_leading_trailing_ascii_space(self):
from src.core.text_clean import visualize_hidden_html from src.core.text_clean import visualize_hidden_html
out = visualize_hidden_html(" Alice ", mark_outer_whitespace=True) out = visualize_hidden_html(" Alice ", mark_outer_whitespace=True)