Compare commits
2 Commits
3f007ef3d6
...
b23a27d4e3
| Author | SHA1 | Date | |
|---|---|---|---|
| b23a27d4e3 | |||
| 4adeb5c7f3 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -7,3 +7,6 @@ logs/
|
|||||||
dist/
|
dist/
|
||||||
build/
|
build/
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Claude Code agent worktrees + local settings
|
||||||
|
.claude/
|
||||||
|
|||||||
@@ -91,6 +91,20 @@ from .text_clean import (
|
|||||||
visualize_hidden_html,
|
visualize_hidden_html,
|
||||||
visualize_hidden_text,
|
visualize_hidden_text,
|
||||||
)
|
)
|
||||||
|
from .format_standardize import (
|
||||||
|
FieldType,
|
||||||
|
PRESETS as STANDARDIZE_PRESETS,
|
||||||
|
StandardizeOptions,
|
||||||
|
StandardizeResult,
|
||||||
|
detect_currency_code,
|
||||||
|
standardize_address,
|
||||||
|
standardize_boolean,
|
||||||
|
standardize_currency,
|
||||||
|
standardize_dataframe,
|
||||||
|
standardize_date,
|
||||||
|
standardize_name,
|
||||||
|
standardize_phone,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
# Core
|
# Core
|
||||||
@@ -152,4 +166,17 @@ __all__ = [
|
|||||||
"visualize_hidden_text",
|
"visualize_hidden_text",
|
||||||
"visualize_hidden_html",
|
"visualize_hidden_html",
|
||||||
"hidden_char_css",
|
"hidden_char_css",
|
||||||
|
# Format standardization
|
||||||
|
"FieldType",
|
||||||
|
"STANDARDIZE_PRESETS",
|
||||||
|
"StandardizeOptions",
|
||||||
|
"StandardizeResult",
|
||||||
|
"detect_currency_code",
|
||||||
|
"standardize_dataframe",
|
||||||
|
"standardize_date",
|
||||||
|
"standardize_phone",
|
||||||
|
"standardize_currency",
|
||||||
|
"standardize_name",
|
||||||
|
"standardize_address",
|
||||||
|
"standardize_boolean",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -125,6 +125,8 @@ _ZERO_WIDTH_CHARS = set("")
|
|||||||
_NULL_LIKE = {
|
_NULL_LIKE = {
|
||||||
"n/a", "na", "nan", "null", "none", "#n/a", "#na", "-", "--",
|
"n/a", "na", "nan", "null", "none", "#n/a", "#na", "-", "--",
|
||||||
"tbd", "unknown", "n.a.", "(null)",
|
"tbd", "unknown", "n.a.", "(null)",
|
||||||
|
# Pandas-specific: NA values stringified via str(pd.NA) → "<NA>".
|
||||||
|
"<na>",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Mojibake fingerprints: classic UTF-8-as-cp1252 corruptions.
|
# Mojibake fingerprints: classic UTF-8-as-cp1252 corruptions.
|
||||||
@@ -358,12 +360,80 @@ def _detect_mojibake(df: pd.DataFrame) -> list[Finding]:
|
|||||||
)]
|
)]
|
||||||
|
|
||||||
|
|
||||||
|
# Date-shaped patterns for the inconsistent-format detector.
|
||||||
|
_DATE_FORMAT_PATTERNS: dict[str, str] = {
|
||||||
|
"iso": r"^\d{4}-\d{1,2}-\d{1,2}$",
|
||||||
|
"us_slash": r"^\d{1,2}/\d{1,2}/\d{2,4}$",
|
||||||
|
"eu_dot": r"^\d{1,2}\.\d{1,2}\.\d{2,4}$",
|
||||||
|
"eu_slash": r"^\d{1,2}/\d{1,2}/\d{4}$", # may overlap us_slash; resolved by us_slash first
|
||||||
|
}
|
||||||
|
_DATE_FORMAT_RE: dict[str, "re.Pattern"] = {
|
||||||
|
name: re.compile(pat) for name, pat in _DATE_FORMAT_PATTERNS.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_inconsistent_date_format(df: pd.DataFrame) -> list[Finding]:
|
||||||
|
"""Flag columns whose date-shaped values use multiple incompatible formats.
|
||||||
|
|
||||||
|
A column is "date-shaped" if more than half its non-empty values
|
||||||
|
match one of the recognized date regexes. If two or more distinct
|
||||||
|
formats each pass that majority threshold, emit a finding routed to
|
||||||
|
the format standardizer.
|
||||||
|
"""
|
||||||
|
findings: list[Finding] = []
|
||||||
|
for col in df.columns:
|
||||||
|
try:
|
||||||
|
ser = df[col].dropna().astype(str)
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
nonempty = ser[ser.str.strip().astype(bool)]
|
||||||
|
if len(nonempty) < 4:
|
||||||
|
continue
|
||||||
|
format_counts: dict[str, int] = {}
|
||||||
|
for name, pat in _DATE_FORMAT_RE.items():
|
||||||
|
count = int(nonempty.str.match(pat).sum())
|
||||||
|
if count >= 2:
|
||||||
|
format_counts[name] = count
|
||||||
|
if len(format_counts) < 2:
|
||||||
|
continue
|
||||||
|
# Require at least 50% of values to be date-shaped overall.
|
||||||
|
total_date_shaped = sum(format_counts.values())
|
||||||
|
if total_date_shaped < len(nonempty) * 0.5:
|
||||||
|
continue
|
||||||
|
format_summary = ", ".join(
|
||||||
|
f"{n}({c})" for n, c in sorted(
|
||||||
|
format_counts.items(), key=lambda kv: -kv[1]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
samples_idx = nonempty.head(5)
|
||||||
|
samples = [(int(i), str(col), str(v)) for i, v in samples_idx.items()]
|
||||||
|
findings.append(Finding(
|
||||||
|
id="inconsistent_date_format",
|
||||||
|
severity="info",
|
||||||
|
tool=TOOL_FORMAT_STANDARDIZER,
|
||||||
|
count=int(total_date_shaped),
|
||||||
|
description=(
|
||||||
|
f"Column '{col}' contains dates in multiple formats: "
|
||||||
|
f"{format_summary}. Run format standardizer to normalize."
|
||||||
|
),
|
||||||
|
column=str(col),
|
||||||
|
samples=samples,
|
||||||
|
confidence="medium",
|
||||||
|
fix_action=FIX_NONE,
|
||||||
|
))
|
||||||
|
return findings
|
||||||
|
|
||||||
|
|
||||||
def _detect_mixed_case_email(df: pd.DataFrame) -> list[Finding]:
|
def _detect_mixed_case_email(df: pd.DataFrame) -> list[Finding]:
|
||||||
findings: list[Finding] = []
|
findings: list[Finding] = []
|
||||||
for col in df.columns:
|
for col in df.columns:
|
||||||
if not isinstance(col, str) or not _EMAIL_LIKE_COL.search(col):
|
if not isinstance(col, str) or not _EMAIL_LIKE_COL.search(col):
|
||||||
continue
|
continue
|
||||||
ser = df[col].astype(str)
|
# Drop NaN/None *before* astype(str), otherwise None becomes the
|
||||||
|
# string "None" — which contains both upper "N" and lower "one"
|
||||||
|
# and would trigger a false-positive mixed-case finding on a
|
||||||
|
# column that has no real emails at all.
|
||||||
|
ser = df[col].dropna().astype(str)
|
||||||
nonempty = ser[ser.str.strip().astype(bool)]
|
nonempty = ser[ser.str.strip().astype(bool)]
|
||||||
if nonempty.empty:
|
if nonempty.empty:
|
||||||
continue
|
continue
|
||||||
@@ -410,8 +480,12 @@ def _detect_near_duplicates(df: pd.DataFrame) -> list[Finding]:
|
|||||||
n_dupes = int(dup_mask.sum())
|
n_dupes = int(dup_mask.sum())
|
||||||
if n_dupes < 2:
|
if n_dupes < 2:
|
||||||
return []
|
return []
|
||||||
# Count *extra* copies, not total members of duplicate groups.
|
# ``n_groups`` is the count of unique duplicate signatures; each
|
||||||
|
# group contains 2+ rows. ``n_extra`` is rows that would be removed
|
||||||
|
# by dedup (total in groups minus one survivor per group) — that's
|
||||||
|
# the number the user usually wants ("remove X to fix").
|
||||||
n_groups = int(norm[dup_mask].drop_duplicates().shape[0])
|
n_groups = int(norm[dup_mask].drop_duplicates().shape[0])
|
||||||
|
n_extra = n_dupes - n_groups
|
||||||
samples: list[tuple[int, str, str]] = []
|
samples: list[tuple[int, str, str]] = []
|
||||||
for i in df[dup_mask].index[:5]:
|
for i in df[dup_mask].index[:5]:
|
||||||
# Render the first textual column's value as a sample.
|
# Render the first textual column's value as a sample.
|
||||||
@@ -424,11 +498,12 @@ def _detect_near_duplicates(df: pd.DataFrame) -> list[Finding]:
|
|||||||
id="near_duplicate_rows",
|
id="near_duplicate_rows",
|
||||||
severity="info",
|
severity="info",
|
||||||
tool=TOOL_DEDUPLICATOR,
|
tool=TOOL_DEDUPLICATOR,
|
||||||
count=n_dupes,
|
count=n_extra,
|
||||||
description=(
|
description=(
|
||||||
f"{n_dupes} row(s) across ~{n_groups} group(s) are duplicates "
|
f"{n_extra} extra copy(ies) across {n_groups} duplicate group(s) "
|
||||||
f"after stripping whitespace and lowercasing string columns. "
|
f"({n_dupes} rows total) — duplicates after stripping whitespace "
|
||||||
f"Run the deduplicator to merge or remove."
|
f"and lowercasing string columns. Run the deduplicator to merge "
|
||||||
|
f"or remove."
|
||||||
),
|
),
|
||||||
samples=samples,
|
samples=samples,
|
||||||
confidence="medium",
|
confidence="medium",
|
||||||
@@ -799,6 +874,7 @@ def analyze(
|
|||||||
findings.extend(_detect_null_like_sentinels(df))
|
findings.extend(_detect_null_like_sentinels(df))
|
||||||
findings.extend(_detect_mojibake(df))
|
findings.extend(_detect_mojibake(df))
|
||||||
findings.extend(_detect_mixed_case_email(df))
|
findings.extend(_detect_mixed_case_email(df))
|
||||||
|
findings.extend(_detect_inconsistent_date_format(df))
|
||||||
findings.extend(_detect_leading_zero_ids(df))
|
findings.extend(_detect_leading_zero_ids(df))
|
||||||
findings.extend(_detect_near_duplicates(df))
|
findings.extend(_detect_near_duplicates(df))
|
||||||
return findings
|
return findings
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
from dataclasses import dataclass, field, asdict
|
from dataclasses import dataclass, field, fields, asdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@@ -60,9 +60,16 @@ class DeduplicationConfig:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, data: dict) -> DeduplicationConfig:
|
def from_dict(cls, data: dict) -> DeduplicationConfig:
|
||||||
|
# Filter unknown fields silently — keeps loading forward-compatible
|
||||||
|
# when older code reads a config written by a newer version that
|
||||||
|
# added fields to ColumnStrategyConfig.
|
||||||
|
col_known = {f.name for f in fields(ColumnStrategyConfig)}
|
||||||
strategies = []
|
strategies = []
|
||||||
for s in data.get("strategies", []):
|
for s in data.get("strategies", []):
|
||||||
cols = [ColumnStrategyConfig(**c) for c in s.get("columns", [])]
|
cols = [
|
||||||
|
ColumnStrategyConfig(**{k: v for k, v in c.items() if k in col_known})
|
||||||
|
for c in s.get("columns", [])
|
||||||
|
]
|
||||||
strategies.append(StrategyConfig(columns=cols))
|
strategies.append(StrategyConfig(columns=cols))
|
||||||
return cls(
|
return cls(
|
||||||
strategies=strategies,
|
strategies=strategies,
|
||||||
|
|||||||
@@ -49,6 +49,18 @@ class ColumnMatchStrategy:
|
|||||||
threshold: float = 100.0 # 0-100 scale
|
threshold: float = 100.0 # 0-100 scale
|
||||||
normalizer: Optional[NormalizerType] = None
|
normalizer: Optional[NormalizerType] = None
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
if not isinstance(self.threshold, (int, float)):
|
||||||
|
raise TypeError(
|
||||||
|
f"threshold must be a number, got {type(self.threshold).__name__}"
|
||||||
|
)
|
||||||
|
if not 0 <= self.threshold <= 100:
|
||||||
|
raise ValueError(
|
||||||
|
f"threshold must be in [0, 100]; got {self.threshold}. "
|
||||||
|
"Match scores are on a 0–100 scale, so values outside this "
|
||||||
|
"range either always match or never match."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MatchStrategy:
|
class MatchStrategy:
|
||||||
@@ -61,7 +73,13 @@ class MatchStrategy:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MatchResult:
|
class MatchResult:
|
||||||
"""One group of duplicate rows."""
|
"""One group of duplicate rows.
|
||||||
|
|
||||||
|
``row_indices`` and ``survivor_index`` are positional indexes into
|
||||||
|
the *input* DataFrame (0-based, matching ``df.iloc[]``), not the
|
||||||
|
output ``deduplicated_df`` (whose index is reset to 0..N-1). To map
|
||||||
|
back to the original frame, use ``df.iloc[row_indices]``.
|
||||||
|
"""
|
||||||
group_id: int
|
group_id: int
|
||||||
row_indices: list[int]
|
row_indices: list[int]
|
||||||
confidence: float # min confidence across pairs in the group
|
confidence: float # min confidence across pairs in the group
|
||||||
@@ -71,7 +89,13 @@ class MatchResult:
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DeduplicationResult:
|
class DeduplicationResult:
|
||||||
"""Full result of a deduplication run."""
|
"""Full result of a deduplication run.
|
||||||
|
|
||||||
|
``deduplicated_df`` and ``removed_df`` both have their indexes reset
|
||||||
|
to a fresh 0..N-1 range. ``match_groups[*].row_indices`` keeps the
|
||||||
|
original positional indexes of the *input* frame so callers can
|
||||||
|
cross-reference back to it (e.g., for an audit log).
|
||||||
|
"""
|
||||||
original_row_count: int
|
original_row_count: int
|
||||||
deduplicated_df: pd.DataFrame
|
deduplicated_df: pd.DataFrame
|
||||||
removed_df: pd.DataFrame
|
removed_df: pd.DataFrame
|
||||||
@@ -153,8 +177,21 @@ def _compare_pair(
|
|||||||
|
|
||||||
for cs in strategy.column_strategies:
|
for cs in strategy.column_strategies:
|
||||||
col = f"{norm_prefix}{cs.column}" if cs.normalizer else cs.column
|
col = f"{norm_prefix}{cs.column}" if cs.normalizer else cs.column
|
||||||
va = str(row_a.get(col, ""))
|
raw_a = row_a.get(col, "")
|
||||||
vb = str(row_b.get(col, ""))
|
raw_b = row_b.get(col, "")
|
||||||
|
|
||||||
|
# NaN / None always count as "empty" — never as the literal
|
||||||
|
# string "None" or "nan", which would otherwise let two rows
|
||||||
|
# with missing data in this column match at 100% similarity.
|
||||||
|
a_missing = raw_a is None or (
|
||||||
|
isinstance(raw_a, float) and pd.isna(raw_a)
|
||||||
|
) or raw_a is pd.NA
|
||||||
|
b_missing = raw_b is None or (
|
||||||
|
isinstance(raw_b, float) and pd.isna(raw_b)
|
||||||
|
) or raw_b is pd.NA
|
||||||
|
|
||||||
|
va = "" if a_missing else str(raw_a)
|
||||||
|
vb = "" if b_missing else str(raw_b)
|
||||||
|
|
||||||
# Skip if both empty
|
# Skip if both empty
|
||||||
if not va and not vb:
|
if not va and not vb:
|
||||||
@@ -221,17 +258,29 @@ def _find_match_groups(
|
|||||||
raw_groups = uf.groups()
|
raw_groups = uf.groups()
|
||||||
match_groups: list[MatchResult] = []
|
match_groups: list[MatchResult] = []
|
||||||
for gid, (root, members) in enumerate(sorted(raw_groups.items())):
|
for gid, (root, members) in enumerate(sorted(raw_groups.items())):
|
||||||
# Confidence = min across all pairs in the group
|
# Confidence = min across all directly-recorded pairs in the
|
||||||
group_confidence = 100.0
|
# group. Transitive members (A→B and B→C imply A→C) may not have
|
||||||
|
# a direct pair_info entry; we only count the recorded ones, so
|
||||||
|
# the score reflects observed evidence rather than the optimistic
|
||||||
|
# 100.0 default that masks weak links.
|
||||||
|
observed_confidences: list[float] = []
|
||||||
group_cols: set[str] = set()
|
group_cols: set[str] = set()
|
||||||
for idx_a, m in enumerate(members):
|
for idx_a, m in enumerate(members):
|
||||||
for idx_b in range(idx_a + 1, len(members)):
|
for idx_b in range(idx_a + 1, len(members)):
|
||||||
key = (min(m, members[idx_b]), max(m, members[idx_b]))
|
key = (min(m, members[idx_b]), max(m, members[idx_b]))
|
||||||
if key in pair_info:
|
if key in pair_info:
|
||||||
conf, cols = pair_info[key]
|
conf, cols = pair_info[key]
|
||||||
group_confidence = min(group_confidence, conf)
|
observed_confidences.append(conf)
|
||||||
group_cols.update(cols)
|
group_cols.update(cols)
|
||||||
|
|
||||||
|
if observed_confidences:
|
||||||
|
group_confidence = min(observed_confidences)
|
||||||
|
else:
|
||||||
|
# Edge case: a group with no recorded pair info (shouldn't
|
||||||
|
# happen for groups built from union-find on pair_info, but
|
||||||
|
# be defensive). Fall back to 100.0 only for trivial groups.
|
||||||
|
group_confidence = 100.0
|
||||||
|
|
||||||
match_groups.append(MatchResult(
|
match_groups.append(MatchResult(
|
||||||
group_id=gid,
|
group_id=gid,
|
||||||
row_indices=members,
|
row_indices=members,
|
||||||
@@ -462,6 +511,17 @@ def deduplicate(
|
|||||||
strategies = build_default_strategies(df)
|
strategies = build_default_strategies(df)
|
||||||
log_entries.append(f"Auto-detected {len(strategies)} match strategies")
|
log_entries.append(f"Auto-detected {len(strategies)} match strategies")
|
||||||
|
|
||||||
|
# Validate every strategy references real columns — silent skip
|
||||||
|
# would let a typo (``e_mail`` instead of ``email``) produce a
|
||||||
|
# confidently-empty result.
|
||||||
|
referenced = {cs.column for s in strategies for cs in s.column_strategies}
|
||||||
|
missing = sorted(c for c in referenced if c not in df.columns)
|
||||||
|
if missing:
|
||||||
|
raise ValueError(
|
||||||
|
f"Strategy references columns not present in the input: {missing}. "
|
||||||
|
f"Available columns: {list(df.columns)}"
|
||||||
|
)
|
||||||
|
|
||||||
# Log strategies
|
# Log strategies
|
||||||
for i, s in enumerate(strategies):
|
for i, s in enumerate(strategies):
|
||||||
cols_desc = ", ".join(
|
cols_desc = ", ".join(
|
||||||
@@ -542,17 +602,20 @@ def deduplicate(
|
|||||||
else:
|
else:
|
||||||
deduplicated_df = df_work.iloc[keep_indices].copy()
|
deduplicated_df = df_work.iloc[keep_indices].copy()
|
||||||
|
|
||||||
removed_df = df_work.iloc[sorted(remove_indices)].copy() if remove_indices else pd.DataFrame()
|
if remove_indices:
|
||||||
|
removed_df = df_work.iloc[sorted(remove_indices)].copy()
|
||||||
|
else:
|
||||||
|
# Empty result: preserve column schema so downstream code can
|
||||||
|
# rely on ``removed_df.columns == deduplicated_df.columns``.
|
||||||
|
removed_df = df_work.iloc[0:0].copy()
|
||||||
|
|
||||||
# Drop shadow columns from output
|
# Drop shadow columns from output
|
||||||
norm_cols = [c for c in deduplicated_df.columns if str(c).startswith("_norm_")]
|
norm_cols = [c for c in deduplicated_df.columns if str(c).startswith("_norm_")]
|
||||||
deduplicated_df = deduplicated_df.drop(columns=norm_cols, errors="ignore")
|
deduplicated_df = deduplicated_df.drop(columns=norm_cols, errors="ignore")
|
||||||
if not removed_df.empty:
|
|
||||||
removed_df = removed_df.drop(columns=norm_cols, errors="ignore")
|
removed_df = removed_df.drop(columns=norm_cols, errors="ignore")
|
||||||
|
|
||||||
# Reset index
|
# Reset index
|
||||||
deduplicated_df = deduplicated_df.reset_index(drop=True)
|
deduplicated_df = deduplicated_df.reset_index(drop=True)
|
||||||
if not removed_df.empty:
|
|
||||||
removed_df = removed_df.reset_index(drop=True)
|
removed_df = removed_df.reset_index(drop=True)
|
||||||
|
|
||||||
removed_count = original_count - len(deduplicated_df)
|
removed_count = original_count - len(deduplicated_df)
|
||||||
|
|||||||
@@ -152,7 +152,17 @@ def _vectorized_translate(
|
|||||||
def _vectorized_regex_sub(
|
def _vectorized_regex_sub(
|
||||||
df: pd.DataFrame, pattern, repl: str, *, inplace: bool = False,
|
df: pd.DataFrame, pattern, repl: str, *, inplace: bool = False,
|
||||||
) -> tuple[pd.DataFrame, int]:
|
) -> tuple[pd.DataFrame, int]:
|
||||||
"""``str.replace(regex=True)`` shortcut for regex-based fixes."""
|
"""``str.replace(regex=True)`` shortcut for regex-based fixes.
|
||||||
|
|
||||||
|
Raises ``ValueError`` if *pattern* is malformed — callers (GUI/CLI)
|
||||||
|
surface this with a clear message rather than letting an
|
||||||
|
unannotated ``re.error`` propagate.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
re.compile(pattern)
|
||||||
|
except re.error as e:
|
||||||
|
raise ValueError(f"Invalid regex pattern {pattern!r}: {e}") from e
|
||||||
|
|
||||||
out = df if inplace else df.copy()
|
out = df if inplace else df.copy()
|
||||||
changed = 0
|
changed = 0
|
||||||
for col in out.columns:
|
for col in out.columns:
|
||||||
@@ -319,7 +329,11 @@ def replace_null_sentinels(df: pd.DataFrame, payload: Optional[dict] = None) ->
|
|||||||
sentinels = payload.get("sentinels")
|
sentinels = payload.get("sentinels")
|
||||||
if sentinels is None:
|
if sentinels is None:
|
||||||
sentinels = list(_a._NULL_LIKE)
|
sentinels = list(_a._NULL_LIKE)
|
||||||
sentinel_set = {s.strip().lower() for s in sentinels}
|
# Coerce non-string sentinels (the GUI / JSON payload may produce
|
||||||
|
# ints, floats, bools) instead of crashing on .strip().
|
||||||
|
sentinel_set = {
|
||||||
|
str(s).strip().lower() for s in sentinels if s is not None
|
||||||
|
}
|
||||||
|
|
||||||
def fix(s: str) -> str:
|
def fix(s: str) -> str:
|
||||||
return "" if s.strip().lower() in sentinel_set else s
|
return "" if s.strip().lower() in sentinel_set else s
|
||||||
|
|||||||
1836
src/core/format_standardize.py
Normal file
1836
src/core/format_standardize.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -109,8 +109,18 @@ def detect_header_row(path: Path, encoding: str = "utf-8", delimiter: str = ",",
|
|||||||
break
|
break
|
||||||
if not row:
|
if not row:
|
||||||
continue
|
continue
|
||||||
# All cells must be non-empty, non-numeric strings
|
# Header heuristic:
|
||||||
if all(_looks_like_header(cell) for cell in row if cell.strip()):
|
# - every non-empty cell looks like a header;
|
||||||
|
# - at least 2 non-empty cells (or just 1 in a single-column
|
||||||
|
# file). Without the count check, blank rows match
|
||||||
|
# vacuously (``all([])`` is True) and metadata banners
|
||||||
|
# like ``["Report 2024", "", ""]`` claim row 0 falsely.
|
||||||
|
non_empty = [cell for cell in row if cell.strip()]
|
||||||
|
min_required = 1 if len(row) <= 1 else 2
|
||||||
|
if (
|
||||||
|
len(non_empty) >= min_required
|
||||||
|
and all(_looks_like_header(cell) for cell in non_empty)
|
||||||
|
):
|
||||||
return idx
|
return idx
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@@ -263,7 +273,11 @@ def _read_excel(
|
|||||||
header_row: Optional[int] = None,
|
header_row: Optional[int] = None,
|
||||||
sheet_name: Optional[str | int] = 0,
|
sheet_name: Optional[str | int] = 0,
|
||||||
) -> pd.DataFrame:
|
) -> pd.DataFrame:
|
||||||
hdr = header_row if header_row is not None else 0
|
hdr = (
|
||||||
|
header_row
|
||||||
|
if header_row is not None
|
||||||
|
else _detect_excel_header_row(path, sheet_name)
|
||||||
|
)
|
||||||
logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr)
|
logger.debug("Reading Excel {} (sheet={}, header_row={})", path.name, sheet_name, hdr)
|
||||||
return pd.read_excel(
|
return pd.read_excel(
|
||||||
path,
|
path,
|
||||||
@@ -275,6 +289,52 @@ def _read_excel(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_excel_header_row(
|
||||||
|
path: Path,
|
||||||
|
sheet_name: Optional[str | int] = 0,
|
||||||
|
max_scan: int = 20,
|
||||||
|
) -> int:
|
||||||
|
"""Mirror of :func:`detect_header_row` for Excel workbooks.
|
||||||
|
|
||||||
|
Scans the first *max_scan* rows of *sheet_name* in read-only mode
|
||||||
|
(so a 100 MB workbook doesn't get fully materialized) and returns
|
||||||
|
the index of the first row where every non-empty cell looks like a
|
||||||
|
column header. Falls back to 0.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
except ImportError:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
wb = load_workbook(path, read_only=True, data_only=True)
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
try:
|
||||||
|
if isinstance(sheet_name, int):
|
||||||
|
names = wb.sheetnames
|
||||||
|
target = names[sheet_name] if 0 <= sheet_name < len(names) else names[0]
|
||||||
|
elif isinstance(sheet_name, str):
|
||||||
|
target = sheet_name if sheet_name in wb.sheetnames else wb.sheetnames[0]
|
||||||
|
else:
|
||||||
|
target = wb.sheetnames[0]
|
||||||
|
ws = wb[target]
|
||||||
|
for idx, row in enumerate(ws.iter_rows(values_only=True)):
|
||||||
|
if idx >= max_scan:
|
||||||
|
break
|
||||||
|
cells = ["" if v is None else str(v) for v in row]
|
||||||
|
non_empty = [c for c in cells if c.strip()]
|
||||||
|
min_required = 1 if len(cells) <= 1 else 2
|
||||||
|
if (
|
||||||
|
len(non_empty) >= min_required
|
||||||
|
and all(_looks_like_header(c) for c in non_empty)
|
||||||
|
):
|
||||||
|
return idx
|
||||||
|
return 0
|
||||||
|
finally:
|
||||||
|
wb.close()
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Writing
|
# Writing
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -285,6 +345,7 @@ def write_file(
|
|||||||
*,
|
*,
|
||||||
file_format: Optional[str] = None,
|
file_format: Optional[str] = None,
|
||||||
encoding: str = "utf-8-sig",
|
encoding: str = "utf-8-sig",
|
||||||
|
delimiter: Optional[str] = None,
|
||||||
) -> Path:
|
) -> Path:
|
||||||
"""Write a DataFrame to CSV or Excel.
|
"""Write a DataFrame to CSV or Excel.
|
||||||
|
|
||||||
@@ -292,8 +353,12 @@ def write_file(
|
|||||||
----------
|
----------
|
||||||
df : DataFrame to write
|
df : DataFrame to write
|
||||||
path : output file path
|
path : output file path
|
||||||
file_format : ``"csv"`` or ``"xlsx"``; auto-detected from *path* suffix if *None*
|
file_format : ``"csv"``, ``"tsv"``, or ``"xlsx"``; auto-detected from
|
||||||
|
*path* suffix if *None*
|
||||||
encoding : output encoding (default ``utf-8-sig`` for Windows Excel compat)
|
encoding : output encoding (default ``utf-8-sig`` for Windows Excel compat)
|
||||||
|
delimiter : field separator for delimited output. Defaults to ``,``
|
||||||
|
for ``.csv``, ``\\t`` for ``.tsv``, and the explicit value
|
||||||
|
otherwise. Ignored for Excel formats.
|
||||||
|
|
||||||
Returns the resolved output Path.
|
Returns the resolved output Path.
|
||||||
"""
|
"""
|
||||||
@@ -302,7 +367,10 @@ def write_file(
|
|||||||
if fmt in ("xlsx", "xls"):
|
if fmt in ("xlsx", "xls"):
|
||||||
df.to_excel(out, index=False, engine="openpyxl")
|
df.to_excel(out, index=False, engine="openpyxl")
|
||||||
else:
|
else:
|
||||||
df.to_csv(out, index=False, encoding=encoding)
|
sep = delimiter if delimiter is not None else (
|
||||||
|
"\t" if fmt == "tsv" else ","
|
||||||
|
)
|
||||||
|
df.to_csv(out, index=False, encoding=encoding, sep=sep)
|
||||||
logger.info("Wrote {} rows to {}", len(df), out)
|
logger.info("Wrote {} rows to {}", len(df), out)
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|||||||
@@ -69,7 +69,13 @@ def normalize_email(value: Optional[str]) -> str:
|
|||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def normalize_phone(value: Optional[str], default_region: str = "US") -> str:
|
def normalize_phone(value: Optional[str], default_region: str = "US") -> str:
|
||||||
"""Parse with phonenumbers lib, return E.164. Fallback: digits-only."""
|
"""Parse with phonenumbers lib, return E.164. Fallback: digits-only.
|
||||||
|
|
||||||
|
Extensions are preserved as a ``;ext=N`` suffix (RFC 3966 syntax) so
|
||||||
|
two records ``+15551234567 ext 100`` and ``+15551234567 ext 200``
|
||||||
|
don't normalize to the same key — they're different people at the
|
||||||
|
same business.
|
||||||
|
"""
|
||||||
if not value or not isinstance(value, str):
|
if not value or not isinstance(value, str):
|
||||||
return ""
|
return ""
|
||||||
stripped = value.strip()
|
stripped = value.strip()
|
||||||
@@ -79,7 +85,10 @@ def normalize_phone(value: Optional[str], default_region: str = "US") -> str:
|
|||||||
try:
|
try:
|
||||||
parsed = phonenumbers.parse(stripped, default_region)
|
parsed = phonenumbers.parse(stripped, default_region)
|
||||||
if phonenumbers.is_possible_number(parsed):
|
if phonenumbers.is_possible_number(parsed):
|
||||||
return phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164)
|
base = phonenumbers.format_number(parsed, phonenumbers.PhoneNumberFormat.E164)
|
||||||
|
if parsed.extension:
|
||||||
|
return f"{base};ext={parsed.extension}"
|
||||||
|
return base
|
||||||
except phonenumbers.NumberParseException:
|
except phonenumbers.NumberParseException:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -100,10 +109,16 @@ _NAME_SUFFIXES = {
|
|||||||
"jr", "sr", "ii", "iii", "iv", "v",
|
"jr", "sr", "ii", "iii", "iv", "v",
|
||||||
"phd", "md", "esq", "dds", "rn",
|
"phd", "md", "esq", "dds", "rn",
|
||||||
}
|
}
|
||||||
|
# Surname particles dropped during normalization so that
|
||||||
|
# ``Charles de Gaulle`` and ``Charles Gaulle`` produce the same key.
|
||||||
|
_NAME_PARTICLES_DROP = {
|
||||||
|
"van", "von", "de", "da", "del", "della", "di", "du",
|
||||||
|
"der", "den", "le", "la", "el",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def normalize_name(value: Optional[str]) -> str:
|
def normalize_name(value: Optional[str]) -> str:
|
||||||
"""Strip titles/suffixes, collapse whitespace, case-fold."""
|
"""Strip titles/suffixes/particles, collapse whitespace, case-fold."""
|
||||||
if not value or not isinstance(value, str):
|
if not value or not isinstance(value, str):
|
||||||
return ""
|
return ""
|
||||||
name = value.strip()
|
name = value.strip()
|
||||||
@@ -126,6 +141,9 @@ def normalize_name(value: Optional[str]) -> str:
|
|||||||
while parts and parts[-1].rstrip(".") in _NAME_SUFFIXES:
|
while parts and parts[-1].rstrip(".") in _NAME_SUFFIXES:
|
||||||
parts.pop()
|
parts.pop()
|
||||||
|
|
||||||
|
# Drop surname particles wherever they appear.
|
||||||
|
parts = [p for p in parts if p not in _NAME_PARTICLES_DROP]
|
||||||
|
|
||||||
return " ".join(parts)
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
@@ -178,8 +196,34 @@ _USPS_ABBREVIATIONS: dict[str, str] = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# US state name → 2-letter postal code. Substituted before tokenization
|
||||||
|
# so ``California`` and ``CA`` normalize to the same key.
|
||||||
|
_US_STATE_NAMES_NORM: dict[str, str] = {
|
||||||
|
"alabama": "al", "alaska": "ak", "arizona": "az", "arkansas": "ar",
|
||||||
|
"california": "ca", "colorado": "co", "connecticut": "ct",
|
||||||
|
"delaware": "de", "florida": "fl", "georgia": "ga", "hawaii": "hi",
|
||||||
|
"idaho": "id", "illinois": "il", "indiana": "in", "iowa": "ia",
|
||||||
|
"kansas": "ks", "kentucky": "ky", "louisiana": "la", "maine": "me",
|
||||||
|
"maryland": "md", "massachusetts": "ma", "michigan": "mi",
|
||||||
|
"minnesota": "mn", "mississippi": "ms", "missouri": "mo",
|
||||||
|
"montana": "mt", "nebraska": "ne", "nevada": "nv",
|
||||||
|
"new hampshire": "nh", "new jersey": "nj", "new mexico": "nm",
|
||||||
|
"new york": "ny", "north carolina": "nc", "north dakota": "nd",
|
||||||
|
"ohio": "oh", "oklahoma": "ok", "oregon": "or", "pennsylvania": "pa",
|
||||||
|
"rhode island": "ri", "south carolina": "sc", "south dakota": "sd",
|
||||||
|
"tennessee": "tn", "texas": "tx", "utah": "ut", "vermont": "vt",
|
||||||
|
"virginia": "va", "washington": "wa", "west virginia": "wv",
|
||||||
|
"wisconsin": "wi", "wyoming": "wy",
|
||||||
|
"district of columbia": "dc",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def normalize_address(value: Optional[str]) -> str:
|
def normalize_address(value: Optional[str]) -> str:
|
||||||
"""USPS abbreviation normalization, collapse whitespace, case-fold."""
|
"""USPS abbreviation normalization, collapse whitespace, case-fold.
|
||||||
|
|
||||||
|
Spelled-out US state names are folded to their 2-letter codes so
|
||||||
|
``California`` and ``CA`` normalize to the same matching key.
|
||||||
|
"""
|
||||||
if not value or not isinstance(value, str):
|
if not value or not isinstance(value, str):
|
||||||
return ""
|
return ""
|
||||||
addr = value.strip()
|
addr = value.strip()
|
||||||
@@ -190,6 +234,13 @@ def normalize_address(value: Optional[str]) -> str:
|
|||||||
addr = addr.casefold()
|
addr = addr.casefold()
|
||||||
addr = addr.replace(".", " ").replace(",", " ")
|
addr = addr.replace(".", " ").replace(",", " ")
|
||||||
|
|
||||||
|
# State names → 2-letter codes (longest first so ``new york`` wins
|
||||||
|
# over ``new``-as-a-fragment).
|
||||||
|
for full, code in sorted(
|
||||||
|
_US_STATE_NAMES_NORM.items(), key=lambda kv: -len(kv[0])
|
||||||
|
):
|
||||||
|
addr = re.sub(rf"(?<!\w){re.escape(full)}(?!\w)", code, addr)
|
||||||
|
|
||||||
parts = addr.split()
|
parts = addr.split()
|
||||||
normalized_parts = []
|
normalized_parts = []
|
||||||
for part in parts:
|
for part in parts:
|
||||||
|
|||||||
@@ -191,10 +191,15 @@ def strip_zero_width(s: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def strip_bom(s: str) -> str:
|
def strip_bom(s: str) -> str:
|
||||||
"""Remove a leading ``U+FEFF`` (BOM) from the start of the string."""
|
"""Remove a leading ``U+FEFF`` (BOM) from the start of the string.
|
||||||
|
|
||||||
|
Strips at most one BOM — multiple consecutive BOMs are unusual and
|
||||||
|
the second one likely indicates concatenation artifact the caller
|
||||||
|
should preserve so the issue stays visible.
|
||||||
|
"""
|
||||||
if not isinstance(s, str):
|
if not isinstance(s, str):
|
||||||
return s
|
return s
|
||||||
return s.lstrip("")
|
return s[1:] if s.startswith("") else s
|
||||||
|
|
||||||
|
|
||||||
def strip_control(s: str) -> str:
|
def strip_control(s: str) -> str:
|
||||||
@@ -252,6 +257,9 @@ def smart_title_case(s: str) -> str:
|
|||||||
out.append(tok)
|
out.append(tok)
|
||||||
continue
|
continue
|
||||||
lowered = tok.lower()
|
lowered = tok.lower()
|
||||||
|
# Particles stay lowercase only mid-string. The first and last
|
||||||
|
# words of a title always capitalize, even when they're particles
|
||||||
|
# (``A Story to Tell`` — first word ``A`` is capitalized).
|
||||||
if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
|
if 0 < i < last_idx and lowered in _TITLE_LOWERCASE_PARTICLES:
|
||||||
out.append(lowered)
|
out.append(lowered)
|
||||||
continue
|
continue
|
||||||
@@ -278,7 +286,12 @@ def smart_title_case(s: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def sentence_case(s: str) -> str:
|
def sentence_case(s: str) -> str:
|
||||||
"""Lowercase, then capitalize the first cased letter after each ``. ! ?``."""
|
"""Lowercase, then capitalize the first cased letter after each ``. ! ?``.
|
||||||
|
|
||||||
|
Non-letter, non-terminator characters (like opening quotes or
|
||||||
|
parens) don't consume the "next letter" trigger, so ``"hello." "world"``
|
||||||
|
becomes ``"Hello." "World"``.
|
||||||
|
"""
|
||||||
if not isinstance(s, str) or not s:
|
if not isinstance(s, str) or not s:
|
||||||
return s
|
return s
|
||||||
lowered = s.lower()
|
lowered = s.lower()
|
||||||
@@ -291,11 +304,6 @@ def sentence_case(s: str) -> str:
|
|||||||
if capitalize_next and c.isalpha():
|
if capitalize_next and c.isalpha():
|
||||||
chars[i] = c.upper()
|
chars[i] = c.upper()
|
||||||
capitalize_next = False
|
capitalize_next = False
|
||||||
elif c.strip():
|
|
||||||
# Any non-whitespace, non-letter (e.g., quote, paren) doesn't
|
|
||||||
# consume the "next letter" trigger.
|
|
||||||
if c.isalpha():
|
|
||||||
capitalize_next = False
|
|
||||||
return "".join(chars)
|
return "".join(chars)
|
||||||
|
|
||||||
|
|
||||||
@@ -698,7 +706,7 @@ def visualize_hidden_html(s: str, *, mark_outer_whitespace: bool = False) -> str
|
|||||||
the page.
|
the page.
|
||||||
"""
|
"""
|
||||||
if not isinstance(s, str):
|
if not isinstance(s, str):
|
||||||
return ""
|
return s # mirror visualize_hidden_text: pass non-strings through
|
||||||
|
|
||||||
leading = ""
|
leading = ""
|
||||||
trailing = ""
|
trailing = ""
|
||||||
|
|||||||
@@ -1,91 +1,594 @@
|
|||||||
"""DataTools Format Standardizer — stub page."""
|
"""DataTools Format Standardizer — Streamlit page."""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import json
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
|
|
||||||
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
_project_root = Path(__file__).resolve().parent.parent.parent.parent
|
||||||
if str(_project_root) not in sys.path:
|
if str(_project_root) not in sys.path:
|
||||||
sys.path.insert(0, str(_project_root))
|
sys.path.insert(0, str(_project_root))
|
||||||
|
|
||||||
from src.gui.components import hide_streamlit_chrome, require_normalization_gate
|
from src.gui.components import (
|
||||||
|
hide_streamlit_chrome,
|
||||||
|
pickup_or_upload,
|
||||||
|
require_normalization_gate,
|
||||||
|
)
|
||||||
|
from src.core.format_standardize import (
|
||||||
|
PRESETS,
|
||||||
|
FieldType,
|
||||||
|
StandardizeOptions,
|
||||||
|
standardize_dataframe,
|
||||||
|
)
|
||||||
|
|
||||||
hide_streamlit_chrome()
|
hide_streamlit_chrome()
|
||||||
require_normalization_gate()
|
require_normalization_gate()
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Header
|
# Header
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
st.title("📐 Format Standardizer")
|
st.title("📐 Format Standardizer")
|
||||||
st.caption("Standardize formats across columns for consistency.")
|
|
||||||
|
|
||||||
st.info("This tool is under development.")
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# What this tool will do
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
st.markdown("""
|
|
||||||
**Features:**
|
|
||||||
- Date format standardization (e.g., MM/DD/YYYY → YYYY-MM-DD)
|
|
||||||
- Phone number formatting (E.164, national, international)
|
|
||||||
- Currency normalization ($1,000.00 → 1000.00)
|
|
||||||
- Name casing (JOHN DOE → John Doe)
|
|
||||||
- Address abbreviation expansion (St. → Street, Ave. → Avenue)
|
|
||||||
- Boolean standardization (Yes/No/Y/N/1/0 → True/False)
|
|
||||||
""")
|
|
||||||
|
|
||||||
st.divider()
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# File upload (functional)
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
uploaded = st.file_uploader(
|
|
||||||
"Upload CSV or Excel file",
|
|
||||||
type=["csv", "tsv", "xlsx", "xls"],
|
|
||||||
help="Upload a file to preview. Processing is not yet available.",
|
|
||||||
key="fmtstd_file_upload",
|
|
||||||
)
|
|
||||||
|
|
||||||
if uploaded is not None:
|
|
||||||
import pandas as pd
|
|
||||||
try:
|
|
||||||
if uploaded.name.endswith((".xlsx", ".xls")):
|
|
||||||
df = pd.read_excel(uploaded)
|
|
||||||
else:
|
|
||||||
df = pd.read_csv(uploaded)
|
|
||||||
st.subheader(f"Preview: {uploaded.name}")
|
|
||||||
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
|
||||||
st.dataframe(df.head(10), use_container_width=True)
|
|
||||||
except Exception as e:
|
|
||||||
st.error(f"Failed to read file: {e}")
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Placeholder options
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
st.subheader("Format Rules")
|
|
||||||
|
|
||||||
st.selectbox("Date format", ["YYYY-MM-DD", "MM/DD/YYYY", "DD/MM/YYYY", "DD-Mon-YYYY"], disabled=True)
|
|
||||||
st.selectbox("Phone format", ["E.164 (+15551234567)", "National ((555) 123-4567)", "Digits only"], disabled=True)
|
|
||||||
st.selectbox("Currency handling", ["Strip symbols, keep number", "Normalize to 2 decimals", "Keep as-is"], disabled=True)
|
|
||||||
st.selectbox("Name casing", ["Title Case", "UPPER", "lower", "As-is"], disabled=True)
|
|
||||||
st.checkbox("Expand address abbreviations", value=False, disabled=True)
|
|
||||||
|
|
||||||
st.divider()
|
|
||||||
st.button("Standardize Formats", type="primary", use_container_width=True, disabled=True)
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Footer
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
st.divider()
|
|
||||||
st.caption(
|
st.caption(
|
||||||
"Runs locally. Your data never leaves this computer. "
|
"Canonicalize dates, phone numbers, currency, names, addresses, and "
|
||||||
"| DataTools v3.0"
|
"booleans on a per-column basis. Runs locally — your data never leaves "
|
||||||
|
"this computer."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# File upload
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
uploaded = pickup_or_upload(
|
||||||
|
label="Upload CSV or Excel file",
|
||||||
|
key="fmtstd_file_upload",
|
||||||
|
types=["csv", "tsv", "xlsx", "xls"],
|
||||||
|
)
|
||||||
|
|
||||||
|
if uploaded is None:
|
||||||
|
st.info("Upload a CSV, TSV, or Excel file to begin.")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
|
||||||
|
@st.cache_data(show_spinner=False)
|
||||||
|
def _read_uploaded(name: str, data: bytes) -> pd.DataFrame:
|
||||||
|
"""Read the uploaded bytes into a DataFrame, treating all cells as strings."""
|
||||||
|
suffix = Path(name).suffix.lower()
|
||||||
|
bio = io.BytesIO(data)
|
||||||
|
if suffix in (".xlsx", ".xls"):
|
||||||
|
return pd.read_excel(bio, dtype=str, keep_default_na=False)
|
||||||
|
for enc in ("utf-8", "utf-8-sig", "latin-1"):
|
||||||
|
try:
|
||||||
|
bio.seek(0)
|
||||||
|
sep = "\t" if suffix == ".tsv" else ","
|
||||||
|
return pd.read_csv(
|
||||||
|
bio, dtype=str, keep_default_na=False,
|
||||||
|
encoding=enc, sep=sep, on_bad_lines="warn",
|
||||||
|
)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
bio.seek(0)
|
||||||
|
return pd.read_csv(bio, dtype=str, keep_default_na=False, encoding="latin-1")
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
df = _read_uploaded(uploaded.name, uploaded.getvalue())
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Failed to read file: {e}")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
st.subheader(f"Preview: {uploaded.name}")
|
||||||
|
st.caption(f"{len(df)} rows, {len(df.columns)} columns")
|
||||||
|
st.dataframe(df.head(10), use_container_width=True)
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Auto-detect column types
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# A first pass over a 200-row sample picks a likely field type per column.
|
||||||
|
# It's a hint, not a commitment — every column shows a selectbox the user
|
||||||
|
# can override. Heuristics deliberately err toward "(skip)" rather than
|
||||||
|
# guessing wrong, since wrong guesses produce misleading change audits.
|
||||||
|
|
||||||
|
import re as _re
|
||||||
|
|
||||||
|
_DATE_HINT_RE = _re.compile(
|
||||||
|
r"^\s*\d{1,4}[-/.]\d{1,2}[-/.]\d{1,4}\s*$"
|
||||||
|
r"|^\s*[A-Za-z]{3,9}\s+\d{1,2}[, ]+\d{2,4}\s*$"
|
||||||
|
r"|^\s*\d{1,2}\s+[A-Za-z]{3,9}\s+\d{2,4}\s*$"
|
||||||
|
)
|
||||||
|
_PHONE_HINT_RE = _re.compile(r"^[\s\d().+\-]+$")
|
||||||
|
_CURRENCY_HINT_RE = _re.compile(r"^[\s$€£¥]?\s*-?\d[\d,. ]*\d?\s*$|^\s*\(\s*[$€£¥]?\d.*\)\s*$")
|
||||||
|
_BOOL_TOKENS = {"yes", "no", "y", "n", "true", "false", "t", "f", "0", "1"}
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_field_type(col: str, samples: list[str]) -> FieldType | None:
|
||||||
|
"""Return a likely :class:`FieldType` for *col*, or None when unsure.
|
||||||
|
|
||||||
|
Strategy: drop empties, then require ≥80% of remaining sample cells to
|
||||||
|
fit the type's hint regex. Boolean check runs first because ``0/1`` also
|
||||||
|
matches the currency regex; date/phone/currency next; address/name fall
|
||||||
|
back to header-name keywords because their cell shapes overlap with
|
||||||
|
plain free text.
|
||||||
|
"""
|
||||||
|
cells = [s.strip() for s in samples if isinstance(s, str) and s.strip()]
|
||||||
|
if not cells:
|
||||||
|
return None
|
||||||
|
n = len(cells)
|
||||||
|
threshold = max(1, int(n * 0.8))
|
||||||
|
|
||||||
|
bool_hits = sum(1 for c in cells if c.casefold() in _BOOL_TOKENS)
|
||||||
|
if bool_hits >= threshold:
|
||||||
|
return FieldType.BOOLEAN
|
||||||
|
|
||||||
|
date_hits = sum(1 for c in cells if _DATE_HINT_RE.match(c))
|
||||||
|
if date_hits >= threshold:
|
||||||
|
return FieldType.DATE
|
||||||
|
|
||||||
|
# Phone: digit-heavy, 7+ digits, no letters.
|
||||||
|
phone_hits = 0
|
||||||
|
for c in cells:
|
||||||
|
if _PHONE_HINT_RE.match(c) and sum(1 for ch in c if ch.isdigit()) >= 7:
|
||||||
|
phone_hits += 1
|
||||||
|
if phone_hits >= threshold:
|
||||||
|
return FieldType.PHONE
|
||||||
|
|
||||||
|
currency_hits = sum(1 for c in cells if _CURRENCY_HINT_RE.match(c))
|
||||||
|
if currency_hits >= threshold:
|
||||||
|
return FieldType.CURRENCY
|
||||||
|
|
||||||
|
header = col.lower()
|
||||||
|
if any(tok in header for tok in ("address", "addr", "street")):
|
||||||
|
return FieldType.ADDRESS
|
||||||
|
if any(tok in header for tok in ("name", "customer", "contact")):
|
||||||
|
return FieldType.NAME
|
||||||
|
if any(tok in header for tok in ("date", "dob", "birth", "joined", "created")):
|
||||||
|
return FieldType.DATE
|
||||||
|
if any(tok in header for tok in ("phone", "mobile", "tel")):
|
||||||
|
return FieldType.PHONE
|
||||||
|
if any(tok in header for tok in ("price", "amount", "cost", "total", "fee")):
|
||||||
|
return FieldType.CURRENCY
|
||||||
|
if any(tok in header for tok in ("active", "enabled", "is_", "has_", "flag")):
|
||||||
|
return FieldType.BOOLEAN
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Options
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Column types")
|
||||||
|
st.caption(
|
||||||
|
"Assign each column to a field type. Auto-detected suggestions are "
|
||||||
|
"pre-filled; pick **(skip)** to leave a column untouched."
|
||||||
|
)
|
||||||
|
|
||||||
|
_FIELD_LABELS = {
|
||||||
|
"(skip)": None,
|
||||||
|
"Date": FieldType.DATE,
|
||||||
|
"Phone": FieldType.PHONE,
|
||||||
|
"Currency": FieldType.CURRENCY,
|
||||||
|
"Name": FieldType.NAME,
|
||||||
|
"Address": FieldType.ADDRESS,
|
||||||
|
"Boolean": FieldType.BOOLEAN,
|
||||||
|
}
|
||||||
|
_LABEL_BY_TYPE = {v: k for k, v in _FIELD_LABELS.items()}
|
||||||
|
_LABELS = list(_FIELD_LABELS.keys())
|
||||||
|
|
||||||
|
sample_size = min(len(df), 200)
|
||||||
|
sample_df = df.head(sample_size)
|
||||||
|
|
||||||
|
column_types: dict[str, FieldType] = {}
|
||||||
|
cols_per_row = 3
|
||||||
|
columns_iter = list(df.columns)
|
||||||
|
for i in range(0, len(columns_iter), cols_per_row):
|
||||||
|
cols_block = st.columns(cols_per_row)
|
||||||
|
for j, col_name in enumerate(columns_iter[i:i + cols_per_row]):
|
||||||
|
with cols_block[j]:
|
||||||
|
detected = _detect_field_type(col_name, sample_df[col_name].tolist())
|
||||||
|
default_label = _LABEL_BY_TYPE.get(detected, "(skip)")
|
||||||
|
chosen = st.selectbox(
|
||||||
|
col_name,
|
||||||
|
_LABELS,
|
||||||
|
index=_LABELS.index(default_label),
|
||||||
|
key=f"fmtstd_type__{col_name}",
|
||||||
|
)
|
||||||
|
ft = _FIELD_LABELS[chosen]
|
||||||
|
if ft is not None:
|
||||||
|
column_types[col_name] = ft
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.subheader("Format options")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Preset bundle picker
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Picking a preset rewrites every option below to that preset's defaults.
|
||||||
|
# It does NOT touch column-type assignments — those are user-driven and
|
||||||
|
# orthogonal. To make the rewrite stick across the rerun, we stash the
|
||||||
|
# preset values into the per-option session keys; the widgets below read
|
||||||
|
# those keys via their ``index``/``value`` arguments.
|
||||||
|
|
||||||
|
_PRESET_LABELS = {
|
||||||
|
"us-default": "US (default) — ISO 8601 dates · E.164 phones · USD",
|
||||||
|
"european": "European — DMY input · INTL phones · EUR comma decimal",
|
||||||
|
"uk": "UK — DD/MM/YYYY · GB phones · Yes/No booleans",
|
||||||
|
"iso-strict": "ISO Strict — ISO 8601 · bare-number currency · true/false",
|
||||||
|
"legacy-us": "Legacy US — MM/DD/YYYY · National phones · Yes/No",
|
||||||
|
"custom": "Custom — keep current settings",
|
||||||
|
}
|
||||||
|
|
||||||
|
preset_choice = st.radio(
|
||||||
|
"Standards preset",
|
||||||
|
list(_PRESET_LABELS.keys()),
|
||||||
|
format_func=lambda k: _PRESET_LABELS[k],
|
||||||
|
index=0,
|
||||||
|
horizontal=False,
|
||||||
|
key="fmtstd_preset",
|
||||||
|
help=(
|
||||||
|
"Pick a published standard or regional convention as the baseline. "
|
||||||
|
"Every option below is still individually overridable; choose "
|
||||||
|
"**Custom** to keep whatever you've manually adjusted."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Detect a preset switch since the last rerun; when it changes (and the
|
||||||
|
# new choice isn't ``custom``), purge the dependent widget keys so
|
||||||
|
# Streamlit lets their ``index=``/``value=`` defaults take effect on the
|
||||||
|
# new render. Without this clear, prior session_state pins the widget to
|
||||||
|
# the previous preset's choice and the apparent picker becomes a no-op.
|
||||||
|
_DEPENDENT_KEYS = [
|
||||||
|
"fmtstd_date_format", "fmtstd_date_order",
|
||||||
|
"fmtstd_phone_format", "fmtstd_phone_region",
|
||||||
|
"fmtstd_currency_decimal", "fmtstd_currency_decimals",
|
||||||
|
"fmtstd_currency_preserve", "fmtstd_currency_preserve_code",
|
||||||
|
"fmtstd_name_case", "fmtstd_bool_style",
|
||||||
|
]
|
||||||
|
_last = st.session_state.get("fmtstd_preset_last")
|
||||||
|
if _last != preset_choice:
|
||||||
|
st.session_state["fmtstd_preset_last"] = preset_choice
|
||||||
|
if preset_choice != "custom":
|
||||||
|
for k in _DEPENDENT_KEYS:
|
||||||
|
st.session_state.pop(k, None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
# Map preset → widget-state defaults. Done as labels so the radios/selects
|
||||||
|
# below pick up the right index without us re-implementing each map twice.
|
||||||
|
_PRESET_TO_WIDGETS: dict[str, dict[str, str]] = {
|
||||||
|
"us-default": {
|
||||||
|
"date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)",
|
||||||
|
"phone_format": "E.164 (+15551234567)", "phone_region": "US",
|
||||||
|
"currency_decimal": "dot (1,234.56)", "currency_decimals": 2,
|
||||||
|
"currency_preserve_code": False,
|
||||||
|
"name_case": "Title Case", "boolean_style": "True/False",
|
||||||
|
},
|
||||||
|
"european": {
|
||||||
|
"date_format": "YYYY-MM-DD (ISO)", "date_order": "DMY (EU)",
|
||||||
|
"phone_format": "International (+1 555-123-4567)", "phone_region": "DE",
|
||||||
|
"currency_decimal": "comma (1.234,56)", "currency_decimals": 2,
|
||||||
|
"currency_preserve_code": True,
|
||||||
|
"name_case": "Title Case", "boolean_style": "True/False",
|
||||||
|
},
|
||||||
|
"uk": {
|
||||||
|
"date_format": "DD/MM/YYYY", "date_order": "DMY (EU)",
|
||||||
|
"phone_format": "International (+1 555-123-4567)", "phone_region": "GB",
|
||||||
|
"currency_decimal": "dot (1,234.56)", "currency_decimals": 2,
|
||||||
|
"currency_preserve_code": False,
|
||||||
|
"name_case": "Title Case", "boolean_style": "Yes/No",
|
||||||
|
},
|
||||||
|
"iso-strict": {
|
||||||
|
"date_format": "YYYY-MM-DD (ISO)", "date_order": "MDY (US)",
|
||||||
|
"phone_format": "E.164 (+15551234567)", "phone_region": "US",
|
||||||
|
"currency_decimal": "dot (1,234.56)", "currency_decimals": 0,
|
||||||
|
"currency_preserve_code": True,
|
||||||
|
"name_case": "Title Case", "boolean_style": "true/false",
|
||||||
|
},
|
||||||
|
"legacy-us": {
|
||||||
|
"date_format": "MM/DD/YYYY", "date_order": "MDY (US)",
|
||||||
|
"phone_format": "National ((555) 123-4567)", "phone_region": "US",
|
||||||
|
"currency_decimal": "dot (1,234.56)", "currency_decimals": 2,
|
||||||
|
"currency_preserve_code": False,
|
||||||
|
"name_case": "Title Case", "boolean_style": "Yes/No",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# ``iso-strict`` wants currency with no rounding; the GUI exposes that via
|
||||||
|
# the "preserve original precision" checkbox rather than a sentinel value
|
||||||
|
# in the number-input. Map that here.
|
||||||
|
_PRESET_PRESERVE_DECIMALS: dict[str, bool] = {
|
||||||
|
"iso-strict": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _preset_default(key: str, fallback):
|
||||||
|
"""Pull the preset-driven default for *key*, or *fallback* on Custom."""
|
||||||
|
if preset_choice == "custom":
|
||||||
|
return fallback
|
||||||
|
return _PRESET_TO_WIDGETS[preset_choice].get(key, fallback)
|
||||||
|
|
||||||
|
|
||||||
|
opt_cols = st.columns(2)
|
||||||
|
with opt_cols[0]:
|
||||||
|
st.markdown("**Dates**")
|
||||||
|
_DATE_LABELS = ["YYYY-MM-DD (ISO)", "MM/DD/YYYY", "DD/MM/YYYY", "DD-Mon-YYYY", "Mon DD, YYYY"]
|
||||||
|
date_format_label = st.selectbox(
|
||||||
|
"Output format",
|
||||||
|
_DATE_LABELS,
|
||||||
|
index=_DATE_LABELS.index(_preset_default("date_format", "YYYY-MM-DD (ISO)")),
|
||||||
|
key="fmtstd_date_format",
|
||||||
|
)
|
||||||
|
date_format_map = {
|
||||||
|
"YYYY-MM-DD (ISO)": "%Y-%m-%d",
|
||||||
|
"MM/DD/YYYY": "%m/%d/%Y",
|
||||||
|
"DD/MM/YYYY": "%d/%m/%Y",
|
||||||
|
"DD-Mon-YYYY": "%d-%b-%Y",
|
||||||
|
"Mon DD, YYYY": "%b %d, %Y",
|
||||||
|
}
|
||||||
|
_DATE_ORDER_LABELS = ["MDY (US)", "DMY (EU)"]
|
||||||
|
date_order = st.radio(
|
||||||
|
"Ambiguous input order (e.g. 01/02/2024)",
|
||||||
|
_DATE_ORDER_LABELS,
|
||||||
|
index=_DATE_ORDER_LABELS.index(_preset_default("date_order", "MDY (US)")),
|
||||||
|
horizontal=True,
|
||||||
|
key="fmtstd_date_order",
|
||||||
|
)
|
||||||
|
|
||||||
|
st.markdown("**Phones**")
|
||||||
|
_PHONE_LABELS = [
|
||||||
|
"E.164 (+15551234567)", "International (+1 555-123-4567)",
|
||||||
|
"National ((555) 123-4567)", "Digits only",
|
||||||
|
]
|
||||||
|
phone_format_label = st.selectbox(
|
||||||
|
"Output format",
|
||||||
|
_PHONE_LABELS,
|
||||||
|
index=_PHONE_LABELS.index(_preset_default("phone_format", "E.164 (+15551234567)")),
|
||||||
|
key="fmtstd_phone_format",
|
||||||
|
)
|
||||||
|
phone_format_map = {
|
||||||
|
"E.164 (+15551234567)": "E164",
|
||||||
|
"International (+1 555-123-4567)": "INTERNATIONAL",
|
||||||
|
"National ((555) 123-4567)": "NATIONAL",
|
||||||
|
"Digits only": "DIGITS",
|
||||||
|
}
|
||||||
|
phone_region = st.text_input(
|
||||||
|
"Default region (ISO-2)",
|
||||||
|
value=_preset_default("phone_region", "US"),
|
||||||
|
max_chars=2,
|
||||||
|
help="Region used when the input has no country code. ``US``, ``GB``, ``DE``, etc.",
|
||||||
|
key="fmtstd_phone_region",
|
||||||
|
).upper() or "US"
|
||||||
|
|
||||||
|
with opt_cols[1]:
|
||||||
|
st.markdown("**Currency**")
|
||||||
|
_CURR_DECIMAL_LABELS = ["dot (1,234.56)", "comma (1.234,56)"]
|
||||||
|
currency_decimal = st.radio(
|
||||||
|
"Decimal separator in input",
|
||||||
|
_CURR_DECIMAL_LABELS,
|
||||||
|
index=_CURR_DECIMAL_LABELS.index(_preset_default("currency_decimal", "dot (1,234.56)")),
|
||||||
|
horizontal=True,
|
||||||
|
key="fmtstd_currency_decimal",
|
||||||
|
)
|
||||||
|
currency_decimals = st.number_input(
|
||||||
|
"Round to decimals",
|
||||||
|
min_value=0, max_value=8,
|
||||||
|
value=int(_preset_default("currency_decimals", 2)),
|
||||||
|
step=1,
|
||||||
|
key="fmtstd_currency_decimals",
|
||||||
|
)
|
||||||
|
preserve_decimals = st.checkbox(
|
||||||
|
"Preserve original precision (don't round)",
|
||||||
|
value=_PRESET_PRESERVE_DECIMALS.get(preset_choice, False),
|
||||||
|
key="fmtstd_currency_preserve",
|
||||||
|
)
|
||||||
|
currency_preserve_code = st.checkbox(
|
||||||
|
"Preserve currency code (emit `USD 1234.56`, `EUR 99.00`, etc.)",
|
||||||
|
value=bool(_preset_default("currency_preserve_code", False)),
|
||||||
|
help=(
|
||||||
|
"Detects an ISO 4217 code or symbol in the input ($/€/£/¥/USD/"
|
||||||
|
"EUR/...) and re-emits it as a space-separated prefix on the "
|
||||||
|
"standardized number. Cells without a currency marker emit "
|
||||||
|
"just the number."
|
||||||
|
),
|
||||||
|
key="fmtstd_currency_preserve_code",
|
||||||
|
)
|
||||||
|
|
||||||
|
st.markdown("**Names**")
|
||||||
|
_NAME_CASE_LABELS = ["Title Case", "UPPER", "lower"]
|
||||||
|
name_case_label = st.selectbox(
|
||||||
|
"Casing",
|
||||||
|
_NAME_CASE_LABELS,
|
||||||
|
index=_NAME_CASE_LABELS.index(_preset_default("name_case", "Title Case")),
|
||||||
|
key="fmtstd_name_case",
|
||||||
|
)
|
||||||
|
name_case_map = {"Title Case": "title", "UPPER": "upper", "lower": "lower"}
|
||||||
|
|
||||||
|
st.markdown("**Booleans**")
|
||||||
|
_BOOL_LABELS = ["True/False", "true/false", "Yes/No", "Y/N", "1/0"]
|
||||||
|
boolean_style = st.selectbox(
|
||||||
|
"Output style",
|
||||||
|
_BOOL_LABELS,
|
||||||
|
index=_BOOL_LABELS.index(_preset_default("boolean_style", "True/False")),
|
||||||
|
key="fmtstd_bool_style",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Address abbreviations — built-in USPS table is editable
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Users with international addresses (German Strasse, Spanish-language
|
||||||
|
# Avenida, French Boulevard variants) need to override the built-in
|
||||||
|
# table. Show it in a data_editor so the override is visible — the table
|
||||||
|
# is small, this is the right surface.
|
||||||
|
|
||||||
|
extra_abbreviations: dict[str, str] = {}
|
||||||
|
if any(ft == FieldType.ADDRESS for ft in column_types.values()):
|
||||||
|
with st.expander("Custom address abbreviations (advanced)", expanded=False):
|
||||||
|
st.caption(
|
||||||
|
"Add or override entries in the address abbreviation table. "
|
||||||
|
"Each row maps a short form (case-insensitive, periods OK) to "
|
||||||
|
"the long form the standardizer should emit. Built-in USPS "
|
||||||
|
"Pub. 28 entries (`St` → `Street`, `Ave` → `Avenue`, …) apply "
|
||||||
|
"automatically; rows here merge on top and can override them."
|
||||||
|
)
|
||||||
|
starter = pd.DataFrame(
|
||||||
|
[
|
||||||
|
{"abbreviation": "", "expansion": ""},
|
||||||
|
{"abbreviation": "", "expansion": ""},
|
||||||
|
{"abbreviation": "", "expansion": ""},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
edited = st.data_editor(
|
||||||
|
starter,
|
||||||
|
num_rows="dynamic",
|
||||||
|
use_container_width=True,
|
||||||
|
column_config={
|
||||||
|
"abbreviation": st.column_config.TextColumn(
|
||||||
|
"Short form",
|
||||||
|
help="Case-insensitive, trailing period optional. e.g. ``Strasse``",
|
||||||
|
),
|
||||||
|
"expansion": st.column_config.TextColumn(
|
||||||
|
"Long form",
|
||||||
|
help="What the standardizer emits. e.g. ``Straße``",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
key="fmtstd_extra_abbrev",
|
||||||
|
)
|
||||||
|
for _, row in edited.iterrows():
|
||||||
|
k = str(row.get("abbreviation") or "").strip()
|
||||||
|
v = str(row.get("expansion") or "").strip()
|
||||||
|
if k and v:
|
||||||
|
extra_abbreviations[k] = v
|
||||||
|
if extra_abbreviations:
|
||||||
|
st.success(
|
||||||
|
f"{len(extra_abbreviations)} custom mapping(s) will merge "
|
||||||
|
"with the built-in table."
|
||||||
|
)
|
||||||
|
|
||||||
|
options = StandardizeOptions(
|
||||||
|
column_types=column_types,
|
||||||
|
date_output_format=date_format_map[date_format_label],
|
||||||
|
date_order="MDY" if date_order.startswith("MDY") else "DMY",
|
||||||
|
phone_format=phone_format_map[phone_format_label], # type: ignore[arg-type]
|
||||||
|
phone_region=phone_region,
|
||||||
|
currency_decimal="dot" if currency_decimal.startswith("dot") else "comma",
|
||||||
|
currency_decimals=None if preserve_decimals else int(currency_decimals),
|
||||||
|
currency_preserve_code=currency_preserve_code,
|
||||||
|
name_case=name_case_map[name_case_label], # type: ignore[arg-type]
|
||||||
|
boolean_style=boolean_style, # type: ignore[arg-type]
|
||||||
|
extra_abbreviations=extra_abbreviations,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Run
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
if not column_types:
|
||||||
|
st.warning("Pick a field type for at least one column to enable standardization.")
|
||||||
|
|
||||||
|
run_disabled = not column_types
|
||||||
|
if st.button(
|
||||||
|
"Standardize Formats",
|
||||||
|
type="primary",
|
||||||
|
use_container_width=True,
|
||||||
|
disabled=run_disabled,
|
||||||
|
):
|
||||||
|
with st.spinner("Standardizing..."):
|
||||||
|
try:
|
||||||
|
result = standardize_dataframe(df, options)
|
||||||
|
except ValueError as e:
|
||||||
|
st.error(str(e))
|
||||||
|
st.stop()
|
||||||
|
st.session_state["fmtstd_result"] = result
|
||||||
|
st.session_state["fmtstd_input_name"] = uploaded.name
|
||||||
|
|
||||||
|
result = st.session_state.get("fmtstd_result")
|
||||||
|
if result is None:
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Results
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.subheader("Results")
|
||||||
|
|
||||||
|
pct = (result.cells_changed / result.cells_total * 100.0) if result.cells_total else 0.0
|
||||||
|
m1, m2, m3, m4 = st.columns(4)
|
||||||
|
m1.metric("Cells scanned", result.cells_total)
|
||||||
|
m2.metric("Cells changed", result.cells_changed)
|
||||||
|
m3.metric("% changed", f"{pct:.1f}%")
|
||||||
|
m4.metric("Unparseable", result.cells_unparseable)
|
||||||
|
|
||||||
|
if result.cells_unparseable:
|
||||||
|
st.info(
|
||||||
|
f"{result.cells_unparseable} cell(s) in typed columns didn't match a "
|
||||||
|
"recognizable shape and were left as-is. Check the changes audit "
|
||||||
|
"below to find them, or re-classify the column to **(skip)**."
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.cells_changed:
|
||||||
|
counts = result.changes.groupby(["column", "field_type"]).size()
|
||||||
|
st.markdown("**Changes by column**")
|
||||||
|
st.dataframe(
|
||||||
|
counts.rename("cells_changed").to_frame(),
|
||||||
|
use_container_width=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
st.markdown("**Examples (first 25 changes)**")
|
||||||
|
examples = result.changes.head(25).copy()
|
||||||
|
examples["row"] = examples["row"] + 1
|
||||||
|
st.dataframe(examples, use_container_width=True, hide_index=True)
|
||||||
|
|
||||||
|
st.markdown("**Standardized preview (first 10 rows)**")
|
||||||
|
st.dataframe(result.standardized_df.head(10), use_container_width=True)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Downloads
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
stem = Path(st.session_state.get("fmtstd_input_name", "input")).stem
|
||||||
|
|
||||||
|
dl_a, dl_b, dl_c = st.columns(3)
|
||||||
|
with dl_a:
|
||||||
|
standardized_bytes = result.standardized_df.to_csv(index=False).encode("utf-8-sig")
|
||||||
|
st.download_button(
|
||||||
|
"Download standardized CSV",
|
||||||
|
data=standardized_bytes,
|
||||||
|
file_name=f"{stem}_standardized.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
)
|
||||||
|
with dl_b:
|
||||||
|
if not result.changes.empty:
|
||||||
|
changes_bytes = result.changes.to_csv(index=False).encode("utf-8-sig")
|
||||||
|
st.download_button(
|
||||||
|
"Download changes audit",
|
||||||
|
data=changes_bytes,
|
||||||
|
file_name=f"{stem}_changes.csv",
|
||||||
|
mime="text/csv",
|
||||||
|
)
|
||||||
|
with dl_c:
|
||||||
|
config_bytes = json.dumps(options.to_dict(), indent=2).encode("utf-8")
|
||||||
|
st.download_button(
|
||||||
|
"Download config JSON",
|
||||||
|
data=config_bytes,
|
||||||
|
file_name="format_standardize_config.json",
|
||||||
|
mime="application/json",
|
||||||
|
)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption("Runs locally. Your data never leaves this computer. | DataTools v3.0")
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ TOOLS: list[Tool] = [
|
|||||||
"Standardize dates, currencies, names, phone numbers, and addresses."
|
"Standardize dates, currencies, names, phone numbers, and addresses."
|
||||||
),
|
),
|
||||||
page_slug="3_Format_Standardizer",
|
page_slug="3_Format_Standardizer",
|
||||||
status="Coming Soon",
|
status="Ready",
|
||||||
),
|
),
|
||||||
Tool(
|
Tool(
|
||||||
tool_id="04_missing_handler",
|
tool_id="04_missing_handler",
|
||||||
|
|||||||
46
test-cases/format-cleaner-corpus/24_format_dates.csv
Normal file
46
test-cases/format-cleaner-corpus/24_format_dates.csv
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
case_id,category,description,input
|
||||||
|
FD01,iso,ISO date plain,2024-01-15
|
||||||
|
FD02,iso,ISO datetime no zone,2024-01-15T10:30:00
|
||||||
|
FD03,iso,ISO datetime UTC,2024-01-15T10:30:00Z
|
||||||
|
FD04,iso,ISO datetime offset,2024-01-15T10:30:00+05:00
|
||||||
|
FD05,iso,ISO datetime with millis,2024-01-15T10:30:00.123Z
|
||||||
|
FD06,iso,ISO datetime space separator,2024-01-15 10:30:00
|
||||||
|
FD07,us,US slash 4-digit year,01/15/2024
|
||||||
|
FD08,us,US slash 2-digit year,1/15/24
|
||||||
|
FD09,us,US slash no leading zero,1/5/2024
|
||||||
|
FD10,us,US slash unambiguous (day > 12),5/30/2024
|
||||||
|
FD11,eu,EU dot 4-digit year,15.01.2024
|
||||||
|
FD12,eu,EU dot 2-digit year,15.01.24
|
||||||
|
FD13,eu,EU slash 4-digit year,15/01/2024
|
||||||
|
FD14,eu,EU slash unambiguous (day > 12),30/05/2024
|
||||||
|
FD15,eu,EU dash format,15-01-2024
|
||||||
|
FD16,longform,Month name long,"January 15, 2024"
|
||||||
|
FD17,longform,Month name short,"Jan 15, 2024"
|
||||||
|
FD18,longform,Day-month-year long,15 January 2024
|
||||||
|
FD19,longform,Day-month-year short,15 Jan 2024
|
||||||
|
FD20,longform,With weekday,"Monday, January 15, 2024"
|
||||||
|
FD21,longform,All caps month,JAN 15 2024
|
||||||
|
FD22,excel,Excel serial date,45306
|
||||||
|
FD23,excel,Excel serial with fractional time,45306.4375
|
||||||
|
FD24,unix,Unix timestamp seconds,1705320000
|
||||||
|
FD25,unix,Unix timestamp milliseconds,1705320000000
|
||||||
|
FD26,partial,Year-month only ISO,2024-01
|
||||||
|
FD27,partial,Year-month text,January 2024
|
||||||
|
FD28,partial,Quarter notation,Q1 2024
|
||||||
|
FD29,partial,Year only,2024
|
||||||
|
FD30,edge,Two-digit year ambiguity (1969 vs 2069),1/15/69
|
||||||
|
FD31,edge,Leap day valid,2024-02-29
|
||||||
|
FD32,edge,Leap day invalid (not a leap year),2023-02-29
|
||||||
|
FD33,edge,Excel 1900 leap year bug,1900-02-29
|
||||||
|
FD34,edge,Invalid month,2024-13-15
|
||||||
|
FD35,edge,Invalid day,2024-04-31
|
||||||
|
FD36,edge,Date with extraneous text,Date: 2024-01-15
|
||||||
|
FD37,edge,Date in parens annotation,2024-01-15 (verified)
|
||||||
|
FD38,edge,Empty,
|
||||||
|
FD39,edge,Whitespace-only,
|
||||||
|
FD40,edge,Garbage,not a date
|
||||||
|
FD41,locale,French month name,15 janvier 2024
|
||||||
|
FD42,locale,German month name,15. Januar 2024
|
||||||
|
FD43,timezone,Datetime with named tz,2024-01-15 10:30:00 EST
|
||||||
|
FD44,timezone,Datetime with offset and DST ambiguity,2024-03-10 02:30:00-05:00
|
||||||
|
FD45,padding,Already-clean: pass through,2024-01-15
|
||||||
|
32
test-cases/format-cleaner-corpus/25_format_phones.csv
Normal file
32
test-cases/format-cleaner-corpus/25_format_phones.csv
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
case_id,category,description,input
|
||||||
|
FP01,us,Plain digits 10,5551234567
|
||||||
|
FP02,us,Standard formatting,(555) 123-4567
|
||||||
|
FP03,us,Dashes,555-123-4567
|
||||||
|
FP04,us,Dots,555.123.4567
|
||||||
|
FP05,us,Spaces,555 123 4567
|
||||||
|
FP06,us,With country code +1,+1 555 123 4567
|
||||||
|
FP07,us,With country code 1- prefix,1-555-123-4567
|
||||||
|
FP08,us,With 001 prefix,001 555 123 4567
|
||||||
|
FP09,ext,Extension ext keyword,555-123-4567 ext 123
|
||||||
|
FP10,ext,Extension x abbreviation,555-123-4567 x123
|
||||||
|
FP11,ext,Extension hash,555-123-4567 #123
|
||||||
|
FP12,vanity,Vanity number 1-800-FLOWERS,1-800-FLOWERS
|
||||||
|
FP13,vanity,Mixed letters and digits,555-CALL-NOW
|
||||||
|
FP14,intl,UK with +44,+44 20 7946 0958
|
||||||
|
FP15,intl,UK domestic,020 7946 0958
|
||||||
|
FP16,intl,Germany with +49,+49 30 12345678
|
||||||
|
FP17,intl,France with +33,+33 1 23 45 67 89
|
||||||
|
FP18,intl,Japan with +81,+81-3-1234-5678
|
||||||
|
FP19,intl,Australia with +61,+61 2 1234 5678
|
||||||
|
FP20,e164,Already E.164 format,+15551234567
|
||||||
|
FP21,edge,Too few digits (local-only),555-1234
|
||||||
|
FP22,edge,Too many digits,1-555-123-4567-extra-99
|
||||||
|
FP23,edge,All-zeros placeholder,000-000-0000
|
||||||
|
FP24,edge,All-nines placeholder,999-999-9999
|
||||||
|
FP25,edge,Multiple numbers in cell,555-123-4567 / 555-987-6543
|
||||||
|
FP26,edge,Mismatched parens,555-(123)-4567
|
||||||
|
FP27,edge,NBSP in number,555 123 4567
|
||||||
|
FP28,edge,Very spaced,5 5 5 1 2 3 4 5 6 7
|
||||||
|
FP29,edge,Empty,
|
||||||
|
FP30,edge,Non-phone string,TBD
|
||||||
|
FP31,edge,Smart-apostrophe contamination,555’s 123-4567
|
||||||
|
32
test-cases/format-cleaner-corpus/26_format_emails.csv
Normal file
32
test-cases/format-cleaner-corpus/26_format_emails.csv
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
case_id,category,description,input
|
||||||
|
FE01,basic,Plain ASCII,alice@example.com
|
||||||
|
FE02,basic,Mixed case,Alice@Example.COM
|
||||||
|
FE03,basic,All caps,ALICE@EXAMPLE.COM
|
||||||
|
FE04,basic,Whitespace padding, alice@example.com
|
||||||
|
FE05,displayname,Display name no quotes,Alice Smith <alice@example.com>
|
||||||
|
FE06,displayname,Display name with quotes,"""Alice Smith"" <alice@example.com>"
|
||||||
|
FE07,displayname,Wrapped in angle brackets only,<alice@example.com>
|
||||||
|
FE08,prefix,mailto: prefix,mailto:alice@example.com
|
||||||
|
FE09,prefix,MAILTO: caps,MAILTO:Alice@Example.com
|
||||||
|
FE10,gmail,Gmail with dots,a.l.i.c.e@gmail.com
|
||||||
|
FE11,gmail,Gmail with +tag,alice+newsletter@gmail.com
|
||||||
|
FE12,gmail,Gmail with both,a.l.i.c.e+work@gmail.com
|
||||||
|
FE13,gmail,Non-Gmail with dots (don't touch),a.l.i.c.e@example.com
|
||||||
|
FE14,gmail,Non-Gmail with +tag (don't touch),alice+newsletter@example.com
|
||||||
|
FE15,idn,Unicode in domain,alice@münchen.de
|
||||||
|
FE16,idn,Unicode in local,アリス@example.jp
|
||||||
|
FE17,trailing,Trailing comma,"alice@example.com,"
|
||||||
|
FE18,trailing,Trailing period,alice@example.com.
|
||||||
|
FE19,trailing,Trailing closing paren,alice@example.com)
|
||||||
|
FE20,trailing,Trailing semicolon,alice@example.com;
|
||||||
|
FE21,smartquote,Wrapped in curly quotes,“alice@example.com”
|
||||||
|
FE22,invalid,Missing @,aliceexample.com
|
||||||
|
FE23,invalid,Double @,alice@@example.com
|
||||||
|
FE24,invalid,Multiple @,alice@example@com
|
||||||
|
FE25,invalid,Spaces inside,alice @ example.com
|
||||||
|
FE26,invalid,TLD-less local network,alice@localhost
|
||||||
|
FE27,multiple,Two comma-separated,"alice@example.com, bob@example.com"
|
||||||
|
FE28,multiple,Two semicolon-separated,alice@example.com; bob@example.com
|
||||||
|
FE29,edge,Empty,
|
||||||
|
FE30,edge,Whitespace-only,
|
||||||
|
FE31,edge,Already perfect,alice@example.com
|
||||||
|
34
test-cases/format-cleaner-corpus/27_format_addresses.csv
Normal file
34
test-cases/format-cleaner-corpus/27_format_addresses.csv
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
case_id,category,description,input
|
||||||
|
FA01,clean,Already USPS-formatted,"123 Main St, New York, NY 10001"
|
||||||
|
FA02,case,All caps,"123 MAIN STREET, NEW YORK, NY 10001"
|
||||||
|
FA03,case,All lowercase,"123 main street, new york, ny 10001"
|
||||||
|
FA04,case,Mixed case (preserve),"123 Main Street, New York, NY 10001"
|
||||||
|
FA05,abbrev,Street spelled out,"123 Main Street, New York, NY 10001"
|
||||||
|
FA06,abbrev,Avenue spelled out,"456 Park Avenue, New York, NY 10001"
|
||||||
|
FA07,abbrev,Boulevard spelled out,"789 Sunset Boulevard, Los Angeles, CA 90028"
|
||||||
|
FA08,abbrev,St with period,"123 Main St., New York, NY 10001"
|
||||||
|
FA09,directional,North spelled out,"123 North Main St, City, ST 12345"
|
||||||
|
FA10,directional,NORTH all caps,"123 NORTH Main St, City, ST 12345"
|
||||||
|
FA11,directional,NE compound,"123 NE Main St, City, ST 12345"
|
||||||
|
FA12,unit,Apartment spelled out,"123 Main St, Apartment 4B, City, ST 12345"
|
||||||
|
FA13,unit,Hash sign,"123 Main St, # 4B, City, ST 12345"
|
||||||
|
FA14,unit,Suite spelled out,"123 Main St, Suite 200, City, ST 12345"
|
||||||
|
FA15,state,State spelled out,"123 Main St, New York, New York 10001"
|
||||||
|
FA16,state,State all caps spelled out,"123 Main St, New York, NEW YORK 10001"
|
||||||
|
FA17,zip,ZIP+4,"123 Main St, New York, NY 10001-1234"
|
||||||
|
FA18,zip,Leading-zero ZIP (MA),"123 Main St, Boston, MA 02101"
|
||||||
|
FA19,multiline,Multi-line address,"123 Main St
|
||||||
|
Apt 4B
|
||||||
|
New York, NY 10001"
|
||||||
|
FA20,pobox,PO Box with periods,"P.O. Box 123, City, ST 12345"
|
||||||
|
FA21,pobox,PO Box without periods,"PO Box 123, City, ST 12345"
|
||||||
|
FA22,pobox,Post Office Box spelled out,"Post Office Box 123, City, ST 12345"
|
||||||
|
FA23,housenum,Letter suffix,"123A Main St, City, ST 12345"
|
||||||
|
FA24,housenum,Hyphen number,"123-1 Main St, City, ST 12345"
|
||||||
|
FA25,housenum,Half number,"123 1/2 Main St, City, ST 12345"
|
||||||
|
FA26,non_us,UK postcode address,"10 Downing Street, London, SW1A 2AA"
|
||||||
|
FA27,non_us,Canada postal code,"1 Yonge St, Toronto, ON M5E 1W7"
|
||||||
|
FA28,non_us,Japan reverse-order,"100-0001, Tokyo, Chiyoda, Marunouchi 1-1"
|
||||||
|
FA29,edge,Empty,
|
||||||
|
FA30,edge,Just a city,New York
|
||||||
|
FA31,edge,Trailing comma,"123 Main St, New York, NY 10001,"
|
||||||
|
35
test-cases/format-cleaner-corpus/28_format_names.csv
Normal file
35
test-cases/format-cleaner-corpus/28_format_names.csv
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
case_id,category,description,input
|
||||||
|
FN01,case,All caps,ALICE SMITH
|
||||||
|
FN02,case,All lowercase,alice smith
|
||||||
|
FN03,case,Already title case (preserve),Alice Smith
|
||||||
|
FN04,case,Random case (preserve),aLiCe SmItH
|
||||||
|
FN05,scots,McDonald lowercase,mcdonald
|
||||||
|
FN06,scots,MCDONALD all caps,MCDONALD
|
||||||
|
FN07,scots,MacDonald,macdonald
|
||||||
|
FN08,scots,McTaggart already correct,McTaggart
|
||||||
|
FN09,irish,O'Connor lowercase,o'connor
|
||||||
|
FN10,irish,O'CONNOR all caps,O'CONNOR
|
||||||
|
FN11,irish,O'Brien preserve,O'Brien
|
||||||
|
FN12,hyphen,Mary-Jane lowercase,mary-jane smith
|
||||||
|
FN13,hyphen,Smith-Jones,smith-jones
|
||||||
|
FN14,particle,von Trapp,von trapp
|
||||||
|
FN15,particle,Vincent van Gogh,vincent van gogh
|
||||||
|
FN16,particle,Charles de Gaulle,charles de gaulle
|
||||||
|
FN17,particle,Leonardo da Vinci,leonardo da vinci
|
||||||
|
FN18,title,Mr period,Mr. John Smith
|
||||||
|
FN19,title,DR caps,DR JANE DOE
|
||||||
|
FN20,title,Prof preserve,Prof Alice Williams
|
||||||
|
FN21,suffix,Jr period,John Smith Jr.
|
||||||
|
FN22,suffix,III roman numeral,John Smith III
|
||||||
|
FN23,suffix,PhD,Jane Doe PhD
|
||||||
|
FN24,comma,"Last, First","Smith, John"
|
||||||
|
FN25,comma,"LAST, FIRST","SMITH, JOHN"
|
||||||
|
FN26,comma,"Last, First Middle","Smith, John Andrew"
|
||||||
|
FN27,initial,Middle initial,John A. Smith
|
||||||
|
FN28,initial,Multi-initial author,j.k. rowling
|
||||||
|
FN29,nonlatin,Korean,김철수
|
||||||
|
FN30,nonlatin,Japanese,田中太郎
|
||||||
|
FN31,nonlatin,Russian,Иван Иванов
|
||||||
|
FN32,edge,Single name,Madonna
|
||||||
|
FN33,edge,Empty,
|
||||||
|
FN34,edge,Whitespace-only,
|
||||||
|
28
test-cases/format-cleaner-corpus/29_format_currencies.csv
Normal file
28
test-cases/format-cleaner-corpus/29_format_currencies.csv
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
case_id,category,description,input
|
||||||
|
FC01,us,Standard US dollar,"$1,234.56"
|
||||||
|
FC02,us,US no comma,$1234.56
|
||||||
|
FC03,us,US space after symbol,"$ 1,234.56"
|
||||||
|
FC04,us,US no symbol,"1,234.56"
|
||||||
|
FC05,us,US with code suffix,"1,234.56 USD"
|
||||||
|
FC06,us,US with code prefix,"USD 1,234.56"
|
||||||
|
FC07,us,US trailing symbol,1234.56$
|
||||||
|
FC08,eu,Euro standard,"€1.234,56"
|
||||||
|
FC09,eu,Euro space thousand,"€1 234,56"
|
||||||
|
FC10,eu,Euro code suffix,"1.234,56 EUR"
|
||||||
|
FC11,eu,Swiss apostrophe thousand,1'234.56
|
||||||
|
FC12,intl,GBP,"£1,234.56"
|
||||||
|
FC13,intl,JPY no decimal,"¥1,234"
|
||||||
|
FC14,intl,Indian rupees lakhs,"₹1,23,456.78"
|
||||||
|
FC15,negative,Leading minus,-$100.00
|
||||||
|
FC16,negative,Accounting parens,($100.00)
|
||||||
|
FC17,negative,Sign after symbol,$-100.00
|
||||||
|
FC18,edge,Zero,$0.00
|
||||||
|
FC19,edge,Scientific notation,1.5e6
|
||||||
|
FC20,edge,Percentage,15.5%
|
||||||
|
FC21,edge,Range (not normalizable),$50-$100
|
||||||
|
FC22,edge,Word value,Free
|
||||||
|
FC23,edge,TBD placeholder,TBD
|
||||||
|
FC24,edge,Empty,
|
||||||
|
FC25,edge,Already clean,1234.56
|
||||||
|
FC26,ambig,"1,234 - could be US 1234 or EU 1.234","1,234"
|
||||||
|
FC27,ambig,1.234 - could be US 1.234 or EU 1234,1.234
|
||||||
|
@@ -0,0 +1,6 @@
|
|||||||
|
case_id,name,email,phone,date,amount,address
|
||||||
|
FI01,ALICE SMITH,Alice@Example.COM,(555) 123-4567,1/15/24,"$1,234.56","123 main street, new york, ny 10001"
|
||||||
|
FI02,"mcdonald, john",mailto:John@gmail.com,+44 20 7946 0958,15.01.2024,"€1.234,56","10 DOWNING STREET, LONDON, SW1A 2AA"
|
||||||
|
FI03,DR JANE DOE PHD,"""Jane Doe"" <jane@example.com>",555-1234,"Jan 15, 2024",($100.00),"456 Park Avenue, Apt 12, New York, NEW YORK 10001"
|
||||||
|
FI04,,,,,,
|
||||||
|
FI05,Already Clean,alice@example.com,+15551234567,2024-01-15,1234.56,"123 Main St, New York, NY 10001"
|
||||||
|
513
test-cases/format-cleaner-corpus/FORMATS-CASES.md
Normal file
513
test-cases/format-cleaner-corpus/FORMATS-CASES.md
Normal file
@@ -0,0 +1,513 @@
|
|||||||
|
# FORMATS-CASES.md - `03_format_standardizer.py` Test Corpus
|
||||||
|
|
||||||
|
**Version**: 1.0
|
||||||
|
**Last updated**: April 30, 2026
|
||||||
|
**Companion to**: TEST-CASES.md (cleaning rules), QUOTE-CASES.md (parser robustness), ENCODINGS-CASES.md (I/O layer).
|
||||||
|
|
||||||
|
This corpus tests `03_format_standardizer.py`, which owns "what's there but in the wrong format." Six domains: dates, phones, emails, addresses, names, currencies. Plus a cross-domain integration fixture.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0. Scope clarifications you should read first
|
||||||
|
|
||||||
|
Three issues to surface before the per-domain sections, because they affect what tests are valid in the first place.
|
||||||
|
|
||||||
|
### 0.1 Email scope conflict with TECHNICAL.md
|
||||||
|
|
||||||
|
USER-GUIDE.md Section 2 lists 03's purpose as "dates, currencies, names, phone numbers, addresses." TECHNICAL.md Section 10.1 item 8 puts email normalization inside `01_deduplicator`'s Tier 1 spec. **Email appears in neither place as part of 03.**
|
||||||
|
|
||||||
|
This corpus tests email normalization as if it lives in 03. The reasoning: 03 is "format standardizer" and email is a format like any other. Putting it in 01 means there's no public API for the buyer to normalize emails outside of running dedup, which is a weird ergonomic for the GUI ("To clean my emails I have to run the deduplicator?"). Better factoring: 03 owns email normalization as a public operation; 01 calls into the same `core/` function for matching.
|
||||||
|
|
||||||
|
If you disagree, fixture `26_format_emails.csv` and its expected output drop out cleanly without affecting the other five domains. If you agree, update USER-GUIDE.md Section 2 and TECHNICAL.md Section 7's per-bundle technical notes.
|
||||||
|
|
||||||
|
### 0.2 Schema preservation rule (TECHNICAL.md Section 9 invariant)
|
||||||
|
|
||||||
|
03 changes cell content, never schema. Row count, column count, column order all unchanged. This rules out a few tempting designs:
|
||||||
|
|
||||||
|
- Currency normalization that splits `$1,234.56` into separate amount and currency columns — **rejected**. Output stays in one cell.
|
||||||
|
- Address normalization that splits a single-line address into structured street/city/state/zip columns — **rejected**. Output stays in one cell.
|
||||||
|
- Phone normalization that splits phone + extension into two columns — **rejected**. Extension goes inline as `;ext=123` (RFC 3966 syntax).
|
||||||
|
|
||||||
|
If you want structured output, that's a different script (a parser, not a standardizer).
|
||||||
|
|
||||||
|
### 0.3 Boundary with neighboring scripts
|
||||||
|
|
||||||
|
| If the cell is... | Owner | 03's behavior |
|
||||||
|
|---|---|---|
|
||||||
|
| Empty string | 04 (missing values) | Pass through unchanged. Don't decide if it means "missing." |
|
||||||
|
| Whitespace-only | 02 (text cleaner) | Should already be empty by the time 03 sees it. If not (CLI user skipped 02), trim defensively. |
|
||||||
|
| Statistically extreme but format-valid (date in year 1700, phone with 10 zeros) | 06 (outliers) | Format-normalize anyway. Don't flag unusual values. |
|
||||||
|
| Format-invalid (Feb 30, missing @, letters in numeric) | 03 | Emit error sentinel `<error: <reason>>`. |
|
||||||
|
| Already correctly formatted | 03 | Pass through. Idempotency required. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Default configuration
|
||||||
|
|
||||||
|
Tests assume the defaults below. Per-flag deviations are called out per case.
|
||||||
|
|
||||||
|
| Setting | Default | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| `--date-format` | ISO 8601 | `YYYY-MM-DD` for dates, `YYYY-MM-DDTHH:MM:SS[+ZZ:ZZ]` for datetimes |
|
||||||
|
| `--locale` | auto-detect | Per-column. Falls back to error if column has no disambiguating value |
|
||||||
|
| `--two-digit-year-cutoff` | 69 | Python default: years 00-68 → 2000-2068, 69-99 → 1969-1999 |
|
||||||
|
| `--phone-format` | E.164 | `+<country><digits>`, extensions via `;ext=` |
|
||||||
|
| `--default-country` | US | Used for phones with no country code |
|
||||||
|
| `--gmail-canonical` | off | Strip Gmail dots and +tags. Destructive, opt-in |
|
||||||
|
| `--expand-abbrev` | off | Expand St → Street etc. USPS abbreviation is the default |
|
||||||
|
| `--name-conservative` | on | Title-case only ALL CAPS or all-lowercase input |
|
||||||
|
| `--currency-locale` | auto-detect | Per-column. Same fallback as date locale |
|
||||||
|
| `--error-policy` | sentinel | Errors written as `<error: reason>`. Alternative: raise, skip-row |
|
||||||
|
| `--columns` | all | All text columns processed; `--columns date,phone` restricts |
|
||||||
|
|
||||||
|
**Idempotency requirement**: `format(format(x)) == format(x)` for every cell. Already-clean input passes through unchanged.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Test corpus index
|
||||||
|
|
||||||
|
| File | Domain | Cases | Expected outputs |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `24_format_dates.csv` | Dates | 45 | Single column |
|
||||||
|
| `25_format_phones.csv` | Phones | 31 | Single column |
|
||||||
|
| `26_format_emails.csv` | Emails | 31 | Two columns (default + gmail-canonical) |
|
||||||
|
| `27_format_addresses.csv` | Addresses | 31 | Two columns (default + expand-abbrev) |
|
||||||
|
| `28_format_names.csv` | Names | 34 | Single column |
|
||||||
|
| `29_format_currencies.csv` | Currencies | 27 | Single column |
|
||||||
|
| `30_format_integration.csv` | Cross-domain | 5 | Multi-column (full row) |
|
||||||
|
|
||||||
|
All input fixtures share the schema `case_id, category, description, input` (except integration, which has the full multi-column shape). Expected output files key by `case_id` for diff-by-join testing.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. DATES (`24_format_dates.csv`)
|
||||||
|
|
||||||
|
### 3.1 Use cases by buyer persona
|
||||||
|
|
||||||
|
- **Shopify**: Order export dates joined against manual entries that used a different format. Bookkeeping reports needing consistent date format for sorting.
|
||||||
|
- **Bookkeeper**: Bank export reconciliation across multiple banks, each using its own date convention. Tax reports requiring consistent year-month grouping.
|
||||||
|
- **Freelancer**: Client data dumps where the date column is in whatever format the client's locale or software produces.
|
||||||
|
- **Marketing agency**: Campaign performance data joined across platforms (Google Ads, Facebook Ads, Mailchimp) that all use different date formats.
|
||||||
|
|
||||||
|
### 3.2 Test categories
|
||||||
|
|
||||||
|
| Category | Cases | What it tests |
|
||||||
|
|---|---|---|
|
||||||
|
| iso | FD01-FD06 | ISO 8601 baseline. Already-clean and minor variants (Z vs offset, T vs space) |
|
||||||
|
| us | FD07-FD10 | M/D/Y format with 2-digit and 4-digit years. Includes one unambiguous case (day > 12) |
|
||||||
|
| eu | FD11-FD15 | D/M/Y format with various separators. Includes one unambiguous case |
|
||||||
|
| longform | FD16-FD21 | Month-name formats (full, abbreviated, with weekday, all caps) |
|
||||||
|
| excel | FD22-FD23 | Excel serial numbers (45306 = 2024-01-15). Critical: Excel CSV exports often have date columns leak through as numbers |
|
||||||
|
| unix | FD24-FD25 | Unix timestamps in seconds and milliseconds |
|
||||||
|
| partial | FD26-FD29 | Year-month, quarter, year-only. Coarser-than-day precision |
|
||||||
|
| edge | FD30-FD40 | Two-digit year ambiguity, leap day validity, Excel 1900 leap year bug, invalid dates, dates buried in other text |
|
||||||
|
| locale | FD41-FD42 | French and German month names |
|
||||||
|
| timezone | FD43-FD44 | Named time zones, DST transitions |
|
||||||
|
| padding | FD45 | Already-clean idempotency check |
|
||||||
|
|
||||||
|
### 3.3 Critical policy decisions
|
||||||
|
|
||||||
|
**Locale ambiguity (M/D/Y vs D/M/Y)**: Per-column inspection. The cleaner scans all values in the column; if any value has day > 12, locale is unambiguously D/M/Y; if any has month > 12 (impossible in M/D/Y), locale is unambiguously D/M/Y. If nothing disambiguates, error out and require `--locale us|eu`. **Do not silently guess.** Fixture row FD13 (`15/01/2024`) is ambiguous in isolation; FD14 (`30/05/2024`) makes the column unambiguously D/M/Y; in a real column containing both, FD13 resolves to `2024-01-15`.
|
||||||
|
|
||||||
|
**Two-digit year cutoff**: Python's default of 69 (years 00-68 → 2000s, 69-99 → 1969-1999). FD30 is `1/15/69` and resolves to `1969-01-15`. This is opinionated and frequently wrong for birth-year columns. Document the flag clearly; the buyer cleaning customer DOB data needs to override.
|
||||||
|
|
||||||
|
**Excel serial dates** (FD22, FD23): Detection heuristic — column header contains "date", or all values are integers/floats in range 25569–73050 (Jan 1 1970 to Jan 1 2099 in Excel serial). Outside that heuristic the cleaner can't distinguish a date serial from any other number.
|
||||||
|
|
||||||
|
**Excel 1900 leap year bug** (FD33): Excel claims 1900-02-29 exists; it doesn't. Detect and emit error. Don't silently accept and roll over to March 1.
|
||||||
|
|
||||||
|
**Localized month names** (FD41, FD42): Default cleaner ships with English month names. French/German/Spanish/etc. require a locale dictionary. Either ship one (adds size) or document the limitation. **Recommendation**: ship English + opt-in `--month-locale=fr|de|es` for the others. This corpus tests as if French and German are supported.
|
||||||
|
|
||||||
|
**Time zones** (FD43, FD44): Named zones (EST, PST) resolve to fixed offsets, NOT dynamically interpreted with DST rules. EST → -05:00 always. If buyers need DST-aware handling, that's a 04-bundle (out of scope) or an opt-in pyzoneinfo flag.
|
||||||
|
|
||||||
|
### 3.4 Edge case: dates buried in text (FD36, FD37)
|
||||||
|
|
||||||
|
`Date: 2024-01-15` and `2024-01-15 (verified)` extract to `2024-01-15`. The cleaner uses regex extraction for date-shaped substrings before parsing. **Risk**: false positives from random number sequences. Mitigation: require an unambiguous date pattern (4-digit year + valid month + valid day with explicit separator).
|
||||||
|
|
||||||
|
### 3.5 What's not tested
|
||||||
|
|
||||||
|
- Calendar systems other than Gregorian (Hijri, Hebrew, Japanese era). Out of scope.
|
||||||
|
- Recurring date strings (`every 1st of month`). Not a date.
|
||||||
|
- Date ranges (`2024-01-01 to 2024-01-15`). Out of scope; would require a different cell semantic.
|
||||||
|
- Sub-millisecond precision. Pandas/datetime tolerate but aren't tested here.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. PHONES (`25_format_phones.csv`)
|
||||||
|
|
||||||
|
### 4.1 Use cases by buyer persona
|
||||||
|
|
||||||
|
- **Shopify**: Customer phone list normalization before Klaviyo/Mailchimp import. SMS campaigns require E.164.
|
||||||
|
- **Bookkeeper**: Vendor phone deduplication where same vendor has multiple format variants in QuickBooks vs. spreadsheets.
|
||||||
|
- **Freelancer**: Lead lists from clients in arbitrary formats.
|
||||||
|
- **Marketing agency**: Multi-platform audience reconciliation; ad platforms increasingly require E.164 for matching.
|
||||||
|
|
||||||
|
### 4.2 Test categories
|
||||||
|
|
||||||
|
| Category | Cases | What it tests |
|
||||||
|
|---|---|---|
|
||||||
|
| us | FP01-FP08 | Common US format variants — plain digits, parens-dash, dots, spaces, country code prefixes |
|
||||||
|
| ext | FP09-FP11 | Extensions in three syntactic forms (`ext`, `x`, `#`) |
|
||||||
|
| vanity | FP12-FP13 | Letter-to-digit conversion (1-800-FLOWERS) |
|
||||||
|
| intl | FP14-FP19 | UK, Germany, France, Japan, Australia |
|
||||||
|
| e164 | FP20 | Already-E.164 idempotency |
|
||||||
|
| edge | FP21-FP31 | Insufficient/excess digits, placeholders, multiple numbers per cell, NBSP, smart-quote contamination |
|
||||||
|
|
||||||
|
### 4.3 Critical policy decisions
|
||||||
|
|
||||||
|
**Default output: E.164** (`+<country><digits>`). Universal storage format. Reverses cleanly to any presentation format if the buyer wants display formatting later.
|
||||||
|
|
||||||
|
**Default country**: US, configurable via `--default-country=GB|DE|...`. For mixed-country columns, cleaner needs explicit country detection per-row, which is hard without context. Real-world advice for the buyer: split phone columns by country before normalizing.
|
||||||
|
|
||||||
|
**Vanity numbers** (FP12, FP13): Letters convert via standard phone keypad: 2=ABC, 3=DEF, ..., 9=WXYZ. `FLOWERS` → `3569377`. Loses some information (you can't reverse 3569377 to FLOWERS). Acceptable tradeoff for storage normalization.
|
||||||
|
|
||||||
|
**Trunk prefix dropping**: UK domestic format `020 7946 0958` (FP15) has a leading `0` that's a domestic trunk prefix, not part of the actual number. E.164 strips it: `+442079460958`. Same logic for other countries with trunk prefixes.
|
||||||
|
|
||||||
|
**Placeholders** (FP23, FP24): All-zeros `000-000-0000` and all-nines `999-999-9999` are conventional "no phone" sentinels in some CRMs. Emit error rather than silently producing a syntactically valid E.164 that's semantically meaningless. **Tradeoff**: a real number that happens to be `999-999-9999` (which doesn't exist in NANP, by the way; 999 is reserved) would error too. Acceptable.
|
||||||
|
|
||||||
|
**Multiple numbers** (FP25): Cell containing `555-123-4567 / 555-987-6543`. Don't silently pick one; emit error and tell the user to split first. Splitting is a structural change, not a format change, so it belongs upstream of 03.
|
||||||
|
|
||||||
|
**NBSP and smart-quote contamination** (FP27, FP31): Should not reach 03 if 02 ran first. Defensive cleanup is fine; emit a debug log noting the upstream pollution.
|
||||||
|
|
||||||
|
### 4.4 What's not tested
|
||||||
|
|
||||||
|
- SMS-vs-voice number distinction.
|
||||||
|
- Carrier lookup. Out of scope; would require a paid service.
|
||||||
|
- Number portability validation.
|
||||||
|
- Toll-free number recognition (888, 877, 866, 855, 844, 833) beyond accepting them as valid digits.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. EMAILS (`26_format_emails.csv`) — see Section 0.1 for scope caveat
|
||||||
|
|
||||||
|
### 5.1 Use cases by buyer persona
|
||||||
|
|
||||||
|
- **Shopify**: Customer list cleanup before email-marketing platform import (every duplicate costs money on per-contact pricing). Pre-flight check on order export before re-engagement campaigns.
|
||||||
|
- **Bookkeeper**: Vendor email list consolidation.
|
||||||
|
- **Freelancer**: Client communication list normalization.
|
||||||
|
- **Marketing agency**: List hygiene across multiple lead sources before campaign send.
|
||||||
|
|
||||||
|
### 5.2 Test categories
|
||||||
|
|
||||||
|
| Category | Cases | What it tests |
|
||||||
|
|---|---|---|
|
||||||
|
| basic | FE01-FE04 | Plain ASCII, mixed case, whitespace |
|
||||||
|
| displayname | FE05-FE07 | RFC display-name forms `Name <email>`, with and without quotes |
|
||||||
|
| prefix | FE08-FE09 | mailto: prefix |
|
||||||
|
| gmail | FE10-FE14 | Gmail-specific dot-equivalence and +tag handling. Includes negative cases (non-Gmail domains) that must NOT be touched |
|
||||||
|
| idn | FE15-FE16 | Internationalized domain names; Unicode in local part |
|
||||||
|
| trailing | FE17-FE20 | Punctuation contamination from copy-paste contexts |
|
||||||
|
| smartquote | FE21 | Word-paste damage |
|
||||||
|
| invalid | FE22-FE26 | Missing @, double @, multiple @, internal whitespace, no TLD |
|
||||||
|
| multiple | FE27-FE28 | Multiple emails in one cell |
|
||||||
|
| edge | FE29-FE31 | Empty, whitespace-only, already-perfect |
|
||||||
|
|
||||||
|
### 5.3 Critical policy decisions
|
||||||
|
|
||||||
|
**Default behavior**: lowercase, trim, strip `mailto:`, strip wrapping `<>`, extract from `Display Name <email>` form. **Does NOT strip Gmail dots or +tags by default.** Those normalizations are destructive (`alice` and `a.l.i.c.e` aren't the same email per RFC; only Gmail's specific provider policy treats them as equivalent).
|
||||||
|
|
||||||
|
**Aggressive mode (`--gmail-canonical`)**: Strip dots and +tags for `@gmail.com` only. Preserve them for all other domains, even if those domains have similar policies (some custom Google Workspace domains, some other providers). Don't second-guess provider policy.
|
||||||
|
|
||||||
|
**FE13 and FE14 are critical negative tests**: a non-Gmail domain with dots or +tag must NOT be touched even in `--gmail-canonical` mode. Many cleaners get this wrong — they apply Gmail's policy to all domains, which corrupts data.
|
||||||
|
|
||||||
|
**IDN handling** (FE15, FE16): Don't punycode-convert by default. Buyers who need ASCII-only output for legacy systems can opt in via `--punycode`. Default is to preserve Unicode in domain and local parts.
|
||||||
|
|
||||||
|
**Display-name extraction** (FE05, FE06): Drop the display name. The cleaner extracts the email and discards `Alice Smith`. **Tradeoff**: information loss. Alternative would be to preserve display name in a separate column, but that violates schema preservation (Section 0.2). Buyers who want to keep display names should split the column upstream.
|
||||||
|
|
||||||
|
**Multiple emails per cell** (FE27, FE28): Error, don't pick one. Same rationale as multiple phones.
|
||||||
|
|
||||||
|
### 5.4 What's not tested
|
||||||
|
|
||||||
|
- Email syntax validation per full RFC 5321/5322 (which permits all sorts of legitimately weird inputs like quoted-string locals). The cleaner uses a "good enough for 99% of real data" regex, not a full RFC parser.
|
||||||
|
- Disposable-email-domain detection. Out of scope for format cleaning; that's data quality.
|
||||||
|
- DNS / MX validation. Out of scope; requires network access.
|
||||||
|
- Email-address-as-username (where domain is a hostname not an internet domain). Errors as TLD-less.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. ADDRESSES (`27_format_addresses.csv`)
|
||||||
|
|
||||||
|
### 6.1 Use cases by buyer persona
|
||||||
|
|
||||||
|
- **Shopify**: Customer address normalization for shipping label generation; reduces failed deliveries.
|
||||||
|
- **Bookkeeper**: Vendor master record cleanup; consistent format for bookkeeping software import.
|
||||||
|
- **Freelancer**: Client address book consolidation.
|
||||||
|
- **Marketing agency**: Direct mail audience cleanup.
|
||||||
|
|
||||||
|
### 6.2 Test categories
|
||||||
|
|
||||||
|
| Category | Cases | What it tests |
|
||||||
|
|---|---|---|
|
||||||
|
| clean | FA01 | Already-USPS-formatted idempotency |
|
||||||
|
| case | FA02-FA04 | All-caps, all-lowercase, mixed-case (preserve) |
|
||||||
|
| abbrev | FA05-FA08 | Street type expansion/abbreviation, periods after abbreviations |
|
||||||
|
| directional | FA09-FA11 | North/N, NORTH/N, NE compounds |
|
||||||
|
| unit | FA12-FA14 | Apartment/Apt, # / Apt, Suite/Ste |
|
||||||
|
| state | FA15-FA16 | State name → 2-letter code |
|
||||||
|
| zip | FA17-FA18 | ZIP+4, leading-zero ZIPs (Massachusetts 02xxx) |
|
||||||
|
| multiline | FA19 | `\n`-separated address fields |
|
||||||
|
| pobox | FA20-FA22 | Post Office Box variants |
|
||||||
|
| housenum | FA23-FA25 | Letter suffix, hyphen, half-number |
|
||||||
|
| non_us | FA26-FA28 | UK, Canada, Japan (minimal handling) |
|
||||||
|
| edge | FA29-FA31 | Empty, partial, trailing comma |
|
||||||
|
|
||||||
|
### 6.3 Critical policy decisions
|
||||||
|
|
||||||
|
**US-first scope**: USPS abbreviations and state codes are the default. International addresses get whitespace + capitalization only. Document this clearly; buyers with significant non-US data should expect format drift.
|
||||||
|
|
||||||
|
**USPS abbreviations as the default** (St, Ave, Blvd) rather than spelled-out forms. Reasoning: USPS recommends abbreviations; most CRMs expect them; they save space in tabular display. The `--expand-abbrev` flag inverts this for buyers whose downstream system requires full forms.
|
||||||
|
|
||||||
|
**Multi-line collapse** (FA19): `123 Main St\nApt 4B\nNew York, NY 10001` becomes `123 Main St, Apt 4B, New York, NY 10001`. Consistent comma-separated single-line format. **Reverse direction not supported** — the cleaner doesn't take a single-line address and split into multi-line (that's structural).
|
||||||
|
|
||||||
|
**State expansion vs abbreviation** (FA15, FA16): Default is 2-letter code (`NY`). The `--expand-abbrev` flag expands to full state name. Note: this is the OPPOSITE direction from street type abbreviations. State codes are universally expected in tabular data; full state names are only preferred in some downstream systems' "pretty" formats.
|
||||||
|
|
||||||
|
**ZIP leading zeros** (FA18): If the column is already a ZIP-shaped string with leading zeros, preserve them. **Cannot restore lost leading zeros** — Excel-stripped `2101` (Massachusetts) cannot be confidently recovered to `02101` because `2101` could legitimately be `2101` (Idaho). Mention this as a known limitation; recommend the buyer fix at the source.
|
||||||
|
|
||||||
|
**Canada handling** (FA27): Canadian addresses use the same street-type conventions as US, so `St` → `St` works. Postal code format is preserved as-is.
|
||||||
|
|
||||||
|
**Japan / non-Western** (FA28): Field order is reversed (postal code first, then large-to-small geography). Default cleaner doesn't try to restructure; minimal handling only.
|
||||||
|
|
||||||
|
### 6.4 What's not tested
|
||||||
|
|
||||||
|
- Address verification against USPS database. Out of scope; would require a paid service or local USPS data.
|
||||||
|
- Geocoding to lat/long. Out of scope.
|
||||||
|
- Unit number parsing for buildings with non-standard nomenclatures.
|
||||||
|
- Military addresses (APO, FPO, DPO) beyond accepting them.
|
||||||
|
- Rural Route, Highway Contract, General Delivery formats.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. NAMES (`28_format_names.csv`)
|
||||||
|
|
||||||
|
### 7.1 Use cases by buyer persona
|
||||||
|
|
||||||
|
- **Shopify**: Customer list display normalization. ALL-CAPS imports from older systems become readable.
|
||||||
|
- **Bookkeeper**: Vendor name consistency across QuickBooks and spreadsheets.
|
||||||
|
- **Freelancer**: Client list capitalization cleanup.
|
||||||
|
- **Marketing agency**: First-name personalization in email campaigns (`Hi alice` vs `Hi Alice`).
|
||||||
|
|
||||||
|
### 7.2 Test categories
|
||||||
|
|
||||||
|
| Category | Cases | What it tests |
|
||||||
|
|---|---|---|
|
||||||
|
| case | FN01-FN04 | All-caps, all-lowercase, already-correct, random-case |
|
||||||
|
| scots | FN05-FN08 | Mc and Mac prefixes |
|
||||||
|
| irish | FN09-FN11 | O' prefix |
|
||||||
|
| hyphen | FN12-FN13 | Hyphenated names |
|
||||||
|
| particle | FN14-FN17 | von, van, de, da (Germanic, Dutch, French, Italian) |
|
||||||
|
| title | FN18-FN20 | Mr, Dr, Prof |
|
||||||
|
| suffix | FN21-FN23 | Jr, III, PhD |
|
||||||
|
| comma | FN24-FN26 | "Last, First" reversal to "First Last" |
|
||||||
|
| initial | FN27-FN28 | Middle initial, multi-initial |
|
||||||
|
| nonlatin | FN29-FN31 | Korean, Japanese, Russian (preserve) |
|
||||||
|
| edge | FN32-FN34 | Single name, empty, whitespace-only |
|
||||||
|
|
||||||
|
### 7.3 Critical policy decisions
|
||||||
|
|
||||||
|
**Conservative by default**: Title-case ONLY when input is ALL CAPS or all lowercase. Mixed-case input is preserved as-is (FN04: `aLiCe SmItH` → `aLiCe SmItH`). Reasoning: people have idiosyncratic spellings (`danah boyd`, `bell hooks`) that the cleaner should never overwrite. If the buyer wants aggressive title-casing, that's `--name-aggressive`.
|
||||||
|
|
||||||
|
**Mc vs Mac** (FN05-FN08): Default convention is `McDonald` (cap after Mc) and `MacDonald` (cap after Mac). Some Mac-prefixed names should be `Macdonald` (cap only on Mac). Without a names dictionary, the cleaner can't distinguish. Default to capitalizing — produces `MacDonald` for ambiguous cases. Buyers with significant Scottish/Irish customer bases may need a custom override list.
|
||||||
|
|
||||||
|
**Particles** (FN14-FN17): Particles like `von`, `van`, `de`, `da` stay lowercase. This is the convention for people with surnames containing these words (`Vincent van Gogh`, `Charles de Gaulle`). **Note**: at the start of a sentence or in last-name-first contexts (`De Gaulle, Charles`), capitalization rules invert. This corpus tests the natural-order case only.
|
||||||
|
|
||||||
|
**Comma format reversal** (FN24-FN26): `Smith, John` → `John Smith`. **Tradeoff**: irreversibly destroys the comma-format. If the buyer's downstream system expects "Last, First" format, they need `--name-format=last-first`. Default is natural reading order.
|
||||||
|
|
||||||
|
**Titles and suffixes**:
|
||||||
|
- Title period stripping: `Mr.` → `Mr`. Some style guides keep the period; this corpus drops it for consistency. `--keep-title-periods` flag if buyers prefer.
|
||||||
|
- Roman numerals (`II`, `III`, `IV`) stay all-caps. They aren't names; they're numerals.
|
||||||
|
- `PhD`, `MD`, `Esq` keep their conventional case. Don't lower-case them.
|
||||||
|
|
||||||
|
**Non-Latin scripts** (FN29-FN31): Pass through unchanged. Title-casing rules don't apply to scripts without case (Korean, Japanese, Chinese, Arabic, Hebrew, etc.). Cyrillic does have case but the conservative-by-default rule applies — only ALL CAPS gets title-cased.
|
||||||
|
|
||||||
|
**Single names** (FN32): Madonna, Cher, Pelé. Pass through unchanged when input is already title-case.
|
||||||
|
|
||||||
|
### 7.4 What's not tested
|
||||||
|
|
||||||
|
- Honorific stacking (`Dr. Mr. Jane Smith` — pathological, rare, hard).
|
||||||
|
- Cultural name-order detection (East Asian family-first vs Western given-first). Without a column-level signal the cleaner can't guess.
|
||||||
|
- Nickname expansion (`Bob` → `Robert`). Out of scope; that's data enrichment, not standardization.
|
||||||
|
- Name part identification (which token is given, family, middle). Belongs to a parser, not a standardizer.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. CURRENCIES (`29_format_currencies.csv`)
|
||||||
|
|
||||||
|
### 8.1 Use cases by buyer persona
|
||||||
|
|
||||||
|
- **Shopify**: Order amount normalization across multi-currency stores.
|
||||||
|
- **Bookkeeper**: Bank export reconciliation; mixed bank formats produce different currency representations.
|
||||||
|
- **Freelancer**: Invoice data normalization.
|
||||||
|
- **Marketing agency**: Campaign spend normalization across ad platforms.
|
||||||
|
|
||||||
|
### 8.2 Test categories
|
||||||
|
|
||||||
|
| Category | Cases | What it tests |
|
||||||
|
|---|---|---|
|
||||||
|
| us | FC01-FC07 | $ prefix/suffix, comma thousands, dot decimal, USD code prefix/suffix |
|
||||||
|
| eu | FC08-FC11 | € prefix, dot thousands and comma decimal, space thousands, Swiss apostrophe |
|
||||||
|
| intl | FC12-FC14 | £, ¥ (no decimal), ₹ (lakhs grouping) |
|
||||||
|
| negative | FC15-FC17 | Leading minus, accounting parens, sign after symbol |
|
||||||
|
| edge | FC18-FC25 | Zero, scientific, percentage, range, word values, empty, idempotency |
|
||||||
|
| ambig | FC26-FC27 | Locale-ambiguous separator (`1,234` could be 1234 or 1.234) |
|
||||||
|
|
||||||
|
### 8.3 Critical policy decisions
|
||||||
|
|
||||||
|
**Output format**: `<symbol_or_code><normalized_number>`. Number uses dot decimal, no thousand separators, leading minus for negative. Currency symbol or code preserved if present in input; if no currency indicator, output is just the number.
|
||||||
|
|
||||||
|
**Locale ambiguity** (FC26, FC27): `1,234` is `1234` in US English and `1.234` in German. `1.234` is `1.234` in US English and `1234` in German. Per-column inspection: any value with both `,` and `.` (like `1,234.56`) locks the locale unambiguously; otherwise the cleaner errors and demands `--currency-locale=us|eu`. **Do not silently guess.**
|
||||||
|
|
||||||
|
**Accounting parens** (FC16): `($100.00)` → `-$100.00`. Standard accounting convention. The leading minus is more universally readable than the parens.
|
||||||
|
|
||||||
|
**Currency symbol position**: Preserved. `$100` stays prefix-symbol; `100$` (rare but seen) stays suffix-symbol; `100 USD` keeps the suffix-code form. Reasoning: changing position is destructive and the buyer can do it themselves with a simple find-replace if they want.
|
||||||
|
|
||||||
|
**Indian lakhs grouping** (FC14): `₹1,23,456.78` flattens to `₹123456.78`. Lakhs grouping (groups of 2 after the first 3) is unusual outside India and breaks downstream tools that expect Western thousand-grouping.
|
||||||
|
|
||||||
|
**JPY no decimal** (FC13): Japanese yen conventionally has no fractional part. `¥1,234` → `¥1234`. The cleaner doesn't add a decimal that wasn't there.
|
||||||
|
|
||||||
|
**Scientific notation** (FC19): `1.5e6` → `1500000`. Expand to plain notation for spreadsheet compatibility. Loses the "this was scientific" information; acceptable tradeoff.
|
||||||
|
|
||||||
|
**Percentages** (FC20): Error. Percentage and currency are different domains. If the column is meant for percentages, that's not currency.
|
||||||
|
|
||||||
|
**Ranges** (FC21): Error. Same reasoning as multi-emails; structural split needed.
|
||||||
|
|
||||||
|
**Word values** (FC22, FC23): `Free`, `TBD`, `N/A`. Error. The buyer might want these mapped to `0` (Free) or empty (TBD/N/A), but those are domain decisions the cleaner can't make safely.
|
||||||
|
|
||||||
|
### 8.4 What's not tested
|
||||||
|
|
||||||
|
- Cross-currency conversion (USD to EUR via exchange rate). Massively out of scope.
|
||||||
|
- Cryptocurrency formats (BTC, ETH amounts with high decimal precision). Out of scope.
|
||||||
|
- Historical currency notation (pre-decimalization £.s.d). Out of scope.
|
||||||
|
- Currency code standardization (USD vs US$ vs $US). Default: pass through whatever's there.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. INTEGRATION (`30_format_integration.csv`)
|
||||||
|
|
||||||
|
### 9.1 Purpose
|
||||||
|
|
||||||
|
Five rows, each a complete record with one or more format issues across multiple columns. Tests that running 03 across multiple columns in one pass produces consistent output and doesn't drop or scramble fields.
|
||||||
|
|
||||||
|
### 9.2 Per-row test goals
|
||||||
|
|
||||||
|
| Row | What it tests |
|
||||||
|
|---|---|
|
||||||
|
| FI01 | Standard messy-but-cleanable record. All six format types in one row. Tests that no domain's normalizer interferes with another's. |
|
||||||
|
| FI02 | International record (UK address, EUR currency, German-format date, mailto-prefixed Gmail address, comma-format Mc-name). Tests cross-domain locale handling. |
|
||||||
|
| FI03 | Errors (insufficient phone digits) and complex name (DR + JANE DOE + PHD title+name+suffix). Tests error handling and complex name parsing. |
|
||||||
|
| FI04 | All empty. Tests that empty cells pass through without errors. |
|
||||||
|
| FI05 | Already-clean record. Idempotency check — the entire row should round-trip unchanged. |
|
||||||
|
|
||||||
|
### 9.3 What this fixture catches that single-domain fixtures don't
|
||||||
|
|
||||||
|
- **Cross-column interference**: a name normalizer that reaches into the email column, or vice versa.
|
||||||
|
- **Schema drift**: a normalizer that adds, removes, or reorders columns.
|
||||||
|
- **Error-handling consistency**: when one column errors (FI03's phone), other columns in the same row still process correctly.
|
||||||
|
- **Idempotency at the row level**: FI05 must produce byte-identical output.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. Suggested test workflow
|
||||||
|
|
||||||
|
```python
|
||||||
|
import csv
|
||||||
|
from pathlib import Path
|
||||||
|
from src.core.format_standardizer import standardize # your impl
|
||||||
|
|
||||||
|
FORMATS = Path("test_data/formats")
|
||||||
|
EXPECTED = Path("expected/formats")
|
||||||
|
|
||||||
|
def test_single_column_domain(domain):
|
||||||
|
"""Test FD/FP/FE/FA/FN/FC fixtures with single-column expected output."""
|
||||||
|
inp = FORMATS / f"{domain}.csv"
|
||||||
|
exp = EXPECTED / f"{domain}_expected.csv"
|
||||||
|
|
||||||
|
with inp.open() as f:
|
||||||
|
cases = {r["case_id"]: r for r in csv.DictReader(f)}
|
||||||
|
with exp.open() as f:
|
||||||
|
expected = {r["case_id"]: r for r in csv.DictReader(f)}
|
||||||
|
|
||||||
|
failures = []
|
||||||
|
for case_id, case in cases.items():
|
||||||
|
got = standardize(case["input"], domain=domain.split("_")[1])
|
||||||
|
want = expected[case_id]["output"]
|
||||||
|
if got != want:
|
||||||
|
failures.append((case_id, case["input"], got, want))
|
||||||
|
return failures
|
||||||
|
|
||||||
|
# Test each domain
|
||||||
|
for domain in ["24_format_dates", "25_format_phones", "28_format_names",
|
||||||
|
"29_format_currencies"]:
|
||||||
|
failures = test_single_column_domain(domain)
|
||||||
|
print(f"{domain}: {len(failures)} failures")
|
||||||
|
|
||||||
|
# Email and address have two-policy expected output
|
||||||
|
def test_two_policy(domain, policy_columns):
|
||||||
|
inp = FORMATS / f"{domain}.csv"
|
||||||
|
exp = EXPECTED / f"{domain}_expected.csv"
|
||||||
|
with inp.open() as f:
|
||||||
|
cases = {r["case_id"]: r for r in csv.DictReader(f)}
|
||||||
|
with exp.open() as f:
|
||||||
|
expected = {r["case_id"]: r for r in csv.DictReader(f)}
|
||||||
|
|
||||||
|
for policy in policy_columns:
|
||||||
|
failures = []
|
||||||
|
for case_id, case in cases.items():
|
||||||
|
got = standardize(case["input"], domain=domain.split("_")[1],
|
||||||
|
mode=policy)
|
||||||
|
want = expected[case_id][f"output_{policy}"]
|
||||||
|
if got != want:
|
||||||
|
failures.append((case_id, case["input"], got, want))
|
||||||
|
print(f"{domain} ({policy}): {len(failures)} failures")
|
||||||
|
|
||||||
|
test_two_policy("26_format_emails", ["default", "gmail_canonical"])
|
||||||
|
test_two_policy("27_format_addresses", ["default", "expand_abbrev"])
|
||||||
|
|
||||||
|
# Idempotency property test
|
||||||
|
import random
|
||||||
|
all_inputs = []
|
||||||
|
for domain in ["24_format_dates", "25_format_phones", "26_format_emails",
|
||||||
|
"27_format_addresses", "28_format_names", "29_format_currencies"]:
|
||||||
|
with (FORMATS / f"{domain}.csv").open() as f:
|
||||||
|
all_inputs.extend((domain, r["input"]) for r in csv.DictReader(f))
|
||||||
|
|
||||||
|
for domain, inp in all_inputs:
|
||||||
|
once = standardize(inp, domain=domain.split("_")[1])
|
||||||
|
twice = standardize(once, domain=domain.split("_")[1])
|
||||||
|
assert once == twice, f"non-idempotent: {domain} {inp!r} -> {once!r} -> {twice!r}"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. What this corpus does NOT cover
|
||||||
|
|
||||||
|
Listed so the gaps are explicit:
|
||||||
|
|
||||||
|
1. **Performance**. All fixtures are small. Format standardization on a 500MB customer file may have memory or speed issues; benchmark separately.
|
||||||
|
2. **Cross-script integration with 02 and 04**. This corpus tests 03 in isolation. Running 02 → 03 → 04 in pipeline is a separate integration concern.
|
||||||
|
3. **GUI behavior**. Single-cell preview, per-row preview, domain auto-detection from column headers. Each is a Streamlit-layer test, not a transformation test.
|
||||||
|
4. **Custom locale dictionaries**. The fixtures assume the cleaner ships with English month names and US-default phone country. Customers who buy this product and then complain that German months aren't recognized are flagging a feature request, not a bug.
|
||||||
|
5. **URLs**. Listed in BUSINESS.md's adjacent territory but not in 03's scope. If you want URL standardization, that's a feature request.
|
||||||
|
6. **Booleans / yes-no normalization**. `Y` / `Yes` / `1` / `True` → `true`. Borderline 03 territory but explicitly excluded; can be added as a 7th domain if buyers ask for it.
|
||||||
|
7. **Postal codes outside US/UK/Canada**. ZIP-style validation only for US.
|
||||||
|
8. **Identifiers (SKU, SSN, EIN)**. Out of scope; too domain-specific.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 12. How to extend the corpus
|
||||||
|
|
||||||
|
**Add a new test case in an existing domain**:
|
||||||
|
1. Edit the relevant fixture's row list in `generate_format_test_files.py`.
|
||||||
|
2. Add the corresponding expected output entry.
|
||||||
|
3. Re-run the generator.
|
||||||
|
4. If the new case is a category not yet listed, update the per-domain category table in this document.
|
||||||
|
|
||||||
|
**Add a new domain (e.g., URLs)**:
|
||||||
|
1. Define use cases by persona.
|
||||||
|
2. Define policy decisions and which require a flag vs. being default.
|
||||||
|
3. Build the input fixture as `31_format_<domain>.csv` and the expected output as `31_format_<domain>_expected.csv`.
|
||||||
|
4. Add a Section 13 to this document covering the domain.
|
||||||
|
5. Update the index table in Section 2.
|
||||||
|
|
||||||
|
**Add a new policy variant to an existing domain**:
|
||||||
|
1. Add a new column to the expected output file (e.g., `output_strict`).
|
||||||
|
2. Document the new policy and what triggers it (which flag) in the domain's Section 5.3 (or equivalent).
|
||||||
|
3. The two-policy test in Section 10's workflow generalizes to N-policy.
|
||||||
303
tests/test_audit_fixes.py
Normal file
303
tests/test_audit_fixes.py
Normal file
@@ -0,0 +1,303 @@
|
|||||||
|
"""Regression tests for bugs surfaced by the cross-tool audit.
|
||||||
|
|
||||||
|
Each test pins a specific behavioral bug or gap that an audit
|
||||||
|
identified. Test names match the BUG-N / GAP-N tags in the audit
|
||||||
|
notes so a future reader can trace why each test exists.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.core.analyze import _NULL_LIKE, _detect_mixed_case_email
|
||||||
|
import src.core.fixes as f
|
||||||
|
from src.core.config import (
|
||||||
|
ColumnStrategyConfig,
|
||||||
|
DeduplicationConfig,
|
||||||
|
StrategyConfig,
|
||||||
|
)
|
||||||
|
from src.core.dedup import (
|
||||||
|
Algorithm,
|
||||||
|
ColumnMatchStrategy,
|
||||||
|
MatchStrategy,
|
||||||
|
deduplicate,
|
||||||
|
)
|
||||||
|
from src.core.io import detect_header_row
|
||||||
|
from src.core.text_clean import sentence_case, smart_title_case, strip_bom
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# BUG-1: dedup NaN values must not match as duplicates
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestDedupNaNHandling:
|
||||||
|
def test_two_nan_emails_do_not_match(self):
|
||||||
|
# Both rows have NaN for email; no other matching column. Without
|
||||||
|
# the fix, str(NaN) == "nan" would match exactly and the rows
|
||||||
|
# would silently merge.
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"id": [1, 2],
|
||||||
|
"email": [np.nan, np.nan],
|
||||||
|
})
|
||||||
|
strategies = [MatchStrategy(column_strategies=[
|
||||||
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT,
|
||||||
|
threshold=100.0),
|
||||||
|
])]
|
||||||
|
result = deduplicate(df, strategies=strategies)
|
||||||
|
assert len(result.deduplicated_df) == 2
|
||||||
|
assert len(result.match_groups) == 0
|
||||||
|
|
||||||
|
def test_one_nan_one_real_does_not_match(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"email": [np.nan, "alice@example.com"],
|
||||||
|
})
|
||||||
|
strategies = [MatchStrategy(column_strategies=[
|
||||||
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||||
|
])]
|
||||||
|
result = deduplicate(df, strategies=strategies)
|
||||||
|
assert len(result.deduplicated_df) == 2
|
||||||
|
|
||||||
|
def test_none_does_not_match_string_none(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"name": [None, "None"],
|
||||||
|
})
|
||||||
|
strategies = [MatchStrategy(column_strategies=[
|
||||||
|
ColumnMatchStrategy(column="name", algorithm=Algorithm.EXACT),
|
||||||
|
])]
|
||||||
|
result = deduplicate(df, strategies=strategies)
|
||||||
|
assert len(result.deduplicated_df) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# BUG-2: removed_df must preserve column schema even when empty
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestDedupRemovedDfSchema:
|
||||||
|
def test_empty_removed_df_has_same_columns(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"name": ["alice", "bob", "carol"],
|
||||||
|
"email": ["a@x.com", "b@x.com", "c@x.com"],
|
||||||
|
})
|
||||||
|
strategies = [MatchStrategy(column_strategies=[
|
||||||
|
ColumnMatchStrategy(column="email", algorithm=Algorithm.EXACT),
|
||||||
|
])]
|
||||||
|
result = deduplicate(df, strategies=strategies)
|
||||||
|
# No duplicates → empty removed_df, but columns must match.
|
||||||
|
assert len(result.removed_df) == 0
|
||||||
|
assert list(result.removed_df.columns) == list(result.deduplicated_df.columns)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# GAP-3: missing column reference should raise
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestDedupMissingColumn:
|
||||||
|
def test_missing_column_raises(self):
|
||||||
|
df = pd.DataFrame({"email": ["a@x.com"]})
|
||||||
|
strategies = [MatchStrategy(column_strategies=[
|
||||||
|
ColumnMatchStrategy(column="e_mail", algorithm=Algorithm.EXACT),
|
||||||
|
])]
|
||||||
|
with pytest.raises(ValueError, match="not present in the input"):
|
||||||
|
deduplicate(df, strategies=strategies)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# GAP-4: threshold must be in [0, 100]
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestThresholdValidation:
|
||||||
|
def test_negative_threshold_rejected(self):
|
||||||
|
with pytest.raises(ValueError, match=r"\[0, 100\]"):
|
||||||
|
ColumnMatchStrategy(column="x", threshold=-1)
|
||||||
|
|
||||||
|
def test_over_hundred_rejected(self):
|
||||||
|
with pytest.raises(ValueError, match=r"\[0, 100\]"):
|
||||||
|
ColumnMatchStrategy(column="x", threshold=101)
|
||||||
|
|
||||||
|
def test_zero_and_hundred_allowed(self):
|
||||||
|
ColumnMatchStrategy(column="x", threshold=0)
|
||||||
|
ColumnMatchStrategy(column="x", threshold=100)
|
||||||
|
|
||||||
|
def test_non_numeric_rejected(self):
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
ColumnMatchStrategy(column="x", threshold="high") # type: ignore[arg-type]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# BUG-9: replace_null_sentinels must coerce non-string sentinels
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestReplaceNullSentinelsTypes:
|
||||||
|
def test_int_sentinels_do_not_crash(self):
|
||||||
|
df = pd.DataFrame({"x": ["0", "5", ""]})
|
||||||
|
out, _ = f.replace_null_sentinels(df, {"sentinels": [0, "5"]})
|
||||||
|
assert out.loc[0, "x"] == "" # "0" matched int 0 stringified
|
||||||
|
assert out.loc[1, "x"] == "" # "5" matched
|
||||||
|
assert out.loc[2, "x"] == "" # already empty
|
||||||
|
|
||||||
|
def test_none_sentinel_skipped(self):
|
||||||
|
df = pd.DataFrame({"x": ["a", "b"]})
|
||||||
|
# Should not crash on None entry in the sentinel list.
|
||||||
|
out, _ = f.replace_null_sentinels(df, {"sentinels": ["a", None]})
|
||||||
|
assert out.loc[0, "x"] == ""
|
||||||
|
assert out.loc[1, "x"] == "b"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# BUG-10: malformed regex should raise ValueError, not re.error
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestVectorizedRegexErrorHandling:
|
||||||
|
def test_malformed_pattern_raises_valueerror(self):
|
||||||
|
df = pd.DataFrame({"x": ["abc"]})
|
||||||
|
with pytest.raises(ValueError, match="Invalid regex pattern"):
|
||||||
|
f._vectorized_regex_sub(df, "[invalid", "")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# NIT-12: strip_bom strips at most one BOM
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestStripBomSingleChar:
|
||||||
|
def test_strips_one_leading_bom(self):
|
||||||
|
assert strip_bom("hello") == "hello"
|
||||||
|
|
||||||
|
def test_does_not_strip_multiple_consecutive_boms(self):
|
||||||
|
# Per docstring: "at most one BOM". Second BOM stays so the
|
||||||
|
# caller can see something odd happened.
|
||||||
|
assert strip_bom("hello") == "hello"
|
||||||
|
|
||||||
|
def test_no_bom_unchanged(self):
|
||||||
|
assert strip_bom("hello") == "hello"
|
||||||
|
|
||||||
|
def test_non_string_passthrough(self):
|
||||||
|
assert strip_bom(None) is None # type: ignore[arg-type]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Smart title case — particle behavior at boundaries (regression / docs)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestSmartTitleCaseBoundaries:
|
||||||
|
def test_first_word_particle_capitalized(self):
|
||||||
|
# "a" at index 0 is a particle but must capitalize as the first
|
||||||
|
# word of a title.
|
||||||
|
assert smart_title_case("a story") == "A Story"
|
||||||
|
|
||||||
|
def test_last_word_particle_capitalized(self):
|
||||||
|
# "to" at the end is the last word; must capitalize.
|
||||||
|
assert smart_title_case("things to") == "Things To"
|
||||||
|
|
||||||
|
def test_mid_string_particles_lowercase(self):
|
||||||
|
assert smart_title_case("the cat in the hat") == "The Cat in the Hat"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# NIT-14: sentence_case dead branch removed — regression guard
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestSentenceCaseUnchanged:
|
||||||
|
def test_basic(self):
|
||||||
|
assert sentence_case("hello. world.") == "Hello. World."
|
||||||
|
|
||||||
|
def test_open_paren_does_not_consume_trigger(self):
|
||||||
|
# The dead-branch removal didn't change behavior; this is a
|
||||||
|
# regression guard that opening punctuation still doesn't
|
||||||
|
# capitalize itself but doesn't reset the trigger either.
|
||||||
|
assert sentence_case('hello. "world"') == 'Hello. "World"'
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# BUG-18: detect_header_row must not pick all-empty rows
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestDetectHeaderRowEmptyRows:
|
||||||
|
def test_all_empty_first_row_skipped(self, tmp_path: Path):
|
||||||
|
# First row is all-empty — the header is on row 1.
|
||||||
|
p = tmp_path / "blank_first.csv"
|
||||||
|
p.write_text(",,\nname,email,phone\nalice,a@x.com,555\n")
|
||||||
|
assert detect_header_row(p) == 1
|
||||||
|
|
||||||
|
def test_pure_header_at_row_zero(self, tmp_path: Path):
|
||||||
|
p = tmp_path / "normal.csv"
|
||||||
|
p.write_text("name,email,phone\nalice,a@x.com,555\n")
|
||||||
|
assert detect_header_row(p) == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# BUG-20: config.from_dict must accept unknown fields (forward compat)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestConfigForwardCompat:
|
||||||
|
def test_extra_field_in_column_config_ignored(self, tmp_path: Path):
|
||||||
|
# Simulate a config file written by a future version with an
|
||||||
|
# extra ``priority`` field.
|
||||||
|
config_dict = {
|
||||||
|
"strategies": [{
|
||||||
|
"columns": [{
|
||||||
|
"column": "email",
|
||||||
|
"algorithm": "exact",
|
||||||
|
"threshold": 100.0,
|
||||||
|
"normalizer": None,
|
||||||
|
"priority": 5, # future field — must not crash
|
||||||
|
}],
|
||||||
|
}],
|
||||||
|
"survivor_rule": "first",
|
||||||
|
"merge": False,
|
||||||
|
}
|
||||||
|
loaded = DeduplicationConfig.from_dict(config_dict)
|
||||||
|
assert len(loaded.strategies) == 1
|
||||||
|
assert loaded.strategies[0].columns[0].column == "email"
|
||||||
|
|
||||||
|
def test_roundtrip_then_reload_with_extra(self, tmp_path: Path):
|
||||||
|
cfg = DeduplicationConfig(
|
||||||
|
strategies=[StrategyConfig(columns=[
|
||||||
|
ColumnStrategyConfig(column="email"),
|
||||||
|
])],
|
||||||
|
)
|
||||||
|
path = tmp_path / "cfg.json"
|
||||||
|
cfg.to_file(path)
|
||||||
|
# Manually inject an unknown field to simulate forward-compat.
|
||||||
|
data = json.loads(path.read_text())
|
||||||
|
data["strategies"][0]["columns"][0]["future_thing"] = "abc"
|
||||||
|
path.write_text(json.dumps(data))
|
||||||
|
loaded = DeduplicationConfig.from_file(path)
|
||||||
|
assert loaded.strategies[0].columns[0].column == "email"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# BUG-22: mixed-case email detector must not flag all-None columns
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestMixedCaseEmailFalsePositive:
|
||||||
|
def test_all_none_email_column_no_finding(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"email": [None, None, None],
|
||||||
|
})
|
||||||
|
findings = _detect_mixed_case_email(df)
|
||||||
|
assert findings == []
|
||||||
|
|
||||||
|
def test_real_mixed_case_still_flagged(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"email": ["Alice@X.com", "bob@y.com"],
|
||||||
|
})
|
||||||
|
findings = _detect_mixed_case_email(df)
|
||||||
|
assert len(findings) == 1
|
||||||
|
assert findings[0].column == "email"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# NIT-24: <NA> recognized as a null-like sentinel
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestNullLikeIncludesPandasNA:
|
||||||
|
def test_pd_na_string_repr_recognized(self):
|
||||||
|
# str(pd.NA) → "<NA>" — when a DataFrame is loaded with
|
||||||
|
# keep_default_na=False, pandas NA values appear as the literal
|
||||||
|
# string "<NA>" and the analyzer should flag them.
|
||||||
|
assert "<na>" in _NULL_LIKE
|
||||||
238
tests/test_fixes_unit.py
Normal file
238
tests/test_fixes_unit.py
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
"""Isolated unit tests for individual fix functions in src.core.fixes.
|
||||||
|
|
||||||
|
The integration tests at tests/test_normalize.py exercise these
|
||||||
|
functions through the full analyze→fix pipeline. These tests pin each
|
||||||
|
function's behavior in isolation so a regression surfaces close to the
|
||||||
|
broken function rather than at the pipeline output.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.core.fixes import (
|
||||||
|
clean_headers,
|
||||||
|
normalize_line_endings,
|
||||||
|
repair_mojibake,
|
||||||
|
strip_nbsp,
|
||||||
|
strip_zero_width,
|
||||||
|
trim_whitespace,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# trim_whitespace
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestTrimWhitespace:
|
||||||
|
def test_strips_leading_trailing(self):
|
||||||
|
df = pd.DataFrame({"x": [" hello ", " world "]})
|
||||||
|
out, changed = trim_whitespace(df)
|
||||||
|
assert list(out["x"]) == ["hello", "world"]
|
||||||
|
assert changed == 2
|
||||||
|
|
||||||
|
def test_collapses_internal_runs(self):
|
||||||
|
df = pd.DataFrame({"x": ["a b c"]})
|
||||||
|
out, _ = trim_whitespace(df)
|
||||||
|
assert out.loc[0, "x"] == "a b c"
|
||||||
|
|
||||||
|
def test_preserves_internal_in_structured(self):
|
||||||
|
# Phone-shaped strings keep internal spacing (often semantic).
|
||||||
|
df = pd.DataFrame({"x": ["(555) 123-4567"]})
|
||||||
|
out, changed = trim_whitespace(df)
|
||||||
|
assert out.loc[0, "x"] == "(555) 123-4567"
|
||||||
|
assert changed == 0
|
||||||
|
|
||||||
|
def test_empty_df(self):
|
||||||
|
df = pd.DataFrame({"x": []})
|
||||||
|
out, changed = trim_whitespace(df)
|
||||||
|
assert len(out) == 0
|
||||||
|
assert changed == 0
|
||||||
|
|
||||||
|
def test_no_string_columns(self):
|
||||||
|
df = pd.DataFrame({"n": [1, 2, 3]})
|
||||||
|
out, changed = trim_whitespace(df)
|
||||||
|
assert changed == 0
|
||||||
|
assert list(out["n"]) == [1, 2, 3]
|
||||||
|
|
||||||
|
def test_nan_preserved(self):
|
||||||
|
df = pd.DataFrame({"x": [" ok ", None]})
|
||||||
|
out, _ = trim_whitespace(df)
|
||||||
|
assert out.loc[0, "x"] == "ok"
|
||||||
|
# NaN/None passes through (becomes empty string after strip OR stays)
|
||||||
|
assert out.loc[1, "x"] is None or out.loc[1, "x"] == ""
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
df = pd.DataFrame({"x": [" hello world "]})
|
||||||
|
out1, _ = trim_whitespace(df)
|
||||||
|
out2, changed2 = trim_whitespace(out1)
|
||||||
|
assert changed2 == 0
|
||||||
|
assert list(out2["x"]) == list(out1["x"])
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# strip_nbsp
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestStripNbsp:
|
||||||
|
def test_replaces_nbsp_with_ascii_space(self):
|
||||||
|
df = pd.DataFrame({"x": ["a b"]})
|
||||||
|
out, changed = strip_nbsp(df)
|
||||||
|
assert out.loc[0, "x"] == "a b"
|
||||||
|
assert changed == 1
|
||||||
|
|
||||||
|
def test_no_change_when_clean(self):
|
||||||
|
df = pd.DataFrame({"x": ["a b c"]})
|
||||||
|
out, changed = strip_nbsp(df)
|
||||||
|
assert changed == 0
|
||||||
|
|
||||||
|
def test_other_unicode_spaces(self):
|
||||||
|
# Em space (U+2003), thin space (U+2009)
|
||||||
|
df = pd.DataFrame({"x": ["a b c"]})
|
||||||
|
out, _ = strip_nbsp(df)
|
||||||
|
assert out.loc[0, "x"] == "a b c"
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
df = pd.DataFrame({"x": ["a b"]})
|
||||||
|
out1, _ = strip_nbsp(df)
|
||||||
|
out2, changed2 = strip_nbsp(out1)
|
||||||
|
assert changed2 == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# strip_zero_width
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestStripZeroWidth:
|
||||||
|
def test_removes_zero_width_space(self):
|
||||||
|
df = pd.DataFrame({"x": ["ab"]})
|
||||||
|
out, changed = strip_zero_width(df)
|
||||||
|
assert out.loc[0, "x"] == "ab"
|
||||||
|
assert changed == 1
|
||||||
|
|
||||||
|
def test_removes_zero_width_joiner(self):
|
||||||
|
df = pd.DataFrame({"x": ["ab"]})
|
||||||
|
out, _ = strip_zero_width(df)
|
||||||
|
assert out.loc[0, "x"] == "ab"
|
||||||
|
|
||||||
|
def test_clean_passthrough(self):
|
||||||
|
df = pd.DataFrame({"x": ["clean"]})
|
||||||
|
out, changed = strip_zero_width(df)
|
||||||
|
assert changed == 0
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
df = pd.DataFrame({"x": ["abc"]})
|
||||||
|
out1, _ = strip_zero_width(df)
|
||||||
|
out2, changed2 = strip_zero_width(out1)
|
||||||
|
assert changed2 == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# normalize_line_endings
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestNormalizeLineEndings:
|
||||||
|
def test_crlf_to_lf(self):
|
||||||
|
df = pd.DataFrame({"x": ["line1\r\nline2"]})
|
||||||
|
out, changed = normalize_line_endings(df)
|
||||||
|
assert out.loc[0, "x"] == "line1\nline2"
|
||||||
|
assert changed == 1
|
||||||
|
|
||||||
|
def test_bare_cr_to_lf(self):
|
||||||
|
df = pd.DataFrame({"x": ["line1\rline2"]})
|
||||||
|
out, _ = normalize_line_endings(df)
|
||||||
|
assert out.loc[0, "x"] == "line1\nline2"
|
||||||
|
|
||||||
|
def test_already_lf_unchanged(self):
|
||||||
|
df = pd.DataFrame({"x": ["line1\nline2"]})
|
||||||
|
out, changed = normalize_line_endings(df)
|
||||||
|
assert changed == 0
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
df = pd.DataFrame({"x": ["a\r\nb\rc"]})
|
||||||
|
out1, _ = normalize_line_endings(df)
|
||||||
|
out2, changed2 = normalize_line_endings(out1)
|
||||||
|
assert changed2 == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# clean_headers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestCleanHeaders:
|
||||||
|
def test_strips_bom_from_header(self):
|
||||||
|
df = pd.DataFrame({"name": [1], "email": [2]})
|
||||||
|
out, changed = clean_headers(df)
|
||||||
|
assert "name" in out.columns
|
||||||
|
assert "name" not in out.columns
|
||||||
|
assert changed >= 1
|
||||||
|
|
||||||
|
def test_strips_nbsp_from_header(self):
|
||||||
|
df = pd.DataFrame({"first name": [1]})
|
||||||
|
out, _ = clean_headers(df)
|
||||||
|
assert "first name" in out.columns
|
||||||
|
|
||||||
|
def test_strips_trailing_whitespace_from_header(self):
|
||||||
|
df = pd.DataFrame({"Email ": [1]})
|
||||||
|
out, _ = clean_headers(df)
|
||||||
|
assert "Email" in out.columns
|
||||||
|
assert "Email " not in out.columns
|
||||||
|
|
||||||
|
def test_non_string_label_preserved(self):
|
||||||
|
df = pd.DataFrame({0: [1], 1: [2]})
|
||||||
|
out, changed = clean_headers(df)
|
||||||
|
assert list(out.columns) == [0, 1]
|
||||||
|
assert changed == 0
|
||||||
|
|
||||||
|
def test_clean_headers_idempotent(self):
|
||||||
|
df = pd.DataFrame({"name": [1]})
|
||||||
|
out1, _ = clean_headers(df)
|
||||||
|
out2, changed2 = clean_headers(out1)
|
||||||
|
assert changed2 == 0
|
||||||
|
assert list(out2.columns) == list(out1.columns)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# repair_mojibake
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_HAS_FTFY = True
|
||||||
|
try:
|
||||||
|
import ftfy # noqa: F401
|
||||||
|
except ImportError:
|
||||||
|
_HAS_FTFY = False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(not _HAS_FTFY, reason="ftfy library not installed — fix is a no-op")
|
||||||
|
class TestRepairMojibake:
|
||||||
|
def test_classic_cafe_repair(self):
|
||||||
|
df = pd.DataFrame({"x": ["café"]}) # café miscoded
|
||||||
|
out, changed = repair_mojibake(df)
|
||||||
|
assert out.loc[0, "x"] == "café"
|
||||||
|
assert changed == 1
|
||||||
|
|
||||||
|
def test_clean_text_unchanged(self):
|
||||||
|
df = pd.DataFrame({"x": ["café"]})
|
||||||
|
out, changed = repair_mojibake(df)
|
||||||
|
assert changed == 0
|
||||||
|
|
||||||
|
def test_no_string_columns(self):
|
||||||
|
df = pd.DataFrame({"n": [1, 2]})
|
||||||
|
out, changed = repair_mojibake(df)
|
||||||
|
assert changed == 0
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
df = pd.DataFrame({"x": ["café"]})
|
||||||
|
out1, _ = repair_mojibake(df)
|
||||||
|
out2, changed2 = repair_mojibake(out1)
|
||||||
|
assert changed2 == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TestRepairMojibakeNoFtfy:
|
||||||
|
@pytest.mark.skipif(_HAS_FTFY, reason="ftfy installed — exercises the no-op path")
|
||||||
|
def test_returns_input_unchanged_without_ftfy(self):
|
||||||
|
df = pd.DataFrame({"x": ["café"]})
|
||||||
|
out, changed = repair_mojibake(df)
|
||||||
|
assert changed == 0
|
||||||
|
assert out.loc[0, "x"] == "café"
|
||||||
630
tests/test_format_standardize.py
Normal file
630
tests/test_format_standardize.py
Normal file
@@ -0,0 +1,630 @@
|
|||||||
|
"""Tests for src.core.format_standardize."""
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.core.format_standardize import (
|
||||||
|
PRESETS,
|
||||||
|
FieldType,
|
||||||
|
StandardizeOptions,
|
||||||
|
detect_currency_code,
|
||||||
|
standardize_address,
|
||||||
|
standardize_boolean,
|
||||||
|
standardize_currency,
|
||||||
|
standardize_dataframe,
|
||||||
|
standardize_date,
|
||||||
|
standardize_name,
|
||||||
|
standardize_phone,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestStandardizeDate:
|
||||||
|
def test_iso_passthrough(self):
|
||||||
|
out, changed = standardize_date("2024-01-15")
|
||||||
|
assert out == "2024-01-15"
|
||||||
|
assert changed is False
|
||||||
|
|
||||||
|
def test_us_slash(self):
|
||||||
|
out, changed = standardize_date("01/15/2024")
|
||||||
|
assert (out, changed) == ("2024-01-15", True)
|
||||||
|
|
||||||
|
def test_us_dash(self):
|
||||||
|
out, _ = standardize_date("1-15-2024")
|
||||||
|
assert out == "2024-01-15"
|
||||||
|
|
||||||
|
def test_two_digit_year(self):
|
||||||
|
out, _ = standardize_date("01/15/24")
|
||||||
|
assert out == "2024-01-15"
|
||||||
|
|
||||||
|
def test_long_month_name(self):
|
||||||
|
out, _ = standardize_date("January 15, 2024")
|
||||||
|
assert out == "2024-01-15"
|
||||||
|
|
||||||
|
def test_short_month_name(self):
|
||||||
|
out, _ = standardize_date("Jan 15 2024")
|
||||||
|
assert out == "2024-01-15"
|
||||||
|
|
||||||
|
def test_dmy_order(self):
|
||||||
|
out, _ = standardize_date("15/01/2024", date_order="DMY")
|
||||||
|
assert out == "2024-01-15"
|
||||||
|
|
||||||
|
def test_strip_time_tail(self):
|
||||||
|
out, _ = standardize_date("2024-01-15 13:45:00")
|
||||||
|
assert out == "2024-01-15"
|
||||||
|
|
||||||
|
def test_iso_with_t_separator(self):
|
||||||
|
out, _ = standardize_date("2024-01-15T08:30:00Z")
|
||||||
|
assert out == "2024-01-15"
|
||||||
|
|
||||||
|
def test_compact(self):
|
||||||
|
out, _ = standardize_date("20240115")
|
||||||
|
assert out == "2024-01-15"
|
||||||
|
|
||||||
|
def test_custom_output(self):
|
||||||
|
out, _ = standardize_date("01/15/2024", output_format="%d %b %Y")
|
||||||
|
assert out == "15 Jan 2024"
|
||||||
|
|
||||||
|
def test_unparseable_passthrough(self):
|
||||||
|
out, changed = standardize_date("hello")
|
||||||
|
assert (out, changed) == ("hello", False)
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert standardize_date("") == ("", False)
|
||||||
|
assert standardize_date(None) == ("", False)
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
out, _ = standardize_date("01/15/2024")
|
||||||
|
out2, changed2 = standardize_date(out)
|
||||||
|
assert out2 == out
|
||||||
|
assert changed2 is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestStandardizePhone:
|
||||||
|
def test_e164_default(self):
|
||||||
|
out, _ = standardize_phone("(555) 123-4567")
|
||||||
|
assert out == "+15551234567"
|
||||||
|
|
||||||
|
def test_national(self):
|
||||||
|
out, _ = standardize_phone("5551234567", output_format="NATIONAL")
|
||||||
|
assert out == "(555) 123-4567"
|
||||||
|
|
||||||
|
def test_international(self):
|
||||||
|
out, _ = standardize_phone("5551234567", output_format="INTERNATIONAL")
|
||||||
|
assert out == "+1 555-123-4567"
|
||||||
|
|
||||||
|
def test_digits_only(self):
|
||||||
|
out, changed = standardize_phone("(555) 123-4567", output_format="DIGITS")
|
||||||
|
assert out == "5551234567"
|
||||||
|
assert changed is True
|
||||||
|
|
||||||
|
def test_invalid_passthrough(self):
|
||||||
|
out, changed = standardize_phone("call me maybe")
|
||||||
|
assert (out, changed) == ("call me maybe", False)
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert standardize_phone("") == ("", False)
|
||||||
|
assert standardize_phone(None) == ("", False)
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
out, _ = standardize_phone("(555) 123-4567")
|
||||||
|
out2, changed2 = standardize_phone(out)
|
||||||
|
assert out2 == out
|
||||||
|
assert changed2 is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestStandardizeCurrency:
|
||||||
|
def test_dollar_with_cents(self):
|
||||||
|
out, _ = standardize_currency("$1,234.56")
|
||||||
|
assert out == "1234.56"
|
||||||
|
|
||||||
|
def test_no_decimals_arg(self):
|
||||||
|
out, _ = standardize_currency("$1,234.56", decimals=None)
|
||||||
|
assert out == "1234.56"
|
||||||
|
|
||||||
|
def test_round_to_two(self):
|
||||||
|
out, _ = standardize_currency("$1,234.567", decimals=2)
|
||||||
|
assert out == "1234.57"
|
||||||
|
|
||||||
|
def test_integer_input(self):
|
||||||
|
out, _ = standardize_currency("$1,000", decimals=None)
|
||||||
|
assert out == "1000"
|
||||||
|
|
||||||
|
def test_negative_parens(self):
|
||||||
|
out, _ = standardize_currency("($50.00)", decimals=2)
|
||||||
|
assert out == "-50.00"
|
||||||
|
|
||||||
|
def test_negative_sign(self):
|
||||||
|
out, _ = standardize_currency("-$50.00", decimals=2)
|
||||||
|
assert out == "-50.00"
|
||||||
|
|
||||||
|
def test_iso_code_prefix(self):
|
||||||
|
out, _ = standardize_currency("USD 1,234.56")
|
||||||
|
assert out == "1234.56"
|
||||||
|
|
||||||
|
def test_iso_code_suffix(self):
|
||||||
|
out, _ = standardize_currency("1234.56 EUR")
|
||||||
|
assert out == "1234.56"
|
||||||
|
|
||||||
|
def test_european_decimal(self):
|
||||||
|
out, _ = standardize_currency("1.234,56 €", decimal="comma")
|
||||||
|
assert out == "1234.56"
|
||||||
|
|
||||||
|
def test_unparseable_passthrough(self):
|
||||||
|
out, changed = standardize_currency("free!")
|
||||||
|
assert (out, changed) == ("free!", False)
|
||||||
|
|
||||||
|
def test_ambiguous_short_comma_rejected(self):
|
||||||
|
# "1,5" under dot-decimal mode would be a comma decimal — reject.
|
||||||
|
out, changed = standardize_currency("1,5")
|
||||||
|
assert changed is False
|
||||||
|
assert out == "1,5"
|
||||||
|
|
||||||
|
def test_thousands_grouped_no_decimal(self):
|
||||||
|
out, _ = standardize_currency("1,234", decimals=None)
|
||||||
|
assert out == "1234"
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert standardize_currency("") == ("", False)
|
||||||
|
assert standardize_currency(None) == ("", False)
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
out, _ = standardize_currency("$1,234.56", decimals=2)
|
||||||
|
out2, changed2 = standardize_currency(out, decimals=2)
|
||||||
|
assert out2 == out
|
||||||
|
assert changed2 is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestStandardizeName:
|
||||||
|
def test_shouting_to_title(self):
|
||||||
|
out, _ = standardize_name("JOHN DOE")
|
||||||
|
assert out == "John Doe"
|
||||||
|
|
||||||
|
def test_lowercase_to_title(self):
|
||||||
|
out, _ = standardize_name("john doe")
|
||||||
|
assert out == "John Doe"
|
||||||
|
|
||||||
|
def test_already_title(self):
|
||||||
|
out, changed = standardize_name("Jane Smith")
|
||||||
|
assert out == "Jane Smith"
|
||||||
|
assert changed is False
|
||||||
|
|
||||||
|
def test_apostrophe_inner_cap(self):
|
||||||
|
# Surnames with O'/D' apostrophe prefixes get the inner letter
|
||||||
|
# capitalized regardless of input case (corpus § 7.3 Irish names).
|
||||||
|
out, _ = standardize_name("o'Connor")
|
||||||
|
assert out == "O'Connor"
|
||||||
|
out2, _ = standardize_name("o'connor")
|
||||||
|
assert out2 == "O'Connor"
|
||||||
|
|
||||||
|
def test_acronym_preserved(self):
|
||||||
|
out, _ = standardize_name("Mary USA Smith")
|
||||||
|
assert out == "Mary USA Smith"
|
||||||
|
|
||||||
|
def test_upper_mode(self):
|
||||||
|
out, _ = standardize_name("john doe", case="upper")
|
||||||
|
assert out == "JOHN DOE"
|
||||||
|
|
||||||
|
def test_lower_mode(self):
|
||||||
|
out, _ = standardize_name("JOHN DOE", case="lower")
|
||||||
|
assert out == "john doe"
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert standardize_name("") == ("", False)
|
||||||
|
assert standardize_name(None) == ("", False)
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
out, _ = standardize_name("JOHN DOE")
|
||||||
|
out2, changed2 = standardize_name(out)
|
||||||
|
assert out2 == out
|
||||||
|
assert changed2 is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestStandardizeAddress:
|
||||||
|
def test_street(self):
|
||||||
|
out, _ = standardize_address("123 Main St")
|
||||||
|
assert out == "123 Main Street"
|
||||||
|
|
||||||
|
def test_avenue_with_period(self):
|
||||||
|
out, _ = standardize_address("456 Oak Ave.")
|
||||||
|
assert out == "456 Oak Avenue"
|
||||||
|
|
||||||
|
def test_apartment(self):
|
||||||
|
out, _ = standardize_address("123 Main St Apt 4")
|
||||||
|
assert out == "123 Main Street Apartment 4"
|
||||||
|
|
||||||
|
def test_direction(self):
|
||||||
|
out, _ = standardize_address("100 N Main St")
|
||||||
|
assert out == "100 North Main Street"
|
||||||
|
|
||||||
|
def test_combined(self):
|
||||||
|
out, _ = standardize_address("789 pine blvd ste 200")
|
||||||
|
assert out == "789 Pine Boulevard Suite 200"
|
||||||
|
|
||||||
|
def test_already_expanded(self):
|
||||||
|
out, changed = standardize_address("123 Main Street")
|
||||||
|
assert out == "123 Main Street"
|
||||||
|
assert changed is False
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert standardize_address("") == ("", False)
|
||||||
|
assert standardize_address(None) == ("", False)
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
out, _ = standardize_address("123 main st apt 4")
|
||||||
|
out2, changed2 = standardize_address(out)
|
||||||
|
assert out2 == out
|
||||||
|
assert changed2 is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestStandardizeBoolean:
|
||||||
|
@pytest.mark.parametrize("inp", ["yes", "Yes", "YES", "y", "Y", "true", "1", "on"])
|
||||||
|
def test_truthy(self, inp):
|
||||||
|
out, changed = standardize_boolean(inp)
|
||||||
|
assert out == "True"
|
||||||
|
assert changed is True
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("inp", ["no", "No", "NO", "n", "N", "false", "0", "off"])
|
||||||
|
def test_falsy(self, inp):
|
||||||
|
out, changed = standardize_boolean(inp)
|
||||||
|
assert out == "False"
|
||||||
|
assert changed is True
|
||||||
|
|
||||||
|
def test_already_canonical(self):
|
||||||
|
out, changed = standardize_boolean("True")
|
||||||
|
assert out == "True"
|
||||||
|
assert changed is False
|
||||||
|
|
||||||
|
def test_python_bool(self):
|
||||||
|
assert standardize_boolean(True) == ("True", True)
|
||||||
|
assert standardize_boolean(False) == ("False", True)
|
||||||
|
|
||||||
|
def test_int_zero_one(self):
|
||||||
|
assert standardize_boolean(1) == ("True", True)
|
||||||
|
assert standardize_boolean(0) == ("False", True)
|
||||||
|
|
||||||
|
def test_yes_no_style(self):
|
||||||
|
assert standardize_boolean("y", style="Yes/No") == ("Yes", True)
|
||||||
|
assert standardize_boolean("0", style="Yes/No") == ("No", True)
|
||||||
|
|
||||||
|
def test_unrecognized_passthrough(self):
|
||||||
|
out, changed = standardize_boolean("maybe")
|
||||||
|
assert (out, changed) == ("maybe", False)
|
||||||
|
|
||||||
|
def test_empty(self):
|
||||||
|
assert standardize_boolean("") == ("", False)
|
||||||
|
assert standardize_boolean(None) == ("", False)
|
||||||
|
|
||||||
|
def test_idempotent(self):
|
||||||
|
out, _ = standardize_boolean("yes")
|
||||||
|
out2, changed2 = standardize_boolean(out)
|
||||||
|
assert out2 == out
|
||||||
|
assert changed2 is False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# DataFrame entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestStandardizeDataframe:
|
||||||
|
def test_mixed_columns(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"name": ["JOHN SMITH", "alice jones"],
|
||||||
|
"phone": ["(555) 123-4567", "555.987.6543"],
|
||||||
|
"amount": ["$1,234.56", "$50"],
|
||||||
|
"joined": ["01/15/2024", "March 5 2023"],
|
||||||
|
"active": ["yes", "0"],
|
||||||
|
"address": ["123 Main St", "456 Oak Ave"],
|
||||||
|
"skip_me": ["leave", "alone"],
|
||||||
|
})
|
||||||
|
opts = StandardizeOptions(
|
||||||
|
column_types={
|
||||||
|
"name": FieldType.NAME,
|
||||||
|
"phone": FieldType.PHONE,
|
||||||
|
"amount": FieldType.CURRENCY,
|
||||||
|
"joined": FieldType.DATE,
|
||||||
|
"active": FieldType.BOOLEAN,
|
||||||
|
"address": FieldType.ADDRESS,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
result = standardize_dataframe(df, opts)
|
||||||
|
out = result.standardized_df
|
||||||
|
assert out.loc[0, "name"] == "John Smith"
|
||||||
|
assert out.loc[1, "name"] == "Alice Jones"
|
||||||
|
assert out.loc[0, "phone"] == "+15551234567"
|
||||||
|
assert out.loc[1, "phone"] == "+15559876543"
|
||||||
|
assert out.loc[0, "amount"] == "1234.56"
|
||||||
|
assert out.loc[1, "amount"] == "50.00"
|
||||||
|
assert out.loc[0, "joined"] == "2024-01-15"
|
||||||
|
assert out.loc[1, "joined"] == "2023-03-05"
|
||||||
|
assert out.loc[0, "active"] == "True"
|
||||||
|
assert out.loc[1, "active"] == "False"
|
||||||
|
assert out.loc[0, "address"] == "123 Main Street"
|
||||||
|
assert out.loc[1, "address"] == "456 Oak Avenue"
|
||||||
|
# Untouched column passes through verbatim.
|
||||||
|
assert list(out["skip_me"]) == ["leave", "alone"]
|
||||||
|
|
||||||
|
def test_changes_audit(self):
|
||||||
|
df = pd.DataFrame({"d": ["01/15/2024", "2023-03-05"]})
|
||||||
|
opts = StandardizeOptions(column_types={"d": FieldType.DATE})
|
||||||
|
result = standardize_dataframe(df, opts)
|
||||||
|
# Only the first row changed; the second was already canonical.
|
||||||
|
assert result.cells_changed == 1
|
||||||
|
assert len(result.changes) == 1
|
||||||
|
assert result.changes.iloc[0]["row"] == 0
|
||||||
|
assert result.changes.iloc[0]["column"] == "d"
|
||||||
|
assert result.changes.iloc[0]["old"] == "01/15/2024"
|
||||||
|
assert result.changes.iloc[0]["new"] == "2024-01-15"
|
||||||
|
|
||||||
|
def test_unparseable_count(self):
|
||||||
|
df = pd.DataFrame({"d": ["01/15/2024", "not a date", "2024-01-15"]})
|
||||||
|
opts = StandardizeOptions(column_types={"d": FieldType.DATE})
|
||||||
|
result = standardize_dataframe(df, opts)
|
||||||
|
assert result.cells_unparseable == 1
|
||||||
|
assert result.cells_total == 3
|
||||||
|
|
||||||
|
def test_unknown_column_raises(self):
|
||||||
|
df = pd.DataFrame({"a": ["1"]})
|
||||||
|
opts = StandardizeOptions(column_types={"missing": FieldType.DATE})
|
||||||
|
with pytest.raises(ValueError, match="not found"):
|
||||||
|
standardize_dataframe(df, opts)
|
||||||
|
|
||||||
|
def test_input_not_mutated(self):
|
||||||
|
df = pd.DataFrame({"d": ["01/15/2024"]})
|
||||||
|
opts = StandardizeOptions(column_types={"d": FieldType.DATE})
|
||||||
|
standardize_dataframe(df, opts)
|
||||||
|
assert df.loc[0, "d"] == "01/15/2024"
|
||||||
|
|
||||||
|
def test_options_serialization_roundtrip(self, tmp_path):
|
||||||
|
opts = StandardizeOptions(
|
||||||
|
column_types={"a": FieldType.DATE, "b": FieldType.PHONE},
|
||||||
|
date_output_format="%d-%b-%Y",
|
||||||
|
phone_format="NATIONAL",
|
||||||
|
)
|
||||||
|
path = tmp_path / "opts.json"
|
||||||
|
opts.to_file(path)
|
||||||
|
loaded = StandardizeOptions.from_file(path)
|
||||||
|
assert loaded.column_types == {"a": FieldType.DATE, "b": FieldType.PHONE}
|
||||||
|
assert loaded.date_output_format == "%d-%b-%Y"
|
||||||
|
assert loaded.phone_format == "NATIONAL"
|
||||||
|
|
||||||
|
def test_nan_passthrough(self):
|
||||||
|
df = pd.DataFrame({"d": ["01/15/2024", None]})
|
||||||
|
opts = StandardizeOptions(column_types={"d": FieldType.DATE})
|
||||||
|
result = standardize_dataframe(df, opts)
|
||||||
|
assert result.standardized_df.loc[0, "d"] == "2024-01-15"
|
||||||
|
assert result.standardized_df.loc[1, "d"] is None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Preset bundles
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestPresets:
|
||||||
|
def test_us_default_iso_dates(self):
|
||||||
|
opts = StandardizeOptions.from_preset("us-default")
|
||||||
|
assert opts.date_output_format == "%Y-%m-%d"
|
||||||
|
assert opts.date_order == "MDY"
|
||||||
|
assert opts.phone_format == "E164"
|
||||||
|
assert opts.boolean_style == "True/False"
|
||||||
|
|
||||||
|
def test_european_dmy_comma(self):
|
||||||
|
opts = StandardizeOptions.from_preset("european")
|
||||||
|
assert opts.date_order == "DMY"
|
||||||
|
assert opts.currency_decimal == "comma"
|
||||||
|
assert opts.currency_preserve_code is True
|
||||||
|
|
||||||
|
def test_uk_ddmmyyyy_yes_no(self):
|
||||||
|
opts = StandardizeOptions.from_preset("uk")
|
||||||
|
assert opts.date_output_format == "%d/%m/%Y"
|
||||||
|
assert opts.phone_region == "GB"
|
||||||
|
assert opts.boolean_style == "Yes/No"
|
||||||
|
|
||||||
|
def test_iso_strict_lowercase_bools_no_rounding(self):
|
||||||
|
opts = StandardizeOptions.from_preset("iso-strict")
|
||||||
|
assert opts.boolean_style == "true/false"
|
||||||
|
assert opts.currency_decimals is None
|
||||||
|
assert opts.currency_preserve_code is True
|
||||||
|
|
||||||
|
def test_legacy_us_national_phones(self):
|
||||||
|
opts = StandardizeOptions.from_preset("legacy-us")
|
||||||
|
assert opts.date_output_format == "%m/%d/%Y"
|
||||||
|
assert opts.phone_format == "NATIONAL"
|
||||||
|
assert opts.boolean_style == "Yes/No"
|
||||||
|
|
||||||
|
def test_overrides_layer_on_top(self):
|
||||||
|
opts = StandardizeOptions.from_preset(
|
||||||
|
"uk",
|
||||||
|
column_types={"name": FieldType.NAME},
|
||||||
|
currency_decimals=4,
|
||||||
|
)
|
||||||
|
assert opts.column_types == {"name": FieldType.NAME}
|
||||||
|
assert opts.currency_decimals == 4
|
||||||
|
# UK-specific defaults survive what we didn't override.
|
||||||
|
assert opts.phone_region == "GB"
|
||||||
|
|
||||||
|
def test_unknown_preset_raises(self):
|
||||||
|
with pytest.raises(ValueError, match="Unknown preset"):
|
||||||
|
StandardizeOptions.from_preset("not-a-real-preset")
|
||||||
|
|
||||||
|
def test_all_presets_loadable(self):
|
||||||
|
# Smoke test: every advertised preset constructs cleanly.
|
||||||
|
for name in PRESETS:
|
||||||
|
opts = StandardizeOptions.from_preset(name)
|
||||||
|
assert isinstance(opts, StandardizeOptions)
|
||||||
|
|
||||||
|
def test_preset_drives_dataframe_pipeline(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"joined": ["15/01/2024"],
|
||||||
|
"active": ["yes"],
|
||||||
|
"amount": ["1.234,56 €"],
|
||||||
|
})
|
||||||
|
opts = StandardizeOptions.from_preset(
|
||||||
|
"european",
|
||||||
|
column_types={
|
||||||
|
"joined": FieldType.DATE,
|
||||||
|
"active": FieldType.BOOLEAN,
|
||||||
|
"amount": FieldType.CURRENCY,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
result = standardize_dataframe(df, opts)
|
||||||
|
out = result.standardized_df
|
||||||
|
assert out.loc[0, "joined"] == "2024-01-15" # ISO output for european
|
||||||
|
assert out.loc[0, "active"] == "True"
|
||||||
|
assert out.loc[0, "amount"] == "EUR 1234.56" # preserve_code on
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Currency code detection / preservation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestCurrencyCodeDetection:
|
||||||
|
@pytest.mark.parametrize("inp,code", [
|
||||||
|
("$1,234.56", "USD"),
|
||||||
|
("€1.234,56", "EUR"),
|
||||||
|
("£99.00", "GBP"),
|
||||||
|
("¥5000", "JPY"),
|
||||||
|
("₹500", "INR"),
|
||||||
|
("USD 1234", "USD"),
|
||||||
|
("1234 EUR", "EUR"),
|
||||||
|
("eur 50", "EUR"),
|
||||||
|
])
|
||||||
|
def test_detects(self, inp, code):
|
||||||
|
assert detect_currency_code(inp) == code
|
||||||
|
|
||||||
|
def test_no_marker_returns_none(self):
|
||||||
|
assert detect_currency_code("1234.56") is None
|
||||||
|
|
||||||
|
def test_non_string_returns_none(self):
|
||||||
|
assert detect_currency_code(None) is None # type: ignore[arg-type]
|
||||||
|
assert detect_currency_code(1234) is None # type: ignore[arg-type]
|
||||||
|
|
||||||
|
|
||||||
|
class TestCurrencyPreserveCode:
|
||||||
|
def test_dollar_preserved(self):
|
||||||
|
out, changed = standardize_currency("$1,234.56", decimals=2, preserve_code=True)
|
||||||
|
assert out == "USD 1234.56"
|
||||||
|
assert changed is True
|
||||||
|
|
||||||
|
def test_euro_preserved_comma_decimal(self):
|
||||||
|
out, _ = standardize_currency(
|
||||||
|
"1.234,56 €", decimal="comma", decimals=2, preserve_code=True,
|
||||||
|
)
|
||||||
|
assert out == "EUR 1234.56"
|
||||||
|
|
||||||
|
def test_iso_code_input_preserved(self):
|
||||||
|
out, _ = standardize_currency("USD 1234.56", decimals=2, preserve_code=True)
|
||||||
|
assert out == "USD 1234.56"
|
||||||
|
|
||||||
|
def test_no_marker_no_prefix(self):
|
||||||
|
out, _ = standardize_currency("1234.56", decimals=2, preserve_code=True)
|
||||||
|
assert out == "1234.56"
|
||||||
|
|
||||||
|
def test_off_by_default(self):
|
||||||
|
out, _ = standardize_currency("$1,234.56", decimals=2)
|
||||||
|
assert out == "1234.56"
|
||||||
|
|
||||||
|
def test_pipeline_preserve_code(self):
|
||||||
|
df = pd.DataFrame({"price": ["$50.00", "€30,00", "100", "USD 12.34"]})
|
||||||
|
opts = StandardizeOptions(
|
||||||
|
column_types={"price": FieldType.CURRENCY},
|
||||||
|
currency_decimals=2,
|
||||||
|
currency_preserve_code=True,
|
||||||
|
currency_decimal="dot", # mixed input — euro will need its own
|
||||||
|
)
|
||||||
|
# Note: comma-decimal euro won't parse under dot mode; treat that
|
||||||
|
# as a known limitation — this test exercises the dot-input path.
|
||||||
|
result = standardize_dataframe(df, opts)
|
||||||
|
out = result.standardized_df
|
||||||
|
assert out.loc[0, "price"] == "USD 50.00"
|
||||||
|
assert out.loc[2, "price"] == "100.00"
|
||||||
|
assert out.loc[3, "price"] == "USD 12.34"
|
||||||
|
|
||||||
|
def test_canonical_check_recognizes_code_prefix(self):
|
||||||
|
# "USD 50.00" should pass through unchanged when preserve_code is on
|
||||||
|
# — and NOT count as unparseable.
|
||||||
|
df = pd.DataFrame({"price": ["USD 50.00", "garbage"]})
|
||||||
|
opts = StandardizeOptions(
|
||||||
|
column_types={"price": FieldType.CURRENCY},
|
||||||
|
currency_decimals=2,
|
||||||
|
currency_preserve_code=True,
|
||||||
|
)
|
||||||
|
result = standardize_dataframe(df, opts)
|
||||||
|
assert result.cells_changed == 0
|
||||||
|
# Only "garbage" counts as unparseable.
|
||||||
|
assert result.cells_unparseable == 1
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# User-editable abbreviations
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestExtraAbbreviations:
|
||||||
|
def test_extra_expansion(self):
|
||||||
|
out, _ = standardize_address(
|
||||||
|
"Bahnhofstrasse 12",
|
||||||
|
extra_abbreviations={"strasse": "Straße"},
|
||||||
|
)
|
||||||
|
# smart_title_case will Title-case the result; "Bahnhofstrasse" is
|
||||||
|
# already a single token (no embedded space) so it doesn't hit the
|
||||||
|
# abbreviation lookup. Use a separated form for the realistic case.
|
||||||
|
assert "Bahnhofstrasse" in out # not split → not expanded
|
||||||
|
|
||||||
|
def test_extra_expansion_separated_token(self):
|
||||||
|
out, _ = standardize_address(
|
||||||
|
"Haupt strasse 12",
|
||||||
|
extra_abbreviations={"strasse": "Straße"},
|
||||||
|
)
|
||||||
|
assert "Straße" in out
|
||||||
|
|
||||||
|
def test_override_existing_entry(self):
|
||||||
|
# Override "ave" to emit Spanish-language "Avenida".
|
||||||
|
out, _ = standardize_address(
|
||||||
|
"456 Oak Ave",
|
||||||
|
extra_abbreviations={"ave": "Avenida"},
|
||||||
|
)
|
||||||
|
assert "Avenida" in out
|
||||||
|
assert "Avenue" not in out
|
||||||
|
|
||||||
|
def test_period_form_works(self):
|
||||||
|
# Lookup is casefold + period-stripped, so ``Ave.`` still matches.
|
||||||
|
out, _ = standardize_address(
|
||||||
|
"456 Oak Ave.",
|
||||||
|
extra_abbreviations={"ave": "Avenida"},
|
||||||
|
)
|
||||||
|
assert "Avenida" in out
|
||||||
|
|
||||||
|
def test_empty_value_skipped(self):
|
||||||
|
# Empty values in the user table don't blow up; they're ignored.
|
||||||
|
out, _ = standardize_address(
|
||||||
|
"456 Oak Ave",
|
||||||
|
extra_abbreviations={"ave": "", " ": "Drive"},
|
||||||
|
)
|
||||||
|
# Built-in expansion still applies.
|
||||||
|
assert "Avenue" in out
|
||||||
|
|
||||||
|
def test_no_extras_unchanged_behavior(self):
|
||||||
|
out_a, _ = standardize_address("123 Main St")
|
||||||
|
out_b, _ = standardize_address("123 Main St", extra_abbreviations={})
|
||||||
|
out_c, _ = standardize_address("123 Main St", extra_abbreviations=None)
|
||||||
|
assert out_a == out_b == out_c == "123 Main Street"
|
||||||
|
|
||||||
|
def test_pipeline_uses_extras(self):
|
||||||
|
df = pd.DataFrame({"addr": ["456 Oak Ave"]})
|
||||||
|
opts = StandardizeOptions(
|
||||||
|
column_types={"addr": FieldType.ADDRESS},
|
||||||
|
extra_abbreviations={"ave": "Avenida"},
|
||||||
|
)
|
||||||
|
result = standardize_dataframe(df, opts)
|
||||||
|
assert "Avenida" in result.standardized_df.loc[0, "addr"]
|
||||||
|
|
||||||
|
def test_serialization_roundtrip_with_extras(self, tmp_path):
|
||||||
|
opts = StandardizeOptions(
|
||||||
|
column_types={"addr": FieldType.ADDRESS},
|
||||||
|
extra_abbreviations={"strasse": "Straße", "platz": "Platz"},
|
||||||
|
currency_preserve_code=True,
|
||||||
|
)
|
||||||
|
path = tmp_path / "opts.json"
|
||||||
|
opts.to_file(path)
|
||||||
|
loaded = StandardizeOptions.from_file(path)
|
||||||
|
assert loaded.extra_abbreviations == {"strasse": "Straße", "platz": "Platz"}
|
||||||
|
assert loaded.currency_preserve_code is True
|
||||||
573
tests/test_format_standardize_corpus.py
Normal file
573
tests/test_format_standardize_corpus.py
Normal file
@@ -0,0 +1,573 @@
|
|||||||
|
"""Corpus-driven tests for ``src.core.format_standardize``.
|
||||||
|
|
||||||
|
Drives every row of the FORMATS test corpus
|
||||||
|
(``test-cases/format-cleaner-corpus/*.csv``) through the per-cell
|
||||||
|
standardizers and asserts the canonical output the corpus expects.
|
||||||
|
|
||||||
|
The corpus itself (``FORMATS-CASES.md`` in the same directory)
|
||||||
|
documents per-domain policy decisions; the per-case ``id`` strings
|
||||||
|
below (FD01, FP14, FA09, …) match its row keys exactly.
|
||||||
|
|
||||||
|
Two sentinels are used in the per-domain expected dicts:
|
||||||
|
|
||||||
|
- A literal string is the corpus's expected canonical output.
|
||||||
|
- ``PASSTHROUGH`` means "corpus accepts no transformation" — usually
|
||||||
|
empty, whitespace-only, or already-clean input.
|
||||||
|
|
||||||
|
A handful of corpus rows are still ``xfail`` because closing them
|
||||||
|
needs heavier machinery (Excel serial parsing, Unix timestamps,
|
||||||
|
non-English month dictionaries, IDN / non-ASCII email validation).
|
||||||
|
Each such marker carries a one-line reason.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import csv
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.core.format_standardize import (
|
||||||
|
FieldType,
|
||||||
|
StandardizeOptions,
|
||||||
|
standardize_address,
|
||||||
|
standardize_currency,
|
||||||
|
standardize_dataframe,
|
||||||
|
standardize_date,
|
||||||
|
standardize_email,
|
||||||
|
standardize_name,
|
||||||
|
standardize_phone,
|
||||||
|
)
|
||||||
|
|
||||||
|
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "format-cleaner-corpus"
|
||||||
|
|
||||||
|
PASSTHROUGH = object() # sentinel: assert the function returned input unchanged
|
||||||
|
|
||||||
|
|
||||||
|
def _load(filename: str) -> list[dict[str, str]]:
|
||||||
|
with (CORPUS / filename).open(newline="") as f:
|
||||||
|
return list(csv.DictReader(f))
|
||||||
|
|
||||||
|
|
||||||
|
def _params(fixture: str, expected: dict[str, object], xfails: dict[str, str]):
|
||||||
|
"""Build pytest.param entries for every row in *fixture*.
|
||||||
|
|
||||||
|
Rows in *xfails* are wrapped in a non-strict xfail with the given
|
||||||
|
reason, so improvements that close the gap surface as xpass and the
|
||||||
|
suite stays green either way.
|
||||||
|
"""
|
||||||
|
rows = _load(fixture)
|
||||||
|
out = []
|
||||||
|
for row in rows:
|
||||||
|
cid = row["case_id"]
|
||||||
|
want = expected.get(cid, PASSTHROUGH)
|
||||||
|
marks = []
|
||||||
|
if cid in xfails:
|
||||||
|
marks.append(pytest.mark.xfail(reason=xfails[cid], strict=False))
|
||||||
|
out.append(pytest.param(row["input"], want, id=cid, marks=marks))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _assert(got: str, want: object, original: str) -> None:
|
||||||
|
if want is PASSTHROUGH:
|
||||||
|
assert got == original, f"expected pass-through, got {got!r}"
|
||||||
|
else:
|
||||||
|
assert got == want
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Dates — 24_format_dates.csv
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_DATE_EXPECTED_MDY: dict[str, object] = {
|
||||||
|
# iso baseline + datetime variants → ISO date
|
||||||
|
"FD01": "2024-01-15",
|
||||||
|
"FD02": "2024-01-15",
|
||||||
|
"FD03": "2024-01-15",
|
||||||
|
"FD04": "2024-01-15",
|
||||||
|
"FD05": "2024-01-15",
|
||||||
|
"FD06": "2024-01-15",
|
||||||
|
# US M/D/Y variants
|
||||||
|
"FD07": "2024-01-15",
|
||||||
|
"FD08": "2024-01-15",
|
||||||
|
"FD09": "2024-01-05",
|
||||||
|
"FD10": "2024-05-30",
|
||||||
|
# longform month names
|
||||||
|
"FD16": "2024-01-15",
|
||||||
|
"FD17": "2024-01-15",
|
||||||
|
"FD18": "2024-01-15",
|
||||||
|
"FD19": "2024-01-15",
|
||||||
|
"FD20": "2024-01-15", # weekday-prefixed
|
||||||
|
"FD21": "2024-01-15",
|
||||||
|
# FD11-FD15 — DMY-shaped EU dates in MDY default mode; the DMY
|
||||||
|
# rerun below covers the actual parse path. Under MDY they pass
|
||||||
|
# through unchanged. (Listed explicitly so a future MDY-aware
|
||||||
|
# locale auto-detect can replace these expectations with the
|
||||||
|
# correct ISO output.)
|
||||||
|
"FD11": PASSTHROUGH,
|
||||||
|
"FD12": PASSTHROUGH,
|
||||||
|
"FD13": PASSTHROUGH,
|
||||||
|
"FD14": PASSTHROUGH,
|
||||||
|
"FD15": PASSTHROUGH,
|
||||||
|
# excel serial → 2024-01-15 (xfail — not implemented)
|
||||||
|
"FD22": "2024-01-15",
|
||||||
|
"FD23": "2024-01-15",
|
||||||
|
# unix timestamp seconds / millis → 2024-01-15 (xfail)
|
||||||
|
"FD24": "2024-01-15",
|
||||||
|
"FD25": "2024-01-15",
|
||||||
|
# partial precision — corpus preserves it
|
||||||
|
"FD26": "2024-01",
|
||||||
|
"FD27": "2024-01", # xfail — text precision
|
||||||
|
"FD28": "2024-Q1", # xfail — quarter
|
||||||
|
"FD29": "2024",
|
||||||
|
# 2-digit year cutoff (per docs: 1969 wins over 2069)
|
||||||
|
"FD30": "1969-01-15",
|
||||||
|
# leap day valid
|
||||||
|
"FD31": "2024-02-29",
|
||||||
|
# invalid dates → corpus expects error sentinel
|
||||||
|
"FD32": "<error: invalid leap day>",
|
||||||
|
"FD33": "<error: Excel 1900 leap year bug>",
|
||||||
|
"FD34": "<error: invalid month>",
|
||||||
|
"FD35": "<error: invalid day>",
|
||||||
|
# buried-date extraction
|
||||||
|
"FD36": "2024-01-15",
|
||||||
|
"FD37": "2024-01-15",
|
||||||
|
# garbage → pass through (corpus 0.3 boundary table)
|
||||||
|
# FD38/39/40 → PASSTHROUGH default
|
||||||
|
# locale-specific month names (xfail — not shipped)
|
||||||
|
"FD41": "2024-01-15",
|
||||||
|
"FD42": "2024-01-15",
|
||||||
|
# timezone — corpus 3.3 says fixed-offset only
|
||||||
|
"FD43": "2024-01-15",
|
||||||
|
"FD44": "2024-03-10",
|
||||||
|
# already-clean idempotency
|
||||||
|
"FD45": "2024-01-15",
|
||||||
|
}
|
||||||
|
|
||||||
|
_DATE_XFAILS_MDY: dict[str, str] = {}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"inp,want",
|
||||||
|
_params("24_format_dates.csv", _DATE_EXPECTED_MDY, _DATE_XFAILS_MDY),
|
||||||
|
)
|
||||||
|
def test_corpus_dates_mdy(inp, want):
|
||||||
|
got, _ = standardize_date(
|
||||||
|
inp, error_policy="sentinel", month_locales=["en", "fr", "de"],
|
||||||
|
)
|
||||||
|
_assert(got, want, inp)
|
||||||
|
|
||||||
|
|
||||||
|
# DMY locale rerun for the EU rows that need it.
|
||||||
|
_DATE_EXPECTED_DMY: dict[str, str] = {
|
||||||
|
"FD11": "2024-01-15",
|
||||||
|
"FD12": "2024-01-15",
|
||||||
|
"FD13": "2024-01-15",
|
||||||
|
"FD14": "2024-05-30",
|
||||||
|
"FD15": "2024-01-15",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"inp,want",
|
||||||
|
[
|
||||||
|
pytest.param(
|
||||||
|
_load("24_format_dates.csv")[i - 1]["input"],
|
||||||
|
_DATE_EXPECTED_DMY[f"FD{i:02d}"],
|
||||||
|
id=f"FD{i:02d}-dmy",
|
||||||
|
)
|
||||||
|
for i in range(11, 16)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_corpus_dates_dmy(inp, want):
|
||||||
|
got, _ = standardize_date(inp, date_order="DMY")
|
||||||
|
assert got == want
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Phones — 25_format_phones.csv
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_PHONE_EXPECTED: dict[str, object] = {
|
||||||
|
"FP01": "+15551234567",
|
||||||
|
"FP02": "+15551234567",
|
||||||
|
"FP03": "+15551234567",
|
||||||
|
"FP04": "+15551234567",
|
||||||
|
"FP05": "+15551234567",
|
||||||
|
"FP06": "+15551234567",
|
||||||
|
"FP07": "+15551234567",
|
||||||
|
"FP08": "+15551234567",
|
||||||
|
"FP09": "+15551234567;ext=123",
|
||||||
|
"FP10": "+15551234567;ext=123",
|
||||||
|
"FP11": "+15551234567;ext=123",
|
||||||
|
# vanity numbers
|
||||||
|
"FP12": "+18003569377",
|
||||||
|
"FP13": "+15552255669",
|
||||||
|
# international (intl row FP15 needs --default-country=GB; covered separately)
|
||||||
|
"FP14": "+442079460958",
|
||||||
|
"FP16": "+493012345678",
|
||||||
|
"FP17": "+33123456789",
|
||||||
|
"FP18": "+81312345678",
|
||||||
|
"FP19": "+61212345678",
|
||||||
|
"FP20": "+15551234567",
|
||||||
|
# placeholders/junk → corpus says error
|
||||||
|
"FP21": "<error: insufficient digits>",
|
||||||
|
"FP22": "<error: too many digits>",
|
||||||
|
"FP23": "<error: placeholder number>",
|
||||||
|
"FP24": "<error: placeholder number>",
|
||||||
|
"FP25": "<error: multiple numbers in cell>",
|
||||||
|
# NBSP / smart-quote contamination — defensive cleanup acceptable
|
||||||
|
"FP26": "+15551234567",
|
||||||
|
"FP27": "+15551234567",
|
||||||
|
"FP28": "+15551234567",
|
||||||
|
# FP29 empty → pass-through
|
||||||
|
"FP30": "<error: not a phone number>",
|
||||||
|
"FP31": "<error: smart-quote contamination>",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"inp,want",
|
||||||
|
_params("25_format_phones.csv", _PHONE_EXPECTED, {}),
|
||||||
|
)
|
||||||
|
def test_corpus_phones(inp, want):
|
||||||
|
got, _ = standardize_phone(inp, error_policy="sentinel")
|
||||||
|
_assert(got, want, inp)
|
||||||
|
|
||||||
|
|
||||||
|
def test_corpus_phones_uk_domestic_with_gb_region():
|
||||||
|
# FP15 — UK trunk-prefixed "020 7946 0958" only resolves with
|
||||||
|
# default_region="GB". Verifies the cleaner's intl path works.
|
||||||
|
got, _ = standardize_phone("020 7946 0958", default_region="GB")
|
||||||
|
assert got == "+442079460958"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Emails — 26_format_emails.csv
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_EMAIL_EXPECTED: dict[str, object] = {
|
||||||
|
"FE01": "alice@example.com",
|
||||||
|
"FE02": "alice@example.com",
|
||||||
|
"FE03": "alice@example.com",
|
||||||
|
"FE04": "alice@example.com",
|
||||||
|
"FE05": "alice@example.com",
|
||||||
|
"FE06": "alice@example.com",
|
||||||
|
"FE07": "alice@example.com",
|
||||||
|
"FE08": "alice@example.com",
|
||||||
|
"FE09": "alice@example.com",
|
||||||
|
"FE10": "a.l.i.c.e@gmail.com", # default: don't touch dots
|
||||||
|
"FE11": "alice+newsletter@gmail.com", # default: don't touch +tag
|
||||||
|
"FE12": "a.l.i.c.e+work@gmail.com",
|
||||||
|
"FE13": "a.l.i.c.e@example.com", # never touch non-Gmail
|
||||||
|
"FE14": "alice+newsletter@example.com",
|
||||||
|
"FE15": "alice@münchen.de",
|
||||||
|
"FE16": "アリス@example.jp",
|
||||||
|
"FE17": "alice@example.com",
|
||||||
|
"FE18": "alice@example.com",
|
||||||
|
"FE19": "alice@example.com",
|
||||||
|
"FE20": "alice@example.com",
|
||||||
|
"FE21": "alice@example.com",
|
||||||
|
"FE22": "<error: missing @>",
|
||||||
|
"FE23": "<error: double @>",
|
||||||
|
"FE24": "<error: multiple @>",
|
||||||
|
"FE25": "<error: internal whitespace>",
|
||||||
|
"FE26": "<error: no TLD>",
|
||||||
|
"FE27": "<error: multiple emails>",
|
||||||
|
"FE28": "<error: multiple emails>",
|
||||||
|
# FE29 / FE30 empty / whitespace → PASSTHROUGH
|
||||||
|
"FE31": "alice@example.com",
|
||||||
|
}
|
||||||
|
|
||||||
|
_EMAIL_XFAILS: dict[str, str] = {}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"inp,want",
|
||||||
|
_params("26_format_emails.csv", _EMAIL_EXPECTED, _EMAIL_XFAILS),
|
||||||
|
)
|
||||||
|
def test_corpus_emails(inp, want):
|
||||||
|
got, _ = standardize_email(inp, error_policy="sentinel")
|
||||||
|
_assert(got, want, inp)
|
||||||
|
|
||||||
|
|
||||||
|
_EMAIL_GMAIL_CANONICAL: dict[str, str] = {
|
||||||
|
"FE10": "alice@gmail.com",
|
||||||
|
"FE11": "alice@gmail.com",
|
||||||
|
"FE12": "alice@gmail.com",
|
||||||
|
"FE13": "a.l.i.c.e@example.com", # negative test: don't touch non-Gmail
|
||||||
|
"FE14": "alice+newsletter@example.com", # negative test
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("inp,want", [
|
||||||
|
pytest.param(
|
||||||
|
next(r for r in _load("26_format_emails.csv") if r["case_id"] == cid)["input"],
|
||||||
|
want, id=f"{cid}-gmail-canonical",
|
||||||
|
)
|
||||||
|
for cid, want in _EMAIL_GMAIL_CANONICAL.items()
|
||||||
|
])
|
||||||
|
def test_corpus_emails_gmail_canonical(inp, want):
|
||||||
|
got, _ = standardize_email(inp, gmail_canonical=True)
|
||||||
|
assert got == want
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Addresses — 27_format_addresses.csv
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_ADDRESS_EXPECTED: dict[str, str] = {
|
||||||
|
"FA01": "123 Main St, New York, NY 10001",
|
||||||
|
"FA02": "123 Main St, New York, NY 10001",
|
||||||
|
"FA03": "123 Main St, New York, NY 10001",
|
||||||
|
"FA04": "123 Main St, New York, NY 10001",
|
||||||
|
"FA05": "123 Main St, New York, NY 10001",
|
||||||
|
"FA06": "456 Park Ave, New York, NY 10001",
|
||||||
|
"FA07": "789 Sunset Blvd, Los Angeles, CA 90028",
|
||||||
|
"FA08": "123 Main St, New York, NY 10001",
|
||||||
|
"FA09": "123 N Main St, City, ST 12345",
|
||||||
|
"FA10": "123 N Main St, City, ST 12345",
|
||||||
|
"FA11": "123 NE Main St, City, ST 12345",
|
||||||
|
"FA12": "123 Main St, Apt 4B, City, ST 12345",
|
||||||
|
"FA13": "123 Main St, # 4B, City, ST 12345",
|
||||||
|
"FA14": "123 Main St, Ste 200, City, ST 12345",
|
||||||
|
"FA15": "123 Main St, New York, NY 10001",
|
||||||
|
"FA16": "123 Main St, New York, NY 10001",
|
||||||
|
"FA17": "123 Main St, New York, NY 10001-1234",
|
||||||
|
"FA18": "123 Main St, Boston, MA 02101",
|
||||||
|
"FA19": "123 Main St, Apt 4B, New York, NY 10001",
|
||||||
|
"FA20": "PO Box 123, City, ST 12345",
|
||||||
|
"FA21": "PO Box 123, City, ST 12345",
|
||||||
|
"FA22": "PO Box 123, City, ST 12345",
|
||||||
|
"FA23": "123A Main St, City, ST 12345",
|
||||||
|
"FA24": "123-1 Main St, City, ST 12345",
|
||||||
|
"FA25": "123 1/2 Main St, City, ST 12345",
|
||||||
|
"FA26": "10 Downing Street, London, SW1A 2AA",
|
||||||
|
"FA27": "1 Yonge St, Toronto, ON M5E 1W7",
|
||||||
|
"FA28": "100-0001, Tokyo, Chiyoda, Marunouchi 1-1",
|
||||||
|
"FA31": "123 Main St, New York, NY 10001",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"inp,want",
|
||||||
|
_params("27_format_addresses.csv", _ADDRESS_EXPECTED, {}),
|
||||||
|
)
|
||||||
|
def test_corpus_addresses(inp, want):
|
||||||
|
got, _ = standardize_address(inp, expand=False)
|
||||||
|
_assert(got, want, inp)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Names — 28_format_names.csv
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_NAME_EXPECTED: dict[str, object] = {
|
||||||
|
"FN01": "Alice Smith",
|
||||||
|
"FN02": "Alice Smith",
|
||||||
|
"FN03": "Alice Smith",
|
||||||
|
"FN04": "aLiCe SmItH", # corpus 7.3 conservative: preserve mixed
|
||||||
|
"FN05": "McDonald",
|
||||||
|
"FN06": "McDonald",
|
||||||
|
"FN07": "MacDonald",
|
||||||
|
"FN08": "McTaggart",
|
||||||
|
"FN09": "O'Connor",
|
||||||
|
"FN10": "O'Connor",
|
||||||
|
"FN11": "O'Brien",
|
||||||
|
"FN12": "Mary-Jane Smith",
|
||||||
|
"FN13": "Smith-Jones",
|
||||||
|
"FN14": "von Trapp",
|
||||||
|
"FN15": "Vincent van Gogh",
|
||||||
|
"FN16": "Charles de Gaulle",
|
||||||
|
"FN17": "Leonardo da Vinci",
|
||||||
|
"FN18": "Mr John Smith", # corpus 7.3: drop title period
|
||||||
|
"FN19": "Dr Jane Doe",
|
||||||
|
"FN20": "Prof Alice Williams",
|
||||||
|
"FN21": "John Smith Jr",
|
||||||
|
"FN22": "John Smith III",
|
||||||
|
"FN23": "Jane Doe PhD",
|
||||||
|
"FN24": "John Smith", # comma-format reversed
|
||||||
|
"FN25": "John Smith",
|
||||||
|
"FN26": "John Andrew Smith",
|
||||||
|
"FN27": "John A Smith", # corpus 7.3: drop initial period
|
||||||
|
"FN28": "J.K. Rowling",
|
||||||
|
"FN29": "김철수",
|
||||||
|
"FN30": "田中太郎",
|
||||||
|
"FN31": "Иван Иванов",
|
||||||
|
"FN32": "Madonna",
|
||||||
|
# FN33 / FN34 → PASSTHROUGH default
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"inp,want",
|
||||||
|
_params("28_format_names.csv", _NAME_EXPECTED, {}),
|
||||||
|
)
|
||||||
|
def test_corpus_names(inp, want):
|
||||||
|
# FN04 needs conservative=True; the rest use default (aggressive).
|
||||||
|
conservative = inp == "aLiCe SmItH"
|
||||||
|
got, _ = standardize_name(inp, conservative=conservative)
|
||||||
|
_assert(got, want, inp)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Currencies — 29_format_currencies.csv
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_CURRENCY_EXPECTED: dict[str, object] = {
|
||||||
|
"FC01": "1234.56",
|
||||||
|
"FC02": "1234.56",
|
||||||
|
"FC03": "1234.56",
|
||||||
|
"FC04": "1234.56",
|
||||||
|
"FC05": "1234.56",
|
||||||
|
"FC06": "1234.56",
|
||||||
|
"FC07": "1234.56",
|
||||||
|
"FC08": "1234.56",
|
||||||
|
"FC09": "1234.56",
|
||||||
|
"FC10": "1234.56",
|
||||||
|
"FC11": "1234.56",
|
||||||
|
"FC12": "1234.56",
|
||||||
|
"FC13": "1234",
|
||||||
|
"FC14": "123456.78",
|
||||||
|
"FC15": "-100",
|
||||||
|
"FC16": "-100",
|
||||||
|
"FC17": "-100",
|
||||||
|
"FC18": "0",
|
||||||
|
"FC19": "1500000",
|
||||||
|
"FC20": "<error: percentage not currency>",
|
||||||
|
"FC21": "<error: range not normalizable>",
|
||||||
|
"FC22": "<error: word value>",
|
||||||
|
"FC23": "<error: word value>",
|
||||||
|
# FC24 empty → PASSTHROUGH
|
||||||
|
"FC25": "1234.56",
|
||||||
|
"FC26": "1234",
|
||||||
|
"FC27": "<error: ambiguous separator, set --currency-locale>",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"inp,want",
|
||||||
|
_params("29_format_currencies.csv", _CURRENCY_EXPECTED, {}),
|
||||||
|
)
|
||||||
|
def test_corpus_currencies(inp, want):
|
||||||
|
got, _ = standardize_currency(inp, error_policy="sentinel")
|
||||||
|
_assert(got, want, inp)
|
||||||
|
|
||||||
|
|
||||||
|
def test_corpus_currencies_eu_with_comma_decimal():
|
||||||
|
# FC08, FC10 also parse correctly under decimal="comma".
|
||||||
|
got, _ = standardize_currency("€1.234,56", decimal="comma")
|
||||||
|
assert got == "1234.56"
|
||||||
|
got, _ = standardize_currency("1.234,56 EUR", decimal="comma")
|
||||||
|
assert got == "1234.56"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Integration — 30_format_integration.csv
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _integration_opts(**overrides) -> StandardizeOptions:
|
||||||
|
"""Standardize options matching corpus defaults for the integration row."""
|
||||||
|
base = StandardizeOptions(
|
||||||
|
column_types={
|
||||||
|
"name": FieldType.NAME,
|
||||||
|
"email": FieldType.EMAIL,
|
||||||
|
"phone": FieldType.PHONE,
|
||||||
|
"date": FieldType.DATE,
|
||||||
|
"amount": FieldType.CURRENCY,
|
||||||
|
"address": FieldType.ADDRESS,
|
||||||
|
},
|
||||||
|
currency_decimals=None,
|
||||||
|
address_expand=False,
|
||||||
|
date_error_policy="passthrough",
|
||||||
|
phone_error_policy="passthrough",
|
||||||
|
)
|
||||||
|
for k, v in overrides.items():
|
||||||
|
setattr(base, k, v)
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def test_corpus_integration_pipeline_preserves_schema():
|
||||||
|
df = pd.read_csv(CORPUS / "30_format_integration.csv",
|
||||||
|
dtype=str, keep_default_na=False)
|
||||||
|
result = standardize_dataframe(df, _integration_opts())
|
||||||
|
out = result.standardized_df
|
||||||
|
|
||||||
|
# Schema preservation (corpus § 0.2): no rows or columns added,
|
||||||
|
# column order intact.
|
||||||
|
assert list(out.columns) == list(df.columns)
|
||||||
|
assert len(out) == len(df)
|
||||||
|
|
||||||
|
|
||||||
|
def test_corpus_integration_FI01_messy_record():
|
||||||
|
# Row 0 = FI01: standard messy-but-cleanable record.
|
||||||
|
df = pd.read_csv(CORPUS / "30_format_integration.csv",
|
||||||
|
dtype=str, keep_default_na=False)
|
||||||
|
result = standardize_dataframe(df, _integration_opts())
|
||||||
|
row = result.standardized_df.iloc[0]
|
||||||
|
assert row["name"] == "Alice Smith"
|
||||||
|
assert row["email"] == "alice@example.com"
|
||||||
|
assert row["phone"] == "+15551234567"
|
||||||
|
assert row["date"] == "2024-01-15"
|
||||||
|
assert row["amount"] == "1234.56"
|
||||||
|
assert row["address"] == "123 Main St, New York, NY 10001"
|
||||||
|
|
||||||
|
|
||||||
|
def test_corpus_integration_FI04_all_empty_passthrough():
|
||||||
|
# Row 3 = FI04: all empty cells, must pass through without errors.
|
||||||
|
df = pd.read_csv(CORPUS / "30_format_integration.csv",
|
||||||
|
dtype=str, keep_default_na=False)
|
||||||
|
result = standardize_dataframe(df, _integration_opts())
|
||||||
|
row = result.standardized_df.iloc[3]
|
||||||
|
for col in ("name", "email", "phone", "date", "amount", "address"):
|
||||||
|
assert row[col] == "", f"FI04.{col} expected empty, got {row[col]!r}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_corpus_integration_FI05_idempotent_on_clean_input():
|
||||||
|
# Row 4 = FI05: already-clean record. Every column should round-trip
|
||||||
|
# unchanged.
|
||||||
|
df = pd.read_csv(CORPUS / "30_format_integration.csv",
|
||||||
|
dtype=str, keep_default_na=False)
|
||||||
|
result = standardize_dataframe(df, _integration_opts())
|
||||||
|
row = result.standardized_df.iloc[4]
|
||||||
|
original = df.iloc[4]
|
||||||
|
for col in ("name", "email", "phone", "date", "amount", "address"):
|
||||||
|
assert row[col] == original[col], (
|
||||||
|
f"FI05.{col} non-idempotent: {original[col]!r} -> {row[col]!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Idempotency property
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Every per-cell standardizer must satisfy ``f(f(x)) == f(x)`` (corpus
|
||||||
|
# § 1, "Idempotency requirement"). We exercise it across every corpus
|
||||||
|
# input under the same flag set the per-domain tests use.
|
||||||
|
|
||||||
|
def _idempotency_runner(fn, fixture, **kwargs):
|
||||||
|
failures = []
|
||||||
|
for row in _load(fixture):
|
||||||
|
once, _ = fn(row["input"], **kwargs)
|
||||||
|
twice, _ = fn(once, **kwargs)
|
||||||
|
if once != twice:
|
||||||
|
failures.append((row["case_id"], row["input"], once, twice))
|
||||||
|
return failures
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("fn,fixture,kwargs", [
|
||||||
|
(standardize_date, "24_format_dates.csv", {}),
|
||||||
|
(standardize_phone, "25_format_phones.csv", {}),
|
||||||
|
(standardize_address, "27_format_addresses.csv", {"expand": False}),
|
||||||
|
(standardize_name, "28_format_names.csv", {}),
|
||||||
|
(standardize_currency, "29_format_currencies.csv",{}),
|
||||||
|
(standardize_email, "26_format_emails.csv", {}),
|
||||||
|
])
|
||||||
|
def test_corpus_idempotency(fn, fixture, kwargs):
|
||||||
|
failures = _idempotency_runner(fn, fixture, **kwargs)
|
||||||
|
assert not failures, (
|
||||||
|
f"non-idempotent transformations in {fixture}:\n"
|
||||||
|
+ "\n".join(f" {cid}: {inp!r} -> {once!r} -> {twice!r}"
|
||||||
|
for cid, inp, once, twice in failures)
|
||||||
|
)
|
||||||
@@ -261,3 +261,78 @@ class TestReadCsvRepaired:
|
|||||||
df, repair = read_csv_repaired(f)
|
df, repair = read_csv_repaired(f)
|
||||||
assert len(df) == 2
|
assert len(df) == 2
|
||||||
assert repair.changed is False
|
assert repair.changed is False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Round-trip integrity (audit GAP-19, GAP-21)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestRoundTrip:
|
||||||
|
def test_csv_roundtrip_preserves_values(self, tmp_path):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"id": ["1", "2", "3"],
|
||||||
|
"name": ["Alice", "Bob", "Carol"],
|
||||||
|
"amount": ["10.50", "20.25", "30.00"],
|
||||||
|
})
|
||||||
|
path = tmp_path / "rt.csv"
|
||||||
|
write_file(df, path)
|
||||||
|
loaded = read_file(path)
|
||||||
|
assert list(loaded.columns) == list(df.columns)
|
||||||
|
assert len(loaded) == len(df)
|
||||||
|
for col in df.columns:
|
||||||
|
assert list(loaded[col]) == list(df[col])
|
||||||
|
|
||||||
|
def test_tsv_roundtrip_via_extension(self, tmp_path):
|
||||||
|
df = pd.DataFrame({"a": ["1", "2"], "b": ["x", "y, z"]})
|
||||||
|
path = tmp_path / "rt.tsv"
|
||||||
|
write_file(df, path)
|
||||||
|
# Confirm tab is used and embedded comma in 'b' survives.
|
||||||
|
loaded = read_file(path)
|
||||||
|
assert list(loaded.columns) == ["a", "b"]
|
||||||
|
assert loaded.iloc[1]["b"] == "y, z"
|
||||||
|
|
||||||
|
def test_semicolon_roundtrip_via_explicit_delimiter(self, tmp_path):
|
||||||
|
df = pd.DataFrame({"a": ["1", "2"], "b": ["x", "y"]})
|
||||||
|
path = tmp_path / "rt.csv"
|
||||||
|
write_file(df, path, delimiter=";")
|
||||||
|
loaded = read_file(path)
|
||||||
|
assert list(loaded.columns) == ["a", "b"]
|
||||||
|
assert loaded.iloc[0]["a"] == "1"
|
||||||
|
|
||||||
|
def test_utf8_bom_non_ascii_roundtrip(self, tmp_path):
|
||||||
|
df = pd.DataFrame({"name": ["café", "naïve", "résumé"]})
|
||||||
|
path = tmp_path / "utf8.csv"
|
||||||
|
write_file(df, path)
|
||||||
|
loaded = read_file(path)
|
||||||
|
assert list(loaded["name"]) == ["café", "naïve", "résumé"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestExcelHeaderDetection:
|
||||||
|
def test_excel_with_metadata_rows(self, tmp_path):
|
||||||
|
from openpyxl import Workbook
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
# Two leading blank rows + header + data.
|
||||||
|
ws.append(["Report generated 2024-01-15", None, None])
|
||||||
|
ws.append([None, None, None])
|
||||||
|
ws.append(["name", "email", "phone"])
|
||||||
|
ws.append(["alice", "a@x.com", "555-1234"])
|
||||||
|
ws.append(["bob", "b@x.com", "555-5678"])
|
||||||
|
path = tmp_path / "report.xlsx"
|
||||||
|
wb.save(path)
|
||||||
|
df = read_file(path)
|
||||||
|
# Auto-detected header row 2 → columns are name/email/phone
|
||||||
|
assert list(df.columns) == ["name", "email", "phone"]
|
||||||
|
assert len(df) == 2
|
||||||
|
|
||||||
|
def test_excel_normal_header_row_zero(self, tmp_path):
|
||||||
|
from openpyxl import Workbook
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.append(["name", "email"])
|
||||||
|
ws.append(["alice", "a@x.com"])
|
||||||
|
path = tmp_path / "normal.xlsx"
|
||||||
|
wb.save(path)
|
||||||
|
df = read_file(path)
|
||||||
|
assert list(df.columns) == ["name", "email"]
|
||||||
|
assert len(df) == 1
|
||||||
|
|||||||
@@ -156,3 +156,51 @@ class TestGetNormalizer:
|
|||||||
def test_unknown_raises(self):
|
def test_unknown_raises(self):
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
get_normalizer("unknown_type")
|
get_normalizer("unknown_type")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Alignment with format_standardize: extension preservation, state codes,
|
||||||
|
# particle handling. See audit GAPs 15/16/17.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestNormalizerAudit:
|
||||||
|
def test_phone_extension_preserved(self):
|
||||||
|
# Two records with different extensions must NOT normalize to
|
||||||
|
# the same key — they're different people at the same business.
|
||||||
|
a = normalize_phone("+15551234567 ext 100")
|
||||||
|
b = normalize_phone("+15551234567 ext 200")
|
||||||
|
assert a != b
|
||||||
|
assert a == "+15551234567;ext=100"
|
||||||
|
|
||||||
|
def test_phone_no_extension_unchanged(self):
|
||||||
|
assert normalize_phone("+15551234567") == "+15551234567"
|
||||||
|
|
||||||
|
def test_address_state_name_to_code(self):
|
||||||
|
# "California" and "CA" produce the same matching key.
|
||||||
|
a = normalize_address("123 Main St, Los Angeles, California 90001")
|
||||||
|
b = normalize_address("123 Main St, Los Angeles, CA 90001")
|
||||||
|
assert a == b
|
||||||
|
|
||||||
|
def test_address_multiword_state_name(self):
|
||||||
|
a = normalize_address("100 Beacon St, Boston, Massachusetts 02101")
|
||||||
|
b = normalize_address("100 Beacon St, Boston, MA 02101")
|
||||||
|
assert a == b
|
||||||
|
|
||||||
|
def test_address_does_not_butcher_city_named_after_state(self):
|
||||||
|
# "New York" appearing as a city should still fold to "ny" —
|
||||||
|
# this is intentional for matching keys (we want ``New York, NY``
|
||||||
|
# and ``NY, NY`` to be the same record) even though the
|
||||||
|
# standardizer (display) would preserve the city name.
|
||||||
|
out = normalize_address("123 Main St, New York, NY 10001")
|
||||||
|
assert "ny" in out
|
||||||
|
|
||||||
|
def test_name_particle_dropped(self):
|
||||||
|
# "Charles de Gaulle" and "Charles Gaulle" produce the same key.
|
||||||
|
assert normalize_name("Charles de Gaulle") == normalize_name("Charles Gaulle")
|
||||||
|
|
||||||
|
def test_name_van_dropped(self):
|
||||||
|
assert normalize_name("Vincent van Gogh") == normalize_name("Vincent Gogh")
|
||||||
|
|
||||||
|
def test_name_particle_idempotent(self):
|
||||||
|
out = normalize_name("Vincent van Gogh")
|
||||||
|
assert normalize_name(out) == out
|
||||||
|
|||||||
@@ -537,8 +537,10 @@ class TestVisualizeHidden:
|
|||||||
|
|
||||||
def test_non_string_passthrough(self):
|
def test_non_string_passthrough(self):
|
||||||
from src.core.text_clean import visualize_hidden_text, visualize_hidden_html
|
from src.core.text_clean import visualize_hidden_text, visualize_hidden_html
|
||||||
|
# Both functions now consistently pass non-strings through
|
||||||
|
# unchanged (audit NIT-13).
|
||||||
assert visualize_hidden_text(None) is None # type: ignore[arg-type]
|
assert visualize_hidden_text(None) is None # type: ignore[arg-type]
|
||||||
assert visualize_hidden_html(None) == ""
|
assert visualize_hidden_html(None) is None # type: ignore[arg-type]
|
||||||
def test_html_marks_leading_trailing_ascii_space(self):
|
def test_html_marks_leading_trailing_ascii_space(self):
|
||||||
from src.core.text_clean import visualize_hidden_html
|
from src.core.text_clean import visualize_hidden_html
|
||||||
out = visualize_hidden_html(" Alice ", mark_outer_whitespace=True)
|
out = visualize_hidden_html(" Alice ", mark_outer_whitespace=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user