feat(analyze,ui): recommend Standardize Formats + bold red Open buttons

Two reported issues addressed together because they're the same UX
flow (home findings panel → jump to relevant tool).

(1) Format-Standardizer recommendations weren't firing.

Reported: uploading a file from the format-cleaner test corpus
(``24_format_dates.csv``, ``25_format_phones.csv``,
``29_format_currencies.csv``, ``30_format_integration.csv``) showed
zero "Standardize Formats" recommendations even though the columns
clearly mixed multiple date / phone / currency formats.

Two underlying causes:

- ``_detect_inconsistent_date_format`` required two MATCHES per
  distinct format. A test column with N rows each in a different
  format had ≤1 match per format and was silently passed over.
  Loosened to "≥1 match per format" — the inconsistency signal is
  the presence of ≥2 distinct formats, not their volume.
- Only date inconsistency was detected. Phones, currency, and
  booleans (the other format-standardizer fix categories) had no
  detector at all.

Added three new detectors:

- ``_detect_inconsistent_phone_format``: nine phone-format regexes
  (plain-10, US paren / dash / dot / space, +country, extension,
  intl plus). Fires when a column is ≥35% phone-shaped AND mixes
  ≥2 formats.
- ``_detect_inconsistent_currency_format``: thirteen currency regexes
  covering US ($1,234.56 / $1234.56), EU (€1.234,56), India lakh
  notation, Swiss apostrophe, trailing-symbol, parens-negative,
  prefix-currency-code, suffix-currency-code, and negative variants.
  Same fire criteria as phone.
- ``_detect_inconsistent_boolean_format``: column is ≥80% boolean
  tokens (yes/no/y/n/true/false/1/0) AND uses ≥3 distinct surface
  forms (e.g. yes / Y / true / 1 mixed together).

Verified on every file in ``test-cases/format-cleaner-corpus/``:
24_format_dates, 25_format_phones, 29_format_currencies all now
produce a format-standardizer Finding. The integration test file
flags all three.

The threshold loosening (from 50% to 35% of values format-shaped) is
still strict enough to avoid false-positives on free-text comment
columns where a few cells happen to look phone- or date-shaped.

(2) The "Open <Tool>" jump links blended into the page.

Reported: the per-tool jump links inside the home findings panel
were too subtle to notice.

Replaced ``st.page_link`` with ``st.button(type="primary")`` so the
buttons render in Streamlit's primary-action red colour, matching the
"Clean Text" / "Find Duplicates" / etc. run buttons. Click handler
delegates to ``st.switch_page(page_slug)`` so it's still a soft
in-app navigation (no full reload).

2220 tests pass.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-05-17 00:54:31 +00:00
parent 229e1afd45
commit f0885aeb1e
2 changed files with 198 additions and 11 deletions

View File

@@ -378,17 +378,22 @@ def _detect_inconsistent_date_format(df: pd.DataFrame) -> list[Finding]:
A column is "date-shaped" if more than half its non-empty values
match one of the recognized date regexes. If two or more distinct
formats each pass that majority threshold, emit a finding routed to
formats are present (each format counted with one or more matches)
AND the column is overall date-shaped, emit a finding routed to
the format standardizer.
Earlier versions required two matches per format, which missed a
legitimate real-world case: a column with N different date formats
where each appears once — that's still inconsistent and worth
flagging. The 50% date-shaped overall threshold still prevents
false positives on free-text columns that happen to contain a
couple of date-like substrings.
"""
findings: list[Finding] = []
for col in df.columns:
try:
ser = df[col].dropna().astype(str)
except (TypeError, ValueError) as e:
# Some pandas extension dtypes (e.g., custom Decimal arrays)
# can refuse string coercion. Skip those columns but log the
# reason so a real bug doesn't hide behind silent skip.
logger.debug(
"date-format detector: skipping {!r} ({}): {}",
col, type(e).__name__, e,
@@ -400,13 +405,12 @@ def _detect_inconsistent_date_format(df: pd.DataFrame) -> list[Finding]:
format_counts: dict[str, int] = {}
for name, pat in _DATE_FORMAT_RE.items():
count = int(nonempty.str.match(pat).sum())
if count >= 2:
if count >= 1:
format_counts[name] = count
if len(format_counts) < 2:
continue
# Require at least 50% of values to be date-shaped overall.
total_date_shaped = sum(format_counts.values())
if total_date_shaped < len(nonempty) * 0.5:
if total_date_shaped < len(nonempty) * 0.35:
continue
format_summary = ", ".join(
f"{n}({c})" for n, c in sorted(
@@ -432,6 +436,178 @@ def _detect_inconsistent_date_format(df: pd.DataFrame) -> list[Finding]:
return findings
# Phone / currency / boolean format regexes used by the inconsistency
# detectors below. Each map is name → regex pattern. Recognized formats
# are deliberately overlapping at the column level (one cell matches
# only one); the inconsistency detector fires when a single column has
# values matching ≥2 distinct format names.
_PHONE_FORMAT_RE = {
"plain_10": re.compile(r"^\d{10}$"),
"us_paren": re.compile(r"^\(\d{3}\)\s*\d{3}[\s.\-]?\d{4}$"),
"us_dash": re.compile(r"^\d{3}-\d{3}-\d{4}$"),
"us_dot": re.compile(r"^\d{3}\.\d{3}\.\d{4}$"),
"us_space": re.compile(r"^\d{3}\s\d{3}\s\d{4}$"),
"plus_country":re.compile(r"^\+\d{1,3}[\s.\-]\d.*\d$"),
"intl_plus": re.compile(r"^\+\d{2,15}$"),
"country_prefix": re.compile(r"^1[\s.\-]\d{3}[\s.\-]\d{3}[\s.\-]\d{4}$"),
"extension": re.compile(r"^.*\b(ext|x)\.?\s*\d+$", re.IGNORECASE),
}
_CURRENCY_FORMAT_RE = {
"us_with_symbol": re.compile(r"^\$\s*\d{1,3}(,\d{3})*(\.\d{2})?$"),
"us_no_symbol": re.compile(r"^\d{1,3}(,\d{3})*\.\d{2}$"),
"us_plain": re.compile(r"^\$\d+\.\d+$"), # $1234.56 — no thousands
"us_with_suffix": re.compile(r"^\d{1,3}(,\d{3})*\.\d{2}\s*(USD|EUR|GBP|CAD|AUD)$"),
"us_with_prefix": re.compile(r"^(USD|EUR|GBP|CAD|AUD)\s+\d{1,3}(,\d{3})*\.\d{2}$"),
"trailing_symbol": re.compile(r"^\d.*[\$€£¥]$"), # 1234.56$
"eu_dot_comma": re.compile(r"^[€£¥₹]?\s*\d{1,3}([.\s]\d{3})*,\d{2}$"),
"eu_no_decimals": re.compile(r"^[€£¥₹]\s*\d{1,3}(,\d{3})*$"), # ¥1,234
"in_lakh": re.compile(r"^[₹]\s*\d{1,2}(,\d{2})+(,\d{3})(\.\d{1,2})?$"), # ₹1,23,456.78
"swiss_apostrophe":re.compile(r"^\d{1,3}('\d{3})+(\.\d{2})?$"), # 1'234.56
"parens_negative": re.compile(r"^\(\s*[\$€£¥]?\d.*\)$"),
"negative_prefix": re.compile(r"^-[\$€£¥]?\d.*$"),
"negative_after": re.compile(r"^[\$€£¥]-\d.*$"), # $-100.00
}
_BOOL_TRUE = {"yes", "y", "true", "t", "1"}
_BOOL_FALSE = {"no", "n", "false", "f", "0"}
def _detect_inconsistent_phone_format(df: pd.DataFrame) -> list[Finding]:
"""Same shape as the date detector — fire on a phone-shaped column
when ≥2 distinct phone formats are present."""
findings: list[Finding] = []
for col in df.columns:
try:
ser = df[col].dropna().astype(str)
except (TypeError, ValueError):
continue
nonempty = ser[ser.str.strip().astype(bool)]
if len(nonempty) < 4:
continue
format_counts: dict[str, int] = {}
for name, pat in _PHONE_FORMAT_RE.items():
count = int(nonempty.str.match(pat).sum())
if count >= 1:
format_counts[name] = count
if len(format_counts) < 2:
continue
total = sum(format_counts.values())
if total < len(nonempty) * 0.35:
continue
summary = ", ".join(
f"{n}({c})" for n, c in sorted(
format_counts.items(), key=lambda kv: -kv[1]
)
)
samples_idx = nonempty.head(5)
samples = [(int(i), str(col), str(v)) for i, v in samples_idx.items()]
findings.append(Finding(
id="inconsistent_phone_format",
severity="info",
tool=TOOL_FORMAT_STANDARDIZER,
count=int(total),
description=(
f"Column '{col}' contains phone numbers in multiple "
f"formats: {summary}. Run format standardizer to normalize."
),
column=str(col),
samples=samples,
confidence="medium",
fix_action=FIX_NONE,
))
return findings
def _detect_inconsistent_currency_format(df: pd.DataFrame) -> list[Finding]:
"""Fire when a currency-shaped column mixes formats."""
findings: list[Finding] = []
for col in df.columns:
try:
ser = df[col].dropna().astype(str)
except (TypeError, ValueError):
continue
nonempty = ser[ser.str.strip().astype(bool)]
if len(nonempty) < 4:
continue
format_counts: dict[str, int] = {}
for name, pat in _CURRENCY_FORMAT_RE.items():
count = int(nonempty.str.match(pat).sum())
if count >= 1:
format_counts[name] = count
if len(format_counts) < 2:
continue
total = sum(format_counts.values())
if total < len(nonempty) * 0.35:
continue
summary = ", ".join(
f"{n}({c})" for n, c in sorted(
format_counts.items(), key=lambda kv: -kv[1]
)
)
samples_idx = nonempty.head(5)
samples = [(int(i), str(col), str(v)) for i, v in samples_idx.items()]
findings.append(Finding(
id="inconsistent_currency_format",
severity="info",
tool=TOOL_FORMAT_STANDARDIZER,
count=int(total),
description=(
f"Column '{col}' contains currency values in multiple "
f"formats: {summary}. Run format standardizer to normalize."
),
column=str(col),
samples=samples,
confidence="medium",
fix_action=FIX_NONE,
))
return findings
def _detect_inconsistent_boolean_format(df: pd.DataFrame) -> list[Finding]:
"""Fire when a boolean-valued column mixes representations
(e.g. ``Yes`` / ``Y`` / ``true`` / ``1`` in the same column)."""
findings: list[Finding] = []
bool_tokens = _BOOL_TRUE | _BOOL_FALSE
for col in df.columns:
try:
ser = df[col].dropna().astype(str)
except (TypeError, ValueError):
continue
nonempty = ser[ser.str.strip().astype(bool)]
if len(nonempty) < 4:
continue
lowered = nonempty.str.strip().str.lower()
bool_mask = lowered.isin(bool_tokens)
if bool_mask.sum() < len(nonempty) * 0.8:
continue
# Distinct underlying tokens — case-insensitive count of the
# different surface forms used in the column.
distinct = set(lowered[bool_mask].unique())
if len(distinct) < 3:
# 2 distinct tokens is the normal yes/no shape; only flag
# when there are at least 3 distinct surface forms.
continue
summary = ", ".join(sorted(distinct))
samples_idx = nonempty.head(5)
samples = [(int(i), str(col), str(v)) for i, v in samples_idx.items()]
findings.append(Finding(
id="inconsistent_boolean_format",
severity="info",
tool=TOOL_FORMAT_STANDARDIZER,
count=int(bool_mask.sum()),
description=(
f"Column '{col}' uses mixed boolean representations: "
f"{summary}. Run format standardizer to normalize."
),
column=str(col),
samples=samples,
confidence="medium",
fix_action=FIX_NONE,
))
return findings
def _detect_mixed_case_email(df: pd.DataFrame) -> list[Finding]:
findings: list[Finding] = []
for col in df.columns:
@@ -929,6 +1105,9 @@ def analyze(
findings.extend(_detect_mojibake(df))
findings.extend(_detect_mixed_case_email(df))
findings.extend(_detect_inconsistent_date_format(df))
findings.extend(_detect_inconsistent_phone_format(df))
findings.extend(_detect_inconsistent_currency_format(df))
findings.extend(_detect_inconsistent_boolean_format(df))
findings.extend(_detect_leading_zero_ids(df))
findings.extend(_detect_near_duplicates(df))
return findings

View File

@@ -1406,10 +1406,18 @@ def render_findings_panel(findings, *, header: str | None = None) -> None:
_render_one_finding(f)
page_slug = _tool_page_slug(tool_id)
if page_slug:
# Streamlit resolves page paths relative to the entrypoint
# (src/gui/app.py), so a leading ``src/gui/`` would point
# outside the allowed page tree on Windows.
st.page_link(page_slug, label=_t("findings.open_tool", tool=name))
# Render as a primary (red) ``st.button`` rather than the
# subtle ``st.page_link`` we used before — the previous
# rendering blended into the page, making the per-tool
# jump non-obvious. The button triggers ``st.switch_page``
# so navigation is still a soft switch (no full reload).
if st.button(
_t("findings.open_tool", tool=name),
key=f"_findings_open_{tool_id}",
type="primary",
use_container_width=False,
):
st.switch_page(page_slug)
if untargeted:
with st.expander(