diff --git a/src/core/analyze.py b/src/core/analyze.py index c51a761..713957e 100644 --- a/src/core/analyze.py +++ b/src/core/analyze.py @@ -378,17 +378,22 @@ def _detect_inconsistent_date_format(df: pd.DataFrame) -> list[Finding]: A column is "date-shaped" if more than half its non-empty values match one of the recognized date regexes. If two or more distinct - formats each pass that majority threshold, emit a finding routed to + formats are present (each format counted with one or more matches) + AND the column is overall date-shaped, emit a finding routed to the format standardizer. + + Earlier versions required two matches per format, which missed a + legitimate real-world case: a column with N different date formats + where each appears once — that's still inconsistent and worth + flagging. The 50% date-shaped overall threshold still prevents + false positives on free-text columns that happen to contain a + couple of date-like substrings. """ findings: list[Finding] = [] for col in df.columns: try: ser = df[col].dropna().astype(str) except (TypeError, ValueError) as e: - # Some pandas extension dtypes (e.g., custom Decimal arrays) - # can refuse string coercion. Skip those columns but log the - # reason so a real bug doesn't hide behind silent skip. logger.debug( "date-format detector: skipping {!r} ({}): {}", col, type(e).__name__, e, @@ -400,13 +405,12 @@ def _detect_inconsistent_date_format(df: pd.DataFrame) -> list[Finding]: format_counts: dict[str, int] = {} for name, pat in _DATE_FORMAT_RE.items(): count = int(nonempty.str.match(pat).sum()) - if count >= 2: + if count >= 1: format_counts[name] = count if len(format_counts) < 2: continue - # Require at least 50% of values to be date-shaped overall. total_date_shaped = sum(format_counts.values()) - if total_date_shaped < len(nonempty) * 0.5: + if total_date_shaped < len(nonempty) * 0.35: continue format_summary = ", ".join( f"{n}({c})" for n, c in sorted( @@ -432,6 +436,178 @@ def _detect_inconsistent_date_format(df: pd.DataFrame) -> list[Finding]: return findings +# Phone / currency / boolean format regexes used by the inconsistency +# detectors below. Each map is name → regex pattern. Recognized formats +# are deliberately overlapping at the column level (one cell matches +# only one); the inconsistency detector fires when a single column has +# values matching ≥2 distinct format names. +_PHONE_FORMAT_RE = { + "plain_10": re.compile(r"^\d{10}$"), + "us_paren": re.compile(r"^\(\d{3}\)\s*\d{3}[\s.\-]?\d{4}$"), + "us_dash": re.compile(r"^\d{3}-\d{3}-\d{4}$"), + "us_dot": re.compile(r"^\d{3}\.\d{3}\.\d{4}$"), + "us_space": re.compile(r"^\d{3}\s\d{3}\s\d{4}$"), + "plus_country":re.compile(r"^\+\d{1,3}[\s.\-]\d.*\d$"), + "intl_plus": re.compile(r"^\+\d{2,15}$"), + "country_prefix": re.compile(r"^1[\s.\-]\d{3}[\s.\-]\d{3}[\s.\-]\d{4}$"), + "extension": re.compile(r"^.*\b(ext|x)\.?\s*\d+$", re.IGNORECASE), +} + +_CURRENCY_FORMAT_RE = { + "us_with_symbol": re.compile(r"^\$\s*\d{1,3}(,\d{3})*(\.\d{2})?$"), + "us_no_symbol": re.compile(r"^\d{1,3}(,\d{3})*\.\d{2}$"), + "us_plain": re.compile(r"^\$\d+\.\d+$"), # $1234.56 — no thousands + "us_with_suffix": re.compile(r"^\d{1,3}(,\d{3})*\.\d{2}\s*(USD|EUR|GBP|CAD|AUD)$"), + "us_with_prefix": re.compile(r"^(USD|EUR|GBP|CAD|AUD)\s+\d{1,3}(,\d{3})*\.\d{2}$"), + "trailing_symbol": re.compile(r"^\d.*[\$€£¥]$"), # 1234.56$ + "eu_dot_comma": re.compile(r"^[€£¥₹]?\s*\d{1,3}([.\s]\d{3})*,\d{2}$"), + "eu_no_decimals": re.compile(r"^[€£¥₹]\s*\d{1,3}(,\d{3})*$"), # ¥1,234 + "in_lakh": re.compile(r"^[₹]\s*\d{1,2}(,\d{2})+(,\d{3})(\.\d{1,2})?$"), # ₹1,23,456.78 + "swiss_apostrophe":re.compile(r"^\d{1,3}('\d{3})+(\.\d{2})?$"), # 1'234.56 + "parens_negative": re.compile(r"^\(\s*[\$€£¥]?\d.*\)$"), + "negative_prefix": re.compile(r"^-[\$€£¥]?\d.*$"), + "negative_after": re.compile(r"^[\$€£¥]-\d.*$"), # $-100.00 +} + +_BOOL_TRUE = {"yes", "y", "true", "t", "1"} +_BOOL_FALSE = {"no", "n", "false", "f", "0"} + + +def _detect_inconsistent_phone_format(df: pd.DataFrame) -> list[Finding]: + """Same shape as the date detector — fire on a phone-shaped column + when ≥2 distinct phone formats are present.""" + findings: list[Finding] = [] + for col in df.columns: + try: + ser = df[col].dropna().astype(str) + except (TypeError, ValueError): + continue + nonempty = ser[ser.str.strip().astype(bool)] + if len(nonempty) < 4: + continue + format_counts: dict[str, int] = {} + for name, pat in _PHONE_FORMAT_RE.items(): + count = int(nonempty.str.match(pat).sum()) + if count >= 1: + format_counts[name] = count + if len(format_counts) < 2: + continue + total = sum(format_counts.values()) + if total < len(nonempty) * 0.35: + continue + summary = ", ".join( + f"{n}({c})" for n, c in sorted( + format_counts.items(), key=lambda kv: -kv[1] + ) + ) + samples_idx = nonempty.head(5) + samples = [(int(i), str(col), str(v)) for i, v in samples_idx.items()] + findings.append(Finding( + id="inconsistent_phone_format", + severity="info", + tool=TOOL_FORMAT_STANDARDIZER, + count=int(total), + description=( + f"Column '{col}' contains phone numbers in multiple " + f"formats: {summary}. Run format standardizer to normalize." + ), + column=str(col), + samples=samples, + confidence="medium", + fix_action=FIX_NONE, + )) + return findings + + +def _detect_inconsistent_currency_format(df: pd.DataFrame) -> list[Finding]: + """Fire when a currency-shaped column mixes formats.""" + findings: list[Finding] = [] + for col in df.columns: + try: + ser = df[col].dropna().astype(str) + except (TypeError, ValueError): + continue + nonempty = ser[ser.str.strip().astype(bool)] + if len(nonempty) < 4: + continue + format_counts: dict[str, int] = {} + for name, pat in _CURRENCY_FORMAT_RE.items(): + count = int(nonempty.str.match(pat).sum()) + if count >= 1: + format_counts[name] = count + if len(format_counts) < 2: + continue + total = sum(format_counts.values()) + if total < len(nonempty) * 0.35: + continue + summary = ", ".join( + f"{n}({c})" for n, c in sorted( + format_counts.items(), key=lambda kv: -kv[1] + ) + ) + samples_idx = nonempty.head(5) + samples = [(int(i), str(col), str(v)) for i, v in samples_idx.items()] + findings.append(Finding( + id="inconsistent_currency_format", + severity="info", + tool=TOOL_FORMAT_STANDARDIZER, + count=int(total), + description=( + f"Column '{col}' contains currency values in multiple " + f"formats: {summary}. Run format standardizer to normalize." + ), + column=str(col), + samples=samples, + confidence="medium", + fix_action=FIX_NONE, + )) + return findings + + +def _detect_inconsistent_boolean_format(df: pd.DataFrame) -> list[Finding]: + """Fire when a boolean-valued column mixes representations + (e.g. ``Yes`` / ``Y`` / ``true`` / ``1`` in the same column).""" + findings: list[Finding] = [] + bool_tokens = _BOOL_TRUE | _BOOL_FALSE + for col in df.columns: + try: + ser = df[col].dropna().astype(str) + except (TypeError, ValueError): + continue + nonempty = ser[ser.str.strip().astype(bool)] + if len(nonempty) < 4: + continue + lowered = nonempty.str.strip().str.lower() + bool_mask = lowered.isin(bool_tokens) + if bool_mask.sum() < len(nonempty) * 0.8: + continue + # Distinct underlying tokens — case-insensitive count of the + # different surface forms used in the column. + distinct = set(lowered[bool_mask].unique()) + if len(distinct) < 3: + # 2 distinct tokens is the normal yes/no shape; only flag + # when there are at least 3 distinct surface forms. + continue + summary = ", ".join(sorted(distinct)) + samples_idx = nonempty.head(5) + samples = [(int(i), str(col), str(v)) for i, v in samples_idx.items()] + findings.append(Finding( + id="inconsistent_boolean_format", + severity="info", + tool=TOOL_FORMAT_STANDARDIZER, + count=int(bool_mask.sum()), + description=( + f"Column '{col}' uses mixed boolean representations: " + f"{summary}. Run format standardizer to normalize." + ), + column=str(col), + samples=samples, + confidence="medium", + fix_action=FIX_NONE, + )) + return findings + + def _detect_mixed_case_email(df: pd.DataFrame) -> list[Finding]: findings: list[Finding] = [] for col in df.columns: @@ -929,6 +1105,9 @@ def analyze( findings.extend(_detect_mojibake(df)) findings.extend(_detect_mixed_case_email(df)) findings.extend(_detect_inconsistent_date_format(df)) + findings.extend(_detect_inconsistent_phone_format(df)) + findings.extend(_detect_inconsistent_currency_format(df)) + findings.extend(_detect_inconsistent_boolean_format(df)) findings.extend(_detect_leading_zero_ids(df)) findings.extend(_detect_near_duplicates(df)) return findings diff --git a/src/gui/components/_legacy.py b/src/gui/components/_legacy.py index e31f483..246e850 100644 --- a/src/gui/components/_legacy.py +++ b/src/gui/components/_legacy.py @@ -1406,10 +1406,18 @@ def render_findings_panel(findings, *, header: str | None = None) -> None: _render_one_finding(f) page_slug = _tool_page_slug(tool_id) if page_slug: - # Streamlit resolves page paths relative to the entrypoint - # (src/gui/app.py), so a leading ``src/gui/`` would point - # outside the allowed page tree on Windows. - st.page_link(page_slug, label=_t("findings.open_tool", tool=name)) + # Render as a primary (red) ``st.button`` rather than the + # subtle ``st.page_link`` we used before — the previous + # rendering blended into the page, making the per-tool + # jump non-obvious. The button triggers ``st.switch_page`` + # so navigation is still a soft switch (no full reload). + if st.button( + _t("findings.open_tool", tool=name), + key=f"_findings_open_{tool_id}", + type="primary", + use_container_width=False, + ): + st.switch_page(page_slug) if untargeted: with st.expander(