feat: 3 new tools, format streaming, distribution-ready demo + landing pages

Tools shipped this batch (4 → 6 of 9 Ready): 04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI 05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI 09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI with soft tool-dependency graph (recommended, not enforced) and JSON save/load for repeatable weekly cleanups. Format Standardizer reworked for 1 GB international files: • Vectorised dispatch + LRU cache over phone/date/currency/boolean/email • Per-row country / address columns drive parsing • Audit cap (default 10 k rows, ~50 MB RAM) • standardize_file(): chunked streaming entry point (~165 k rows/sec) • currency_decimal="auto" for EU comma-decimal locales • R$ / kr / zł multi-char currency prefixes • cli_format.py with auto-stream above 100 MB inputs Encoding detection arbiter + language-aware probe: Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM) via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes. Distribution-readiness assets: • streamlit_app.py — Streamlit Community Cloud entry shim • src/gui/app_demo.py — single-page demo, ?p=<persona> routing, 100-row cap + watermark, free-vs-paid boundary enforced at surface • samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs • landing/ — 4 static HTML pages (apex chooser + 3 niche), shared CSS, deploy.py URL-substitution script, auto-generated robots.txt + sitemap.xml + 404.html + favicon • docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md — full strategy + measurement + deployment + master checklist Test counts: before: 1,520 passed · 4 skipped · 17 xfailed after: 1,729 passed · 0 skipped · 0 xfailed Tier-1 corpora added: • missing-corpus 3 use cases + 16 edge cases • column-mapper-corpus 3 use cases + 5 edge cases • format-cleaner intl 20-row 13-country stress fixture Engine hardening flushed out by the corpora: • interpolate guards against object-dtype columns • mean/median skip all-NaN columns (silences numpy warning) • fillna runs under future.no_silent_downcasting (silences pandas warning) • mojibake test no longer skips when ftfy installed (monkeypatch path) • drop-row threshold semantics: strict-greater (consistent across rows / cols) • currency_decimal validator allow-set updated for "auto" Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 22:31:26 +00:00
parent d18b95880d
commit 966af8ef94
89 changed files with 12039 additions and 284 deletions
--- a/src/core/io.py
+++ b/src/core/io.py
@@ -18,6 +18,207 @@ from loguru import logger
 # Encoding detection
 # ---------------------------------------------------------------------------

+# charset-normalizer often picks an Eastern-European code page (cp1250,
+# cp1258) for byte-equivalent Western content, mac_iceland over mac_roman
+# in the Mac family, and shift_jis_2004 for short Cyrillic samples. The
+# arbiter below resolves these specific false positives without
+# overruling the detector when its top pick is genuinely the right
+# answer.
+#
+# Mapping is *over-picked encoding* → *more plausible substitutes (in
+# priority order)*. We accept either the candidate's primary encoding
+# name or any of its ``could_be_from_charset`` aliases.
+_ENCODING_FALLBACKS: dict[str, tuple[str, ...]] = {
+    "cp1250":         ("cp1252", "latin_1", "iso8859_15", "iso8859_2"),
+    "cp1258":         ("iso8859_2", "cp1250", "cp1252"),
+    "mac_iceland":    ("mac_roman",),
+    "shift_jis_2004": ("koi8_r", "cp1251", "cp1252", "iso8859_2"),
+    "shift_jisx0213": ("koi8_r", "cp1251", "cp1252", "iso8859_2"),
+}
+
+
+def _arbitrate_charset_match(matches) -> Optional[str]:
+    """Pick the most plausible encoding from a charset-normalizer match list.
+
+    Two distinguishing signals separate a false positive from a real
+    pick when the top encoding is one we've recorded as over-picked:
+
+    * If the top match's own ``could_be_from_charset`` alias list
+      already names a preferred fallback (e.g. cp1250 with cp1252 as a
+      sibling), we substitute — charset-normalizer has flagged the
+      byte content as ambiguous.
+    * If the second-ranked match shares identical *chaos* and
+      *coherence* scores with the top — meaning the bytes decode
+      byte-equivalently under both — we substitute when the second
+      match is the preferred Western default.
+
+    When neither signal fires (real cp1250 / cp1258 content where
+    charset-normalizer is genuinely confident), the top pick is
+    returned unchanged.
+    """
+    ranked = list(matches)
+    if not ranked:
+        return None
+    top = ranked[0]
+    top_enc = top.encoding.lower()
+    fallbacks = _ENCODING_FALLBACKS.get(top_enc)
+    if not fallbacks:
+        return top_enc
+
+    # The decisive signal: a lower-ranked candidate that ties the top
+    # pick on both chaos and coherence has decoded the bytes
+    # *identically*, so the choice between them is byte-equivalent. When
+    # one of those tied candidates is a preferred Western default,
+    # substitute. We walk the fallbacks in priority order so the most
+    # canonical alternative wins (cp1252 over iso8859_2 over iso8859_15).
+    #
+    # When no tied candidate matches, we leave the top pick alone — that
+    # is the "real cp1250 / cp1258 content" path where charset-normalizer
+    # is genuinely confident.
+    top_chaos = getattr(top, "chaos", None)
+    top_coherence = getattr(top, "coherence", None)
+    tied: list = []
+    for m in ranked[1:]:
+        if m.chaos != top_chaos or m.coherence != top_coherence:
+            break  # ranked list is monotonically less confident
+        tied.append(m)
+
+    if tied:
+        for preferred in fallbacks:
+            for m in tied:
+                candidates = {
+                    m.encoding.lower(),
+                    *(a.lower() for a in m.could_be_from_charset),
+                }
+                if preferred in candidates:
+                    return preferred
+
+    # No tied alternative — but charset-normalizer occasionally folds
+    # the more popular Western alias into the *top pick's own* alias
+    # list (cp1250 with cp1252 listed alongside). When that happens,
+    # prefer the canonical Western form.
+    top_aliases = {a.lower() for a in top.could_be_from_charset}
+    for preferred in fallbacks:
+        # Only honour an in-alias swap if the preferred encoding is a
+        # different family from the top pick (cp1252 swap from cp1250 is
+        # legitimate; iso8859_2 swap from cp1250 is not — they differ
+        # bytewise on accented Eastern letters).
+        if preferred in top_aliases and not _same_byte_family(top_enc, preferred):
+            return preferred
+
+    return top_enc
+
+
+# ---------------------------------------------------------------------------
+# Language-aware probe: distinguish KOI8-R from Shift_JIS, ISO-8859-2 from
+# cp1258 when charset-normalizer cannot.
+# ---------------------------------------------------------------------------
+
+# Unicode ranges that uniquely identify each language family. A candidate
+# encoding "wins" the probe when its decoding of the raw bytes produces
+# the highest *coverage ratio* (non-ASCII letters in the target range
+# divided by total non-ASCII letters).
+_CYRILLIC_RANGE = (0x0400, 0x04FF)
+_EE_LATIN_LETTERS = frozenset(
+    "ąćęłńóśźżĄĆĘŁŃÓŚŹŻ"          # Polish
+    "áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ"   # Czech
+    "áéíóöőúüűÁÉÍÓÖŐÚÜŰ"         # Hungarian
+    "äčďéíĺľňóôŕšťúýžÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ"  # Slovak
+)
+
+# Encodings to probe when charset-normalizer fingerprints the file as
+# Japanese (a frequent misfire on short Cyrillic samples whose byte
+# patterns happen to coincide with shift_jis lead bytes).
+_CYRILLIC_PROBES: tuple[str, ...] = ("koi8_r", "cp1251", "iso8859_5")
+_EE_LATIN_PROBES: tuple[str, ...] = ("iso8859_2", "cp1250")
+
+
+def _cyrillic_coverage(text: str) -> float:
+    """Fraction of *all non-ASCII characters* in *text* that are Cyrillic letters.
+
+    Dividing by all non-ASCII (rather than only letters) penalises
+    decodings that produce mostly symbols/box-drawing with a sprinkle
+    of incidental Cyrillic glyphs — a real KOI8-R Russian text scores
+    >0.7 because nearly every non-ASCII codepoint IS a Cyrillic letter,
+    whereas a Japanese-shift_jis-decoded-as-koi8r text scores low.
+    """
+    non_ascii = [c for c in text if ord(c) >= 0x80]
+    if not non_ascii:
+        return 0.0
+    cyr = sum(
+        1 for c in non_ascii
+        if c.isalpha() and _CYRILLIC_RANGE[0] <= ord(c) <= _CYRILLIC_RANGE[1]
+    )
+    return cyr / len(non_ascii)
+
+
+def _ee_latin_coverage(text: str) -> float:
+    """Fraction of *all non-ASCII characters* in *text* that look like EE Latin."""
+    non_ascii = [c for c in text if ord(c) >= 0x80]
+    if not non_ascii:
+        return 0.0
+    ee = sum(1 for c in non_ascii if c in _EE_LATIN_LETTERS)
+    return ee / len(non_ascii)
+
+
+def _probe_language(raw: bytes, top_enc: str) -> Optional[str]:
+    """Try language-specific decodings when charset-normalizer guessed wrong.
+
+    Returns a better encoding name when one of the probe candidates
+    decodes the bytes into a language-coherent text (Cyrillic ≥ 70 % for
+    Cyrillic probes, EE-Latin ≥ 50 % for EE Latin probes), else None.
+    """
+    if top_enc in {"shift_jis_2004", "shift_jisx0213", "shift_jis", "cp932"}:
+        probes, scorer, threshold = _CYRILLIC_PROBES, _cyrillic_coverage, 0.70
+    elif top_enc in {"cp1258", "iso8859_16"}:
+        probes, scorer, threshold = _EE_LATIN_PROBES, _ee_latin_coverage, 0.50
+    else:
+        return None
+
+    # Score the top pick first. If the top encoding *itself* decodes the
+    # bytes into reasonable Cyrillic / EE Latin text, the bytes are
+    # genuinely in that script — don't override.
+    try:
+        top_decoded = raw.decode(top_enc, errors="replace")
+        top_score = scorer(top_decoded)
+    except LookupError:
+        top_score = 0.0
+
+    best_enc: Optional[str] = None
+    best_score = 0.0
+    for enc in probes:
+        try:
+            decoded = raw.decode(enc)
+        except (UnicodeDecodeError, LookupError):
+            continue
+        score = scorer(decoded)
+        if score > best_score:
+            best_score = score
+            best_enc = enc
+
+    # Require both an absolute coverage threshold AND a clear margin over
+    # the top pick — otherwise we risk hijacking real Japanese / Vietnamese
+    # content whose decode happens to produce a few Cyrillic / EE-Latin
+    # glyphs by coincidence.
+    if best_enc and best_score >= threshold and best_score >= top_score + 0.30:
+        return best_enc
+    return None
+
+
+# Pairs of encoding names whose byte ranges DIFFER for accented letters.
+# Used to refuse spurious in-alias swaps (e.g. cp1250 vs iso8859_2 are
+# byte-distinct even though charset-normalizer lists them as siblings).
+_SAME_FAMILY: set[frozenset[str]] = {
+    frozenset({"cp1250", "iso8859_2"}),
+    frozenset({"mac_iceland", "mac_turkish"}),
+    frozenset({"shift_jis_2004", "shift_jisx0213"}),
+}
+
+
+def _same_byte_family(a: str, b: str) -> bool:
+    return frozenset({a, b}) in _SAME_FAMILY
+
+
 def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
    """Detect file encoding by reading the first *sample_bytes*.

@@ -34,8 +235,21 @@ def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:

    # Check BOM first
    if raw[:3] == b"\xef\xbb\xbf":
-        return "utf-8-sig"
-    if raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
+        # A "lying" BOM: file claims utf-8 but the body bytes don't decode
+        # as utf-8. Fall through to charset detection on the BOM-stripped
+        # body so we don't hand back utf-8-sig that will then fail to read.
+        body = raw[3:]
+        try:
+            body.decode("utf-8")
+            return "utf-8-sig"
+        except UnicodeDecodeError:
+            logger.debug(
+                "detect_encoding({}): file has UTF-8 BOM but body is not "
+                "valid UTF-8 — falling through to charset detection",
+                Path(path).name,
+            )
+            raw = body
+    elif raw[:2] in (b"\xff\xfe", b"\xfe\xff"):
        return "utf-16"

    # Strict UTF-8 wins. charset_normalizer fingerprints small files
@@ -48,11 +262,21 @@ def detect_encoding(path: Path, sample_bytes: int = 65_536) -> str:
    except UnicodeDecodeError:
        pass

-    result = from_bytes(raw).best()
-    if result is None:
+    matches = from_bytes(raw)
+    enc = _arbitrate_charset_match(matches)
+    if enc is None:
        return "utf-8"
-    enc = result.encoding.lower()
-    # Normalise common aliases
+    # Language-aware probe runs after the arbiter so we only spend cycles
+    # on the cases where charset-normalizer fingerprinted the bytes as a
+    # codepage that doesn't match the apparent script. Returns a better
+    # encoding only when the probe finds a high-coverage match.
+    probed = _probe_language(raw, enc)
+    if probed:
+        logger.debug(
+            "detect_encoding({}): language probe overrode {} → {}",
+            Path(path).name, enc, probed,
+        )
+        enc = probed
    if enc in ("ascii", "us-ascii"):
        enc = "utf-8"
    return enc