From db5ec084dad64733d9e1c7d36e89a07e30de9abe Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 16 May 2026 19:50:09 +0000 Subject: [PATCH] docs+code: rename tool labels everywhere Sweep follow-up to 93e43fc. Display labels now consistent across docs, landing pages, CLI output, code comments, docstrings, and test prose. Five parallel surfaces touched: - docs (EN + ES): README, USER-GUIDE, CLI-REFERENCE, and 11 internal design/planning docs - landing pages: index + bookkeeper/revops/shopify-pet - src: CLI module docstrings, _TOOL_DISPLAY dicts in cli_analyze.py and gui/components/_legacy.py, core module headers, every tool page's module docstring - tests: class/method/module docstrings and section-header comments - test-cases READMEs Page slugs (1_Deduplicator etc.), tool_id strings (01_deduplicator etc.), Python class names (TestDeduplicatorWorkflow, FeatureFlag.*), URL paths, anchor IDs, CSS classes, and asset filenames were left intact since they're code identifiers / structural references. All 2033 tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.es.md | 18 +++++------ README.md | 18 +++++------ build/README.md | 2 +- docs/ADMIN.md | 2 +- docs/BUSINESS.md | 4 +-- docs/CLI-REFERENCE.es.md | 10 +++--- docs/CLI-REFERENCE.md | 10 +++--- docs/DECISIONS.md | 12 +++---- docs/DEVELOPER.md | 2 +- docs/NEXT-STEPS.md | 2 +- docs/PLAN.md | 24 +++++++------- docs/POST-LAUNCH.md | 2 +- docs/RECOVERY.md | 4 +-- docs/REQUIREMENTS.md | 32 +++++++++---------- docs/TECHNICAL.md | 12 +++---- docs/USER-GUIDE.es.md | 38 +++++++++++------------ docs/USER-GUIDE.md | 38 +++++++++++------------ landing/bookkeeper/index.html | 12 +++---- landing/index.html | 6 ++-- landing/revops/index.html | 16 +++++----- landing/shopify-pet/index.html | 14 ++++----- src/cli_analyze.py | 18 +++++------ src/cli_column_map.py | 2 +- src/cli_format.py | 2 +- src/cli_missing.py | 2 +- src/cli_pipeline.py | 2 +- src/core/column_mapper.py | 2 +- src/core/missing.py | 2 +- src/core/pipeline.py | 2 +- src/gui/__init__.py | 2 +- src/gui/components/__init__.py | 2 +- src/gui/components/_legacy.py | 20 ++++++------ src/gui/pages/1_Deduplicator.py | 2 +- src/gui/pages/2_Text_Cleaner.py | 2 +- src/gui/pages/3_Format_Standardizer.py | 2 +- src/gui/pages/4_Missing_Values.py | 2 +- src/gui/pages/5_Column_Mapper.py | 2 +- src/gui/pages/6_Outlier_Detector.py | 2 +- src/gui/pages/7_Multi_File_Merger.py | 2 +- src/gui/pages/8_Validator_Reporter.py | 2 +- src/gui/pages/9_Pipeline_Runner.py | 2 +- test-cases/column-mapper-corpus/README.md | 2 +- test-cases/missing-corpus/README.md | 2 +- test-cases/text-cleaner-corpus/README.md | 2 +- tests/gui/test_activation.py | 2 +- tests/gui/test_advanced_panels.py | 2 +- tests/gui/test_dedup_review.py | 2 +- tests/gui/test_errors.py | 2 +- tests/gui/test_gate.py | 4 +-- tests/gui/test_lite_tier.py | 10 +++--- tests/gui/test_workflows.py | 12 +++---- tests/test_cli_analyze.py | 4 +-- tests/test_column_mapper_corpus.py | 2 +- tests/test_e2e.py | 2 +- tests/test_lite_tier.py | 4 +-- tests/test_missing_corpus.py | 2 +- tests/test_perf_regressions.py | 4 +-- 57 files changed, 205 insertions(+), 205 deletions(-) diff --git a/README.es.md b/README.es.md index 472eebd..a6587f3 100644 --- a/README.es.md +++ b/README.es.md @@ -8,15 +8,15 @@ Limpieza local de CSV / Excel. CLI + GUI en el navegador, sin nube, sin ceremoni | # | Herramienta | Estado | |---|------|--------| -| 01 | **Eliminador de duplicados** — coincidencia exacta + difusa, 5 normalizadores, reglas de superviviente, auditoría | Listo | -| 02 | **Limpiador de texto** — espacios, caracteres tipográficos, BOM, finales de línea, mayúsculas/minúsculas | Listo | -| 03 | **Estandarizador de formatos** — fechas, teléfonos, correos, direcciones, nombres, monedas, booleanos | Listo | -| 04 | **Gestor de valores faltantes** — detección de nulos disfrazados, perfil, media/mediana/moda/ffill/bfill/interpolación, estrategias de descarte | Listo | -| 05 | **Mapeador de columnas** — autodetección difusa de renombrados, esquema objetivo con coerción de tipos, campos requeridos con valores por defecto, descartar/reordenar | Listo | -| 06 | Detector de valores atípicos | Próximamente | -| 07 | Combinador de varios archivos | Próximamente | -| 08 | Validador e informes | Próximamente | -| 09 | **Ejecutor de canalizaciones** — encadena herramientas en un orden recomendado (no forzado), guarda/carga JSON, automatiza limpiezas semanales | Listo | +| 01 | **Buscar duplicados** — coincidencia exacta + difusa, 5 normalizadores, reglas de superviviente, auditoría | Listo | +| 02 | **Limpiar texto** — espacios, caracteres tipográficos, BOM, finales de línea, mayúsculas/minúsculas | Listo | +| 03 | **Estandarizar formatos** — fechas, teléfonos, correos, direcciones, nombres, monedas, booleanos | Listo | +| 04 | **Corregir valores faltantes** — detección de nulos disfrazados, perfil, media/mediana/moda/ffill/bfill/interpolación, estrategias de descarte | Listo | +| 05 | **Mapear columnas** — autodetección difusa de renombrados, esquema objetivo con coerción de tipos, campos requeridos con valores por defecto, descartar/reordenar | Listo | +| 06 | Detectar valores atípicos | Próximamente | +| 07 | Combinar archivos | Próximamente | +| 08 | Verificación de calidad | Próximamente | +| 09 | **Flujos automatizados** — encadena herramientas en un orden recomendado (no forzado), guarda/carga JSON, automatiza limpiezas semanales | Listo | ## Descarga (usuarios no técnicos) diff --git a/README.md b/README.md index 48ea9ee..b55356c 100644 --- a/README.md +++ b/README.md @@ -8,15 +8,15 @@ Local CSV / Excel cleaning. CLI + browser GUI, no cloud, no install ceremony. GU | # | Tool | Status | |---|------|--------| -| 01 | **Deduplicator** — exact + fuzzy match, 5 normalizers, survivor rules, audit | Ready | -| 02 | **Text Cleaner** — whitespace, smart chars, BOM, line endings, case ops | Ready | -| 03 | **Format Standardizer** — dates, phones, emails, addresses, names, currencies, booleans | Ready | -| 04 | **Missing Value Handler** — disguised-null detection, profile, mean/median/mode/ffill/bfill/interpolate, drop strategies | Ready | -| 05 | **Column Mapper** — fuzzy auto-rename, target schema with type coercion, required fields with defaults, drop/reorder | Ready | -| 06 | Outlier Detector | Coming Soon | -| 07 | Multi-File Merger | Coming Soon | -| 08 | Validator & Reporter | Coming Soon | -| 09 | **Pipeline Runner** — chain tools with recommended (not forced) order, save/load JSON, automate weekly cleanups | Ready | +| 01 | **Find Duplicates** — exact + fuzzy match, 5 normalizers, survivor rules, audit | Ready | +| 02 | **Clean Text** — whitespace, smart chars, BOM, line endings, case ops | Ready | +| 03 | **Standardize Formats** — dates, phones, emails, addresses, names, currencies, booleans | Ready | +| 04 | **Fix Missing Values** — disguised-null detection, profile, mean/median/mode/ffill/bfill/interpolate, drop strategies | Ready | +| 05 | **Map Columns** — fuzzy auto-rename, target schema with type coercion, required fields with defaults, drop/reorder | Ready | +| 06 | Find Unusual Values | Coming Soon | +| 07 | Combine Files | Coming Soon | +| 08 | Quality Check | Coming Soon | +| 09 | **Automated Workflows** — chain tools with recommended (not forced) order, save/load JSON, automate weekly cleanups | Ready | ## Download (non-technical users) diff --git a/build/README.md b/build/README.md index e0cee78..aae03cc 100644 --- a/build/README.md +++ b/build/README.md @@ -246,7 +246,7 @@ much state to trust: 4. Double-click the app icon. 5. Browser should open to http://127.0.0.1:850x within 5 seconds. 6. Drop samples/demo/shopify_pet_customers.csv into the - Pipeline Runner page; click Run; AFTER preview should appear. + Automated Workflows page; click Run; AFTER preview should appear. 7. Confirm in the network tab: zero outbound calls except to 127.0.0.1 and the Streamlit static asset paths (also local). ``` diff --git a/docs/ADMIN.md b/docs/ADMIN.md index 25fb36f..9fe0bd8 100644 --- a/docs/ADMIN.md +++ b/docs/ADMIN.md @@ -333,7 +333,7 @@ the attached `.dtlic` file. | Tier | Features | |------|---------| -| **lite** | Deduplicator, Text Cleaner, Format Standardizer | +| **lite** | Find Duplicates, Clean Text, Standardize Formats | | **core** | All 9 tools | | **pro** | All 9 tools + future Pro-only features | diff --git a/docs/BUSINESS.md b/docs/BUSINESS.md index b44008e..1ffd0ea 100644 --- a/docs/BUSINESS.md +++ b/docs/BUSINESS.md @@ -47,7 +47,7 @@ Sell niche Python automation tools as one-time downloadable digital products. Ta **Surface**: desktop install per OS (PyInstaller) with Streamlit GUI + CLI. Constrained demo on Streamlit Community Cloud. -## 4a. Lead bundle — Deduplicator +## 4a. Lead bundle — Find Duplicates Highest pain density across all 4 personas. Feeds landing copy, demo design, feature priority. Tech spec: TECHNICAL.md §11.1. @@ -208,7 +208,7 @@ Headroom enables optional ad spend ($100-200/mo) once a bundle has proven conver ## 13. Honest status (2026-05-01) -- 3 of 9 tools shipped (Dedup, Text Cleaner, Format Standardizer). +- 3 of 9 tools shipped (Find Duplicates, Clean Text, Standardize Formats). - Cross-platform build pipeline designed, not yet built. - macOS code signing not yet set up. - Streamlit GUI shipped for the 3 ready tools. diff --git a/docs/CLI-REFERENCE.es.md b/docs/CLI-REFERENCE.es.md index 1cf5bb4..734fe95 100644 --- a/docs/CLI-REFERENCE.es.md +++ b/docs/CLI-REFERENCE.es.md @@ -8,15 +8,15 @@ Tres módulos de CLI, uno por cada herramienta Lista: | Módulo | Comando | Propósito | |--------|---------|---------| -| `src.cli` | `python -m src.cli FILE` | Eliminador de duplicados | -| `src.cli_text_clean` | `python -m src.cli_text_clean FILE` | Limpiador de texto | +| `src.cli` | `python -m src.cli FILE` | Buscar duplicados | +| `src.cli_text_clean` | `python -m src.cli_text_clean FILE` | Limpiar texto | | `src.cli_analyze` | `python -m src.cli_analyze FILE` | Analizador (escaneo de solo lectura) | Cada comando es **previsualización por defecto** — añade `--apply` para escribir la salida. --- -# Eliminador de duplicados +# Buscar duplicados ``` python -m src.cli ARCHIVO_ENTRADA [OPCIONES] @@ -125,7 +125,7 @@ Registro: `logs/dedup_YYYYMMDD_HHMMSS.log`. --- -# Limpiador de texto +# Limpiar texto ``` python -m src.cli_text_clean ARCHIVO_ENTRADA [OPCIONES] @@ -156,7 +156,7 @@ Higiene a nivel de carácter. Ver [TECHNICAL.md §10.2](TECHNICAL.md) (solo en i - `--config RUTA` / `--save-config RUTA`. ### Archivo -- `--sheet`, `--encoding`, `--header-row` — iguales que en el Eliminador de duplicados. +- `--sheet`, `--encoding`, `--header-row` — iguales que en Buscar duplicados. ## Presets diff --git a/docs/CLI-REFERENCE.md b/docs/CLI-REFERENCE.md index f3cdc27..e7f157c 100644 --- a/docs/CLI-REFERENCE.md +++ b/docs/CLI-REFERENCE.md @@ -6,15 +6,15 @@ Three CLI modules, one per Ready tool: | Module | Command | Purpose | |--------|---------|---------| -| `src.cli` | `python -m src.cli FILE` | Deduplicator | -| `src.cli_text_clean` | `python -m src.cli_text_clean FILE` | Text Cleaner | +| `src.cli` | `python -m src.cli FILE` | Find Duplicates | +| `src.cli_text_clean` | `python -m src.cli_text_clean FILE` | Clean Text | | `src.cli_analyze` | `python -m src.cli_analyze FILE` | Analyzer (read-only scan) | Every command is **preview-only by default** — add `--apply` to write output. --- -# Deduplicator +# Find Duplicates ``` python -m src.cli INPUT_FILE [OPTIONS] @@ -123,7 +123,7 @@ Log: `logs/dedup_YYYYMMDD_HHMMSS.log`. --- -# Text Cleaner +# Clean Text ``` python -m src.cli_text_clean INPUT_FILE [OPTIONS] @@ -154,7 +154,7 @@ Character-level hygiene. See [TECHNICAL.md §10.2](TECHNICAL.md) for the spec. - `--config PATH` / `--save-config PATH`. ### File -- `--sheet`, `--encoding`, `--header-row` — same as Deduplicator. +- `--sheet`, `--encoding`, `--header-row` — same as Find Duplicates. ## Presets diff --git a/docs/DECISIONS.md b/docs/DECISIONS.md index a9d8564..911dbf6 100644 --- a/docs/DECISIONS.md +++ b/docs/DECISIONS.md @@ -67,7 +67,7 @@ Each candidate scored 1-5 on 6 dimensions. Total /30 → verdict. **v1.2 rationale**: - Buyer persona ("hates Excel work but can't code") won't learn a CLI. Refunds at this price. -- Deduplicator needs interactive review — not viable in pure CLI. +- Find Duplicates needs interactive review — not viable in pure CLI. - Dual interface keeps CLI for automation without sacrificing primary buyer surface. ## 4a. Functional scope principle (v1.2) @@ -170,13 +170,13 @@ $49-79/bundle · $149 full suite (when 3+ exist). | Apr 28 (v1.3) | Add hosted browser demo as conversion lever | Direct consequence of Streamlit choice. See §5. | | Apr 28 (v1.4) | Re-apply 04/06 boundary work (silent-drift recovery) | Stream B v1.2 content overwritten in parallel v1.3 work. Restored per no-silent-drift rule. | | Apr 28 (v1.5) | Add `02_text_cleaner.py`; renumber 02-08 → 03-09 | Character-level hygiene had no clear owner. See TECHNICAL §10. | -| Apr 29 (v1.7) | Adopt Text Cleaner Tier 1/2/3 spec; lock `excel-hygiene` default | Promotes from stub to buildable v1 target. Full spec in TECHNICAL §11.2. | +| Apr 29 (v1.7) | Adopt Clean Text Tier 1/2/3 spec; lock `excel-hygiene` default | Promotes from stub to buildable v1 target. Full spec in TECHNICAL §11.2. | | Apr 28 (v1.6) | Fold conversation-history content into docs (deduplicator spec, lead bundle use cases, full GUI matrix, 04/06 examples, Streamlit-to-SaaS reasoning) | No new decisions; promote at-risk analysis from chat history per no-silent-drift rule. | -| May 1 (v1.6) | Mark Format Standardizer **Ready** | 199-row buyer corpus passing; Tier 1 + most Tier 2 built. | +| May 1 (v1.6) | Mark Standardize Formats **Ready** | 199-row buyer corpus passing; Tier 1 + most Tier 2 built. | | May 1 (v1.6) | Add `src/core/errors.py` structured hierarchy | Uniform helpful messages across CLI + GUI. See TECHNICAL §7. | | May 13 (v1.6) | Ship in-house JSON i18n + EN/ES packs | Expand addressable market (Spanish-first buyers, LatAm bookkeepers) without a `gettext` build step. JSON packs editable by non-devs; parity test prevents drift. See TECHNICAL §10b. | | May 13 (v1.6) | Ship licensing: 1-year HMAC-signed blobs, name+email registration, offline verification, tier-scaffolded for future SKUs | Unlock the lifetime-update business model without recurring infra. Honor-system DRM (HMAC + 30-day refund) — sufficient at $49. See §9b below. | -| May 13 (v1.6) | Add Lite SKU (Dedup + Text Cleaner + Format Standardizer) | Lower-priced entry point for buyers who only need the three universal tools. Per-tool feature gating + lock badges on the home grid surface the upgrade path. See §9b. | +| May 13 (v1.6) | Add Lite SKU (Find Duplicates + Clean Text + Standardize Formats) | Lower-priced entry point for buyers who only need the three universal tools. Per-tool feature gating + lock badges on the home grid surface the upgrade path. See §9b. | | May 13 (v1.6) | Remove user-facing free trial | A 1-year all-features trial undercut the paid Lite SKU. Paid-only keeps tier economics clean. Internal ``_mint`` API still exists for tests and the seller's key generator. See §9b. | | May 13 (v1.6) | Upgrade license crypto: HMAC → Ed25519 (asymmetric) | HMAC's symmetric secret was extractable from the shipped binary — anyone with the binary could mint blobs. Ed25519 splits sign (seller) from verify (binary), so binary compromise doesn't let an attacker forge licenses. Blob prefix bumped DTLIC1 → DTLIC2. See §9b. | | May 13 (v1.6) | Add ``assert_production_safe`` tripwire | A shipped build with ``DATATOOLS_DEV_MODE=1`` or the in-source dev pubkey would silently defeat licensing. The tripwire refuses to boot such a build. No-op in source / pytest runs. See §9b. | @@ -211,13 +211,13 @@ The 30-day refund window covers casual blob sharing from a different angle (anyo - Number of devices the same blob is used on (no concurrent-use detection). - Reverse-engineered re-signing of expired blobs (would require RSA / online check). -**Future SKUs**: the ``FEATURES_BY_TIER`` table in ``src/license/features.py`` is the single source of truth for "which tools each tier unlocks". Adding a PRO SKU that excludes the pipeline runner is a 1-line edit there + a 1-line edit at the gate site. No consumer-code churn. +**Future SKUs**: the ``FEATURES_BY_TIER`` table in ``src/license/features.py`` is the single source of truth for "which tools each tier unlocks". Adding a PRO SKU that excludes Automated Workflows is a 1-line edit there + a 1-line edit at the gate site. No consumer-code churn. **v1.6 SKU lineup**: | Tier | Tools unlocked | Notes | |---|---|---| -| LITE | Deduplicator, Text Cleaner, Format Standardizer | Entry SKU. Three universal tools that handle the most common bookkeeping / RevOps / Klaviyo prep workflows. | +| LITE | Find Duplicates, Clean Text, Standardize Formats | Entry SKU. Three universal tools that handle the most common bookkeeping / RevOps / Klaviyo prep workflows. | | CORE | All 9 tools | Full v1 suite. | | PRO | All 9 tools (scaffolded) | Reserved for future per-feature carve-outs (e.g., scheduled pipelines, API access). | | ENTERPRISE | All 9 tools (scaffolded) | Reserved for future bulk / multi-seat SKUs. | diff --git a/docs/DEVELOPER.md b/docs/DEVELOPER.md index 3cbb88e..82cfabd 100644 --- a/docs/DEVELOPER.md +++ b/docs/DEVELOPER.md @@ -33,7 +33,7 @@ CLI (src/cli*.py) GUI (src/gui/app.py + pages/) | `core.errors` | `DataToolsError` hierarchy, `ensure_dataframe()`, `ensure_choice()`, `wrap_file_read/write()`, `format_for_user()` | | `core._constants` | `US_STATE_NAMES`, `US_STATE_CODES`, `USPS_EXPANSIONS`, `USPS_COMPRESSIONS` | -## Data flow — Deduplicator +## Data flow — Find Duplicates ``` read_file() # auto-detect encoding, delimiter, header diff --git a/docs/NEXT-STEPS.md b/docs/NEXT-STEPS.md index efcc6d3..e5f2122 100644 --- a/docs/NEXT-STEPS.md +++ b/docs/NEXT-STEPS.md @@ -30,7 +30,7 @@ Status legend: | ✓ | Item | Where it lives | |---|------|----------------| | 🟢 | 6 of 9 tools shipped (Dedup, Text, Format, Missing, Column-Map, Pipeline) | `src/core/`, `src/cli_*.py`, `src/gui/pages/` | -| 🟢 | Pipeline Runner (the retention multiplier per `PLAN.md` §2.6) | `src/core/pipeline.py`, `src/cli_pipeline.py`, `src/gui/pages/9_Pipeline_Runner.py` | +| 🟢 | Automated Workflows (the retention multiplier per `PLAN.md` §2.6) | `src/core/pipeline.py`, `src/cli_pipeline.py`, `src/gui/pages/9_Pipeline_Runner.py` | | 🟢 | 1,729 passing tests · 0 skipped · 0 xfailed | `tests/` | | 🟢 | 3 niche demo datasets + pre-tuned pipeline JSONs | `samples/demo/` | | 🟢 | Streamlit demo app + Cloud entry shim | `streamlit_app.py`, `src/gui/app_demo.py` | diff --git a/docs/PLAN.md b/docs/PLAN.md index 86276d2..c2f4e57 100644 --- a/docs/PLAN.md +++ b/docs/PLAN.md @@ -29,8 +29,8 @@ win. | Asset | State | |---|---| -| Tools 1–5 (Dedup, Text Clean, Format Standardize, Missing, Column Mapper) | Ready · 1,691 tests passing · 0 xfailed | -| Tools 6–9 (Outlier, Multi-File Merge, Validator, Pipeline) | Coming Soon | +| Tools 1–5 (Find Duplicates, Clean Text, Standardize Formats, Fix Missing Values, Map Columns) | Ready · 1,691 tests passing · 0 xfailed | +| Tools 6–9 (Find Unusual Values, Combine Files, Quality Check, Automated Workflows) | Coming Soon | | PyInstaller installer pipeline | Not started | | macOS code signing (Apple Dev Program) | Not started | | Hosted browser demo (Streamlit Cloud) | Not deployed | @@ -52,7 +52,7 @@ Tools 6–8 are blocked behind a **distribution gate**: no work on them until the existing 5 tools have a paying customer + one external review (BUSINESS.md §4 sequence rule, applied recursively inside the bundle). -**Exception granted 2026-05-01**: Tool 09 Pipeline Runner is built +**Exception granted 2026-05-01**: Tool 09 Automated Workflows is built *now*. Rationale: the pipeline transforms the bundle from "5 tools you buy" into "an automatable workflow you depend on." That conversion is what produces retention and word-of-mouth — the only marketing channel @@ -104,10 +104,10 @@ demo dataset. | # | Pain | $ / time impact | Tools that fix it | |---|------|-----------------|---| | S1 | **Klaviyo / Mailchimp / Omnisend per-contact billing.** Subscriber list with 10–18 % duplicate rate (case drift, plus signs in Gmail addresses, multiple devices) → recurring overpay forever. | $30–300/mo per percent of dupes on a 50 k list — recurring | Dedup + Format Standardize (email canonicalization) + Pipeline (re-run weekly) | -| S2 | **Product feed rejected by Google Merchant Center / Meta Catalog.** Smart quotes in titles, NBSP in SKU, inconsistent attributes; campaign launch delayed 24–72 h while feed gets fixed. | 1–3 days delayed launch × campaign value | Text Cleaner + Format Standardize | -| S3 | **Multi-channel order consolidation.** Shopify + Etsy + Amazon + Faire + wholesale spreadsheet, each with a different column for "customer email" / "order total" / "ship country". | 4–8 hr / month manually merging | Column Mapper + Dedup + Pipeline | +| S2 | **Product feed rejected by Google Merchant Center / Meta Catalog.** Smart quotes in titles, NBSP in SKU, inconsistent attributes; campaign launch delayed 24–72 h while feed gets fixed. | 1–3 days delayed launch × campaign value | Clean Text + Standardize Formats | +| S3 | **Multi-channel order consolidation.** Shopify + Etsy + Amazon + Faire + wholesale spreadsheet, each with a different column for "customer email" / "order total" / "ship country". | 4–8 hr / month manually merging | Map Columns + Find Duplicates + Automated Workflows | | S4 | **Subscription identity fragmentation.** Pet-box subscribers cancel and re-sub under a different email; cohort analysis says churn is 20 % when it's actually 12 % — pricing decisions wrong. | Mis-priced LTV → over- or under-paid acquisition | Dedup with `merge=true` survivor | -| S5 | **International tax / VAT MOSS compliance.** Country column is `UK` / `U.K.` / `United Kingdom` / `GB` in the same export; VAT report breaks. Phone formats per region break call-center routing. | Compliance penalty risk + ops friction | Format Standardize (per-row country) + Column Mapper | +| S5 | **International tax / VAT MOSS compliance.** Country column is `UK` / `U.K.` / `United Kingdom` / `GB` in the same export; VAT report breaks. Phone formats per region break call-center routing. | Compliance penalty risk + ops friction | Standardize Formats (per-row country) + Map Columns | #### Bookkeeper / freelance accountant @@ -126,7 +126,7 @@ demo dataset. | R1 | **HubSpot / Marketo / Iterable per-contact tier pricing.** 10 k contacts → enterprise tier at $4–8 k/mo. Every duplicate is a recurring tax. | $200–800 / month per 1 k duplicate contacts — recurring | Dedup with cross-source merge + Pipeline | | R2 | **Email-deliverability / sender reputation.** Sending to invalid or duplicate addresses tanks reputation; recovery takes weeks. | Catastrophic — entire email programme degraded | Format Standardize (email canonicalization) + Missing (sentinel detection) | | R3 | **GDPR / contact-data privacy.** Uploading lead data to a third-party cleaning SaaS is itself a GDPR concern; legal review blocks adoption. | Compliance risk + 4–8 wk legal-review delay | Local-only desktop app, zero outbound calls | -| R4 | **Multi-vendor lead-source unification.** Apollo, ZoomInfo, LinkedIn Sales Nav, manual scrapes — each export has different headers, scoring, country format. | 1–3 days per campaign of manual unification | Column Mapper (alias matching) + Format Standardize (per-row country) + Dedup | +| R4 | **Multi-vendor lead-source unification.** Apollo, ZoomInfo, LinkedIn Sales Nav, manual scrapes — each export has different headers, scoring, country format. | 1–3 days per campaign of manual unification | Map Columns (alias matching) + Standardize Formats (per-row country) + Find Duplicates | | R5 | **Suppression-list management across 5+ platforms.** Each platform has its own format; un-deduped suppression lists let opt-outs slip through, triggering CAN-SPAM / GDPR exposure. | Compliance risk + churn-back cost | Pipeline saved as JSON, re-run on each new suppression batch | ### 2.4 Operationalize the moat the docs already name. @@ -154,7 +154,7 @@ right after "runs locally." Copy seed: *"Every change auditable. Hand the audit CSV to your client with the cleaned file."* -### 2.6 The Pipeline Runner is the retention multiplier. +### 2.6 Automated Workflows is the retention multiplier. A buyer with a saved pipeline isn't a one-off purchase — they're a recurring user who recommends the product. This is exactly the @@ -172,8 +172,8 @@ trigger DECISIONS.md §8 already names). ### 2.8 Dependency-aware pipeline UX. Tools have soft execution-order preferences (Text Clean before Format -Standardize, Format before Dedup, Missing before Dedup). The Pipeline -Runner *recommends* the order, *warns* on reversals, and **never +Standardize, Format before Dedup, Missing before Dedup). Automated +Workflows *recommends* the order, *warns* on reversals, and **never forces** — the user owns their workflow. Implementation: see `src/core/pipeline.py` `SOFT_DEPENDENCIES`. @@ -184,7 +184,7 @@ forces** — the user owns their workflow. Implementation: see | 1 | PyInstaller pipeline · Mac/Win unsigned installers · Apple Dev Program enrollment (1–2 wk lead) | `dist/datatools-mac.dmg` and `dist/datatools-win.exe` install on a clean machine | | 2 | Demo deployed to Streamlit Cloud · landing page v1 with embedded demo · 3 persona datasets in the demo | Public URL serves a working pipeline run on a sample dataset in < 30 s | | 3 | Gumroad listing live · share value-first in 3 niche communities (no pitch) · 1 long-tail SEO post for the lead persona | First listing impression captured · post not removed for self-promotion | -| 4 | Pipeline Runner v1.0 shipped (this week, 2026-05-01 — exception per §2.1) · v1.1 patch announced with Tool 09 + intl improvements | Pipeline saves/loads JSON · 3 demo pipelines preloaded | +| 4 | Automated Workflows v1.0 shipped (this week, 2026-05-01 — exception per §2.1) · v1.1 patch announced with Tool 09 + intl improvements | Pipeline saves/loads JSON · 3 demo pipelines preloaded | | 5–8 | Bookkeeper landing page · agency landing page · second tool's promo cycle · priority-support tier added (defer purchase until §2.7 trigger) | Three live landing pages with distinct H1, demo dataset, conversion target | | 9–13 | Tool 06–08 only **if** revenue trajectory supports continued investment · otherwise more market work on the existing 5 + 09 | Decision made on 13 Aug 2026 with revenue data, not feature ambition | @@ -202,7 +202,7 @@ These flip the plan, not the underlying criteria: ## 5. Anti-temptations (things the plan refuses) -- **More tools before more buyers.** Locked. Exception only for Pipeline Runner per §2.1. +- **More tools before more buyers.** Locked. Exception only for Automated Workflows per §2.1. - **SaaS pivot.** Recurring infra conflicts with the lifestyle constraint (DECISIONS.md §4). - **Live chat / sales calls.** Conflicts with no-touch (DECISIONS.md §1 #8). - **Custom integrations / one-off consulting.** $300/hr looks tempting; breaks the "build once, sell many" model that justifies the entire strategy. diff --git a/docs/POST-LAUNCH.md b/docs/POST-LAUNCH.md index f8efa4c..77679c2 100644 --- a/docs/POST-LAUNCH.md +++ b/docs/POST-LAUNCH.md @@ -144,7 +144,7 @@ Reading PLAN.md §3 + this doc together, the rough script: | **M1** (June) | Installers · demo · 3 landing pages · Gumroad live | Whether the funnel mechanically works. Numbers will be noisy; just look for one purchase. | | **M2** (July) | M1 + community posts in 3 niches + 1 SEO post | Which persona converts. Re-allocate effort to the highest-converting niche. | | **M3** (August) | M2 + landing-page changes from M2 review | Whether intent-rate moved on the change. Decide tools 06–08 go/no-go. | -| **M4** (September) | M3 + first repeat-buyer signals | Whether the Pipeline Runner is producing retention as designed. | +| **M4** (September) | M3 + first repeat-buyer signals | Whether Automated Workflows is producing retention as designed. | By end of M4, the data tells you whether the plan is producing $1k–3k/mo (BUSINESS.md §6 6-month target) — extrapolated from the diff --git a/docs/RECOVERY.md b/docs/RECOVERY.md index 92817f0..80e366d 100644 --- a/docs/RECOVERY.md +++ b/docs/RECOVERY.md @@ -21,8 +21,8 @@ project-root/ │ └── CLI-REFERENCE.md ├── src/ │ ├── core/ # shared logic — both CLI + GUI call into this -│ ├── cli.py # Deduplicator CLI -│ ├── cli_text_clean.py # Text Cleaner CLI +│ ├── cli.py # Find Duplicates CLI +│ ├── cli_text_clean.py # Clean Text CLI │ ├── cli_analyze.py # Analyzer CLI │ └── gui/ │ ├── app.py # Streamlit entry diff --git a/docs/REQUIREMENTS.md b/docs/REQUIREMENTS.md index 064bec0..0078908 100644 --- a/docs/REQUIREMENTS.md +++ b/docs/REQUIREMENTS.md @@ -76,7 +76,7 @@ Sample size: 1,000 rows (configurable). - Full-DataFrame `auto_fix`: ~5 min (~30 µs/cell). - Output write: ~10 s. - Recommended RAM: 3–4× input size for the full-Apply path. -- **Format standardizer** (`standardize_dataframe`): ~2.7M rows/sec on +- **Standardize Formats** (`standardize_dataframe`): ~2.7M rows/sec on cache-warm repetition-heavy columns (synthetic 1M-row in-memory benchmark, 2 typed columns); the fused single-pass loop replaced a 3-pass ``.tolist()`` cycle, so per-call overhead is now dominated by @@ -87,20 +87,20 @@ Sample size: 1,000 rows (configurable). thread-pool scaffolding; on CPython 3.12 with the GIL it's roughly neutral, but the API is ready for the free-threaded (PEP 703) Python 3.13+ build where it will help. -- **Text cleaner** (`clean_dataframe`): ~1M rows/sec on +- **Clean Text** (`clean_dataframe`): ~1M rows/sec on repetition-heavy columns (per-call string cache: the pipeline runs once per *unique* cell value, not once per row). -- **Missing handler** (`handle_missing`): lazy-copy — when sentinel +- **Fix Missing Values** (`handle_missing`): lazy-copy — when sentinel standardization runs but finds nothing, AND no drops AND no fills apply, the input frame is returned as-is. On a clean 1 GB file this saves the 1 GB allocation that the unconditional upfront copy used to take. -- **Column mapper** (`map_columns`): rename + drop both already +- **Map Columns** (`map_columns`): rename + drop both already return fresh frames; the explicit upfront `df.copy()` is now removed and downstream mutating steps (schema-add, coerce) copy on demand via `_ensure_owned()`. Rename-only and identity-mapping paths run with zero explicit copies. -- **Deduplicator**: +- **Find Duplicates**: - **Exact-only strategies** (every column uses `Algorithm.EXACT` at threshold 100 — covers strong-key dedup like email/phone, the fallback drop-duplicates path, and explicit "match on this exact @@ -117,19 +117,19 @@ Sample size: 1,000 rows (configurable). (the common dedup workload) skip re-parsing. ## 11. Tools -1. Deduplicator — Ready -2. Text Cleaner — Ready -3. Format Standardizer — Ready -4. Missing Value Handler — Ready -5. Column Mapper — Ready -6. Outlier Detector — Coming Soon -7. Multi-File Merger — Coming Soon -8. Validator & Reporter — Coming Soon -9. Pipeline Runner — Ready +1. Find Duplicates — Ready +2. Clean Text — Ready +3. Standardize Formats — Ready +4. Fix Missing Values — Ready +5. Map Columns — Ready +6. Find Unusual Values — Coming Soon +7. Combine Files — Coming Soon +8. Quality Check — Coming Soon +9. Automated Workflows — Ready ### 11.a Recommended pipeline order (soft, not enforced) -The Pipeline Runner ships with a `SOFT_DEPENDENCIES` table; the +Automated Workflows ships with a `SOFT_DEPENDENCIES` table; the following ordering is the default and the basis of the warning surface. Re-ordering is allowed; the runner emits a warning string and proceeds. @@ -214,7 +214,7 @@ and proceeds. fresh blob without losing the embedded buyer identity. Tier may change during renewal (Lite → Core upgrade path). - **Tiers**: - - ``lite`` — Deduplicator + Text Cleaner + Format Standardizer. + - ``lite`` — Find Duplicates + Clean Text + Standardize Formats. Buyer pays once, gets the three universally-useful tools. - ``core`` — every Ready tool (all 9 in v1.6). - ``pro``, ``enterprise`` — scaffolded for future SKUs; currently diff --git a/docs/TECHNICAL.md b/docs/TECHNICAL.md index 6e44072..e2e4ff3 100644 --- a/docs/TECHNICAL.md +++ b/docs/TECHNICAL.md @@ -34,8 +34,8 @@ src/ normalizers.py # Per-column normalizers for dedup matching text_clean.py # clean_dataframe + smart_title_case _constants.py # Shared USPS abbrevs + state names - cli.py # Deduplicator CLI (Typer) - cli_text_clean.py # Text Cleaner CLI + cli.py # Find Duplicates CLI (Typer) + cli_text_clean.py # Clean Text CLI cli_analyze.py # Analyzer CLI (--json) gui/ app.py # Streamlit entry point @@ -192,7 +192,7 @@ GUI / CLI handlers use `format_for_user()` so the user always sees: file path, o | Bundle | Status | |--------|--------| -| Data Cleaning Mastery | 3/9 tools Ready (Dedup, Text Cleaner, Format Standardizer); 6 stubs | +| Data Cleaning Mastery | 3/9 tools Ready (Find Duplicates, Clean Text, Standardize Formats); 6 stubs | | Automated Business Reporting | Not started | | Ecommerce Data Pipeline | Not started | | Small Business Finance | Not started | @@ -214,12 +214,12 @@ Deliberately separate. Confluent original spec was wrong. | Script | Owns | |--------|------| -| 04 Missing Value Handler | "What's not there." Disguised nulls (`N/A`, `-`, sentinel codes), missingness patterns, imputation, drop-by-threshold. | -| 06 Outlier Detector | "What shouldn't be there." z-score / IQR / modified-z, multivariate (Isolation Forest, Mahalanobis), domain rules, winsorization. | +| 04 Fix Missing Values | "What's not there." Disguised nulls (`N/A`, `-`, sentinel codes), missingness patterns, imputation, drop-by-threshold. | +| 06 Find Unusual Values | "What shouldn't be there." z-score / IQR / modified-z, multivariate (Isolation Forest, Mahalanobis), domain rules, winsorization. | **Run order**: 04 before 06. Outlier stats on data with `NaN` / sentinels are mathematically poisoned (means dragged, IQR widens, false negatives). -**Pipeline order** (Pipeline Runner enforces): 02 → 03 → 04 → 05 → 06 → 07 → 08. 01 is order-flexible. +**Pipeline order** (Automated Workflows enforces): 02 → 03 → 04 → 05 → 06 → 07 → 08. 01 is order-flexible. **Contested cases**: - Whitespace-only cell — 02 trims to empty; 04 then flags empty as null. diff --git a/docs/USER-GUIDE.es.md b/docs/USER-GUIDE.es.md index 963019e..4aedd43 100644 --- a/docs/USER-GUIDE.es.md +++ b/docs/USER-GUIDE.es.md @@ -14,7 +14,7 @@ Introduce tu nombre completo y correo, pega el código de licencia del correo de | Nivel | Herramientas | |---|---| -| **Lite** | Eliminador de duplicados · Limpiador de texto · Estandarizador de formatos | +| **Lite** | Buscar duplicados · Limpiar texto · Estandarizar formatos | | **Core** | Las 9 herramientas | Un usuario Lite que abra una herramienta exclusiva de Core verá un mensaje "Actualiza tu licencia". La página de inicio también muestra una marca 🔒 Bloqueado en las tarjetas de las herramientas que tu nivel no incluye. Para actualizar, pega un código Core en la página Activar. @@ -53,15 +53,15 @@ Matriz de soporte completa: [REQUIREMENTS.md](REQUIREMENTS.md) (solo en inglés) | # | Herramienta | Propósito | Estado | |---|------|---------|--------| -| 01 | Eliminador de duplicados | Coincidencia exacta + difusa, 5 normalizadores, auditoría | Listo | -| 02 | Limpiador de texto | Espacios, caracteres tipográficos, BOM, finales de línea, mayúsculas/minúsculas | Listo | -| 03 | Estandarizador de formatos | Fechas / teléfonos / correos / direcciones / nombres / monedas / booleanos | Listo | -| 04 | Gestor de valores faltantes | Nulos disfrazados, imputación, descarte por umbral | Próximamente | -| 05 | Mapeador de columnas | Renombrar + aplicar esquema | Próximamente | -| 06 | Detector de valores atípicos | z-score, IQR, multivariante | Próximamente | -| 07 | Combinador de varios archivos | Combina varios archivos | Próximamente | -| 08 | Validador e informes | Reglas + informe PDF/Excel | Próximamente | -| 09 | Ejecutor de canalizaciones | Lanzador multi-herramienta de un clic | Próximamente | +| 01 | Buscar duplicados | Coincidencia exacta + difusa, 5 normalizadores, auditoría | Listo | +| 02 | Limpiar texto | Espacios, caracteres tipográficos, BOM, finales de línea, mayúsculas/minúsculas | Listo | +| 03 | Estandarizar formatos | Fechas / teléfonos / correos / direcciones / nombres / monedas / booleanos | Listo | +| 04 | Corregir valores faltantes | Nulos disfrazados, imputación, descarte por umbral | Próximamente | +| 05 | Mapear columnas | Renombrar + aplicar esquema | Próximamente | +| 06 | Detectar valores atípicos | z-score, IQR, multivariante | Próximamente | +| 07 | Combinar archivos | Combina varios archivos | Próximamente | +| 08 | Verificación de calidad | Reglas + informe PDF/Excel | Próximamente | +| 09 | Flujos automatizados | Lanzador multi-herramienta de un clic | Próximamente | **Datos de muestra** (`samples/`): `messy_sales.csv`, `bank_export.xlsx`. @@ -89,17 +89,17 @@ Ayuda: `deduplicator --help`. Referencia completa: [CLI-REFERENCE.es.md](CLI-REF ### 3.3 Orden de ejecución (cuando uses las herramientas manualmente) -Si no usas el Ejecutor de canalizaciones, sigue este orden: +Si no usas Flujos automatizados, sigue este orden: -1. **02 Limpiador de texto** primero — normaliza espacios y caracteres especiales. -2. **03 Estandarizador de formatos** — fechas, teléfonos, etc. necesitan texto limpio. -3. **04 Gestor de valores faltantes** — códigos centinela se ocultan como números. -4. **05 Mapeador de columnas** — esquema antes que estadísticas de atípicos. -5. **06 Detector de valores atípicos** — necesita datos numéricos limpios. Calcular estadísticas con `NaN` o `-999` envenena los resultados. -6. **07 Combinador de varios archivos**, **08 Validador** según sea necesario. -7. **01 Eliminador de duplicados** es flexible en cuanto al orden (normaliza internamente para la coincidencia). +1. **02 Limpiar texto** primero — normaliza espacios y caracteres especiales. +2. **03 Estandarizar formatos** — fechas, teléfonos, etc. necesitan texto limpio. +3. **04 Corregir valores faltantes** — códigos centinela se ocultan como números. +4. **05 Mapear columnas** — esquema antes que estadísticas de atípicos. +5. **06 Detectar valores atípicos** — necesita datos numéricos limpios. Calcular estadísticas con `NaN` o `-999` envenena los resultados. +6. **07 Combinar archivos**, **08 Verificación de calidad** según sea necesario. +7. **01 Buscar duplicados** es flexible en cuanto al orden (normaliza internamente para la coincidencia). -El Ejecutor de canalizaciones aplica este orden automáticamente. +Flujos automatizados aplica este orden automáticamente. ### 3.4 Idioma diff --git a/docs/USER-GUIDE.md b/docs/USER-GUIDE.md index d82954a..ff317e8 100644 --- a/docs/USER-GUIDE.md +++ b/docs/USER-GUIDE.md @@ -14,7 +14,7 @@ Enter your full name + email, paste the license blob from your purchase email (s | Tier | Tools | |---|---| -| **Lite** | Deduplicator · Text Cleaner · Format Standardizer | +| **Lite** | Find Duplicates · Clean Text · Standardize Formats | | **Core** | All 9 tools | A Lite user opening a Core-only tool sees an "Upgrade your license" prompt. The home page also shows a 🔒 Locked badge on tool cards your tier doesn't unlock. To upgrade, paste a Core blob on the Activate page. @@ -53,15 +53,15 @@ Full numbered support matrix: [REQUIREMENTS.md](REQUIREMENTS.md). | # | Tool | Purpose | Status | |---|------|---------|--------| -| 01 | Deduplicator | Exact + fuzzy match, 5 normalizers, audit | Ready | -| 02 | Text Cleaner | Whitespace, smart chars, BOM, line endings, case ops | Ready | -| 03 | Format Standardizer | Dates / phones / emails / addresses / names / currencies / booleans | Ready | -| 04 | Missing Value Handler | Disguised nulls, imputation, drop-by-threshold | Coming Soon | -| 05 | Column Mapper | Rename + enforce schema | Coming Soon | -| 06 | Outlier Detector | z-score, IQR, multivariate | Coming Soon | -| 07 | Multi-File Merger | Combine multiple files | Coming Soon | -| 08 | Validator & Reporter | Rules + PDF/Excel report | Coming Soon | -| 09 | Pipeline Runner | One-click multi-tool launcher | Coming Soon | +| 01 | Find Duplicates | Exact + fuzzy match, 5 normalizers, audit | Ready | +| 02 | Clean Text | Whitespace, smart chars, BOM, line endings, case ops | Ready | +| 03 | Standardize Formats | Dates / phones / emails / addresses / names / currencies / booleans | Ready | +| 04 | Fix Missing Values | Disguised nulls, imputation, drop-by-threshold | Coming Soon | +| 05 | Map Columns | Rename + enforce schema | Coming Soon | +| 06 | Find Unusual Values | z-score, IQR, multivariate | Coming Soon | +| 07 | Combine Files | Combine multiple files | Coming Soon | +| 08 | Quality Check | Rules + PDF/Excel report | Coming Soon | +| 09 | Automated Workflows | One-click multi-tool launcher | Coming Soon | **Sample data** (`samples/`): `messy_sales.csv`, `bank_export.xlsx`. @@ -89,17 +89,17 @@ Get help: `deduplicator --help`. Full reference: [CLI-REFERENCE.md](CLI-REFERENC ### 3.3 Run order (when running tools manually) -If you skip the Pipeline Runner, follow this order: +If you skip Automated Workflows, follow this order: -1. **02 Text Cleaner** first — normalizes whitespace + special chars. -2. **03 Format Standardizer** — dates, phones, etc. need cleaned text. -3. **04 Missing Value Handler** — sentinel codes hide as numbers. -4. **05 Column Mapper** — schema before outlier stats. -5. **06 Outlier Detector** — needs clean numerics. Stats on data with `NaN` or `-999` are mathematically poisoned. -6. **07 Multi-File Merger**, **08 Validator** as needed. -7. **01 Deduplicator** is order-flexible (normalizes internally for matching). +1. **02 Clean Text** first — normalizes whitespace + special chars. +2. **03 Standardize Formats** — dates, phones, etc. need cleaned text. +3. **04 Fix Missing Values** — sentinel codes hide as numbers. +4. **05 Map Columns** — schema before outlier stats. +5. **06 Find Unusual Values** — needs clean numerics. Stats on data with `NaN` or `-999` are mathematically poisoned. +6. **07 Combine Files**, **08 Quality Check** as needed. +7. **01 Find Duplicates** is order-flexible (normalizes internally for matching). -The Pipeline Runner enforces this automatically. +Automated Workflows enforces this automatically. ### 3.4 Language diff --git a/landing/bookkeeper/index.html b/landing/bookkeeper/index.html index 6fba4cc..8e9551a 100644 --- a/landing/bookkeeper/index.html +++ b/landing/bookkeeper/index.html @@ -251,12 +251,12 @@ row,column,field_type,old,new
In the bundle

Six tools. One pipeline. One $49 download.

-

1 · Deduplicator

Fuzzy match (Jaro-Winkler), explicit strategies for Date+Amount+Vendor, survivor rules.

-

2 · Text Cleaner

Header whitespace, smart quotes from copy-paste, em-dash sentinels.

-

3 · Format Standardizer

ISO dates, numeric amounts (parens-negative), vendor casing, multi-currency.

-

4 · Missing Value Handler

Disguised-null detection: , N/A, (blank), ?.

-

5 · Column Mapper

Project to your accounting tool's required schema, coerce types, drop extras.

-

6 · Pipeline Runner

Save the cleanup. Run it on next month's export with one command. Same audit, automated.

+

1 · Find Duplicates

Fuzzy match (Jaro-Winkler), explicit strategies for Date+Amount+Vendor, survivor rules.

+

2 · Clean Text

Header whitespace, smart quotes from copy-paste, em-dash sentinels.

+

3 · Standardize Formats

ISO dates, numeric amounts (parens-negative), vendor casing, multi-currency.

+

4 · Fix Missing Values

Disguised-null detection: , N/A, (blank), ?.

+

5 · Map Columns

Project to your accounting tool's required schema, coerce types, drop extras.

+

6 · Automated Workflows

Save the cleanup. Run it on next month's export with one command. Same audit, automated.

diff --git a/landing/index.html b/landing/index.html index fc38f10..7696a46 100644 --- a/landing/index.html +++ b/landing/index.html @@ -168,9 +168,9 @@

One engine. Same six tools. Same $49.

The persona pages above are positioning, not different products. - Whichever you buy, you get the full bundle: Deduplicator, Text - Cleaner, Format Standardizer, Missing-Value Handler, Column - Mapper, and Pipeline Runner — pre-tuned with a saved pipeline + Whichever you buy, you get the full bundle: Find Duplicates, Clean + Text, Standardize Formats, Fix Missing Values, Map Columns, + and Automated Workflows — pre-tuned with a saved pipeline that matches your workflow.

diff --git a/landing/revops/index.html b/landing/revops/index.html index 7ef3da5..e51cf69 100644 --- a/landing/revops/index.html +++ b/landing/revops/index.html @@ -165,7 +165,7 @@
🌍

Multi-platform audience reconciliation

-

Build one canonical audience from Meta, Google Ads, LinkedIn, and your CRM. Each platform exports a different shape; column-mapper aligns them all, dedup merges the survivors with their most-complete fields.

+

Build one canonical audience from Meta, Google Ads, LinkedIn, and your CRM. Each platform exports a different shape; Map Columns aligns them all, dedup merges the survivors with their most-complete fields.

🛡️ @@ -192,7 +192,7 @@
  • Per-row country column drives the parser — no global default that bucks UK numbers as malformed US.
  • Country-name normalization: USA / US / United States all resolve to the same ISO-2 code.
  • 50+ country support via Google's libphonenumber, including KR, CN, IN, MX, BR, IL, TR, PL, DK, SE.
  • -
  • Schema enforcement via the column-mapper: project to your CRM's required shape, coerce score columns to integers, reorder fields to match the import contract.
  • +
  • Schema enforcement via Map Columns: project to your CRM's required shape, coerce score columns to integers, reorder fields to match the import contract.
  • @@ -249,12 +249,12 @@ Total elapsed: 6.7 s
    In the bundle

    Six tools. One pipeline. One $49 download.

    -

    1 · Deduplicator

    Fuzzy match across email + phone + name + company; merge survivors with most-complete fields.

    -

    2 · Text Cleaner

    Smart quotes from copy-paste, NBSP from spreadsheet exports, BOM from Excel.

    -

    3 · Format Standardizer

    E.164 phones with per-row country, canonical emails, name casing, ISO dates.

    -

    4 · Missing Value Handler

    Detect TBD, (unknown), across vendor exports.

    -

    5 · Column Mapper

    Project to your CRM's required schema, coerce score to integer, reorder for import.

    -

    6 · Pipeline Runner

    Save the cleanup as JSON. Drop next campaign's combined export on it. Same dedup, automated.

    +

    1 · Find Duplicates

    Fuzzy match across email + phone + name + company; merge survivors with most-complete fields.

    +

    2 · Clean Text

    Smart quotes from copy-paste, NBSP from spreadsheet exports, BOM from Excel.

    +

    3 · Standardize Formats

    E.164 phones with per-row country, canonical emails, name casing, ISO dates.

    +

    4 · Fix Missing Values

    Detect TBD, (unknown), across vendor exports.

    +

    5 · Map Columns

    Project to your CRM's required schema, coerce score to integer, reorder for import.

    +

    6 · Automated Workflows

    Save the cleanup as JSON. Drop next campaign's combined export on it. Same dedup, automated.

    diff --git a/landing/shopify-pet/index.html b/landing/shopify-pet/index.html index 6058d5d..424075f 100644 --- a/landing/shopify-pet/index.html +++ b/landing/shopify-pet/index.html @@ -178,7 +178,7 @@
    🔗

    Multi-channel order consolidation

    -

    Orders from Shopify + Etsy + a wholesale spreadsheet, each with a different column for "customer email." Column-mapper aligns them; dedup merges across channels.

    +

    Orders from Shopify + Etsy + a wholesale spreadsheet, each with a different column for "customer email." Map Columns aligns them; dedup merges across channels.

    ⚙️ @@ -270,12 +270,12 @@ Total elapsed: 4.2 s
    In the bundle

    Six tools. One pipeline. One $49 download.

    -

    1 · Deduplicator

    Fuzzy match (Jaro-Winkler), 5 normalizers, survivor rules, interactive review.

    -

    2 · Text Cleaner

    Whitespace, smart chars, NBSP, BOM, line endings, case ops.

    -

    3 · Format Standardizer

    Dates, phones, emails, addresses, names, currencies, booleans.

    -

    4 · Missing Value Handler

    Disguised-null detection, profile, mean/median/mode/ffill, drop strategies.

    -

    5 · Column Mapper

    Fuzzy auto-rename, target schema, type coercion, required-field defaults.

    -

    6 · Pipeline Runner

    Chain tools in recommended order, save/load JSON, automate weekly cleanups.

    +

    1 · Find Duplicates

    Fuzzy match (Jaro-Winkler), 5 normalizers, survivor rules, interactive review.

    +

    2 · Clean Text

    Whitespace, smart chars, NBSP, BOM, line endings, case ops.

    +

    3 · Standardize Formats

    Dates, phones, emails, addresses, names, currencies, booleans.

    +

    4 · Fix Missing Values

    Disguised-null detection, profile, mean/median/mode/ffill, drop strategies.

    +

    5 · Map Columns

    Fuzzy auto-rename, target schema, type coercion, required-field defaults.

    +

    6 · Automated Workflows

    Chain tools in recommended order, save/load JSON, automate weekly cleanups.

    diff --git a/src/cli_analyze.py b/src/cli_analyze.py index c2dafb3..9ed625a 100644 --- a/src/cli_analyze.py +++ b/src/cli_analyze.py @@ -45,15 +45,15 @@ app = typer.Typer( # Tool id -> friendly display name. Kept in the CLI module since the GUI has # its own version; both stay in lockstep with the actual script lineup. _TOOL_DISPLAY = { - "01_deduplicator": "Deduplicator", - "02_text_cleaner": "Text Cleaner", - "03_format_standardizer": "Format Standardizer", - "04_missing_handler": "Missing Value Handler", - "05_column_mapper": "Column Mapper", - "06_outlier_detector": "Outlier Detector", - "07_multi_file_merger": "Multi-File Merger", - "08_validator_reporter": "Validator & Reporter", - "09_pipeline_runner": "Pipeline Runner", + "01_deduplicator": "Find Duplicates", + "02_text_cleaner": "Clean Text", + "03_format_standardizer": "Standardize Formats", + "04_missing_handler": "Fix Missing Values", + "05_column_mapper": "Map Columns", + "06_outlier_detector": "Find Unusual Values", + "07_multi_file_merger": "Combine Files", + "08_validator_reporter": "Quality Check", + "09_pipeline_runner": "Automated Workflows", } diff --git a/src/cli_column_map.py b/src/cli_column_map.py index 05cb6a5..68334d8 100644 --- a/src/cli_column_map.py +++ b/src/cli_column_map.py @@ -1,4 +1,4 @@ -"""CLI for the DataTools Column Mapper (script 05). +"""CLI for the DataTools Map Columns tool (script 05). Usage: python -m src.cli_column_map input.csv # auto-mapping preview diff --git a/src/cli_format.py b/src/cli_format.py index 766be06..c8eea06 100644 --- a/src/cli_format.py +++ b/src/cli_format.py @@ -1,4 +1,4 @@ -"""CLI for the DataTools Format Standardizer (script 03). +"""CLI for the DataTools Standardize Formats tool (script 03). Usage: python -m src.cli_format input.csv \\ diff --git a/src/cli_missing.py b/src/cli_missing.py index e33a315..64f013b 100644 --- a/src/cli_missing.py +++ b/src/cli_missing.py @@ -1,4 +1,4 @@ -"""CLI for the DataTools Missing Value Handler (script 04). +"""CLI for the DataTools Fix Missing Values tool (script 04). Usage: python -m src.cli_missing input.csv # profile only diff --git a/src/cli_pipeline.py b/src/cli_pipeline.py index e1b40a7..8cc4cb0 100644 --- a/src/cli_pipeline.py +++ b/src/cli_pipeline.py @@ -1,4 +1,4 @@ -"""CLI for the DataTools Pipeline Runner (script 09). +"""CLI for the DataTools Automated Workflows tool (script 09). Usage: # Run the recommended default pipeline (text → format → missing → dedup): diff --git a/src/core/column_mapper.py b/src/core/column_mapper.py index 41e2abc..c8d8339 100644 --- a/src/core/column_mapper.py +++ b/src/core/column_mapper.py @@ -1,4 +1,4 @@ -"""DataTools Column Mapper. +"""DataTools Map Columns. Rename columns, enforce a target schema, coerce types, drop / add / reorder columns. Designed for the three buyer profiles the toolkit diff --git a/src/core/missing.py b/src/core/missing.py index e0a32a2..ddf5b5e 100644 --- a/src/core/missing.py +++ b/src/core/missing.py @@ -1,4 +1,4 @@ -"""DataTools Missing Value Handler. +"""DataTools Fix Missing Values. Detects disguised nulls, profiles missingness per column, and applies imputation or drop strategies with a full audit trail. diff --git a/src/core/pipeline.py b/src/core/pipeline.py index 9ec2f9b..52c9133 100644 --- a/src/core/pipeline.py +++ b/src/core/pipeline.py @@ -1,4 +1,4 @@ -"""DataTools Pipeline Runner. +"""DataTools Automated Workflows. Chain the cleaning tools (text-clean, format-standardize, missing, column-map, dedup) into a single orchestrated workflow. The pipeline diff --git a/src/gui/__init__.py b/src/gui/__init__.py index f6097fe..544493c 100644 --- a/src/gui/__init__.py +++ b/src/gui/__init__.py @@ -1 +1 @@ -"""Streamlit GUI for the DataTools Deduplicator.""" +"""Streamlit GUI for DataTools.""" diff --git a/src/gui/components/__init__.py b/src/gui/components/__init__.py index 4b590b8..c239ccf 100644 --- a/src/gui/components/__init__.py +++ b/src/gui/components/__init__.py @@ -16,7 +16,7 @@ they need without dragging the entire kitchen-sink module: dedup_review.py ← dedup match-group cards + review pipeline shared.py ← chrome / file-pickup helpers used by every tool -A standalone Deduplicator build, for example, can ship without +A standalone Find Duplicates build, for example, can ship without ``findings.py`` and ``gate.py`` — those modules import the analyzer / gate code that the Lite SKU does not include. diff --git a/src/gui/components/_legacy.py b/src/gui/components/_legacy.py index 1e52f07..0c59965 100644 --- a/src/gui/components/_legacy.py +++ b/src/gui/components/_legacy.py @@ -847,15 +847,15 @@ def _build_match_groups_csv( # Tool id -> friendly display name. Single source of truth for the GUI; the # CLI keeps its own copy so each entrypoint stays self-contained. TOOL_DISPLAY_NAMES: dict[str, str] = { - "01_deduplicator": "Deduplicator", - "02_text_cleaner": "Text Cleaner", - "03_format_standardizer": "Format Standardizer", - "04_missing_handler": "Missing Value Handler", - "05_column_mapper": "Column Mapper", - "06_outlier_detector": "Outlier Detector", - "07_multi_file_merger": "Multi-File Merger", - "08_validator_reporter": "Validator & Reporter", - "09_pipeline_runner": "Pipeline Runner", + "01_deduplicator": "Find Duplicates", + "02_text_cleaner": "Clean Text", + "03_format_standardizer": "Standardize Formats", + "04_missing_handler": "Fix Missing Values", + "05_column_mapper": "Map Columns", + "06_outlier_detector": "Find Unusual Values", + "07_multi_file_merger": "Combine Files", + "08_validator_reporter": "Quality Check", + "09_pipeline_runner": "Automated Workflows", } _SEVERITY_ICON: dict[str, str] = { @@ -1016,7 +1016,7 @@ def render_hidden_aware_preview( ) -> None: """Render a DataFrame preview that shows hidden characters in every cell. - Used for the Text Cleaner's "before" and "after" previews so the user + Used for the Clean Text tool's "before" and "after" previews so the user can actually see the leading/trailing whitespace, NBSP padding, zero-width characters, and smart punctuation that the cleaner is going to remove (or just removed). A plain ``st.dataframe`` collapses outer diff --git a/src/gui/pages/1_Deduplicator.py b/src/gui/pages/1_Deduplicator.py index 48b98f6..dd0a1f9 100644 --- a/src/gui/pages/1_Deduplicator.py +++ b/src/gui/pages/1_Deduplicator.py @@ -1,4 +1,4 @@ -"""DataTools Deduplicator — full working tool page.""" +"""DataTools Find Duplicates — full working tool page.""" from __future__ import annotations diff --git a/src/gui/pages/2_Text_Cleaner.py b/src/gui/pages/2_Text_Cleaner.py index dae734f..60c34ce 100644 --- a/src/gui/pages/2_Text_Cleaner.py +++ b/src/gui/pages/2_Text_Cleaner.py @@ -1,4 +1,4 @@ -"""DataTools Text Cleaner — Streamlit page.""" +"""DataTools Clean Text — Streamlit page.""" from __future__ import annotations diff --git a/src/gui/pages/3_Format_Standardizer.py b/src/gui/pages/3_Format_Standardizer.py index 46f91b8..76f9ae0 100644 --- a/src/gui/pages/3_Format_Standardizer.py +++ b/src/gui/pages/3_Format_Standardizer.py @@ -1,4 +1,4 @@ -"""DataTools Format Standardizer — Streamlit page.""" +"""DataTools Standardize Formats — Streamlit page.""" from __future__ import annotations diff --git a/src/gui/pages/4_Missing_Values.py b/src/gui/pages/4_Missing_Values.py index 6b5de24..08baf0d 100644 --- a/src/gui/pages/4_Missing_Values.py +++ b/src/gui/pages/4_Missing_Values.py @@ -1,4 +1,4 @@ -"""DataTools Missing Value Handler — Streamlit page.""" +"""DataTools Fix Missing Values — Streamlit page.""" from __future__ import annotations diff --git a/src/gui/pages/5_Column_Mapper.py b/src/gui/pages/5_Column_Mapper.py index 818332d..9ba1e68 100644 --- a/src/gui/pages/5_Column_Mapper.py +++ b/src/gui/pages/5_Column_Mapper.py @@ -1,4 +1,4 @@ -"""DataTools Column Mapper — Streamlit page.""" +"""DataTools Map Columns — Streamlit page.""" from __future__ import annotations diff --git a/src/gui/pages/6_Outlier_Detector.py b/src/gui/pages/6_Outlier_Detector.py index b26cf4b..57b01f7 100644 --- a/src/gui/pages/6_Outlier_Detector.py +++ b/src/gui/pages/6_Outlier_Detector.py @@ -1,4 +1,4 @@ -"""DataTools Outlier Detector — stub page.""" +"""DataTools Find Unusual Values — stub page.""" from __future__ import annotations diff --git a/src/gui/pages/7_Multi_File_Merger.py b/src/gui/pages/7_Multi_File_Merger.py index c58870f..50f9fff 100644 --- a/src/gui/pages/7_Multi_File_Merger.py +++ b/src/gui/pages/7_Multi_File_Merger.py @@ -1,4 +1,4 @@ -"""DataTools Multi-File Merger — stub page.""" +"""DataTools Combine Files — stub page.""" from __future__ import annotations diff --git a/src/gui/pages/8_Validator_Reporter.py b/src/gui/pages/8_Validator_Reporter.py index f3d2333..f94ce3d 100644 --- a/src/gui/pages/8_Validator_Reporter.py +++ b/src/gui/pages/8_Validator_Reporter.py @@ -1,4 +1,4 @@ -"""DataTools Validator & Reporter — stub page.""" +"""DataTools Quality Check — stub page.""" from __future__ import annotations diff --git a/src/gui/pages/9_Pipeline_Runner.py b/src/gui/pages/9_Pipeline_Runner.py index 95b78fe..9f58e04 100644 --- a/src/gui/pages/9_Pipeline_Runner.py +++ b/src/gui/pages/9_Pipeline_Runner.py @@ -1,4 +1,4 @@ -"""DataTools Pipeline Runner — Streamlit page.""" +"""DataTools Automated Workflows — Streamlit page.""" from __future__ import annotations diff --git a/test-cases/column-mapper-corpus/README.md b/test-cases/column-mapper-corpus/README.md index 956032a..c12f6e5 100644 --- a/test-cases/column-mapper-corpus/README.md +++ b/test-cases/column-mapper-corpus/README.md @@ -1,4 +1,4 @@ -# Column Mapper — corpus +# Map Columns — corpus Acceptance fixtures for `src/core/column_mapper.py`. Each `.csv` under `test_data/` is paired with assertions in diff --git a/test-cases/missing-corpus/README.md b/test-cases/missing-corpus/README.md index d76e9dd..a7f314b 100644 --- a/test-cases/missing-corpus/README.md +++ b/test-cases/missing-corpus/README.md @@ -1,4 +1,4 @@ -# Missing Value Handler — corpus +# Fix Missing Values — corpus Acceptance fixtures for `src/core/missing.py`. Each `.csv` under `test_data/` is paired with assertions in `tests/test_missing_corpus.py`. diff --git a/test-cases/text-cleaner-corpus/README.md b/test-cases/text-cleaner-corpus/README.md index 59416e1..1315e62 100644 --- a/test-cases/text-cleaner-corpus/README.md +++ b/test-cases/text-cleaner-corpus/README.md @@ -1,4 +1,4 @@ -# Text Cleaner Test Corpus +# Clean Text Test Corpus Test fixtures for `02_text_cleaner.py` (Excel & CSV Data Cleaning Mastery Bundle). diff --git a/tests/gui/test_activation.py b/tests/gui/test_activation.py index f3a484d..e53c1c0 100644 --- a/tests/gui/test_activation.py +++ b/tests/gui/test_activation.py @@ -3,7 +3,7 @@ These exercise the chrome-level gate that ``hide_streamlit_chrome`` installs: when no valid license is on disk, every page renders the activation form instead of the page body, and tool widgets do NOT -appear. We test against the Deduplicator page since it's the smallest +appear. We test against the Find Duplicates page since it's the smallest real-world tool that depends on chrome. The autouse fixture in ``tests/conftest.py`` sets diff --git a/tests/gui/test_advanced_panels.py b/tests/gui/test_advanced_panels.py index ae0b200..be5c30a 100644 --- a/tests/gui/test_advanced_panels.py +++ b/tests/gui/test_advanced_panels.py @@ -5,7 +5,7 @@ expander that houses every per-column / per-strategy knob. It's the densest single widget surface in the GUI, so a session-state key drift in there cascades into every dedup session. -We exercise it via the Deduplicator page (rendering ``config_panel`` +We exercise it via the Find Duplicates page (rendering ``config_panel`` in isolation requires a fake Streamlit context). The page provides the surrounding state; we poke widgets and verify their effects. """ diff --git a/tests/gui/test_dedup_review.py b/tests/gui/test_dedup_review.py index bcb4ae0..c11338d 100644 --- a/tests/gui/test_dedup_review.py +++ b/tests/gui/test_dedup_review.py @@ -2,7 +2,7 @@ ``match_group_card`` from ``src.gui.components`` has two modes (decided / undecided) and a Confirm/Undo flow keyed by session_state. We test -each state by exercising the parent Deduplicator page end to end and +each state by exercising the parent Find Duplicates page end to end and then poking at ``review_decisions`` directly. Why not unit-test ``match_group_card`` in isolation? AppTest needs a diff --git a/tests/gui/test_errors.py b/tests/gui/test_errors.py index 22abde4..9a95fce 100644 --- a/tests/gui/test_errors.py +++ b/tests/gui/test_errors.py @@ -21,7 +21,7 @@ from .conftest import collected_text, stash_upload # --------------------------------------------------------------------------- class TestMalformedUploadErrors: - """Bytes that look like a CSV but aren't parseable. The Deduplicator + """Bytes that look like a CSV but aren't parseable. The Find Duplicates page wraps ``read_file`` failures in an ``st.error`` with the file name and the structured ``format_for_user`` output.""" diff --git a/tests/gui/test_gate.py b/tests/gui/test_gate.py index 3aeac14..d1a9339 100644 --- a/tests/gui/test_gate.py +++ b/tests/gui/test_gate.py @@ -11,7 +11,7 @@ exist, each pinned here: 3. **Upload + matching passed normalization** — gate is a no-op; the page proceeds. -We exercise the gate via the Deduplicator page (any tool page would +We exercise the gate via the Find Duplicates page (any tool page would work; dedup is the smallest one that doesn't depend on heavy widgets). """ @@ -27,7 +27,7 @@ from .conftest import ( ) -# Deduplicator is our canary — it calls ``require_normalization_gate`` +# Find Duplicates is our canary — it calls ``require_normalization_gate`` # on the second line of the module. If the gate blocks, the dedup- # specific title shouldn't even render. GATED_PAGE = "1_Deduplicator" diff --git a/tests/gui/test_lite_tier.py b/tests/gui/test_lite_tier.py index f05dc24..7e08301 100644 --- a/tests/gui/test_lite_tier.py +++ b/tests/gui/test_lite_tier.py @@ -1,9 +1,9 @@ """GUI tests for the Lite tier. -A Lite license unlocks Deduplicator, Text Cleaner, Format -Standardizer. Opening any other tool page (Missing Values, Column -Mapper, Pipeline Runner, etc.) must render an upgrade prompt and -short-circuit the page body. +A Lite license unlocks Find Duplicates, Clean Text, Standardize +Formats. Opening any other tool page (Fix Missing Values, Map +Columns, Automated Workflows, etc.) must render an upgrade prompt +and short-circuit the page body. The home grid shows a 🔒 Locked badge on the cards for tools the user's tier doesn't unlock. @@ -104,7 +104,7 @@ class TestLiteHomeGridBadges: ): home_app.run() text = collected_text(home_app) - # Missing Value Handler is locked under Lite — its card should + # Fix Missing Values is locked under Lite — its card should # have a 🔒 Locked badge. # We assert the lock glyph appears alongside the locked tool's # display name. Streamlit renders the markdown verbatim so the diff --git a/tests/gui/test_workflows.py b/tests/gui/test_workflows.py index c8e91d0..77384c4 100644 --- a/tests/gui/test_workflows.py +++ b/tests/gui/test_workflows.py @@ -19,7 +19,7 @@ from .conftest import collected_text, stash_upload # --------------------------------------------------------------------------- -# Deduplicator +# Find Duplicates # --------------------------------------------------------------------------- class TestDeduplicatorWorkflow: @@ -64,7 +64,7 @@ class TestDeduplicatorWorkflow: # --------------------------------------------------------------------------- -# Text Cleaner +# Clean Text # --------------------------------------------------------------------------- class TestTextCleanerWorkflow: @@ -96,7 +96,7 @@ class TestTextCleanerWorkflow: # --------------------------------------------------------------------------- -# Format Standardizer +# Standardize Formats # --------------------------------------------------------------------------- class TestFormatStandardizerWorkflow: @@ -110,7 +110,7 @@ class TestFormatStandardizerWorkflow: # --------------------------------------------------------------------------- -# Missing Value Handler +# Fix Missing Values # --------------------------------------------------------------------------- class TestMissingValuesWorkflow: @@ -124,7 +124,7 @@ class TestMissingValuesWorkflow: # --------------------------------------------------------------------------- -# Column Mapper +# Map Columns # --------------------------------------------------------------------------- class TestColumnMapperWorkflow: @@ -138,7 +138,7 @@ class TestColumnMapperWorkflow: # --------------------------------------------------------------------------- -# Pipeline Runner +# Automated Workflows # --------------------------------------------------------------------------- class TestPipelineRunnerWorkflow: diff --git a/tests/test_cli_analyze.py b/tests/test_cli_analyze.py index 6ca24b3..fb3796c 100644 --- a/tests/test_cli_analyze.py +++ b/tests/test_cli_analyze.py @@ -41,8 +41,8 @@ class TestAnalyzeCli: assert result.exit_code == 0 # The Rich table breaks lines; assert on stable substrings instead of # full finding ids. - assert "Text Cleaner" in result.stdout - assert "Missing Value" in result.stdout + assert "Clean Text" in result.stdout + assert "Fix Missing Values" in result.stdout # Severity column is rendered. assert "warn" in result.stdout diff --git a/tests/test_column_mapper_corpus.py b/tests/test_column_mapper_corpus.py index f505084..65dcd7a 100644 --- a/tests/test_column_mapper_corpus.py +++ b/tests/test_column_mapper_corpus.py @@ -1,4 +1,4 @@ -"""Acceptance corpus for the Column Mapper. +"""Acceptance corpus for the Map Columns tool. Loads every fixture in ``test-cases/column-mapper-corpus/test_data/`` and asserts the documented behaviour against the documented schema. diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 59c2a3f..e7abdc4 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -48,7 +48,7 @@ class TestAnalyzeCliE2E: proc = _run("-m", "src.cli_analyze", str(CORPUS_KITCHEN_SINK)) assert proc.returncode == 0, proc.stderr # Rich tables wrap; assert on stable substrings. - assert "Text Cleaner" in proc.stdout + assert "Clean Text" in proc.stdout assert "csv_bom_stripped" in proc.stdout or "smart_quotes" in proc.stdout def test_json_output_parses(self): diff --git a/tests/test_lite_tier.py b/tests/test_lite_tier.py index 979f905..9aabfe5 100644 --- a/tests/test_lite_tier.py +++ b/tests/test_lite_tier.py @@ -1,7 +1,7 @@ """Tier-specific tests: Lite tier feature set + gating. -Lite unlocks exactly three tools — Deduplicator, Text Cleaner, -Format Standardizer — and locks the other six. We test: +Lite unlocks exactly three tools — Find Duplicates, Clean Text, +Standardize Formats — and locks the other six. We test: - The features map for Lite returns the right three flags (and only those three). diff --git a/tests/test_missing_corpus.py b/tests/test_missing_corpus.py index 6a5af83..7b942cd 100644 --- a/tests/test_missing_corpus.py +++ b/tests/test_missing_corpus.py @@ -1,4 +1,4 @@ -"""Acceptance corpus for the Missing Value Handler. +"""Acceptance corpus for the Fix Missing Values tool. Loads every fixture in ``test-cases/missing-corpus/test_data/`` and asserts the documented behaviour. The fixtures are split into: diff --git a/tests/test_perf_regressions.py b/tests/test_perf_regressions.py index cc05742..f3d5715 100644 --- a/tests/test_perf_regressions.py +++ b/tests/test_perf_regressions.py @@ -25,7 +25,7 @@ from src.core import ( # --------------------------------------------------------------------------- -# Format Standardizer: single-tolist hot loop +# Standardize Formats: single-tolist hot loop # --------------------------------------------------------------------------- class TestStandardizerHotLoop: @@ -93,7 +93,7 @@ class TestStandardizerHotLoop: # --------------------------------------------------------------------------- -# Deduplicator: per-call normalizer cache +# Find Duplicates: per-call normalizer cache # --------------------------------------------------------------------------- class TestDedupNormalizerCache: