Compare commits
8 Commits
28ab51a869
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 41ab2166ef | |||
| 9943e6e537 | |||
| e7ec79b9b5 | |||
| 6df726e69e | |||
| 38616d69e2 | |||
| 00d3f28865 | |||
| 837f4b88b5 | |||
| fd9606c67b |
141
.github/workflows/build.yml
vendored
141
.github/workflows/build.yml
vendored
@@ -1,18 +1,17 @@
|
|||||||
name: Build installers
|
name: Build installers
|
||||||
|
|
||||||
# Triggers:
|
# Triggers:
|
||||||
# * Tag push (v*) → produces installers + portable zips, attaches them
|
# * Tag push (v*) → produces installers, attaches them to a GitHub Release.
|
||||||
# to a GitHub Release.
|
# * Manual dispatch → uploads the installers as workflow artifacts only.
|
||||||
# * Manual dispatch → uploads everything as workflow artifacts only.
|
|
||||||
#
|
#
|
||||||
# Outputs per platform (downloadable by buyers):
|
# Outputs per platform (downloadable by buyers):
|
||||||
# * macOS: .dmg installer + portable .zip (signed .app inside).
|
# * macOS: .dmg installer
|
||||||
# * Windows: .exe installer + portable .zip (no-install).
|
# * Windows: .exe installer
|
||||||
# * Linux: .AppImage (already portable; no separate zip).
|
# * Linux: .AppImage (already portable; no separate installer step)
|
||||||
#
|
#
|
||||||
# Self-contained: every artifact ships its own Python interpreter + every
|
# Self-contained: every artifact ships its own Python interpreter + every
|
||||||
# runtime dep through PyInstaller. No pre/post install steps on the
|
# runtime dep (including bundled Tesseract OCR) through PyInstaller. No
|
||||||
# buyer's machine.
|
# pre/post install steps on the buyer's machine.
|
||||||
#
|
#
|
||||||
# What this workflow doesn't do (yet):
|
# What this workflow doesn't do (yet):
|
||||||
# * Code signing (Mac Developer ID, Windows code-signing cert).
|
# * Code signing (Mac Developer ID, Windows code-signing cert).
|
||||||
@@ -40,16 +39,16 @@ jobs:
|
|||||||
include:
|
include:
|
||||||
- os: macos-latest
|
- os: macos-latest
|
||||||
platform: mac
|
platform: mac
|
||||||
installer_glob: dist/DataTools-*-mac.dmg
|
artifact_name: DataTools-mac.dmg
|
||||||
portable_glob: dist/DataTools-*-mac-portable.zip
|
artifact_path: dist/DataTools-*-mac.dmg
|
||||||
- os: windows-latest
|
- os: windows-latest
|
||||||
platform: win
|
platform: win
|
||||||
installer_glob: dist/DataTools-*-win-setup.exe
|
artifact_name: DataTools-win.exe
|
||||||
portable_glob: dist/DataTools-*-win-portable.zip
|
artifact_path: dist/DataTools-*-win-setup.exe
|
||||||
- os: ubuntu-latest
|
- os: ubuntu-latest
|
||||||
platform: linux
|
platform: linux
|
||||||
installer_glob: dist/DataTools-*-linux-x86_64.AppImage
|
artifact_name: DataTools-linux.AppImage
|
||||||
portable_glob: '' # AppImage is already a portable single file
|
artifact_path: dist/DataTools-*-linux-x86_64.AppImage
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
@@ -66,7 +65,7 @@ jobs:
|
|||||||
pip install pyinstaller pillow
|
pip install pyinstaller pillow
|
||||||
|
|
||||||
# ---- Tesseract bundling cache --------------------------------
|
# ---- Tesseract bundling cache --------------------------------
|
||||||
# The fetch logic inside build/make_release.py downloads:
|
# The fetch logic inside build/tesseract.py downloads:
|
||||||
# * build/vendor/tessdata/eng.traineddata (~16 MB, shared)
|
# * build/vendor/tessdata/eng.traineddata (~16 MB, shared)
|
||||||
# * build/_tesseract/<platform>/ (binary + libs, 30-120 MB)
|
# * build/_tesseract/<platform>/ (binary + libs, 30-120 MB)
|
||||||
# Cache both so iterative CI runs don't re-download. The
|
# Cache both so iterative CI runs don't re-download. The
|
||||||
@@ -80,9 +79,9 @@ jobs:
|
|||||||
build/vendor/tessdata
|
build/vendor/tessdata
|
||||||
key: tesseract-${{ runner.os }}-5.5.0-tessdata_best-v1
|
key: tesseract-${{ runner.os }}-5.5.0-tessdata_best-v1
|
||||||
|
|
||||||
# ---- Linux: install patchelf so make_release.py can rewrite
|
# ---- Linux: install patchelf so tesseract.py can rewrite
|
||||||
# RPATH on the bundled tesseract binary. apt-get install
|
# RPATH on the bundled tesseract binary. apt-get install
|
||||||
# tesseract-ocr is handled inside make_release.py itself. -----
|
# tesseract-ocr is handled inside tesseract.py itself. --------
|
||||||
- name: Install Linux build prereqs for Tesseract bundling
|
- name: Install Linux build prereqs for Tesseract bundling
|
||||||
if: matrix.os == 'ubuntu-latest'
|
if: matrix.os == 'ubuntu-latest'
|
||||||
run: |
|
run: |
|
||||||
@@ -99,9 +98,9 @@ jobs:
|
|||||||
- name: Generate platform icons
|
- name: Generate platform icons
|
||||||
run: python build/generate_icons.py
|
run: python build/generate_icons.py
|
||||||
|
|
||||||
# Stage Tesseract before PyInstaller. The make_release.py
|
# Stage Tesseract before PyInstaller. The tesseract.py helpers
|
||||||
# helpers handle the per-platform fetch (UB-Mannheim on Win,
|
# handle the per-platform fetch (UB-Mannheim on Win, brew on
|
||||||
# brew on Mac, apt on Linux) and stage the binary + libs into
|
# Mac, apt on Linux) and stage the binary + libs into
|
||||||
# build/_tesseract/<platform>/ where the spec picks them up.
|
# build/_tesseract/<platform>/ where the spec picks them up.
|
||||||
# We invoke a tiny inline Python so the workflow doesn't have
|
# We invoke a tiny inline Python so the workflow doesn't have
|
||||||
# to know the per-platform target string.
|
# to know the per-platform target string.
|
||||||
@@ -113,7 +112,7 @@ jobs:
|
|||||||
python - <<'PY'
|
python - <<'PY'
|
||||||
import os, sys
|
import os, sys
|
||||||
sys.path.insert(0, "build")
|
sys.path.insert(0, "build")
|
||||||
from make_release import fetch_tessdata, fetch_tesseract_for_platform
|
from tesseract import fetch_tessdata, fetch_tesseract_for_platform
|
||||||
target = os.environ["DATATOOLS_PLATFORM"]
|
target = os.environ["DATATOOLS_PLATFORM"]
|
||||||
fetch_tessdata()
|
fetch_tessdata()
|
||||||
fetch_tesseract_for_platform(target)
|
fetch_tesseract_for_platform(target)
|
||||||
@@ -127,16 +126,81 @@ jobs:
|
|||||||
DATATOOLS_TESS_STAGING: build/_tesseract/${{ matrix.platform }}
|
DATATOOLS_TESS_STAGING: build/_tesseract/${{ matrix.platform }}
|
||||||
run: pyinstaller build/datatools.spec --clean --noconfirm
|
run: pyinstaller build/datatools.spec --clean --noconfirm
|
||||||
|
|
||||||
|
# ---- macOS code signing + notarization (before DMG packaging) -
|
||||||
|
# Signs dist/DataTools.app with the Developer ID, notarizes it,
|
||||||
|
# and staples the ticket so Gatekeeper passes offline. Wrapped in
|
||||||
|
# a guard: if the cert secret is absent the step prints a warning
|
||||||
|
# and exits 0, so dry-run dispatches still produce an (unsigned)
|
||||||
|
# build. Secret names match build/README.md "Signing".
|
||||||
|
- name: Sign & notarize macOS app
|
||||||
|
if: matrix.os == 'macos-latest'
|
||||||
|
env:
|
||||||
|
CERT_P12_BASE64: ${{ secrets.MACOS_DEVELOPER_ID_CERT_P12_BASE64 }}
|
||||||
|
CERT_PASSWORD: ${{ secrets.MACOS_DEVELOPER_ID_CERT_PASSWORD }}
|
||||||
|
NOTARY_APPLE_ID: ${{ secrets.MACOS_NOTARY_APPLE_ID }}
|
||||||
|
NOTARY_TEAM_ID: ${{ secrets.MACOS_NOTARY_TEAM_ID }}
|
||||||
|
NOTARY_PASSWORD: ${{ secrets.MACOS_NOTARY_PASSWORD }}
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
if [ -z "${CERT_P12_BASE64:-}" ]; then
|
||||||
|
echo "::warning::MACOS_DEVELOPER_ID_CERT_P12_BASE64 not set — shipping an UNSIGNED build (Gatekeeper will warn buyers)."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
APP="dist/DataTools.app"
|
||||||
|
|
||||||
|
# 1. Import the Developer ID cert into an ephemeral keychain.
|
||||||
|
KEYCHAIN="$RUNNER_TEMP/build.keychain-db"
|
||||||
|
KEYCHAIN_PW="$(uuidgen)"
|
||||||
|
security create-keychain -p "$KEYCHAIN_PW" "$KEYCHAIN"
|
||||||
|
security set-keychain-settings -lut 3600 "$KEYCHAIN"
|
||||||
|
security unlock-keychain -p "$KEYCHAIN_PW" "$KEYCHAIN"
|
||||||
|
echo "$CERT_P12_BASE64" | base64 --decode > "$RUNNER_TEMP/cert.p12"
|
||||||
|
security import "$RUNNER_TEMP/cert.p12" -k "$KEYCHAIN" -P "$CERT_PASSWORD" \
|
||||||
|
-T /usr/bin/codesign
|
||||||
|
security set-key-partition-list -S apple-tool:,apple: -s -k "$KEYCHAIN_PW" "$KEYCHAIN" >/dev/null
|
||||||
|
# Make the ephemeral keychain searchable (preserve the login keychain).
|
||||||
|
security list-keychains -d user -s "$KEYCHAIN" \
|
||||||
|
$(security list-keychains -d user | sed 's/"//g')
|
||||||
|
|
||||||
|
IDENTITY="$(security find-identity -v -p codesigning "$KEYCHAIN" \
|
||||||
|
| grep 'Developer ID Application' | head -1 | awk -F'"' '{print $2}')"
|
||||||
|
if [ -z "$IDENTITY" ]; then
|
||||||
|
echo "::error::No 'Developer ID Application' identity found in the imported cert."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Signing with: $IDENTITY"
|
||||||
|
|
||||||
|
# 2. Sign the bundle (hardened runtime + secure timestamp + entitlements).
|
||||||
|
# --deep signs the nested dylibs/.so the PyInstaller bundle carries.
|
||||||
|
codesign --deep --force --options runtime --timestamp \
|
||||||
|
--entitlements build/macos/entitlements.plist \
|
||||||
|
--sign "$IDENTITY" "$APP"
|
||||||
|
codesign --verify --strict --verbose=2 "$APP"
|
||||||
|
|
||||||
|
# 3. Notarize the .app (notarytool needs a zip/dmg/pkg, not a bare .app),
|
||||||
|
# then staple so Gatekeeper validates offline.
|
||||||
|
if [ -n "${NOTARY_APPLE_ID:-}" ]; then
|
||||||
|
ditto -c -k --keepParent "$APP" "$RUNNER_TEMP/DataTools.zip"
|
||||||
|
xcrun notarytool submit "$RUNNER_TEMP/DataTools.zip" \
|
||||||
|
--apple-id "$NOTARY_APPLE_ID" \
|
||||||
|
--team-id "$NOTARY_TEAM_ID" \
|
||||||
|
--password "$NOTARY_PASSWORD" \
|
||||||
|
--wait
|
||||||
|
xcrun stapler staple "$APP"
|
||||||
|
xcrun stapler validate "$APP"
|
||||||
|
else
|
||||||
|
echo "::warning::Notary credentials not set — app is signed but NOT notarized (Gatekeeper will still warn)."
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f "$RUNNER_TEMP/cert.p12"
|
||||||
|
|
||||||
# ---- Per-platform installer packaging ------------------------
|
# ---- Per-platform installer packaging ------------------------
|
||||||
|
|
||||||
- name: Package macOS DMG (installer)
|
- name: Package macOS DMG (installer)
|
||||||
if: matrix.os == 'macos-latest'
|
if: matrix.os == 'macos-latest'
|
||||||
run: bash build/macos/build_dmg.sh "${{ steps.version.outputs.version }}"
|
run: bash build/macos/build_dmg.sh "${{ steps.version.outputs.version }}"
|
||||||
|
|
||||||
- name: Package macOS portable .zip
|
|
||||||
if: matrix.os == 'macos-latest'
|
|
||||||
run: bash build/macos/build_zip.sh "${{ steps.version.outputs.version }}"
|
|
||||||
|
|
||||||
- name: Install Inno Setup (Windows)
|
- name: Install Inno Setup (Windows)
|
||||||
if: matrix.os == 'windows-latest'
|
if: matrix.os == 'windows-latest'
|
||||||
run: choco install innosetup --no-progress -y
|
run: choco install innosetup --no-progress -y
|
||||||
@@ -147,10 +211,6 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
iscc /DAppVersion=${{ steps.version.outputs.version }} build\installer.iss
|
iscc /DAppVersion=${{ steps.version.outputs.version }} build\installer.iss
|
||||||
|
|
||||||
- name: Package Windows portable .zip
|
|
||||||
if: matrix.os == 'windows-latest'
|
|
||||||
run: python build/build_portable_zip.py win ${{ steps.version.outputs.version }}
|
|
||||||
|
|
||||||
- name: Install AppImage tooling (Linux)
|
- name: Install AppImage tooling (Linux)
|
||||||
if: matrix.os == 'ubuntu-latest'
|
if: matrix.os == 'ubuntu-latest'
|
||||||
run: |
|
run: |
|
||||||
@@ -168,29 +228,14 @@ jobs:
|
|||||||
- name: Upload installer artifact
|
- name: Upload installer artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: DataTools-${{ matrix.platform }}-installer
|
name: ${{ matrix.artifact_name }}
|
||||||
path: ${{ matrix.installer_glob }}
|
path: ${{ matrix.artifact_path }}
|
||||||
if-no-files-found: error
|
if-no-files-found: error
|
||||||
|
|
||||||
- name: Upload portable artifact
|
- name: Attach to Release (tag push only)
|
||||||
if: matrix.portable_glob != ''
|
|
||||||
uses: actions/upload-artifact@v4
|
|
||||||
with:
|
|
||||||
name: DataTools-${{ matrix.platform }}-portable
|
|
||||||
path: ${{ matrix.portable_glob }}
|
|
||||||
if-no-files-found: error
|
|
||||||
|
|
||||||
- name: Attach installer to Release (tag push only)
|
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
uses: softprops/action-gh-release@v2
|
uses: softprops/action-gh-release@v2
|
||||||
with:
|
with:
|
||||||
files: ${{ matrix.installer_glob }}
|
files: ${{ matrix.artifact_path }}
|
||||||
fail_on_unmatched_files: true
|
fail_on_unmatched_files: true
|
||||||
generate_release_notes: true
|
generate_release_notes: true
|
||||||
|
|
||||||
- name: Attach portable to Release (tag push only)
|
|
||||||
if: startsWith(github.ref, 'refs/tags/v') && matrix.portable_glob != ''
|
|
||||||
uses: softprops/action-gh-release@v2
|
|
||||||
with:
|
|
||||||
files: ${{ matrix.portable_glob }}
|
|
||||||
fail_on_unmatched_files: true
|
|
||||||
|
|||||||
14
README.es.md
14
README.es.md
@@ -22,15 +22,15 @@ Cada página de herramienta incluye una ventana emergente de **Help** (a la dere
|
|||||||
|
|
||||||
## Descarga (usuarios no técnicos)
|
## Descarga (usuarios no técnicos)
|
||||||
|
|
||||||
Paquetes precompilados — sin instalar Python, sin permisos de administrador, sin internet en ejecución. Cada versión ofrece dos formatos por sistema operativo: un **instalador** que crea accesos directos en el escritorio + menú Inicio / Launchpad, y un **.zip portable** que descomprimes y haces doble clic. Elige el que te permita tu política de TI.
|
Paquetes precompilados — sin instalar Python, sin permisos de administrador, sin internet en ejecución. Cada versión ofrece un **instalador** por sistema operativo que crea accesos directos en el escritorio + menú Inicio / Launchpad.
|
||||||
|
|
||||||
| Plataforma | Instalador (recomendado) | Portable (sin instalar) |
|
| Plataforma | Instalador |
|
||||||
|---|---|---|
|
|---|---|
|
||||||
| **macOS** | `DataTools-X.Y.Z-mac.dmg` — ábrelo, arrastra DataTools.app a /Applications, ejecútalo desde Launchpad. | `DataTools-X.Y.Z-mac-portable.zip` — descomprime donde quieras, doble clic en `DataTools.app`. |
|
| **macOS** | `DataTools-X.Y.Z-mac.dmg` — ábrelo, arrastra DataTools.app a /Applications, ejecútalo desde Launchpad. |
|
||||||
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — ejecuta el instalador (por usuario, sin admin). Crea acceso directo en el escritorio + entrada en el menú Inicio. | `DataTools-X.Y.Z-win-portable.zip` — descomprime donde quieras, doble clic en `DataTools.exe`. |
|
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — ejecuta el instalador (por usuario, sin admin). Crea acceso directo en el escritorio + entrada en el menú Inicio. |
|
||||||
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x` y doble clic. | El AppImage ya es portable. |
|
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x` y doble clic. El AppImage ya es portable. |
|
||||||
|
|
||||||
Última versión: consulta [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (o el listado de Gumroad). Cada paquete ocupa ~300 MB descomprimido; al primer arranque la app levanta un servidor local en http://127.0.0.1:8501 y abre tu navegador predeterminado. Nada sale de tu equipo — instalador y portable son idénticos por dentro.
|
Última versión: consulta [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (o el listado de Gumroad). Cada paquete ocupa ~300 MB descomprimido; al primer arranque la app levanta un servidor local en http://127.0.0.1:8501 y abre tu navegador predeterminado. Nada sale de tu equipo.
|
||||||
|
|
||||||
**Tesseract OCR viene incluido.** El soporte para PDFs escaneados del Extractor de PDF funciona sin configuración adicional en las tres plataformas — no hace falta instalar Tesseract por separado. Atribución de licencia: ver [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
**Tesseract OCR viene incluido.** El soporte para PDFs escaneados del Extractor de PDF funciona sin configuración adicional en las tres plataformas — no hace falta instalar Tesseract por separado. Atribución de licencia: ver [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
||||||
|
|
||||||
|
|||||||
14
README.md
14
README.md
@@ -22,15 +22,15 @@ Every tool page has an in-tool **Help** popover (right of the title) with a comp
|
|||||||
|
|
||||||
## Download (non-technical users)
|
## Download (non-technical users)
|
||||||
|
|
||||||
Pre-built bundles — no Python install, no admin rights, no internet at runtime. Each release ships two flavors per OS: an **installer** that wires up Desktop + Start Menu / Launchpad shortcuts, and a **portable .zip** you unzip and double-click. Pick whichever your IT policy allows.
|
Pre-built bundles — no Python install, no admin rights, no internet at runtime. Each release ships an **installer** per OS that wires up Desktop + Start Menu / Launchpad shortcuts.
|
||||||
|
|
||||||
| Platform | Installer (recommended) | Portable (no install) |
|
| Platform | Installer |
|
||||||
|---|---|---|
|
|---|---|
|
||||||
| **macOS** | `DataTools-X.Y.Z-mac.dmg` — open, drag DataTools.app into /Applications, launch from Launchpad. | `DataTools-X.Y.Z-mac-portable.zip` — unzip anywhere, double-click `DataTools.app`. |
|
| **macOS** | `DataTools-X.Y.Z-mac.dmg` — open, drag DataTools.app into /Applications, launch from Launchpad. |
|
||||||
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — run installer (per-user, no admin). Desktop shortcut + Start Menu entry created. | `DataTools-X.Y.Z-win-portable.zip` — unzip anywhere, double-click `DataTools.exe`. |
|
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — run installer (per-user, no admin). Desktop shortcut + Start Menu entry created. |
|
||||||
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x`, double-click. | The AppImage is already portable. |
|
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x`, double-click. The AppImage is already portable. |
|
||||||
|
|
||||||
Latest release: see [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (or the Gumroad listing). Each bundle is ~300 MB unpacked; on first launch the app starts a local server at http://127.0.0.1:8501 and opens your default browser. Nothing leaves your machine — installers and portables are byte-identical inside.
|
Latest release: see [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (or the Gumroad listing). Each bundle is ~300 MB unpacked; on first launch the app starts a local server at http://127.0.0.1:8501 and opens your default browser. Nothing leaves your machine.
|
||||||
|
|
||||||
**Tesseract OCR is bundled.** Scanned-PDF support in the PDF Extractor works out of the box on all three platforms — no separate Tesseract install required. License attribution: see [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
**Tesseract OCR is bundled.** Scanned-PDF support in the PDF Extractor works out of the box on all three platforms — no separate Tesseract install required. License attribution: see [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
||||||
|
|
||||||
|
|||||||
111
build/README.md
111
build/README.md
@@ -23,14 +23,12 @@ build/
|
|||||||
├── generate_icons.py Builds icon.ico / icon.icns / icon.png from
|
├── generate_icons.py Builds icon.ico / icon.icns / icon.png from
|
||||||
│ src/gui/assets/datatools_icon_256.png. Run
|
│ src/gui/assets/datatools_icon_256.png. Run
|
||||||
│ once before pyinstaller (CI does this).
|
│ once before pyinstaller (CI does this).
|
||||||
├── build_portable_zip.py Cross-platform: zips dist/DataTools/ into a
|
├── tesseract.py Fetches the per-platform Tesseract binary +
|
||||||
│ no-install portable download. Used by the
|
│ eng.traineddata at build time. CI imports
|
||||||
│ Windows + Linux portable artifacts.
|
│ fetch_tessdata + fetch_tesseract_for_platform.
|
||||||
├── macos/
|
├── macos/
|
||||||
│ ├── build_dmg.sh Wraps dist/DataTools.app into a .dmg with a
|
│ └── build_dmg.sh Wraps dist/DataTools.app into a .dmg with a
|
||||||
│ │ drag-to-/Applications layout (installer).
|
│ drag-to-/Applications layout (installer).
|
||||||
│ └── build_zip.sh Wraps dist/DataTools.app into a portable
|
|
||||||
│ .zip via ditto (preserves bundle metadata).
|
|
||||||
├── appimage/
|
├── appimage/
|
||||||
│ ├── AppRun Entry point invoked when the AppImage runs.
|
│ ├── AppRun Entry point invoked when the AppImage runs.
|
||||||
│ ├── datatools.desktop Linux desktop-entry metadata.
|
│ ├── datatools.desktop Linux desktop-entry metadata.
|
||||||
@@ -43,17 +41,15 @@ build/
|
|||||||
|
|
||||||
## Distribution outputs per platform
|
## Distribution outputs per platform
|
||||||
|
|
||||||
Each CI run produces two downloads per platform — an installer for
|
Each CI run produces one installer per platform:
|
||||||
buyers who want shortcuts wired automatically, and a portable .zip
|
|
||||||
for buyers (or IT-locked-down machines) that can't run installers:
|
|
||||||
|
|
||||||
| Platform | Installer | Portable |
|
| Platform | Installer |
|
||||||
|----------|----------------------------------------|------------------------------------------------|
|
|----------|----------------------------------------|
|
||||||
| macOS | `DataTools-<ver>-mac.dmg` | `DataTools-<ver>-mac-portable.zip` (ditto .app)|
|
| macOS | `DataTools-<ver>-mac.dmg` |
|
||||||
| Windows | `DataTools-<ver>-win-setup.exe` | `DataTools-<ver>-win-portable.zip` |
|
| Windows | `DataTools-<ver>-win-setup.exe` |
|
||||||
| Linux | `DataTools-<ver>-linux-x86_64.AppImage`| (the AppImage IS the portable) |
|
| Linux | `DataTools-<ver>-linux-x86_64.AppImage` (already portable) |
|
||||||
|
|
||||||
All six outputs are self-contained: every dependency (Python, pandas,
|
All three outputs are self-contained: every dependency (Python, pandas,
|
||||||
streamlit, pdfplumber, **Tesseract OCR + `eng.traineddata`**, the lot)
|
streamlit, pdfplumber, **Tesseract OCR + `eng.traineddata`**, the lot)
|
||||||
is frozen into the bundle. The buyer does not need to install Python,
|
is frozen into the bundle. The buyer does not need to install Python,
|
||||||
pip, Tesseract, or anything else first. With Tesseract bundled, each
|
pip, Tesseract, or anything else first. With Tesseract bundled, each
|
||||||
@@ -76,55 +72,55 @@ the resulting installers to a GitHub Release. Manual
|
|||||||
|
|
||||||
## Releasing
|
## Releasing
|
||||||
|
|
||||||
### Single-command local build (recommended for one-developer workflow)
|
### CI build (push tag → GitHub Release) — the release process
|
||||||
|
|
||||||
PyInstaller can't cross-compile, so a single machine produces one
|
Releases are built by GitHub Actions (`.github/workflows/build.yml`),
|
||||||
platform's packages. Run this on each target OS:
|
not on a developer's machine. The matrix runs on
|
||||||
|
macos-latest / windows-latest / ubuntu-latest, stages Tesseract
|
||||||
```bash
|
(`build/tesseract.py`), runs PyInstaller, packages the per-platform
|
||||||
# One-time setup per machine:
|
installer, and attaches it to a GitHub Release on tag push:
|
||||||
pip install -r requirements.txt
|
|
||||||
pip install pyinstaller pillow
|
|
||||||
# Windows only: install Inno Setup from https://jrsoftware.org/isdl.php
|
|
||||||
# Linux only: drop appimagetool onto PATH (see preflight output)
|
|
||||||
|
|
||||||
# Build everything for the current OS:
|
|
||||||
python build/make_release.py
|
|
||||||
```
|
|
||||||
|
|
||||||
Outputs land in `dist/`:
|
|
||||||
- Windows host → `DataTools-<ver>-win-setup.exe` + `DataTools-<ver>-win-portable.zip`
|
|
||||||
- macOS host → `DataTools-<ver>-mac.dmg` + `DataTools-<ver>-mac-portable.zip`
|
|
||||||
- Linux host → `DataTools-<ver>-linux-x86_64.AppImage`
|
|
||||||
|
|
||||||
Useful flags:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python build/make_release.py --preflight # check tooling, build nothing
|
|
||||||
python build/make_release.py --clean # wipe dist/ first
|
|
||||||
python build/make_release.py --skip-installer # just the portable zip
|
|
||||||
python build/make_release.py --skip-portable # just the installer
|
|
||||||
```
|
|
||||||
|
|
||||||
### CI build (push tag → GitHub Release)
|
|
||||||
|
|
||||||
If you have CI runners for all three OSes:
|
|
||||||
|
|
||||||
1. Bump `__version__` in `src/__init__.py`.
|
1. Bump `__version__` in `src/__init__.py`.
|
||||||
2. `git commit -am "release: vX.Y.Z" && git tag vX.Y.Z`.
|
2. `git commit -am "release: vX.Y.Z" && git tag vX.Y.Z`.
|
||||||
3. `git push && git push --tags`.
|
3. `git push && git push --tags`.
|
||||||
4. CI builds all three platforms and creates a Release with the
|
4. CI builds all three platforms and creates a Release with the
|
||||||
installers + portable zips attached.
|
installers attached.
|
||||||
5. Mirror the Release assets to Gumroad (manual until v2).
|
5. Mirror the Release assets to Gumroad (manual until v2).
|
||||||
|
|
||||||
|
A manual `workflow_dispatch` run does the same build but uploads the
|
||||||
|
installers as workflow artifacts instead of creating a Release —
|
||||||
|
useful for smoke-testing a build without cutting a tag.
|
||||||
|
|
||||||
|
### Local build (single platform, for testing)
|
||||||
|
|
||||||
|
PyInstaller can't cross-compile, so a local build produces only the
|
||||||
|
current OS's installer. This mirrors what CI does, by hand — use it to
|
||||||
|
debug the bundle before tagging. See the per-platform recipes below for
|
||||||
|
the exact commands; the short version is:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install pyinstaller pillow
|
||||||
|
python build/generate_icons.py
|
||||||
|
python -c "import sys; sys.path.insert(0,'build'); \
|
||||||
|
from tesseract import fetch_tessdata, fetch_tesseract_for_platform; \
|
||||||
|
fetch_tessdata(); fetch_tesseract_for_platform('mac')" # win / mac / linux
|
||||||
|
pyinstaller build/datatools.spec --clean --noconfirm
|
||||||
|
# then run the matching packager: build/macos/build_dmg.sh,
|
||||||
|
# build/installer.iss (iscc), or build/appimage/build.sh
|
||||||
|
```
|
||||||
|
|
||||||
## Signing (Phase 2 — needs accounts/credentials)
|
## Signing (Phase 2 — needs accounts/credentials)
|
||||||
|
|
||||||
Both code-signing steps are intentionally not in CI yet because they
|
**macOS signing + notarization is now wired into `build.yml`** (the
|
||||||
require credentials the owner sets up first.
|
"Sign & notarize macOS app" step, with `build/macos/entitlements.plist`).
|
||||||
|
It is guarded: if `MACOS_DEVELOPER_ID_CERT_P12_BASE64` is absent the step
|
||||||
|
warns and exits 0, so dry-run dispatches still produce an unsigned build.
|
||||||
|
To activate it, just add the secrets below — no code change needed.
|
||||||
|
**Windows** code-signing is still not wired (accepted v1 friction).
|
||||||
|
|
||||||
**macOS** — Apple Developer Program enrollment ($99/yr). Once enrolled,
|
**macOS** — Apple Developer Program enrollment ($99/yr). Once enrolled,
|
||||||
add these GitHub Secrets and uncomment the `codesign` + `notarytool`
|
add these GitHub Secrets to activate the signing step in `build.yml`:
|
||||||
steps in `build.yml`:
|
|
||||||
|
|
||||||
| Secret | Value |
|
| Secret | Value |
|
||||||
|---|---|
|
|---|---|
|
||||||
@@ -321,17 +317,18 @@ The runtime resolver (in `src/`, owned by the runtime team) walks:
|
|||||||
(sourced from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best)).
|
(sourced from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best)).
|
||||||
`datatools.spec` copies it into `tesseract/tessdata/`.
|
`datatools.spec` copies it into `tesseract/tessdata/`.
|
||||||
- **Binary** — fetched per-platform at build time by
|
- **Binary** — fetched per-platform at build time by
|
||||||
`build/make_release.py` from pinned upstream URLs. Current pin:
|
`build/tesseract.py` from pinned upstream URLs. Current pin:
|
||||||
**Tesseract 5.5.0**.
|
**Tesseract 5.5.0**. CI imports `fetch_tessdata` +
|
||||||
|
`fetch_tesseract_for_platform` from this module before PyInstaller.
|
||||||
|
|
||||||
**Updating Tesseract**:
|
**Updating Tesseract**:
|
||||||
|
|
||||||
1. Bump the version pin and the per-platform fetch URLs in
|
1. Bump the version pin and the per-platform fetch URLs in
|
||||||
`build/make_release.py`.
|
`build/tesseract.py`.
|
||||||
2. If the model schema changed upstream, refresh
|
2. If the model schema changed upstream, refresh
|
||||||
`build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the
|
`build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the
|
||||||
matching tag.
|
matching tag.
|
||||||
3. Rebuild on each platform (`python build/make_release.py`) and
|
3. Push a `v*` tag so CI rebuilds all three platforms, then
|
||||||
smoke-test a scanned PDF through the PDF Extractor.
|
smoke-test a scanned PDF through the PDF Extractor.
|
||||||
4. Update `LICENSE_TESSERACT.txt` at the repo root if upstream license
|
4. Update `LICENSE_TESSERACT.txt` at the repo root if upstream license
|
||||||
terms change (Apache-2.0 today).
|
terms change (Apache-2.0 today).
|
||||||
|
|||||||
@@ -1,69 +0,0 @@
|
|||||||
"""Wrap the PyInstaller folder build into a portable .zip.
|
|
||||||
|
|
||||||
Self-contained download: unzip → double-click the launcher → app runs.
|
|
||||||
No installer, no Python install, no admin rights required.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
python build/build_portable_zip.py <platform> <version>
|
|
||||||
|
|
||||||
Where ``platform`` is one of ``win`` / ``mac`` / ``linux``. The
|
|
||||||
script just produces a generic ``dist/DataTools/`` zip; on macOS the
|
|
||||||
preferred portable format is the ``ditto``-wrapped .app — see
|
|
||||||
``build/macos/build_zip.sh`` for that flow. This helper exists mainly
|
|
||||||
for Windows + Linux, where there's no .app bundle to wrap.
|
|
||||||
|
|
||||||
Output:
|
|
||||||
dist/DataTools-<version>-<platform>-portable.zip
|
|
||||||
|
|
||||||
The zip root is the ``DataTools/`` folder so an unzip produces a
|
|
||||||
self-contained dir the user can drop anywhere (Desktop, USB stick,
|
|
||||||
network share). On Windows, the launcher is ``DataTools.exe`` inside
|
|
||||||
that folder; on Linux, ``DataTools``.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import shutil
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
REPO = Path(__file__).resolve().parent.parent
|
|
||||||
DIST_DIR = REPO / "dist"
|
|
||||||
BUNDLE_DIR = DIST_DIR / "DataTools"
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
|
||||||
if len(sys.argv) < 3:
|
|
||||||
sys.stderr.write(
|
|
||||||
"usage: python build/build_portable_zip.py <platform> <version>\n"
|
|
||||||
)
|
|
||||||
return 2
|
|
||||||
platform = sys.argv[1]
|
|
||||||
version = sys.argv[2]
|
|
||||||
|
|
||||||
if not BUNDLE_DIR.is_dir():
|
|
||||||
sys.stderr.write(
|
|
||||||
f"Bundle dir not found at {BUNDLE_DIR}.\n"
|
|
||||||
"Run ``pyinstaller build/datatools.spec --clean --noconfirm`` first.\n"
|
|
||||||
)
|
|
||||||
return 1
|
|
||||||
|
|
||||||
out_stem = DIST_DIR / f"DataTools-{version}-{platform}-portable"
|
|
||||||
# ``make_archive`` takes a base name (no extension) and produces
|
|
||||||
# ``<base>.zip``. ``root_dir`` = parent of what we want compressed,
|
|
||||||
# ``base_dir`` = the folder name inside the archive root. This
|
|
||||||
# combo yields a single top-level ``DataTools/`` directory inside
|
|
||||||
# the .zip rather than dumping its contents loose.
|
|
||||||
archive = shutil.make_archive(
|
|
||||||
base_name=str(out_stem),
|
|
||||||
format="zip",
|
|
||||||
root_dir=str(DIST_DIR),
|
|
||||||
base_dir="DataTools",
|
|
||||||
)
|
|
||||||
size_mb = Path(archive).stat().st_size / (1024 * 1024)
|
|
||||||
print(f"wrote {archive} ({size_mb:.1f} MB)")
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
@@ -105,7 +105,7 @@ datas += [
|
|||||||
]
|
]
|
||||||
|
|
||||||
# ----- Tesseract OCR bundle ----------------------------------------
|
# ----- Tesseract OCR bundle ----------------------------------------
|
||||||
# ``build/make_release.py`` stages the per-platform Tesseract binary
|
# ``build/tesseract.py`` stages the per-platform Tesseract binary
|
||||||
# + its runtime libs (DLLs/dylibs/sos) into
|
# + its runtime libs (DLLs/dylibs/sos) into
|
||||||
# ``build/_tesseract/<target>/`` and the shared eng.traineddata into
|
# ``build/_tesseract/<target>/`` and the shared eng.traineddata into
|
||||||
# ``build/vendor/tessdata/``. We add both to ``datas`` so PyInstaller
|
# ``build/vendor/tessdata/``. We add both to ``datas`` so PyInstaller
|
||||||
@@ -119,16 +119,16 @@ datas += [
|
|||||||
# from ``Path(sys._MEIPASS) / "tesseract" / ...``. Keep the two ends
|
# from ``Path(sys._MEIPASS) / "tesseract" / ...``. Keep the two ends
|
||||||
# in sync — if you rename "tesseract" here, update pdf_extract.py too.
|
# in sync — if you rename "tesseract" here, update pdf_extract.py too.
|
||||||
#
|
#
|
||||||
# The orchestrator (make_release.py) sets DATATOOLS_TESS_STAGING to
|
# CI (.github/workflows/build.yml) sets DATATOOLS_TESS_STAGING to the
|
||||||
# the right per-platform dir before invoking PyInstaller. For ad-hoc
|
# right per-platform dir before invoking PyInstaller. For ad-hoc
|
||||||
# `pyinstaller build/datatools.spec` runs without the orchestrator,
|
# `pyinstaller build/datatools.spec` runs without that env var, fall
|
||||||
# fall back to the canonical staging path.
|
# back to the canonical staging path.
|
||||||
_tess_staging_env = os.environ.get("DATATOOLS_TESS_STAGING")
|
_tess_staging_env = os.environ.get("DATATOOLS_TESS_STAGING")
|
||||||
if _tess_staging_env:
|
if _tess_staging_env:
|
||||||
_tess_staging = Path(_tess_staging_env)
|
_tess_staging = Path(_tess_staging_env)
|
||||||
else:
|
else:
|
||||||
# Pick the obvious per-host staging dir as a fallback so spec-only
|
# Pick the obvious per-host staging dir as a fallback so spec-only
|
||||||
# builds (without the orchestrator) still work in dev.
|
# builds (without the CI env var) still work in dev.
|
||||||
import sys as _sys_for_target
|
import sys as _sys_for_target
|
||||||
_target_guess = (
|
_target_guess = (
|
||||||
"win" if _sys_for_target.platform.startswith("win")
|
"win" if _sys_for_target.platform.startswith("win")
|
||||||
@@ -149,8 +149,8 @@ else:
|
|||||||
# though, since the OCR feature will silently fail at runtime.
|
# though, since the OCR feature will silently fail at runtime.
|
||||||
print(
|
print(
|
||||||
f"WARNING: {_tess_staging} is empty or missing — OCR will be "
|
f"WARNING: {_tess_staging} is empty or missing — OCR will be "
|
||||||
"disabled in the bundle. Run build/make_release.py (which "
|
"disabled in the bundle. Run build/tesseract.py's "
|
||||||
"calls fetch_tesseract_for_platform) before pyinstaller, or "
|
"fetch_tesseract_for_platform before pyinstaller, or "
|
||||||
"pre-stage the binary manually."
|
"pre-stage the binary manually."
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -159,8 +159,8 @@ if (_tessdata / "eng.traineddata").exists():
|
|||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
f"WARNING: {_tessdata}/eng.traineddata is missing — OCR will "
|
f"WARNING: {_tessdata}/eng.traineddata is missing — OCR will "
|
||||||
"have no language data at runtime. Run build/make_release.py "
|
"have no language data at runtime. Run build/tesseract.py's "
|
||||||
"or fetch manually per build/vendor/README.md."
|
"fetch_tessdata or fetch manually per build/vendor/README.md."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Bundle the Apache-2.0 LICENSE text alongside the binary. The docs
|
# Bundle the Apache-2.0 LICENSE text alongside the binary. The docs
|
||||||
|
|||||||
@@ -1,43 +0,0 @@
|
|||||||
#!/usr/bin/env bash
|
|
||||||
# Wrap dist/DataTools.app into a no-install portable .zip.
|
|
||||||
#
|
|
||||||
# Usage:
|
|
||||||
# bash build/macos/build_zip.sh <version>
|
|
||||||
#
|
|
||||||
# Why a portable .zip in addition to the .dmg:
|
|
||||||
# * Buyers who don't want an installer can unzip and double-click the
|
|
||||||
# .app directly — no drag-to-/Applications step, no installer
|
|
||||||
# chrome. Self-contained: the .app holds Python + every dep.
|
|
||||||
# * IT-locked-down machines often block .dmg auto-mount but allow
|
|
||||||
# .zip download + extraction.
|
|
||||||
#
|
|
||||||
# Run after ``pyinstaller build/datatools.spec --clean --noconfirm``
|
|
||||||
# has produced ``dist/DataTools.app``. Output goes to
|
|
||||||
# ``dist/DataTools-<version>-mac-portable.zip``.
|
|
||||||
#
|
|
||||||
# Tesseract bundling: no-op here. The bundled Tesseract binary +
|
|
||||||
# dylibs + tessdata are already inside DataTools.app/Contents/Resources/tesseract/
|
|
||||||
# (placed by PyInstaller's BUNDLE/datas mechanism). ``ditto -c -k``
|
|
||||||
# preserves the whole .app tree.
|
|
||||||
|
|
||||||
set -euo pipefail
|
|
||||||
|
|
||||||
VERSION="${1:-0.0.0-dev}"
|
|
||||||
APP="dist/DataTools.app"
|
|
||||||
ZIP="dist/DataTools-${VERSION}-mac-portable.zip"
|
|
||||||
|
|
||||||
if [[ ! -d "$APP" ]]; then
|
|
||||||
echo "Error: $APP not found. Run pyinstaller build/datatools.spec first." >&2
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# ``ditto`` preserves the .app bundle's extended attributes and
|
|
||||||
# resource forks (a plain ``zip`` strips them and can break code
|
|
||||||
# signatures + Info.plist resolution on the buyer's machine).
|
|
||||||
#
|
|
||||||
# --sequesterRsrc keeps the AppleDouble metadata inside the archive
|
|
||||||
# rather than as parallel ._ files on disk after extraction.
|
|
||||||
rm -f "$ZIP"
|
|
||||||
ditto -c -k --sequesterRsrc --keepParent "$APP" "$ZIP"
|
|
||||||
|
|
||||||
echo "Built $ZIP ($(du -h "$ZIP" | cut -f1))"
|
|
||||||
28
build/macos/entitlements.plist
Normal file
28
build/macos/entitlements.plist
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||||
|
<!--
|
||||||
|
Hardened-runtime entitlements for the notarized DataTools.app.
|
||||||
|
|
||||||
|
PyInstaller freezes a CPython interpreter that maps writable+executable
|
||||||
|
memory and loads many unsigned .so/.dylib modules at runtime. Without
|
||||||
|
these entitlements the hardened runtime kills the process on launch
|
||||||
|
(or notarization rejects the bundle). Keep this list minimal — the app
|
||||||
|
is a local-only Streamlit server, so no network-server/device/camera
|
||||||
|
entitlements are needed.
|
||||||
|
-->
|
||||||
|
<plist version="1.0">
|
||||||
|
<dict>
|
||||||
|
<!-- CPython JIT-style writable/executable memory + ctypes trampolines -->
|
||||||
|
<key>com.apple.security.cs.allow-jit</key>
|
||||||
|
<true/>
|
||||||
|
<key>com.apple.security.cs.allow-unsigned-executable-memory</key>
|
||||||
|
<true/>
|
||||||
|
<!-- Load the bundled C-extension .so / .dylib modules (pandas, pdfplumber,
|
||||||
|
Pillow, the bundled Tesseract dylibs) that aren't Team-ID signed -->
|
||||||
|
<key>com.apple.security.cs.disable-library-validation</key>
|
||||||
|
<true/>
|
||||||
|
<!-- Launcher sets DATATOOLS_*/TESSDATA_PREFIX/PYTHON* before exec -->
|
||||||
|
<key>com.apple.security.cs.allow-dyld-environment-variables</key>
|
||||||
|
<true/>
|
||||||
|
</dict>
|
||||||
|
</plist>
|
||||||
@@ -1,40 +1,23 @@
|
|||||||
"""Single-command release builder for DataTools.
|
"""Tesseract bundling helpers for the release build.
|
||||||
|
|
||||||
PyInstaller can't cross-compile — to produce a Windows .exe you run
|
PDF Extractor OCR ships a per-platform Tesseract binary plus the English
|
||||||
this on Windows, for a Mac .dmg you run it on macOS, for a Linux
|
``eng.traineddata`` model inside the frozen PyInstaller bundle so scanned
|
||||||
AppImage you run it on Linux. One script, one OS at a time.
|
PDFs work without a separate user install. These helpers fetch the binary
|
||||||
|
and tessdata at build time; the GitHub Actions workflow
|
||||||
|
(``.github/workflows/build.yml``) imports ``fetch_tessdata`` and
|
||||||
|
``fetch_tesseract_for_platform`` and runs them before PyInstaller.
|
||||||
|
|
||||||
What this script does (in order):
|
Everything is staged under ``build/_tesseract/<platform>/`` (gitignored).
|
||||||
1. Preflight — checks PyInstaller, Pillow, and the platform's
|
The PyInstaller spec (``build/datatools.spec``) reads that staging dir plus
|
||||||
packager (Inno Setup on Win / hdiutil + ditto on Mac /
|
``build/vendor/tessdata/`` and bundles them under ``<bundle>/tesseract/``,
|
||||||
appimagetool on Linux) are reachable. Bails with install
|
where the runtime discovery code in ``src/pdf_extract.py`` expects:
|
||||||
instructions if anything is missing.
|
Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"
|
||||||
2. Generates icon.ico / icon.icns / icon.png from the PNG asset.
|
Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"
|
||||||
3. Runs PyInstaller against build/datatools.spec.
|
|
||||||
4. Wraps the PyInstaller output into:
|
|
||||||
* Windows: DataTools-<ver>-win-setup.exe (Inno Setup)
|
|
||||||
+ DataTools-<ver>-win-portable.zip
|
|
||||||
* macOS: DataTools-<ver>-mac.dmg
|
|
||||||
+ DataTools-<ver>-mac-portable.zip
|
|
||||||
* Linux: DataTools-<ver>-linux-x86_64.AppImage
|
|
||||||
5. Prints what landed in dist/ and the byte sizes.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
python build/make_release.py # build everything for this OS
|
|
||||||
python build/make_release.py --preflight # check tooling, don't build
|
|
||||||
python build/make_release.py --skip-installer # only the portable zip
|
|
||||||
python build/make_release.py --skip-portable # only the installer
|
|
||||||
python build/make_release.py --clean # wipe dist/ first
|
|
||||||
|
|
||||||
Run from the repo root or from build/ — either works.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
import os
|
||||||
import platform
|
|
||||||
import re
|
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
@@ -43,7 +26,6 @@ from pathlib import Path
|
|||||||
|
|
||||||
REPO = Path(__file__).resolve().parent.parent
|
REPO = Path(__file__).resolve().parent.parent
|
||||||
BUILD = REPO / "build"
|
BUILD = REPO / "build"
|
||||||
DIST = REPO / "dist"
|
|
||||||
|
|
||||||
# Tesseract bundling. The runtime discovery code in
|
# Tesseract bundling. The runtime discovery code in
|
||||||
# ``src/pdf_extract.py`` looks for the binary at
|
# ``src/pdf_extract.py`` looks for the binary at
|
||||||
@@ -95,119 +77,6 @@ def _run(cmd: list[str], cwd: Path | None = None, env: dict | None = None) -> No
|
|||||||
sys.exit(127)
|
sys.exit(127)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Platform detection
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _detect_platform() -> str:
|
|
||||||
"""Return ``win`` / ``mac`` / ``linux`` based on sys.platform."""
|
|
||||||
p = sys.platform
|
|
||||||
if p.startswith("win"):
|
|
||||||
return "win"
|
|
||||||
if p == "darwin":
|
|
||||||
return "mac"
|
|
||||||
if p.startswith("linux"):
|
|
||||||
return "linux"
|
|
||||||
_err(f"unsupported platform {p!r}; this script handles win/mac/linux only.")
|
|
||||||
sys.exit(2)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Version — single source of truth in src/__init__.py
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _read_version() -> str:
|
|
||||||
init_py = (REPO / "src" / "__init__.py").read_text(encoding="utf-8")
|
|
||||||
m = re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', init_py)
|
|
||||||
if not m:
|
|
||||||
_err("could not parse __version__ from src/__init__.py")
|
|
||||||
sys.exit(1)
|
|
||||||
return m.group(1)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Preflight — check tooling before doing anything destructive
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _have_module(name: str) -> bool:
|
|
||||||
try:
|
|
||||||
__import__(name)
|
|
||||||
return True
|
|
||||||
except ImportError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def _have_command(name: str) -> bool:
|
|
||||||
return shutil.which(name) is not None
|
|
||||||
|
|
||||||
|
|
||||||
# Per-platform install hints. The error messages quote these so a buyer
|
|
||||||
# building from source isn't left guessing what to install next.
|
|
||||||
_INSTALL_HINTS = {
|
|
||||||
"pyinstaller": "pip install pyinstaller",
|
|
||||||
"pil": "pip install pillow",
|
|
||||||
"iscc": "Inno Setup (Windows): https://jrsoftware.org/isdl.php — install, then re-open the shell so iscc lands on PATH.",
|
|
||||||
"hdiutil": "ships with macOS — if it's missing your Mac install is broken.",
|
|
||||||
"ditto": "ships with macOS — if it's missing your Mac install is broken.",
|
|
||||||
"appimagetool": "Linux: download appimagetool-x86_64.AppImage from https://github.com/AppImage/AppImageKit/releases, chmod +x, drop on PATH.",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def preflight(target: str) -> None:
|
|
||||||
"""Verify every tool the target build needs is reachable; exit if not."""
|
|
||||||
_step(f"preflight ({target})")
|
|
||||||
|
|
||||||
missing: list[tuple[str, str]] = []
|
|
||||||
|
|
||||||
# Python-side deps — same on every platform. The ``_INSTALL_HINTS``
|
|
||||||
# lookup uses lowercase keys so module name capitalization doesn't
|
|
||||||
# need to match.
|
|
||||||
for mod in ("PyInstaller", "PIL"):
|
|
||||||
if not _have_module(mod):
|
|
||||||
hint = _INSTALL_HINTS.get(mod.lower(), f"pip install {mod}")
|
|
||||||
missing.append((mod.lower(), hint))
|
|
||||||
else:
|
|
||||||
_ok(f"{mod} importable")
|
|
||||||
|
|
||||||
# PyInstaller's CLI must also be reachable as a binary, not just as
|
|
||||||
# an importable module — the spec is invoked via the ``pyinstaller``
|
|
||||||
# command. ``python -m PyInstaller`` is a fine fallback so don't
|
|
||||||
# hard-fail if only the CLI binary is missing.
|
|
||||||
if _have_command("pyinstaller"):
|
|
||||||
_ok("pyinstaller on PATH")
|
|
||||||
else:
|
|
||||||
_warn("pyinstaller binary not on PATH — will fall back to `python -m PyInstaller`")
|
|
||||||
|
|
||||||
# Platform-specific packagers.
|
|
||||||
if target == "win":
|
|
||||||
if _have_command("iscc"):
|
|
||||||
_ok("Inno Setup (iscc) on PATH")
|
|
||||||
else:
|
|
||||||
missing.append(("iscc", _INSTALL_HINTS["iscc"]))
|
|
||||||
elif target == "mac":
|
|
||||||
for tool in ("hdiutil", "ditto"):
|
|
||||||
if _have_command(tool):
|
|
||||||
_ok(f"{tool} on PATH")
|
|
||||||
else:
|
|
||||||
missing.append((tool, _INSTALL_HINTS[tool]))
|
|
||||||
elif target == "linux":
|
|
||||||
if _have_command("appimagetool"):
|
|
||||||
_ok("appimagetool on PATH")
|
|
||||||
else:
|
|
||||||
missing.append(("appimagetool", _INSTALL_HINTS["appimagetool"]))
|
|
||||||
|
|
||||||
if missing:
|
|
||||||
_err("missing prerequisites:")
|
|
||||||
for name, hint in missing:
|
|
||||||
print(f" - {name}: {hint}", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
_ok("all prerequisites present")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Tesseract bundling — fetch the binary + tessdata at build time.
|
# Tesseract bundling — fetch the binary + tessdata at build time.
|
||||||
#
|
#
|
||||||
@@ -582,176 +451,3 @@ def fetch_tesseract_for_platform(target: str) -> Path:
|
|||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
return staging
|
return staging
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Build steps
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def step_generate_icons() -> None:
|
|
||||||
_step("generate icons")
|
|
||||||
_run([sys.executable, str(BUILD / "generate_icons.py")])
|
|
||||||
|
|
||||||
|
|
||||||
def step_pyinstaller(clean: bool, *, target: str | None = None) -> None:
|
|
||||||
_step("pyinstaller bundle")
|
|
||||||
# Use ``python -m PyInstaller`` so we don't depend on the binary
|
|
||||||
# being on PATH (Windows users frequently see this — pip's
|
|
||||||
# Scripts/ dir isn't auto-added).
|
|
||||||
cmd = [sys.executable, "-m", "PyInstaller",
|
|
||||||
str(BUILD / "datatools.spec"),
|
|
||||||
"--noconfirm"]
|
|
||||||
if clean:
|
|
||||||
cmd.append("--clean")
|
|
||||||
# The spec reads ``DATATOOLS_TESS_STAGING`` to find the per-platform
|
|
||||||
# tesseract staging dir. Passing it via env keeps the spec file
|
|
||||||
# platform-agnostic — the spec doesn't need to detect win/mac/linux
|
|
||||||
# itself; the orchestrator already did.
|
|
||||||
env = os.environ.copy()
|
|
||||||
if target:
|
|
||||||
env["DATATOOLS_TESS_STAGING"] = str(TESSERACT_STAGING / target)
|
|
||||||
_run(cmd, env=env)
|
|
||||||
|
|
||||||
|
|
||||||
def step_package_win(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
|
||||||
out: list[Path] = []
|
|
||||||
if do_installer:
|
|
||||||
_step("Windows installer (Inno Setup)")
|
|
||||||
_run(["iscc", f"/DAppVersion={version}", str(BUILD / "installer.iss")])
|
|
||||||
out.append(DIST / f"DataTools-{version}-win-setup.exe")
|
|
||||||
if do_portable:
|
|
||||||
_step("Windows portable .zip")
|
|
||||||
_run([sys.executable, str(BUILD / "build_portable_zip.py"), "win", version])
|
|
||||||
out.append(DIST / f"DataTools-{version}-win-portable.zip")
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def step_package_mac(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
|
||||||
out: list[Path] = []
|
|
||||||
if do_installer:
|
|
||||||
_step("macOS DMG (installer)")
|
|
||||||
_run(["bash", str(BUILD / "macos" / "build_dmg.sh"), version])
|
|
||||||
out.append(DIST / f"DataTools-{version}-mac.dmg")
|
|
||||||
if do_portable:
|
|
||||||
_step("macOS portable .zip")
|
|
||||||
_run(["bash", str(BUILD / "macos" / "build_zip.sh"), version])
|
|
||||||
out.append(DIST / f"DataTools-{version}-mac-portable.zip")
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def step_package_linux(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
|
||||||
# On Linux the AppImage IS the portable. We ignore the two flags
|
|
||||||
# and always produce the single file — splitting wouldn't add
|
|
||||||
# value.
|
|
||||||
if not (do_installer or do_portable):
|
|
||||||
return []
|
|
||||||
_step("Linux AppImage")
|
|
||||||
_run(["bash", str(BUILD / "appimage" / "build.sh"), version])
|
|
||||||
return [DIST / f"DataTools-{version}-linux-x86_64.AppImage"]
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Orchestration
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _summarise(outputs: list[Path]) -> None:
|
|
||||||
_step("done — outputs")
|
|
||||||
if not outputs:
|
|
||||||
_warn("no files produced (everything skipped via flags)")
|
|
||||||
return
|
|
||||||
for p in outputs:
|
|
||||||
if p.exists():
|
|
||||||
size_mb = p.stat().st_size / (1024 * 1024)
|
|
||||||
print(f" {p.relative_to(REPO)} ({size_mb:.1f} MB)")
|
|
||||||
else:
|
|
||||||
_warn(f"expected output missing: {p.relative_to(REPO)}")
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> int:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog="make_release.py",
|
|
||||||
description=(
|
|
||||||
"Build the installer + portable zip for the current OS. "
|
|
||||||
"Cross-compilation isn't supported by PyInstaller — run "
|
|
||||||
"this once per platform you want to target."
|
|
||||||
),
|
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--platform", choices=("auto", "win", "mac", "linux"), default="auto",
|
|
||||||
help="Override OS detection (mostly for testing). Default: auto.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--preflight", action="store_true",
|
|
||||||
help="Check tooling and exit without building.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--clean", action="store_true",
|
|
||||||
help="Wipe dist/ before building.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--skip-installer", action="store_true",
|
|
||||||
help="Don't build the OS installer (.exe / .dmg).",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--skip-portable", action="store_true",
|
|
||||||
help="Don't build the portable .zip.",
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
target = _detect_platform() if args.platform == "auto" else args.platform
|
|
||||||
version = _read_version()
|
|
||||||
do_installer = not args.skip_installer
|
|
||||||
do_portable = not args.skip_portable
|
|
||||||
|
|
||||||
print(f"DataTools release builder")
|
|
||||||
print(f" target: {target} (host: {platform.platform()})")
|
|
||||||
print(f" version: {version}")
|
|
||||||
print(f" installer: {'yes' if do_installer else 'no'}")
|
|
||||||
print(f" portable: {'yes' if do_portable else 'no'}")
|
|
||||||
print(f" dist dir: {DIST}")
|
|
||||||
|
|
||||||
if target != _detect_platform():
|
|
||||||
_warn(
|
|
||||||
f"--platform {target} but host is {_detect_platform()}. "
|
|
||||||
"PyInstaller can't cross-compile — the bundle will be for "
|
|
||||||
"the HOST, only the packaging step will follow your override. "
|
|
||||||
"Useful only for testing the packager paths."
|
|
||||||
)
|
|
||||||
|
|
||||||
preflight(target)
|
|
||||||
if args.preflight:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
if args.clean and DIST.exists():
|
|
||||||
_step(f"cleaning {DIST}")
|
|
||||||
shutil.rmtree(DIST)
|
|
||||||
|
|
||||||
step_generate_icons()
|
|
||||||
|
|
||||||
# Stage Tesseract OCR before PyInstaller runs. The spec reads
|
|
||||||
# ``build/_tesseract/<target>/`` + ``build/vendor/tessdata/`` and
|
|
||||||
# bundles them under ``<bundle>/tesseract/`` so the runtime
|
|
||||||
# discovery in src/pdf_extract.py finds them at:
|
|
||||||
# Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"
|
|
||||||
# Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"
|
|
||||||
fetch_tessdata()
|
|
||||||
fetch_tesseract_for_platform(target)
|
|
||||||
|
|
||||||
step_pyinstaller(clean=args.clean, target=target)
|
|
||||||
|
|
||||||
if target == "win":
|
|
||||||
outputs = step_package_win(version, do_installer, do_portable)
|
|
||||||
elif target == "mac":
|
|
||||||
outputs = step_package_mac(version, do_installer, do_portable)
|
|
||||||
else:
|
|
||||||
outputs = step_package_linux(version, do_installer, do_portable)
|
|
||||||
|
|
||||||
_summarise(outputs)
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
sys.exit(main())
|
|
||||||
11
build/vendor/README.md
vendored
11
build/vendor/README.md
vendored
@@ -4,9 +4,10 @@ This tree holds the third-party assets that get bundled into the
|
|||||||
PyInstaller artifacts but that we deliberately do **not** keep in git
|
PyInstaller artifacts but that we deliberately do **not** keep in git
|
||||||
(too large / license-encumbered / re-fetchable on demand).
|
(too large / license-encumbered / re-fetchable on demand).
|
||||||
|
|
||||||
The build pipeline (`build/make_release.py`) populates everything in
|
The build's Tesseract helper (`build/tesseract.py`) populates
|
||||||
here before the PyInstaller step. The contents are git-ignored except
|
everything in here before the PyInstaller step — CI
|
||||||
for this README.
|
(`.github/workflows/build.yml`) calls it ahead of the build. The
|
||||||
|
contents are git-ignored except for this README.
|
||||||
|
|
||||||
## tessdata/
|
## tessdata/
|
||||||
|
|
||||||
@@ -40,9 +41,9 @@ statements (the only OCR use case so far), the extra accuracy of the
|
|||||||
|
|
||||||
### How it gets populated
|
### How it gets populated
|
||||||
|
|
||||||
`build/make_release.py::fetch_tessdata()` checks for
|
`build/tesseract.py::fetch_tessdata()` checks for
|
||||||
`build/vendor/tessdata/eng.traineddata` on every run. If it's
|
`build/vendor/tessdata/eng.traineddata` on every run. If it's
|
||||||
missing, the script downloads it from the canonical URL above and
|
missing, it downloads the file from the canonical URL above and
|
||||||
caches it here. Subsequent builds reuse the cached file.
|
caches it here. Subsequent builds reuse the cached file.
|
||||||
|
|
||||||
On CI, the directory is restored from the GitHub Actions cache so we
|
On CI, the directory is restored from the GitHub Actions cache so we
|
||||||
|
|||||||
@@ -32,17 +32,22 @@ rebuilds it from a stale headline.
|
|||||||
| Friction kills conversion | BUSINESS.md §7 | Demo dataset preloaded; no "select a file" first-step |
|
| Friction kills conversion | BUSINESS.md §7 | Demo dataset preloaded; no "select a file" first-step |
|
||||||
| < $1,200/mo recurring | BUSINESS.md §9 | Migration plan to $5/mo VPS only after rate-limit signal |
|
| < $1,200/mo recurring | BUSINESS.md §9 | Migration plan to $5/mo VPS only after rate-limit signal |
|
||||||
|
|
||||||
## 3. The three personas (per PLAN.md §2.3)
|
## 3. The three personas — one audience: accounting (per PLAN.md §2.3)
|
||||||
|
|
||||||
|
We niche to **accounting** and enter through the three workflows where a
|
||||||
|
messy export costs real money. Same engine, three landing pages — each
|
||||||
|
is the same buyer at a different desk (bookkeeping, payables, receivables).
|
||||||
|
|
||||||
| Tag | Persona | Top-of-funnel keyword | Demo dataset | Pre-saved pipeline |
|
| Tag | Persona | Top-of-funnel keyword | Demo dataset | Pre-saved pipeline |
|
||||||
|---|---|---|---|---|
|
|---|---|---|---|---|
|
||||||
| `shopify-pet` | Shopify operator (priority: pet supplies) | "shopify customer cleanup" | `samples/demo/shopify_pet_customers.csv` | `shopify_pet_pipeline.json` |
|
| `bookkeeper` | Bookkeeper — bank reconciliation | "reconcile bank export csv duplicates" | `samples/demo/bank_reconciliation.csv` | `bank_reconciliation_pipeline.json` |
|
||||||
| `bookkeeper` | Bookkeeper / freelance accountant | "reconcile bank export csv" | `samples/demo/bookkeeper_bank_reconcile.csv` | `bookkeeper_bank_pipeline.json` |
|
| `ap-1099` | Accounts payable — 1099 vendor prep | "clean 1099 vendor list missing EIN" | `samples/demo/vendor_1099.csv` | `vendor_1099_pipeline.json` |
|
||||||
| `revops` | Marketing / RevOps agency | "dedupe lead list across vendors" | `samples/demo/agency_combined_leads.csv` | `agency_leads_pipeline.json` |
|
| `ar-aging` | Accounts receivable — open invoices | "remove duplicate invoices aging report" | `samples/demo/ar_open_invoices.csv` | `ar_open_invoices_pipeline.json` |
|
||||||
|
|
||||||
Each persona gets its **own landing page URL**, its **own demo dataset
|
Each persona gets its **own landing page URL** (`?p=<tag>`), its **own
|
||||||
loaded by default**, and its **own H1 + below-the-fold copy.** The
|
demo dataset loaded by default**, and its **own H1 + below-the-fold
|
||||||
engine is identical; only positioning differs.
|
copy** — wired in `src/gui/app_demo.py::PERSONAS`. The engine is
|
||||||
|
identical; only positioning differs.
|
||||||
|
|
||||||
## 4. Demo dataset specifications
|
## 4. Demo dataset specifications
|
||||||
|
|
||||||
@@ -53,114 +58,77 @@ persona's tooling. Each contains every kind of pollution the bundle's
|
|||||||
five tools fix, so a single demo run shows every tool earning its
|
five tools fix, so a single demo run shows every tool earning its
|
||||||
keep.
|
keep.
|
||||||
|
|
||||||
### 4.0 Pain-point coverage map
|
### 4.0 Value-proof map
|
||||||
|
|
||||||
Each demo dataset is engineered so the buyer sees their **own top
|
Each demo dataset is engineered so the buyer sees their **own top pain**
|
||||||
pain** demonstrated in the AFTER preview. The mapping below pairs
|
fixed in the AFTER preview, with one unmistakable headline number. All
|
||||||
each pain from PLAN.md §2.3a with the rows / columns that exercise
|
three run the same saved 4-step pipeline (Clean Text → Standardize
|
||||||
it. Refresh the dataset only when this coverage drops.
|
Formats → Fix Missing Values → Find Duplicates). The numbers below are
|
||||||
|
**validated against the live engine** (`tests/test_demo_pipelines.py`
|
||||||
|
pins them) — refresh the dataset only if a number stops landing.
|
||||||
|
|
||||||
| Persona | Pain (from PLAN §2.3a) | Demo coverage |
|
| Persona | Headline proof | What the visitor watches happen |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| Shopify pet | S1 — Klaviyo per-contact dupes | 5 dup pairs across rows 1–15 (case + format + address-twin variants) |
|
| Bookkeeper | **26 → 20 rows · 6 phantom duplicates removed** | The same payment posted twice (different date + amount format) collapses to one; dates go ISO, parens-negatives become real negatives |
|
||||||
| Shopify pet | S2 — feed-rejection chars | smart-quote / NBSP / BOM in rows 1–6, 9, 11 |
|
| AP / 1099 | **24 records → 8 vendors · 7 missing EINs recovered** | Each vendor's scattered records merge into one complete row; `merge=true` backfills the EIN/address/phone that any single record was missing |
|
||||||
| Shopify pet | S3 — multi-channel | partner-style customer IDs (`SHOP-`); demonstration of column-level mapping covered in RevOps demo |
|
| AR aging | **26 → 21 rows · 5 double-entered invoices removed** | Duplicate invoice numbers collapse; a blank status is backfilled from its twin; invoice + due dates go ISO, amounts numeric |
|
||||||
| Shopify pet | S4 — subscription identity | rows 1+2, 7+8, 9+10 — same person, different format |
|
|
||||||
| Shopify pet | S5 — VAT-MOSS country drift | rows 16–18 (`United Kingdom` / `U.K.` / `UK`) + rows 19–20 (`Germany`/`Italia`) |
|
|
||||||
| Bookkeeper | B1 — month-overlap re-import | 7 dup pairs spanning Jan↔Feb and Mar boundaries |
|
|
||||||
| Bookkeeper | B2 — 1099 vendor consolidation | Amazon × 3 spellings, Verizon × 2, Acme Realty × 2, Adobe × 2, Costco × 2, Zoom × 2, Stripe × 4 |
|
|
||||||
| Bookkeeper | B3 — audit trail | every cell change in the run logged with old/new/rule — surface in the demo's audit tab |
|
|
||||||
| Bookkeeper | B4 — per-license economics | demonstrated by pricing copy, not data |
|
|
||||||
| Bookkeeper | B5 — multi-currency | rows 26 (EUR), 27 (GBP), 28 (BRL with comma decimal), 29 (parens-negative) |
|
|
||||||
| RevOps | R1 — per-contact tier | 6 cross-source dup pairs (HubSpot × LinkedIn × Manual Scrape) |
|
|
||||||
| RevOps | R2 — deliverability | rows 26–27 (`uma at uniform dot com`, `victor@@victorco.com` invalid emails) |
|
|
||||||
| RevOps | R3 — GDPR / privacy | demonstrated by the network-tab moat panel + zero-upload claim |
|
|
||||||
| RevOps | R4 — vendor unification | 3 source values (HubSpot / LinkedIn / Manual Scrape), 13 country codes, mixed-shape headers |
|
|
||||||
| RevOps | R5 — suppression list | rows 29–30 (`Suppressed`, `Opted Out` tags) |
|
|
||||||
|
|
||||||
### 4.1 `shopify_pet_customers.csv` (20 rows)
|
### 4.1 `bank_reconciliation.csv` (26 rows) — Bookkeeper
|
||||||
|
|
||||||
**Looks like**: a Shopify customer export filtered for "Pet Supplies"
|
**Looks like**: two months (Jan + Feb 2025) of business-checking activity
|
||||||
sales channel, 12 months activity.
|
from a bank portal, where the Feb re-export overlaps Jan so the same
|
||||||
|
transaction posts twice. Columns: `Date, Description, Vendor, Category,
|
||||||
|
Amount, Account`.
|
||||||
|
|
||||||
**Pollution included**:
|
**Pollution included**:
|
||||||
- Whitespace padding (" Alice ", "Sydney Opera House Drive ")
|
- Mixed date formats: `01/15/2025`, `2025-01-15`, `Jan 18 2025`, `1/27/25`, `Feb 5 2025`.
|
||||||
- Mixed phone formats: `(415) 555-1234`, `415.555.1234`, `5559876543`,
|
- Currency formats incl. negatives: `-$129.99`, `($89.50)` parens-negative, `+$3,450.00`, `- $599.88`, bare `-129.99`, `(50.00)`.
|
||||||
`+1 555-111-1111`
|
- Whitespace + NBSP padding; smart quotes and an em-dash inside descriptions.
|
||||||
- International phones: GB, ES, DE, AU, JP (15 demo rows span 6
|
- Vendor casing variety on *non-duplicate* rows: `Amazon` / `amazon.com` / `AMAZON.COM`, `Verizon` / `verizon`.
|
||||||
countries)
|
- Disguised nulls in Category: `—`, `(blank)`, `?`, `unknown`, `TBD`.
|
||||||
- Currency variants: `$1,240.50`, `£890.25`, `€2.410,75` (EU comma
|
- **6 duplicate transactions** — each pair shares the same vendor + real value but a different date *and* amount format, so they collapse only after standardization.
|
||||||
decimal), `A$ 1,299.00`, `¥75000`
|
|
||||||
- Date formats: `2025-12-04`, `12/15/2025`, `?`, `(blank)`, `(none)`,
|
|
||||||
`#N/A`
|
|
||||||
- Disguised nulls: `N/A`, blank, `(blank)`, `?`, `#N/A`, `(none)`,
|
|
||||||
`unknown`
|
|
||||||
- Name casing: `EVE MARTINEZ`, `henry`, `O'NEIL`, `noah`, mixed Title /
|
|
||||||
ALL CAPS / lower
|
|
||||||
- Email case variants that *should* dedup: `Bob@PetShop.com` vs
|
|
||||||
`alice@petshop.com`
|
|
||||||
- 4 fuzzy duplicates (Alice/Bob same address, Grace/Henry same phone,
|
|
||||||
Carlos/Olivia same address, Ivy/Jack same address)
|
|
||||||
|
|
||||||
**After running the pipeline**: 20 rows → 15, ~29 cells canonicalized,
|
**After running the pipeline** (validated): **26 → 20 rows, 6 duplicates
|
||||||
~45 sentinels standardised, 5 cross-row duplicates merged. The
|
removed**, 36 date/amount cells standardized (0 unparseable), all dates
|
||||||
customer table is now Klaviyo-import-ready and the country column
|
ISO, parens-negatives resolved (`($89.50)` → `-89.50`), disguised-null
|
||||||
(previously `UK` / `U.K.` / `United Kingdom` / `Germany` / `Italia`)
|
categories flagged. The reconciliation ties out.
|
||||||
is GB / DE / IT — VAT MOSS report won't break.
|
|
||||||
|
|
||||||
### 4.2 `bookkeeper_bank_reconcile.csv` (30 rows)
|
### 4.2 `vendor_1099.csv` (24 rows) — Accounts payable / 1099
|
||||||
|
|
||||||
**Looks like**: two months of business checking + credit-card activity
|
**Looks like**: a 1099-NEC vendor master list where the same vendor was
|
||||||
exported from a bank portal, with the Feb export accidentally
|
entered 2–3 times across the year by different staff, each record holding
|
||||||
overlapping the Jan export at the month boundary.
|
only *part* of the vendor's details. Columns: `Vendor, Contact, Email,
|
||||||
|
Phone, EIN, Address, Total_Paid`.
|
||||||
|
|
||||||
**Pollution included**:
|
**Pollution included**:
|
||||||
- Mixed date formats: `01/15/2025`, `2025-01-15`, `Jan 18 2025`,
|
- The duplicate records for a vendor share one email differing only by case/whitespace (the reliable dedup key, matched with the `email` normalizer).
|
||||||
`1/27/25`, `Feb 5 2025`
|
- EIN / Phone / Address scattered across the duplicate set so no single record is complete but the union is — gaps marked `—`, `(blank)`, `TBD`, `unknown`, `N/A`.
|
||||||
- Currency formats: `-$129.99`, `($89.50)` parens-negative,
|
- Vendor name casing/spelling variants, phone formats, EIN formats (`12-3456789` vs `123456789`), `Total_Paid` currency variants.
|
||||||
`+$3,450.00`, `- $599.88` space, bare `-129.99`, `(50.00)`
|
|
||||||
- Header trailing whitespace: `"Date "`
|
|
||||||
- Smart quotes around descriptions: `"autopay"`
|
|
||||||
- Em-dash sentinels in Vendor: `—`
|
|
||||||
- Smart-em-dash inside descriptions: `STAPLES #4422 — paper, toner`
|
|
||||||
- Vendor casing inconsistency: `Amazon` / `amazon.com` / `AMAZON.COM`,
|
|
||||||
`Verizon` / `verizon`
|
|
||||||
- 6 duplicate transactions (same date+amount+vendor recorded twice
|
|
||||||
with different formats)
|
|
||||||
|
|
||||||
**After running the pipeline**: 30 rows → 23, ~84 cells normalized, 7
|
**After running the pipeline** (validated): **24 records → 8 vendors, 16
|
||||||
duplicates removed (month-overlap + VAT-MOSS dups). All dates
|
duplicates removed, 7 missing EINs recovered** by `merge=true` +
|
||||||
ISO-formatted, all amounts numeric (including EUR/GBP/BRL with comma
|
`most_complete` survivor, 35 disguised nulls caught, phones/emails/amounts
|
||||||
decimal), vendor casing canonical, parens-negative resolved.
|
standardized (0 unparseable). One vendor genuinely has no EIN in any
|
||||||
|
record — it survives with a blank EIN as the realistic "flag for
|
||||||
|
follow-up" case.
|
||||||
|
|
||||||
### 4.3 `agency_combined_leads.csv` (30 rows)
|
### 4.3 `ar_open_invoices.csv` (26 rows) — Accounts receivable
|
||||||
|
|
||||||
**Looks like**: a marketing-ops worksheet combining lead exports from
|
**Looks like**: an open-invoices (unpaid AR) export where some invoices
|
||||||
HubSpot + LinkedIn Sales Navigator + manual scraping, ready for
|
were double-entered in different formats and client contacts are messy.
|
||||||
campaign targeting.
|
Columns: `Invoice, Client, Email, Invoice_Date, Due_Date, Amount, Status`.
|
||||||
|
|
||||||
**Pollution included**:
|
**Pollution included**:
|
||||||
- Phone formats per region: US, UK, Spain, Germany, China, India,
|
- Two date columns with mixed formats; currency variants incl. a credit memo `($300.00)` → `-300.00`.
|
||||||
Australia, Mexico, Israel, Singapore, Hong Kong, Italy, South
|
- Client name casing variety; email case variants (`AP@Acme.com` vs `ap@acme.com`).
|
||||||
Korea — 13 country codes
|
- Status disguised nulls: `—`, `?`, `(blank)`, `TBD`, `unknown`, `(none)`.
|
||||||
- Country column inconsistent: `USA` / `US` / `United States`
|
- **5 double-entered invoices** — same invoice number twice, dates/amount in different formats, one copy with a blank status the other fills.
|
||||||
- Disguised nulls: `N/A`, `unknown`, `(unknown)`, `(blank)`, `(none)`,
|
|
||||||
`?`, `—`, `#N/A`, `TBD`
|
|
||||||
- Source column tags origin (`HubSpot` / `LinkedIn` / `Manual Scrape`)
|
|
||||||
- Email duplicates across sources with case variants: `alice@acme.com`
|
|
||||||
+ `Alice.Johnson@acme.com`, `bob@beta.com` + `Bob@Beta.com`,
|
|
||||||
`diana@delta.com` from two sources, `carlos@gamma.io` from two
|
|
||||||
sources, `Frank@Foxtrot.de` + `frank@foxtrot.de`
|
|
||||||
- Name casing: `DIANA LEE`, `henry`, `IVY CHEN`, mixed
|
|
||||||
- 6 fuzzy / cross-source duplicates designed to survive the dedup
|
|
||||||
- Score column with sentinel pollution that needs coercion to integer
|
|
||||||
|
|
||||||
**After running the pipeline**: 30 rows → 24, ~43 cells canonicalized,
|
**After running the pipeline** (validated): **26 → 21 rows, 5 duplicate
|
||||||
14 sentinels resolved, 6 cross-source duplicates merged with `merge=true`
|
invoices removed**, both date columns ISO + amounts numeric + emails
|
||||||
so each survivor inherits the most-complete picture. Invalid-email
|
lowercased (0 unparseable), 7 disguised-null statuses caught, and a blank
|
||||||
rows (deliverability stress) and `Suppressed`/`Opted Out` tags
|
status backfilled from its twin via `merge=true`. The aging report stops
|
||||||
(suppression-list use case) survive as flagged rows the operator
|
double-counting.
|
||||||
manually reviews.
|
|
||||||
|
|
||||||
## 5. UX flow (per persona)
|
## 5. UX flow (per persona)
|
||||||
|
|
||||||
@@ -174,26 +142,26 @@ dedicated `app_demo.py` for the cloud build).
|
|||||||
│ "{Persona-specific H1}" │
|
│ "{Persona-specific H1}" │
|
||||||
├──────────────────────────────────────────────────────────┤
|
├──────────────────────────────────────────────────────────┤
|
||||||
│ │
|
│ │
|
||||||
│ Sample dataset preloaded: shopify_pet_customers.csv │
|
│ Sample dataset preloaded: bank_reconciliation.csv │
|
||||||
│ [Replace with your own file (capped 100 rows)] │
|
│ [Replace with your own file (capped 100 rows)] │
|
||||||
│ │
|
│ │
|
||||||
│ ┌─ BEFORE preview (15 rows) ─────────────────────────┐ │
|
│ ┌─ BEFORE preview (26 rows) ─────────────────────────┐ │
|
||||||
│ │ Alice | (415) 555-1234 | $1,240.50 | … │ │
|
│ │ 01/15/2025 | Stripe | +$3,450.00 | … │ │
|
||||||
│ │ Bob | 415.555.1234 | $1,240.50 | … │ │
|
│ │ 2025-01-15 | Stripe | 3450.00 | … (dup) │ │
|
||||||
│ │ ... │ │
|
│ │ ... │ │
|
||||||
│ └──────────────────────────────────────────────────┘ │
|
│ └──────────────────────────────────────────────────┘ │
|
||||||
│ │
|
│ │
|
||||||
│ Pipeline (saved): │
|
│ Pipeline (saved): │
|
||||||
│ 1. Text Clean → 2. Format Standardize → │
|
│ 1. Clean Text → 2. Standardize Formats → │
|
||||||
│ 3. Missing → 4. Deduplicate │
|
│ 3. Fix Missing → 4. Find Duplicates │
|
||||||
│ │
|
│ │
|
||||||
│ [▶ Run pipeline] │
|
│ [▶ Run pipeline] │
|
||||||
│ │
|
│ │
|
||||||
│ ┌─ AFTER preview ───────────────────────────────────┐ │
|
│ ┌─ AFTER preview ───────────────────────────────────┐ │
|
||||||
│ │ 15 rows → 11 (4 duplicates merged) │ │
|
│ │ 26 rows → 20 (6 duplicate transactions removed) │ │
|
||||||
│ │ 27 cells canonicalized · 33 sentinels resolved │ │
|
│ │ 36 cells standardized · 4 disguised nulls flagged │ │
|
||||||
│ │ │ │
|
│ │ │ │
|
||||||
│ │ Alice Johnson | +14155551234 | 1240.50 | … │ │
|
│ │ 2025-01-15 | Stripe | 3450.00 | … │ │
|
||||||
│ │ ... │ │
|
│ │ ... │ │
|
||||||
│ └──────────────────────────────────────────────────┘ │
|
│ └──────────────────────────────────────────────────┘ │
|
||||||
│ │
|
│ │
|
||||||
@@ -244,27 +212,35 @@ not "demo crippled" data.
|
|||||||
|
|
||||||
## 7. CTA copy (per persona)
|
## 7. CTA copy (per persona)
|
||||||
|
|
||||||
### 7.1 Shopify pet operator
|
Copy lives in `src/gui/app_demo.py::PERSONAS` (H1 / sub / CTA per tag);
|
||||||
|
keep this section in sync with that dict.
|
||||||
|
|
||||||
- **H1**: *Clean your customer / vendor / subscriber exports — locally.*
|
### 7.1 Bookkeeper — bank reconciliation (`?p=bookkeeper`)
|
||||||
- **Sub**: *Klaviyo-import-ready in 30 seconds. Catches duplicates Excel
|
|
||||||
misses. Your data never leaves your computer.*
|
|
||||||
- **CTA**: *Get DataTools for Shopify — $49 →*
|
|
||||||
|
|
||||||
### 7.2 Bookkeeper / freelance accountant
|
- **H1**: *Catch the transactions your bank export posted twice. Locally.*
|
||||||
|
- **Sub**: *When the Jan and Feb exports overlap, the same payment posts
|
||||||
- **H1**: *Reconcile messy bank exports. Hand your client an audit
|
twice in two formats. DataTools standardizes every date and amount, then
|
||||||
trail.*
|
dedups on the real transaction so your reconciliation ties out — 26 rows
|
||||||
- **Sub**: *Catches the duplicate transaction Quickbooks imported twice.
|
→ 20, six phantom duplicates gone.*
|
||||||
Standardizes dates, amounts, vendor casing. Every change auditable.*
|
|
||||||
- **CTA**: *Get DataTools for Bookkeepers — $49 →*
|
- **CTA**: *Get DataTools for Bookkeepers — $49 →*
|
||||||
|
|
||||||
### 7.3 Marketing / RevOps agency
|
### 7.2 Accounts payable — 1099 prep (`?p=ap-1099`)
|
||||||
|
|
||||||
- **H1**: *Dedupe leads across HubSpot, LinkedIn, and manual scrapes.*
|
- **H1**: *Build a clean 1099 vendor list — with the missing EINs filled in.*
|
||||||
- **Sub**: *International phones, country normalization, fuzzy dedup
|
- **Sub**: *The same vendor entered three times, each record holding only
|
||||||
with merge — one tool, one schema, no upload.*
|
part of the details. DataTools consolidates to one row and backfills the
|
||||||
- **CTA**: *Get DataTools for RevOps — $49 →*
|
gaps from the duplicates — 24 records → 8 vendors, 7 missing EINs
|
||||||
|
recovered.*
|
||||||
|
- **CTA**: *Get DataTools for Accounting — $49 →*
|
||||||
|
|
||||||
|
### 7.3 Accounts receivable — open invoices (`?p=ar-aging`)
|
||||||
|
|
||||||
|
- **H1**: *Stop chasing the invoices your aging report counted twice. Locally.*
|
||||||
|
- **Sub**: *Double-entered invoices inflate your AR aging and your
|
||||||
|
follow-ups. DataTools standardizes dates and amounts, lowercases client
|
||||||
|
emails, and removes the duplicate invoice numbers — 26 rows → 21, five
|
||||||
|
phantom invoices off the books.*
|
||||||
|
- **CTA**: *Get DataTools for Accounting — $49 →*
|
||||||
|
|
||||||
## 8. Telemetry / conversion tracking
|
## 8. Telemetry / conversion tracking
|
||||||
|
|
||||||
|
|||||||
@@ -298,7 +298,7 @@ All `DataToolsError` subclasses extend stdlib `ValueError` or `OSError` so exist
|
|||||||
|
|
||||||
## PDF Extractor — bundled Tesseract
|
## PDF Extractor — bundled Tesseract
|
||||||
|
|
||||||
Frozen builds (installer / portable .zip / AppImage) ship Tesseract OCR inside the bundle so scanned PDFs work without a separate system install. Source / `pip` developer environments still resolve Tesseract from `PATH`.
|
Frozen builds (installer / AppImage) ship Tesseract OCR inside the bundle so scanned PDFs work without a separate system install. Source / `pip` developer environments still resolve Tesseract from `PATH`.
|
||||||
|
|
||||||
**Runtime layout (frozen bundles)**:
|
**Runtime layout (frozen bundles)**:
|
||||||
|
|
||||||
@@ -318,13 +318,13 @@ Frozen builds (installer / portable .zip / AppImage) ship Tesseract OCR inside t
|
|||||||
**Where the bytes come from**:
|
**Where the bytes come from**:
|
||||||
|
|
||||||
- **Tessdata** is vendored at `build/vendor/tessdata/eng.traineddata` — the "best" English model from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best). PyInstaller's spec copies it into `tesseract/tessdata/` inside the bundle.
|
- **Tessdata** is vendored at `build/vendor/tessdata/eng.traineddata` — the "best" English model from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best). PyInstaller's spec copies it into `tesseract/tessdata/` inside the bundle.
|
||||||
- **Tesseract binary** is fetched at build time by `build/make_release.py` — per-platform download URLs are pinned in that script. The current pin is **Tesseract 5.5.0**.
|
- **Tesseract binary** is fetched at build time by `build/tesseract.py` — per-platform download URLs are pinned in that module. The current pin is **Tesseract 5.5.0**. CI (`.github/workflows/build.yml`) imports `fetch_tessdata` + `fetch_tesseract_for_platform` and runs them before PyInstaller.
|
||||||
|
|
||||||
**To update Tesseract**:
|
**To update Tesseract**:
|
||||||
|
|
||||||
1. Bump the version pin + the per-platform fetch URLs in `build/make_release.py`.
|
1. Bump the version pin + the per-platform fetch URLs in `build/tesseract.py`.
|
||||||
2. If upstream changed the `eng.traineddata` schema, refresh `build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the matching tag.
|
2. If upstream changed the `eng.traineddata` schema, refresh `build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the matching tag.
|
||||||
3. Rebuild on each platform (`python build/make_release.py`) and smoke-test a scanned-PDF run through the PDF Extractor before tagging the release.
|
3. Push a `v*` tag so CI rebuilds all three platforms, then smoke-test a scanned-PDF run through the PDF Extractor before publishing the release.
|
||||||
4. Update `LICENSE_TESSERACT.txt` at the repo root if the upstream license terms change (Tesseract is Apache-2.0 today).
|
4. Update `LICENSE_TESSERACT.txt` at the repo root if the upstream license terms change (Tesseract is Apache-2.0 today).
|
||||||
|
|
||||||
## Tests
|
## Tests
|
||||||
@@ -352,6 +352,8 @@ tests/
|
|||||||
├── test_analyze.py · test_normalize.py · test_text_clean.py
|
├── test_analyze.py · test_normalize.py · test_text_clean.py
|
||||||
├── test_format_standardize.py
|
├── test_format_standardize.py
|
||||||
├── test_format_standardize_corpus.py # 199-row buyer corpus
|
├── test_format_standardize_corpus.py # 199-row buyer corpus
|
||||||
|
├── test_pipeline.py # pipeline engine: adapters, run, validate, serialize
|
||||||
|
├── test_cli_pipeline.py # pipeline CLI: recommend/apply/strict/audit
|
||||||
├── test_audit_fixes.py · test_errors.py · test_fixes_unit.py
|
├── test_audit_fixes.py · test_errors.py · test_fixes_unit.py
|
||||||
├── test_corpus.py · test_encodings_corpus.py · test_fixtures_sweep.py
|
├── test_corpus.py · test_encodings_corpus.py · test_fixtures_sweep.py
|
||||||
├── test_cli.py · test_cli_*.py · test_e2e.py · test_install.py
|
├── test_cli.py · test_cli_*.py · test_e2e.py · test_install.py
|
||||||
@@ -365,10 +367,27 @@ tests/
|
|||||||
├── test_workflows.py # happy path per Ready tool
|
├── test_workflows.py # happy path per Ready tool
|
||||||
├── test_dedup_review.py # match-group card interactions
|
├── test_dedup_review.py # match-group card interactions
|
||||||
├── test_advanced_panels.py # config_panel widgets
|
├── test_advanced_panels.py # config_panel widgets
|
||||||
|
├── test_pipeline_builder.py # module-card builder: cards, reorder, JSON, run
|
||||||
|
├── test_pipeline_phrasing.py # step_phrase/step_status + name bridge (pure fns)
|
||||||
├── test_errors.py # malformed-upload error paths
|
├── test_errors.py # malformed-upload error paths
|
||||||
└── test_findings_panel.py # analyzer findings rendering
|
└── test_findings_panel.py # analyzer findings rendering
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Pipeline (Automated Workflows) coverage
|
||||||
|
|
||||||
|
The pipeline feature is pinned end to end across four files (~115 tests):
|
||||||
|
`test_pipeline.py` (core engine — every adapter's summary numbers, step
|
||||||
|
data-flow, error stop/continue, empty/single-column/all-disabled edges,
|
||||||
|
dict + file serialization round-trips, `recommended_pipeline(include=…)`,
|
||||||
|
soft-dependency validation), `test_cli_pipeline.py` (CLI — `--recommend`,
|
||||||
|
dry-run-by-default, `--apply` output + audit JSON, `--steps`, `--strict`,
|
||||||
|
`--continue-on-error`, arg validation, save→load round-trip),
|
||||||
|
`test_pipeline_builder.py` (the visual builder via AppTest — card seeding,
|
||||||
|
toggle, reorder ▲/▼, add/remove, restore-recommended, Advanced JSON
|
||||||
|
import/export, per-tool Configure panels emitting the right option dicts),
|
||||||
|
and `test_pipeline_phrasing.py` (the plain-English `step_phrase`/`step_status`
|
||||||
|
helpers and the adapter-key→friendly-name bridge as pure functions).
|
||||||
|
|
||||||
### GUI test layer
|
### GUI test layer
|
||||||
|
|
||||||
GUI tests drive pages with `streamlit.testing.v1.AppTest` —
|
GUI tests drive pages with `streamlit.testing.v1.AppTest` —
|
||||||
|
|||||||
@@ -124,7 +124,7 @@ Tag a release → 3 platform artifacts upload to GitHub Releases. Manual: copy t
|
|||||||
|
|
||||||
### 3.10 Bundled Tesseract (PDF Extractor OCR)
|
### 3.10 Bundled Tesseract (PDF Extractor OCR)
|
||||||
|
|
||||||
Frozen builds ship Tesseract 5.5 + `eng.traineddata` inside the PyInstaller bundle so scanned PDFs work without a separate install. Per-platform binary URLs pinned in `build/make_release.py`; tessdata vendored at `build/vendor/tessdata/eng.traineddata`. License attribution in `LICENSE_TESSERACT.txt` at the repo root.
|
Frozen builds ship Tesseract 5.5 + `eng.traineddata` inside the PyInstaller bundle so scanned PDFs work without a separate install. Per-platform binary URLs pinned in `build/tesseract.py`; tessdata vendored at `build/vendor/tessdata/eng.traineddata`. License attribution in `LICENSE_TESSERACT.txt` at the repo root.
|
||||||
|
|
||||||
**Discovery order at runtime** (see `docs/DEVELOPER.md` for the full Path layout):
|
**Discovery order at runtime** (see `docs/DEVELOPER.md` for the full Path layout):
|
||||||
|
|
||||||
|
|||||||
@@ -25,16 +25,11 @@ Para usar la misma licencia en otro equipo: desactiva éste (página Activar →
|
|||||||
|
|
||||||
## 1. Instalación
|
## 1. Instalación
|
||||||
|
|
||||||
No necesitas tener Python ni permisos de administrador — el paquete trae su propio intérprete y todas las dependencias. Dos formatos por sistema operativo, elige el que tu política de TI permita:
|
No necesitas tener Python ni permisos de administrador — el paquete trae su propio intérprete y todas las dependencias. Cada sistema operativo tiene un único instalador que crea automáticamente el acceso directo en el escritorio + la entrada en el menú Inicio / Launchpad.
|
||||||
|
|
||||||
- **Instalador** — crea automáticamente acceso directo en el escritorio + entrada en el menú Inicio / Launchpad. Recomendado para la mayoría.
|
|
||||||
- **.zip portable** — descomprime y haz doble clic. No toca el registro, se ejecuta desde cualquier lugar (escritorio, USB, recurso de red). Úsalo si no puedes ejecutar instaladores, quieres una instalación de una sola carpeta que puedas copiar entre equipos, o estás evaluando antes de instalar.
|
|
||||||
|
|
||||||
Ambos formatos son idénticos por dentro: mismo Python, mismas dependencias, mismo comportamiento de arranque.
|
|
||||||
|
|
||||||
### 1.1 Windows
|
### 1.1 Windows
|
||||||
|
|
||||||
**Opción A — Instalador (`DataTools-<ver>-win-setup.exe`)**
|
**Instalador (`DataTools-<ver>-win-setup.exe`)**
|
||||||
|
|
||||||
1. Descarga `DataTools-<ver>-win-setup.exe` desde tu correo de licencia o GitHub Releases.
|
1. Descarga `DataTools-<ver>-win-setup.exe` desde tu correo de licencia o GitHub Releases.
|
||||||
2. Doble clic en el instalador. La primera vez, Windows SmartScreen mostrará **"Windows protegió tu PC"** — pulsa **Más información** → **Ejecutar de todas formas**. (Este aviso solo aparece una vez por compilación hasta que tengamos un certificado EV de firma de código.)
|
2. Doble clic en el instalador. La primera vez, Windows SmartScreen mostrará **"Windows protegió tu PC"** — pulsa **Más información** → **Ejecutar de todas formas**. (Este aviso solo aparece una vez por compilación hasta que tengamos un certificado EV de firma de código.)
|
||||||
@@ -44,18 +39,11 @@ Ambos formatos son idénticos por dentro: mismo Python, mismas dependencias, mis
|
|||||||
|
|
||||||
Para anclarlo a la barra de tareas, lanza la app una vez, clic derecho en su icono de la barra de tareas, y **Anclar a la barra de tareas**. Windows requiere este paso manual — ningún instalador puede anclar por programa.
|
Para anclarlo a la barra de tareas, lanza la app una vez, clic derecho en su icono de la barra de tareas, y **Anclar a la barra de tareas**. Windows requiere este paso manual — ningún instalador puede anclar por programa.
|
||||||
|
|
||||||
**Opción B — Portable (`DataTools-<ver>-win-portable.zip`)**
|
**Desinstalar**: Configuración → Aplicaciones → DataTools → Desinstalar.
|
||||||
|
|
||||||
1. Descarga `DataTools-<ver>-win-portable.zip`.
|
|
||||||
2. Clic derecho en el .zip → **Extraer todo…** → elige una carpeta (p. ej. `C:\Tools\DataTools`).
|
|
||||||
3. Abre la carpeta `DataTools\` extraída, doble clic en `DataTools.exe`. El aviso de SmartScreen aparece solo la primera vez.
|
|
||||||
4. Para crear tu propio acceso directo en el escritorio: clic derecho en `DataTools.exe` → **Enviar a → Escritorio (crear acceso directo)**.
|
|
||||||
|
|
||||||
**Desinstalar** (solo instalador): Configuración → Aplicaciones → DataTools → Desinstalar. Portable: borra la carpeta.
|
|
||||||
|
|
||||||
### 1.2 macOS
|
### 1.2 macOS
|
||||||
|
|
||||||
**Opción A — DMG instalador (`DataTools-<ver>-mac.dmg`)**
|
**DMG instalador (`DataTools-<ver>-mac.dmg`)**
|
||||||
|
|
||||||
1. Descarga `DataTools-<ver>-mac.dmg`.
|
1. Descarga `DataTools-<ver>-mac.dmg`.
|
||||||
2. Doble clic en el .dmg. Se abre una ventana de Finder con el icono **DataTools** y un alias **Aplicaciones**.
|
2. Doble clic en el .dmg. Se abre una ventana de Finder con el icono **DataTools** y un alias **Aplicaciones**.
|
||||||
@@ -65,12 +53,6 @@ Para anclarlo a la barra de tareas, lanza la app una vez, clic derecho en su ico
|
|||||||
|
|
||||||
Para mantener DataTools en el Dock: lanza la app, clic derecho en su icono del Dock → **Opciones → Mantener en el Dock**. macOS no permite que los instaladores fijen al Dock automáticamente.
|
Para mantener DataTools en el Dock: lanza la app, clic derecho en su icono del Dock → **Opciones → Mantener en el Dock**. macOS no permite que los instaladores fijen al Dock automáticamente.
|
||||||
|
|
||||||
**Opción B — Portable (`DataTools-<ver>-mac-portable.zip`)**
|
|
||||||
|
|
||||||
1. Descarga `DataTools-<ver>-mac-portable.zip`. Safari descomprime al descargar por defecto; en Finder verás `DataTools.app` directamente.
|
|
||||||
2. Mueve `DataTools.app` a **Aplicaciones** si quieres que aparezca en Launchpad — o déjalo en el escritorio, un USB o un recurso de red. La .app portable se ejecuta desde cualquier sitio.
|
|
||||||
3. Doble clic en `DataTools.app`. Clic derecho → **Abrir** la primera vez (misma rutina que con el DMG).
|
|
||||||
|
|
||||||
**Desinstalar**: arrastra `DataTools.app` a la Papelera. Tus archivos de datos siguen donde estén — la app no instala nada más.
|
**Desinstalar**: arrastra `DataTools.app` a la Papelera. Tus archivos de datos siguen donde estén — la app no instala nada más.
|
||||||
|
|
||||||
### 1.3 Linux
|
### 1.3 Linux
|
||||||
|
|||||||
@@ -25,16 +25,11 @@ To use the same license on a different machine: deactivate this one (Activate pa
|
|||||||
|
|
||||||
## 1. Install
|
## 1. Install
|
||||||
|
|
||||||
You don't need Python and you don't need admin rights — the bundle ships its own interpreter and every dependency. Two flavors per OS, pick whichever your IT policy allows:
|
You don't need Python and you don't need admin rights — the bundle ships its own interpreter and every dependency. Each OS gets a single installer that wires up the Desktop shortcut + Start Menu / Launchpad entry automatically.
|
||||||
|
|
||||||
- **Installer** — wires up Desktop shortcut + Start Menu / Launchpad entry automatically. Recommended for most users.
|
|
||||||
- **Portable .zip** — unzip and double-click. No registry writes, runs from anywhere (Desktop, USB stick, network share). Use this if you can't run installers, want a single-folder install you can copy between machines, or are evaluating before committing to install.
|
|
||||||
|
|
||||||
Both flavors are byte-identical inside: same Python, same dependencies, same launch behavior.
|
|
||||||
|
|
||||||
### 1.1 Windows
|
### 1.1 Windows
|
||||||
|
|
||||||
**Option A — Installer (`DataTools-<ver>-win-setup.exe`)**
|
**Installer (`DataTools-<ver>-win-setup.exe`)**
|
||||||
|
|
||||||
1. Download `DataTools-<ver>-win-setup.exe` from your release email or GitHub Releases.
|
1. Download `DataTools-<ver>-win-setup.exe` from your release email or GitHub Releases.
|
||||||
2. Double-click the installer. On the first run Windows SmartScreen will say **"Windows protected your PC"** — click **More info** → **Run anyway**. (This warning only appears once per build until we have an EV code-signing cert.)
|
2. Double-click the installer. On the first run Windows SmartScreen will say **"Windows protected your PC"** — click **More info** → **Run anyway**. (This warning only appears once per build until we have an EV code-signing cert.)
|
||||||
@@ -44,18 +39,11 @@ Both flavors are byte-identical inside: same Python, same dependencies, same lau
|
|||||||
|
|
||||||
To pin to the taskbar, launch the app once, right-click its icon in the taskbar, then **Pin to taskbar**. Windows requires this manual step — no installer is allowed to pin programmatically.
|
To pin to the taskbar, launch the app once, right-click its icon in the taskbar, then **Pin to taskbar**. Windows requires this manual step — no installer is allowed to pin programmatically.
|
||||||
|
|
||||||
**Option B — Portable (`DataTools-<ver>-win-portable.zip`)**
|
**Uninstall**: Settings → Apps → DataTools → Uninstall.
|
||||||
|
|
||||||
1. Download `DataTools-<ver>-win-portable.zip`.
|
|
||||||
2. Right-click the .zip → **Extract All…** → pick a folder (e.g. `C:\Tools\DataTools`).
|
|
||||||
3. Open the extracted `DataTools\` folder, double-click `DataTools.exe`. SmartScreen warning fires the first time only.
|
|
||||||
4. To create your own desktop shortcut later: right-click `DataTools.exe` → **Send to → Desktop (create shortcut)**.
|
|
||||||
|
|
||||||
**Uninstall** (installer only): Settings → Apps → DataTools → Uninstall. Portable: delete the folder.
|
|
||||||
|
|
||||||
### 1.2 macOS
|
### 1.2 macOS
|
||||||
|
|
||||||
**Option A — Installer DMG (`DataTools-<ver>-mac.dmg`)**
|
**Installer DMG (`DataTools-<ver>-mac.dmg`)**
|
||||||
|
|
||||||
1. Download `DataTools-<ver>-mac.dmg`.
|
1. Download `DataTools-<ver>-mac.dmg`.
|
||||||
2. Double-click the .dmg. A Finder window opens showing the **DataTools** icon and an **Applications** alias.
|
2. Double-click the .dmg. A Finder window opens showing the **DataTools** icon and an **Applications** alias.
|
||||||
@@ -65,12 +53,6 @@ To pin to the taskbar, launch the app once, right-click its icon in the taskbar,
|
|||||||
|
|
||||||
To keep DataTools in the Dock: launch the app, right-click its Dock icon → **Options → Keep in Dock**. macOS doesn't allow installers to pin to the Dock automatically.
|
To keep DataTools in the Dock: launch the app, right-click its Dock icon → **Options → Keep in Dock**. macOS doesn't allow installers to pin to the Dock automatically.
|
||||||
|
|
||||||
**Option B — Portable (`DataTools-<ver>-mac-portable.zip`)**
|
|
||||||
|
|
||||||
1. Download `DataTools-<ver>-mac-portable.zip`. Safari auto-unzips on download; in Finder you'll see `DataTools.app` directly.
|
|
||||||
2. Move `DataTools.app` to **Applications** if you want it discoverable via Launchpad — or keep it on your Desktop, a USB stick, or a network share. The portable .app runs from anywhere.
|
|
||||||
3. Double-click `DataTools.app`. Right-click → **Open** the first time (same unsigned-build dance as the DMG).
|
|
||||||
|
|
||||||
**Uninstall**: drag `DataTools.app` to the Trash. Your data files stay where you put them — nothing else is installed.
|
**Uninstall**: drag `DataTools.app` to the Trash. Your data files stay where you put them — nothing else is installed.
|
||||||
|
|
||||||
### 1.3 Linux
|
### 1.3 Linux
|
||||||
|
|||||||
@@ -9,9 +9,9 @@ Cloudflare Pages.
|
|||||||
```
|
```
|
||||||
landing/
|
landing/
|
||||||
├── _shared/styles.css shared CSS (system fonts, no externals)
|
├── _shared/styles.css shared CSS (system fonts, no externals)
|
||||||
├── shopify-pet/index.html Shopify operator (priority: pet supplies)
|
├── bookkeeper/index.html bookkeeper — bank reconciliation
|
||||||
├── bookkeeper/index.html bookkeeper / freelance accountant
|
├── ap-1099/index.html accounts payable — 1099 vendor prep
|
||||||
├── revops/index.html marketing / RevOps agency
|
├── ar-aging/index.html accounts receivable — open invoices
|
||||||
└── README.md this file
|
└── README.md this file
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -19,8 +19,8 @@ Each page:
|
|||||||
|
|
||||||
- Inherits `landing/_shared/styles.css`
|
- Inherits `landing/_shared/styles.css`
|
||||||
- Overrides the `--accent` colour variable in an inline `<style>` block
|
- Overrides the `--accent` colour variable in an inline `<style>` block
|
||||||
so each persona has its own visual identity (Shopify = mint green,
|
so each persona has its own visual identity (Bookkeeper = steel blue,
|
||||||
Bookkeeper = steel blue, RevOps = vivid violet)
|
AP / 1099 = amber/gold, AR = receivables green)
|
||||||
- Has a sticky buy bar with the Gumroad CTA tagged with `?from=<persona>`
|
- Has a sticky buy bar with the Gumroad CTA tagged with `?from=<persona>`
|
||||||
- Embeds the live demo (Streamlit) via `<iframe>` with a sandbox attribute
|
- Embeds the live demo (Streamlit) via `<iframe>` with a sandbox attribute
|
||||||
- Carries persona-specific H1, sub-copy, use cases, FAQ, and a
|
- Carries persona-specific H1, sub-copy, use cases, FAQ, and a
|
||||||
@@ -64,13 +64,13 @@ wrangler pages deploy landing/dist
|
|||||||
```
|
```
|
||||||
|
|
||||||
Configure the custom apex domain (`datatools.app`) in the Cloudflare
|
Configure the custom apex domain (`datatools.app`) in the Cloudflare
|
||||||
Pages project settings; sub-paths `/shopify-pet/`, `/bookkeeper/`,
|
Pages project settings; sub-paths `/bookkeeper/`, `/ap-1099/`,
|
||||||
`/revops/` are served automatically because the directory layout
|
`/ar-aging/` are served automatically because the directory layout
|
||||||
mirrors them. Cache rule defaults are fine (HTML 1 day, CSS 7 days).
|
mirrors them. Cache rule defaults are fine (HTML 1 day, CSS 7 days).
|
||||||
|
|
||||||
If you want **separate Pages projects** per persona for independent
|
If you want **separate Pages projects** per persona for independent
|
||||||
A/B testing, point three projects at the same `landing/dist/` and
|
A/B testing, point three projects at the same `landing/dist/` and
|
||||||
configure each with its own sub-domain (`shopify.datatools.app`, etc.)
|
configure each with its own sub-domain (`bookkeeper.datatools.app`, etc.)
|
||||||
and a Pages rule that rewrites the root to that persona's
|
and a Pages rule that rewrites the root to that persona's
|
||||||
sub-directory.
|
sub-directory.
|
||||||
|
|
||||||
@@ -110,7 +110,7 @@ Refresh the page when:
|
|||||||
| `page_view → run_completed < 30%` for 4 weeks | The demo iframe isn't loading or visitors aren't engaging. Check the iframe URL. Move the demo above the fold if it's currently below. |
|
| `page_view → run_completed < 30%` for 4 weeks | The demo iframe isn't loading or visitors aren't engaging. Check the iframe URL. Move the demo above the fold if it's currently below. |
|
||||||
| New tool ships (06–09) | Add it to the persona's saved pipeline only if it fits — don't bloat the demo with every tool. |
|
| New tool ships (06–09) | Add it to the persona's saved pipeline only if it fits — don't bloat the demo with every tool. |
|
||||||
| Pricing change | Update `<meta>` schema, the buybar `.price-tag`, the pricing card, and the FAQ. Search-and-replace `$49` across the file. |
|
| Pricing change | Update `<meta>` schema, the buybar `.price-tag`, the pricing card, and the FAQ. Search-and-replace `$49` across the file. |
|
||||||
| New persona added (4th, 5th) | Copy `shopify-pet/index.html`, replace persona-specific copy, add to the `footer` cross-link block on the existing pages. |
|
| New persona added (4th, 5th) | Copy `bookkeeper/index.html`, replace persona-specific copy, add to the `footer` cross-link block on the existing pages. |
|
||||||
|
|
||||||
## Why static HTML
|
## Why static HTML
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
* with zero build step, no privacy banner needed).
|
* with zero build step, no privacy banner needed).
|
||||||
* • Mobile-first; layout reflows below 720 px.
|
* • Mobile-first; layout reflows below 720 px.
|
||||||
* • Dark, focused, content-first. Buyer reads this on a laptop
|
* • Dark, focused, content-first. Buyer reads this on a laptop
|
||||||
* between Shopify exports — keep it readable and skimmable.
|
* between messy accounting exports — keep it readable and skimmable.
|
||||||
* • Persona pages all share this sheet — niche differences live in
|
* • Persona pages all share this sheet — niche differences live in
|
||||||
* copy + accent-color variables overridden in each page's <style>.
|
* copy + accent-color variables overridden in each page's <style>.
|
||||||
*/
|
*/
|
||||||
@@ -18,7 +18,7 @@
|
|||||||
--text-mute: #9aa3b2;
|
--text-mute: #9aa3b2;
|
||||||
--text-soft: #c8ced8;
|
--text-soft: #c8ced8;
|
||||||
--rule: #252a36;
|
--rule: #252a36;
|
||||||
--accent: #6ee7b7; /* Shopify pet default — overridden per persona */
|
--accent: #6ee7b7; /* default accent — overridden per persona */
|
||||||
--accent-ink: #052e1a;
|
--accent-ink: #052e1a;
|
||||||
--warn: #fbbf24;
|
--warn: #fbbf24;
|
||||||
--max: 1080px;
|
--max: 1080px;
|
||||||
|
|||||||
391
landing/ap-1099/index.html
Normal file
391
landing/ap-1099/index.html
Normal file
@@ -0,0 +1,391 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>DataTools for 1099 Prep — Clean Your Vendor Master & Recover Missing EINs Locally · $49</title>
|
||||||
|
<meta name="description" content="Build a clean 1099 vendor list — locally. Consolidates duplicate vendor rows, backfills scattered EINs, and flags the genuinely missing ones. 24 messy records → 8 complete vendors, 7 EINs recovered. Your data never leaves your computer. $49 one-time." />
|
||||||
|
<meta name="keywords" content="1099 vendor list, missing EIN, accounts payable cleanup, vendor master dedupe, 1099-NEC prep, QuickBooks vendor export, deduplicate vendors" />
|
||||||
|
<link rel="canonical" href="https://datatools.app/ap-1099/" />
|
||||||
|
<link rel="stylesheet" href="../_shared/styles.css" />
|
||||||
|
|
||||||
|
<!-- Persona accent: Accounts Payable / 1099 → amber/gold invoice tone -->
|
||||||
|
<style>
|
||||||
|
:root { --accent: #d97706; --accent-ink: #2a1604; }
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<!-- Open Graph -->
|
||||||
|
<meta property="og:title" content="DataTools for 1099 Prep — Clean Your Vendor Master & Recover Missing EINs Locally" />
|
||||||
|
<meta property="og:description" content="Consolidate duplicate vendors, backfill scattered EINs, file 1099-NECs on time. Local. No upload. $49 one-time." />
|
||||||
|
<meta property="og:type" content="product" />
|
||||||
|
<meta property="og:url" content="https://datatools.app/ap-1099/" />
|
||||||
|
|
||||||
|
<!-- Schema.org Product -->
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{
|
||||||
|
"@context": "https://schema.org",
|
||||||
|
"@type": "SoftwareApplication",
|
||||||
|
"name": "DataTools for 1099 Prep",
|
||||||
|
"operatingSystem": "Windows, macOS, Linux",
|
||||||
|
"applicationCategory": "BusinessApplication",
|
||||||
|
"offers": {
|
||||||
|
"@type": "Offer",
|
||||||
|
"price": "49",
|
||||||
|
"priceCurrency": "USD"
|
||||||
|
},
|
||||||
|
"description": "Clean your accounts-payable vendor master locally for 1099-NEC season. Six-tool data-cleaning bundle: dedupe-merge to consolidate duplicate vendor rows and backfill missing EINs, text-clean, format-standardize, missing-value handle, column-map, pipeline.",
|
||||||
|
"softwareVersion": "1.0"
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<!-- ============= Sticky buy bar ============= -->
|
||||||
|
<div class="buybar">
|
||||||
|
<div class="buybar-inner">
|
||||||
|
<div class="brand"><span class="brand-mark">●</span> DataTools <span class="muted">/ for 1099 prep</span></div>
|
||||||
|
<div>
|
||||||
|
<span class="price-tag">$49 — one-time, no subscription</span>
|
||||||
|
<a class="btn" href="https://gumroad.com/l/datatools?from=ap-1099" rel="noopener">Get DataTools →</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ============= Hero ============= -->
|
||||||
|
<section class="hero">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">For accounts payable · 1099-NEC season · vendor master cleanup</div>
|
||||||
|
<h1>Build a clean 1099 vendor list —<br /><strong>with the missing EINs filled in.</strong></h1>
|
||||||
|
<p class="lead">
|
||||||
|
The same vendor got entered three times across the year — one row has
|
||||||
|
the EIN, another the address, another the phone — and now it's January
|
||||||
|
and you can't file because the numbers are scattered. DataTools
|
||||||
|
consolidates each vendor to one row and backfills the gaps from the
|
||||||
|
duplicates: in our sample, <strong>24 messy records become 8 complete
|
||||||
|
vendors with 7 missing EINs recovered</strong> from duplicate rows.
|
||||||
|
<strong>Your data never leaves your computer.</strong>
|
||||||
|
</p>
|
||||||
|
<div class="cta-row">
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=ap-1099" rel="noopener">Get DataTools for Accounting — $49 →</a>
|
||||||
|
<a class="btn btn-ghost btn-large" href="#demo">Try the live demo ↓</a>
|
||||||
|
<span class="price-note">One-time payment · cross-platform · runs offline</span>
|
||||||
|
</div>
|
||||||
|
<div class="stats">
|
||||||
|
<div class="stat"><div class="num">24→8</div><div class="label">messy records to complete vendors</div></div>
|
||||||
|
<div class="stat"><div class="num">7</div><div class="label">missing EINs recovered</div></div>
|
||||||
|
<div class="stat"><div class="num">0</div><div class="label">cloud uploads ever</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Pain points ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">If any of these sound like your January</div>
|
||||||
|
<h2>Five pains DataTools fixes in one pass</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🧾</span>
|
||||||
|
<h3>The same vendor is in the list two or three times</h3>
|
||||||
|
<p>Different staff entered "Acme LLC", "Acme, L.L.C.", and "ACME Llc" across the year. Each is a separate row in the vendor master, and each only holds part of the story — so your 1099 totals split across three near-duplicate spellings.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> hours of manual matching, plus the risk of filing the wrong total.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🔢</span>
|
||||||
|
<h3>The EIN is on a different row than the rest of the details</h3>
|
||||||
|
<p>One record captured the EIN at onboarding; the row you actually paid against doesn't have it. At 1099 time the field is blank even though you collected it months ago — it's just sitting on a duplicate.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> chasing W-9s you already have on file.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📵</span>
|
||||||
|
<h3>Phones, addresses, and amounts are formatted five different ways</h3>
|
||||||
|
<p>Remittance phone as <code>(212) 555-0147</code> on one row and <code>212.555.0147</code> on another. Amounts with stray <code>$</code> and commas. The export won't reconcile and the 1099-NEC box totals don't tie out.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> a half-day reconciling before you can even start filing.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">⛔</span>
|
||||||
|
<h3>You don't know which EINs are genuinely missing</h3>
|
||||||
|
<p>Some EINs are recoverable from a duplicate row. Some you never collected. Until the list is consolidated you can't tell the two apart — so you either over-chase vendors or under-file.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> late filings and TIN-mismatch penalties.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📤</span>
|
||||||
|
<h3>Your QuickBooks vendor export doesn't match your AP ledger</h3>
|
||||||
|
<p>The vendor master in QuickBooks, the payments spreadsheet, and the W-9 tracker each use different column names for "vendor name" / "Tax ID" / "amount paid." Merging them is an afternoon of manual rename before any analysis begins.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> 4–8 hours per filing season manually merging exports.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🔒</span>
|
||||||
|
<h3>Cloud cleaners want you to upload your vendor master</h3>
|
||||||
|
<p>Your vendor master holds EINs, remittance addresses, and payment history — exactly the data you should not be uploading to a SaaS to clean. DataTools is desktop-only — your vendor list never leaves your computer.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> nothing — and that's the point.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Live demo ============= -->
|
||||||
|
<section id="demo">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Live demo · runs in your browser</div>
|
||||||
|
<h2>Try it on a real-looking vendor master export</h2>
|
||||||
|
<p>
|
||||||
|
The demo below loads a sample 24-row vendor file with the pollution
|
||||||
|
we've seen in real AP systems: the same vendor entered two or three
|
||||||
|
times under slightly different spellings, EINs that live on one
|
||||||
|
duplicate row but not the one you paid against, phones and amounts
|
||||||
|
formatted five ways, and the usual mess of
|
||||||
|
<code>N/A</code> / <code>(blank)</code> / <code>?</code> sentinels.
|
||||||
|
Click <strong>Run pipeline</strong> and watch the 24 records collapse
|
||||||
|
to <strong>8 complete vendors with 7 EINs recovered</strong> in under
|
||||||
|
a second.
|
||||||
|
</p>
|
||||||
|
<div class="demo-frame">
|
||||||
|
<iframe
|
||||||
|
src="https://demo.datatools.app/?p=ap-1099"
|
||||||
|
loading="lazy"
|
||||||
|
title="DataTools live demo — accounts payable / 1099 vendor cleanup"
|
||||||
|
sandbox="allow-scripts allow-same-origin allow-downloads allow-forms"></iframe>
|
||||||
|
<div class="demo-caption">
|
||||||
|
Demo runs on free hosting (Streamlit Community Cloud). Capped at
|
||||||
|
100 input rows · output watermarked with one trailing row. The
|
||||||
|
paid product has no caps and runs entirely offline.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Built for AP / 1099 ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Built for the accounts-payable team</div>
|
||||||
|
<h2>Five workflows you do every filing season</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🧹</span>
|
||||||
|
<h3>Vendor-master consolidation</h3>
|
||||||
|
<p>Catches the same vendor that shows up as <code>Acme LLC</code>, <code>Acme, L.L.C.</code>, and <code>ACME Llc</code>. Fuzzy match merges the spellings; the dedup merge collapses them to one row and backfills the gaps from each duplicate.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🔢</span>
|
||||||
|
<h3>EIN backfill & missing-EIN flagging</h3>
|
||||||
|
<p>Pulls the EIN off whichever duplicate row captured it and fills it into the survivor. The EINs that are <em>genuinely</em> missing get flagged so you know exactly which W-9s to chase.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">💵</span>
|
||||||
|
<h3>1099-NEC amount roll-up</h3>
|
||||||
|
<p>Before filing: standardize amounts, drop sentinels-as-missing, and merge so each vendor's total paid lands on one row and ties to your AP ledger.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📥</span>
|
||||||
|
<h3>QuickBooks vendor export cleanup</h3>
|
||||||
|
<p>Whitespace in Tax IDs, near-identical vendor names, copy-paste smart quotes in remittance addresses — gone. Audit log shows every change for your reviewer.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🔗</span>
|
||||||
|
<h3>Merging the W-9 tracker into the AP ledger</h3>
|
||||||
|
<p>The vendor master, the payments spreadsheet, and the W-9 tracker each name "Tax ID" differently. Map Columns aligns them; the dedup merge consolidates across all three sources.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">⚙️</span>
|
||||||
|
<h3>Repeatable pipeline</h3>
|
||||||
|
<p>Save the cleanup as a JSON file. Drop next year's vendor export on it. Same consolidation, zero re-configuration. Automatable via the CLI.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Privacy moat ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">The thing every cloud cleaner can't say</div>
|
||||||
|
<h2>Your vendor master never leaves your computer.</h2>
|
||||||
|
<p>
|
||||||
|
DataTools is a desktop app. There's no upload step, no SaaS account,
|
||||||
|
no subscription, no "trust our security policy." The first thing you
|
||||||
|
can do after install is open your browser's network tab, run the
|
||||||
|
cleaner on your real vendor file, and verify zero outbound
|
||||||
|
requests.
|
||||||
|
</p>
|
||||||
|
<div class="callout">
|
||||||
|
<strong>Why it matters for AP:</strong> your vendor master holds EINs,
|
||||||
|
remittance addresses, and payment history. Cloud cleaners require you
|
||||||
|
to upload it. We don't.
|
||||||
|
</div>
|
||||||
|
<div class="terminal"><span class="prompt">$</span> python -m src.cli_pipeline vendor_1099.csv --pipeline vendor_1099_pipeline.json --apply
|
||||||
|
Reading vendor_1099.csv...
|
||||||
|
24 rows, 9 columns
|
||||||
|
Executing pipeline:
|
||||||
|
<span class="ok">✓</span> text_clean (38 ms) {cells_changed: 41}
|
||||||
|
<span class="ok">✓</span> format_standardize (62 ms) {cells_changed: 36} # phones, EINs, amounts
|
||||||
|
<span class="ok">✓</span> missing (11 ms) {sentinels_standardized: 9}
|
||||||
|
<span class="ok">✓</span> dedup (140 ms) {groups_merged: 8, rows_removed: 16, eins_backfilled: 7}
|
||||||
|
|
||||||
|
Initial rows: 24 → Final rows: 8 (8 complete vendors)
|
||||||
|
EINs recovered from duplicate rows: 7 | Still missing (flagged): 1
|
||||||
|
Unparseable cells: 0
|
||||||
|
Total elapsed: 0.25 s
|
||||||
|
<span class="prompt">$</span> # zero network calls. zero. promise.</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Audit moat ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">For when your reviewer asks "what changed?"</div>
|
||||||
|
<h2>Every change auditable. Every cell logged.</h2>
|
||||||
|
<p>
|
||||||
|
Every modification is recorded with the original value, the new
|
||||||
|
value, and which rule fired. Hand the audit CSV to your controller,
|
||||||
|
your reviewer, or the IRS-ready workpaper file along with the cleaned
|
||||||
|
vendor list. No <em>"I trust the AI"</em> hand-waving — they see
|
||||||
|
exactly which EIN came from which duplicate row.
|
||||||
|
</p>
|
||||||
|
<div class="callout">
|
||||||
|
<strong>Real example:</strong> the demo above merged 24 records into
|
||||||
|
8 vendors and backfilled 7 EINs. The dedup audit lists every vendor
|
||||||
|
group with the survivor, its merged-in duplicates, and the source row
|
||||||
|
each recovered EIN was pulled from. The standardize audit lists every
|
||||||
|
phone, amount, and Tax ID it reformatted.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Format handling ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">If your vendors are messy — most AP files are</div>
|
||||||
|
<h2>EINs, phones, addresses, and amounts in every shape.</h2>
|
||||||
|
<p>
|
||||||
|
One row has the EIN as <code>12-3456789</code>, another as
|
||||||
|
<code>123456789</code>. The remittance phone is <code>(212)
|
||||||
|
555-0147</code> on one and <code>212.555.0147</code> on the next.
|
||||||
|
An amount reads <code>$12,410.75</code> with a stray space. Excel
|
||||||
|
treats half of these as text errors. DataTools normalizes every one —
|
||||||
|
EINs to a single format, phones to E.164, amounts to clean numerics —
|
||||||
|
so the file reconciles and the 1099 box totals tie out.
|
||||||
|
</p>
|
||||||
|
<ul class="bullets">
|
||||||
|
<li><strong>EIN / Tax-ID normalization</strong> to one consistent <code>NN-NNNNNNN</code> shape, with genuinely-missing ones flagged.</li>
|
||||||
|
<li><strong>Phone standardization</strong> to E.164 via Google's libphonenumber.</li>
|
||||||
|
<li><strong>Amount parsing</strong> for <code>$</code> / commas / stray spaces — including amounts Excel mis-types as text.</li>
|
||||||
|
<li><strong>Address shape detection</strong> for US remittance addresses.</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= What you get ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">In the bundle</div>
|
||||||
|
<h2>Six tools. One pipeline. One $49 download.</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card"><h3>1 · Find Duplicates</h3><p>Fuzzy match (Jaro-Winkler), 5 normalizers, survivor rules, gap-backfill merge, interactive review.</p></div>
|
||||||
|
<div class="card"><h3>2 · Clean Text</h3><p>Whitespace, smart chars, NBSP, BOM, line endings, case ops.</p></div>
|
||||||
|
<div class="card"><h3>3 · Standardize Formats</h3><p>EINs, amounts, dates, phones, emails, addresses, names, booleans.</p></div>
|
||||||
|
<div class="card"><h3>4 · Fix Missing Values</h3><p>Disguised-null detection, profile, flag genuinely-missing fields, drop strategies.</p></div>
|
||||||
|
<div class="card"><h3>5 · Map Columns</h3><p>Fuzzy auto-rename, target schema, type coercion, required-field defaults.</p></div>
|
||||||
|
<div class="card"><h3>6 · Automated Workflows</h3><p>Chain tools in recommended order, save/load JSON, automate next year's vendor cleanup.</p></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Pricing ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Pricing — pay once, own it</div>
|
||||||
|
<h2>$49. No subscription. No ceiling on rows or files.</h2>
|
||||||
|
<div class="pricing">
|
||||||
|
<div class="card featured">
|
||||||
|
<div class="row"><div class="price">$49</div><div class="price-suffix">one-time</div></div>
|
||||||
|
<h3>DataTools for 1099 Prep</h3>
|
||||||
|
<ul>
|
||||||
|
<li>All 6 tools, full pipeline</li>
|
||||||
|
<li>Mac · Windows · Linux installers</li>
|
||||||
|
<li>Code-signed (no Gatekeeper warnings)</li>
|
||||||
|
<li>Free updates for the v1.x line</li>
|
||||||
|
<li>Bonus: ready-made vendor-master & 1099 pipelines</li>
|
||||||
|
</ul>
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=ap-1099" rel="noopener">Buy on Gumroad →</a>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="row"><div class="price">$149</div><div class="price-suffix">one-time</div></div>
|
||||||
|
<h3>Full DataTools Suite</h3>
|
||||||
|
<p class="muted">Available when 3+ bundles ship. Includes everything in the 1099-prep pack plus the Bookkeeper and Accounts-Receivable bundles. Save $48.</p>
|
||||||
|
<a class="btn btn-ghost btn-large" href="#" aria-disabled="true">Coming when ready</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= FAQ ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<h2>Questions</h2>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Does this work with my QuickBooks vendor export?</summary>
|
||||||
|
<p>Yes — the input is just CSV / Excel from any source. Your QuickBooks vendor export works the same as a Xero export, a Bill.com download, or a vendor spreadsheet you maintain by hand. The cleaner doesn't care where the file came from.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>How does this compare to Excel's "Remove Duplicates"?</summary>
|
||||||
|
<p>Excel does <em>exact</em> deduplication and only deletes — it never backfills. <code>Acme LLC</code> and <code>Acme, L.L.C.</code> are different vendors to Excel, and even when it does catch a duplicate it throws the extra row away, taking the EIN with it. DataTools fuzzy-matches across spelling drift, merges the group to one survivor, and pulls the missing EIN, phone, and address off the rows it merges in.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>How does it recover a missing EIN?</summary>
|
||||||
|
<p>When it merges a group of duplicate vendor rows, it keeps the survivor and backfills any empty field — including the EIN — from whichever duplicate row had it. In the sample file, 7 of the 8 vendors had their EIN recovered this way; the 1 that's truly missing gets flagged so you know to chase the W-9.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Do I need to know Python to use it?</summary>
|
||||||
|
<p>No. The GUI is a browser interface that opens automatically when you double-click the app. It loads your vendor file, you click Run, you download the cleaned list. The CLI is there for power users who want to script next year's cleanup.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What about my data privacy?</summary>
|
||||||
|
<p>Your vendor master — EINs, remittance addresses, payment history — never leaves your computer. There is no cloud component, no telemetry, no "anonymous usage stats." When the app is running you can confirm zero outbound network requests in your browser's developer tools.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What's your refund policy?</summary>
|
||||||
|
<p>Try the live demo above on the sample vendor dataset before you buy. If you still find DataTools doesn't fit your workflow within 14 days, email for a refund — no questions asked.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Will there be updates?</summary>
|
||||||
|
<p>Yes. The v1.x line is included free for everyone who buys DataTools today. We ship a patch every 30 days adding format support, edge-case fixes, and small features.</p>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Final CTA ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container" style="text-align: center;">
|
||||||
|
<h2>Stop chasing scattered EINs by hand.</h2>
|
||||||
|
<p class="lead" style="margin: 0 auto 28px;">One $49 download. Mac, Windows, or Linux. Runs offline. Consolidates 24 messy records into 8 complete vendors, recovers the 7 EINs hiding on duplicate rows, flags the ones genuinely missing, and saves a pipeline you can re-run on next year's vendor export.</p>
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=ap-1099" rel="noopener">Get DataTools for Accounting — $49 →</a>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Footer ============= -->
|
||||||
|
<footer>
|
||||||
|
<div class="container">
|
||||||
|
<div>
|
||||||
|
<p><strong>DataTools</strong> — local data-cleaning for accounts payable, bookkeepers, and accounts-receivable teams.</p>
|
||||||
|
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p>
|
||||||
|
<a href="../bookkeeper/">For bookkeepers</a> ·
|
||||||
|
<a href="../ar-aging/">For accounts receivable</a><br />
|
||||||
|
<a href="https://gumroad.com/l/datatools?from=ap-1099">Buy on Gumroad</a> ·
|
||||||
|
<a href="mailto:hello@datatools.app">Email support</a>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
358
landing/ar-aging/index.html
Normal file
358
landing/ar-aging/index.html
Normal file
@@ -0,0 +1,358 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>DataTools for Accounts Receivable — Kill Duplicate Invoices Inflating Your AR Aging Report · $49</title>
|
||||||
|
<meta name="description" content="One tool to clean your open-invoices export: standardize invoice dates, due dates, and amounts, lowercase client emails, then remove double-entered invoice numbers so your AR aging report is accurate. 26 rows → 21, five duplicate invoices removed. Fully offline. $49 one-time." />
|
||||||
|
<meta name="keywords" content="accounts receivable aging, duplicate invoices, AR cleanup, open invoices export, invoice dedupe, aging report accuracy, receivables csv tool" />
|
||||||
|
<link rel="canonical" href="https://datatools.app/ar-aging/" />
|
||||||
|
<link rel="stylesheet" href="../_shared/styles.css" />
|
||||||
|
|
||||||
|
<!-- Persona accent: Accounts Receivable → receivables green -->
|
||||||
|
<style>
|
||||||
|
:root {
|
||||||
|
--accent: #059669;
|
||||||
|
--accent-ink: #03241a;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<meta property="og:title" content="DataTools for Accounts Receivable — Kill Duplicate Invoices Inflating Your AR Aging Report" />
|
||||||
|
<meta property="og:description" content="Standardize invoice dates, due dates, and amounts, lowercase client emails, then dedupe double-entered invoices — one tool, no upload. $49 one-time." />
|
||||||
|
<meta property="og:type" content="product" />
|
||||||
|
<meta property="og:url" content="https://datatools.app/ar-aging/" />
|
||||||
|
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{
|
||||||
|
"@context": "https://schema.org",
|
||||||
|
"@type": "SoftwareApplication",
|
||||||
|
"name": "DataTools for Accounts Receivable",
|
||||||
|
"operatingSystem": "Windows, macOS, Linux",
|
||||||
|
"applicationCategory": "BusinessApplication",
|
||||||
|
"offers": {
|
||||||
|
"@type": "Offer",
|
||||||
|
"price": "49",
|
||||||
|
"priceCurrency": "USD"
|
||||||
|
},
|
||||||
|
"description": "Clean and dedupe your open-invoices export so the AR aging report is accurate. Standardize invoice dates, due dates, and amounts, lowercase client emails, then remove double-entered invoice numbers — backfilling a blank status from its twin row. Six-tool data-cleaning bundle for accounts receivable and accounting teams.",
|
||||||
|
"softwareVersion": "1.0"
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<div class="buybar">
|
||||||
|
<div class="buybar-inner">
|
||||||
|
<div class="brand"><span class="brand-mark">●</span> DataTools <span class="muted">/ for Accounts Receivable</span></div>
|
||||||
|
<div>
|
||||||
|
<span class="price-tag">$49 — one-time, no subscription</span>
|
||||||
|
<a class="btn" href="https://gumroad.com/l/datatools?from=ar-aging" rel="noopener">Get DataTools →</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<section class="hero">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">For accounts receivable · controllers · collections · accounting teams</div>
|
||||||
|
<h1>Stop chasing the invoices<br /><strong>your aging report counted twice.</strong></h1>
|
||||||
|
<p class="lead">
|
||||||
|
The same invoice number gets posted twice — once as
|
||||||
|
<code>3/04/2026</code> for <code>$1,250.00</code>, again as
|
||||||
|
<code>2026-03-04</code> for <code>1250</code> — so your AR aging
|
||||||
|
report double-counts the receivable and your team chases a balance
|
||||||
|
that was never really open. DataTools standardizes every invoice
|
||||||
|
date, due date, and amount, lowercases client emails, then removes
|
||||||
|
the double-entered invoice numbers — taking a real open-invoices
|
||||||
|
export from <strong>26 rows to 21, five duplicate invoices
|
||||||
|
removed</strong> — all on your own machine, with nothing uploaded.
|
||||||
|
</p>
|
||||||
|
<div class="cta-row">
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=ar-aging" rel="noopener">Get DataTools for Accounting — $49 →</a>
|
||||||
|
<a class="btn btn-ghost btn-large" href="#demo">Try the live demo ↓</a>
|
||||||
|
<span class="price-note">One-time payment · cross-platform · runs offline</span>
|
||||||
|
</div>
|
||||||
|
<div class="stats">
|
||||||
|
<div class="stat"><div class="num">26→21</div><div class="label">rows after dedupe</div></div>
|
||||||
|
<div class="stat"><div class="num">5</div><div class="label">duplicate invoices removed</div></div>
|
||||||
|
<div class="stat"><div class="num">0</div><div class="label">cloud uploads ever</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Pain points ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">If your last aging report didn't tie out to cash</div>
|
||||||
|
<h2>Five pains DataTools fixes before you run the aging report</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">💸</span>
|
||||||
|
<h3>Double-entered invoices inflate every aging bucket</h3>
|
||||||
|
<p>The same invoice number posted twice — once in <code>MM/DD/YYYY</code>, once in ISO — lands in two rows and gets counted twice. Your 60-day bucket looks worse than it is, and the receivables total overstates what's actually owed.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> overstated AR, a balance sheet that won't reconcile, and a controller asking why.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📞</span>
|
||||||
|
<h3>Collections chases invoices that were already paid or never real</h3>
|
||||||
|
<p>When a duplicate invoice number shows as still-open, a collector emails the client about a balance that doesn't exist. The client pushes back, trust erodes, and your team burns a morning untangling it.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> wasted collections hours + an awkward "please disregard" to the client.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">⚖️</span>
|
||||||
|
<h3>Uploading the AR ledger to a cloud cleaner is a compliance headache</h3>
|
||||||
|
<p>Every cloud-based cleaner wants you to upload your full receivables ledger — client names, amounts, contact emails. That's a data-handling review your firm doesn't want to run. DataTools is desktop-only — no upload, no DPA, no review.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> weeks of review per tool, or just not cleaning the data at all.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🗓️</span>
|
||||||
|
<h3>Mixed date formats make due dates and aging unreliable</h3>
|
||||||
|
<p>Invoice dates arrive as <code>3/4/26</code>, <code>2026-03-04</code>, and <code>Mar 4 2026</code>; due dates are just as mixed. Sort by date and the buckets are wrong, so the wrong invoices show up in the wrong aging column.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> 1–3 hours per close reconciling dates by hand, every period.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📧</span>
|
||||||
|
<h3>Messy client contacts break your remittance reminders</h3>
|
||||||
|
<p>Client names come in mixed casing and emails arrive as <code>Billing@ClientCo.com</code> in one row and <code>billing@clientco.com</code> in another — so the same client looks like two, and reminders go out twice or not at all.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> duplicate dunning, missed reminders, and a client list that won't group.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">❓</span>
|
||||||
|
<h3>Blank invoice statuses hide whether a receivable is really open</h3>
|
||||||
|
<p>When one of the two twin rows has a blank status, you can't tell if the invoice is open, partial, or paid — so it either gets dropped from the aging report or counted at the wrong stage.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> misclassified receivables and an aging report you can't trust.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section id="demo">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Live demo · runs in your browser</div>
|
||||||
|
<h2>Try it on a real-looking open-invoices export</h2>
|
||||||
|
<p>
|
||||||
|
The demo below loads a 26-row open-invoices export with five
|
||||||
|
double-entered invoice numbers — the same invoice posted twice in
|
||||||
|
different date and amount formats (<code>3/04/2026</code> vs
|
||||||
|
<code>2026-03-04</code>, <code>$1,250.00</code> vs <code>1250</code>),
|
||||||
|
client emails in mixed case, and one blank invoice status. Click
|
||||||
|
<strong>Run pipeline</strong> and watch the 5-step pipeline (text
|
||||||
|
clean → format → missing → column map → dedup) standardize both date
|
||||||
|
columns to ISO, coerce amounts to numbers, lowercase the emails, and
|
||||||
|
collapse 26 rows to 21 — backfilling the blank status from its twin
|
||||||
|
row so the aging report is accurate.
|
||||||
|
</p>
|
||||||
|
<div class="demo-frame">
|
||||||
|
<iframe
|
||||||
|
src="https://demo.datatools.app/?p=ar-aging"
|
||||||
|
loading="lazy"
|
||||||
|
title="DataTools live demo — Accounts Receivable"
|
||||||
|
sandbox="allow-scripts allow-same-origin allow-downloads allow-forms"></iframe>
|
||||||
|
<div class="demo-caption">
|
||||||
|
Demo runs on free hosting. Capped at 100 input rows · output
|
||||||
|
watermarked. The paid product has no caps and runs entirely offline.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Built for the receivables close</div>
|
||||||
|
<h2>Three workflows you do every period</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🪢</span>
|
||||||
|
<h3>Dedupe double-entered invoices</h3>
|
||||||
|
<p>Match on invoice number, drop the second posting, and keep one canonical row per invoice — backfilling a blank status, due date, or amount from its twin so nothing accurate is lost when the duplicate goes.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🗓️</span>
|
||||||
|
<h3>Standardize invoice and due dates</h3>
|
||||||
|
<p>Coerce every invoice date and due date to ISO and every amount to a clean number, so the aging buckets sort correctly and the receivables total ties out to the ledger.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📧</span>
|
||||||
|
<h3>Normalize client contacts for remittance</h3>
|
||||||
|
<p>Lowercase client emails and fix name casing so each client groups as one. Send remit-to reminders once, to a clean contact list — not twice because two rows looked like two clients.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">If your export comes from QuickBooks, Xero, or a billing system</div>
|
||||||
|
<h2>Standardized dates and amounts. One row per invoice.</h2>
|
||||||
|
<p>
|
||||||
|
Your billing system exports <code>3/04/2026</code>. The re-post of
|
||||||
|
the same invoice has <code>2026-03-04</code>. The amount is
|
||||||
|
<code>$1,250.00</code> in one row and <code>1250</code> in the other.
|
||||||
|
DataTools reads each row, normalizes both date columns to ISO,
|
||||||
|
coerces the amount to a number, and then matches on invoice number
|
||||||
|
to keep exactly one canonical row per receivable.
|
||||||
|
</p>
|
||||||
|
<ul class="bullets">
|
||||||
|
<li><strong>Invoice date + due date</strong> both standardized to ISO, so every aging bucket sorts and totals correctly.</li>
|
||||||
|
<li><strong>Amounts coerced to numbers</strong>: <code>$1,250.00</code> and <code>1250</code> resolve to the same value — no false mismatch between twin rows.</li>
|
||||||
|
<li><strong>Client emails lowercased</strong> so the same client groups as one for remittance reminders.</li>
|
||||||
|
<li><strong>Status backfill on dedupe</strong>: when a twin row has a blank invoice status, the survivor inherits it — so no open receivable goes missing from the report.</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">For anyone who reports on receivables</div>
|
||||||
|
<h2>Every duplicate invoice you don't catch overstates your AR.</h2>
|
||||||
|
<p>
|
||||||
|
Your aging report is only as good as the export under it. Every
|
||||||
|
double-entered invoice number is a receivable counted twice — it
|
||||||
|
inflates the aging buckets, overstates the total owed, and sends
|
||||||
|
collections after balances that aren't really open. DataTools
|
||||||
|
catches them once, before the report runs, by matching on invoice
|
||||||
|
number with the date and amount noise already standardized away.
|
||||||
|
</p>
|
||||||
|
<div class="callout">
|
||||||
|
<strong>Real numbers from the demo:</strong> a 26-row open-invoices
|
||||||
|
export collapses to 21 — that's five double-entered invoices the
|
||||||
|
mixed date and amount formats were hiding, both date columns now
|
||||||
|
ISO, amounts numeric, emails lowercased, 0 unparseable, and a blank
|
||||||
|
status backfilled from its twin row. The aging report finally ties out.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">The thing every cloud cleaner can't say</div>
|
||||||
|
<h2>Your clients' receivables never leave your computer.</h2>
|
||||||
|
<p>
|
||||||
|
Cloud cleaning tools require you to upload your AR ledger — client
|
||||||
|
names, invoice amounts, remit-to contacts. That ledger is sensitive
|
||||||
|
client financial data, and once it's on someone else's server, your
|
||||||
|
firm owns a data-handling problem you didn't need. DataTools is a
|
||||||
|
desktop app. There is no upload step.
|
||||||
|
</p>
|
||||||
|
<div class="terminal"><span class="prompt">$</span> python -m src.cli_pipeline ar_open_invoices.csv --pipeline ar_open_invoices_pipeline.json --apply
|
||||||
|
Reading ar_open_invoices.csv...
|
||||||
|
26 rows, 9 columns
|
||||||
|
Executing pipeline:
|
||||||
|
<span class="ok">✓</span> text_clean (40 ms) {cells_changed: 31}
|
||||||
|
<span class="ok">✓</span> format_standardize (120 ms) {dates_to_iso: 41, amounts_to_number: 26, emails_lowercased: 18}
|
||||||
|
<span class="ok">✓</span> missing (30 ms) {sentinels_standardized: 4, status_backfilled: 1}
|
||||||
|
<span class="ok">✓</span> column_map (20 ms) {columns_renamed: 2}
|
||||||
|
<span class="ok">✓</span> dedup (60 ms) {duplicate_invoices_removed: 5, merged: 5}
|
||||||
|
|
||||||
|
Initial rows: 26 → Final rows: 21
|
||||||
|
Unparseable dates/amounts: 0
|
||||||
|
Total elapsed: 0.3 s
|
||||||
|
<span class="prompt">$</span> # 5 double-entered invoices gone. aging report ties out. for $49.</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">In the bundle</div>
|
||||||
|
<h2>Six tools. One pipeline. One $49 download.</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card"><h3>1 · Find Duplicates</h3><p>Match on invoice number; keep one canonical row per receivable and backfill blanks from the twin.</p></div>
|
||||||
|
<div class="card"><h3>2 · Clean Text</h3><p>Smart quotes from copy-paste, NBSP from spreadsheet exports, BOM from Excel.</p></div>
|
||||||
|
<div class="card"><h3>3 · Standardize Formats</h3><p>Invoice and due dates to ISO, amounts to clean numbers, client emails lowercased.</p></div>
|
||||||
|
<div class="card"><h3>4 · Fix Missing Values</h3><p>Detect <code>TBD</code>, <code>(unknown)</code>, <code>—</code> and backfill blank invoice statuses on dedupe.</p></div>
|
||||||
|
<div class="card"><h3>5 · Map Columns</h3><p>Project to your aging-report schema, coerce amount to a number, reorder fields for import.</p></div>
|
||||||
|
<div class="card"><h3>6 · Automated Workflows</h3><p>Save the cleanup as JSON. Drop next period's open-invoices export on it. Same dedupe, automated.</p></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Pricing — pay once, own it</div>
|
||||||
|
<h2>$49. No subscription. No per-close fee.</h2>
|
||||||
|
<div class="pricing">
|
||||||
|
<div class="card featured">
|
||||||
|
<div class="row"><div class="price">$49</div><div class="price-suffix">one-time</div></div>
|
||||||
|
<h3>DataTools for Accounts Receivable</h3>
|
||||||
|
<ul>
|
||||||
|
<li>All 6 tools, full pipeline</li>
|
||||||
|
<li>Mac · Windows · Linux installers</li>
|
||||||
|
<li>Code-signed (no Gatekeeper warnings)</li>
|
||||||
|
<li>Free updates for the v1.x line</li>
|
||||||
|
<li>Bonus: open-invoices dedupe pipeline preset</li>
|
||||||
|
<li><strong>Use on any number of clients</strong> — no seat limits</li>
|
||||||
|
</ul>
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=ar-aging" rel="noopener">Buy on Gumroad →</a>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="row"><div class="price">$149</div><div class="price-suffix">one-time</div></div>
|
||||||
|
<h3>Full DataTools Suite</h3>
|
||||||
|
<p class="muted">Available when 3+ bundles ship. Includes everything in the Accounts Receivable pack plus the Bookkeeper and Accounts Payable / 1099 bundles. Save $48.</p>
|
||||||
|
<a class="btn btn-ghost btn-large" href="#" aria-disabled="true">Coming when ready</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<h2>Questions</h2>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Does this replace my accounting system's deduplication?</summary>
|
||||||
|
<p>No — it cleans the export <em>before</em> you run the aging report or import it back. Most billing systems will happily hold two postings of the same invoice number; DataTools catches the double-entered invoice so it never inflates a single aging bucket.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>How does it know two rows are the same invoice?</summary>
|
||||||
|
<p>It matches on invoice number after the date and amount formats are standardized away. So a posting dated <code>3/04/2026</code> for <code>$1,250.00</code> and its twin dated <code>2026-03-04</code> for <code>1250</code> are recognized as one invoice — and only one canonical row survives.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What happens to a blank invoice status when the duplicate is removed?</summary>
|
||||||
|
<p>It's backfilled. If one twin row has a blank status and the other says <code>open</code>, the surviving row inherits <code>open</code> — so no real receivable drops off the aging report just because the duplicate carried the better data.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Can I use it on multiple clients without paying again?</summary>
|
||||||
|
<p>Yes. The licence is per-operator, not per-client. Run it on every client's open-invoices export for the same $49.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What's the audit trail look like?</summary>
|
||||||
|
<p>A row-by-row CSV: every modified cell with its original value, new value, and which rule fired — every date coerced to ISO, every amount normalized, every duplicate invoice removed. A separate JSON file describes the pipeline that produced it, so the cleanup reproduces deterministically and your client can verify it on their machine.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What's your refund policy?</summary>
|
||||||
|
<p>Try the live demo above on the sample open-invoices export before you buy. If DataTools doesn't fit your workflow within 14 days, email for a refund — no questions asked.</p>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container" style="text-align: center;">
|
||||||
|
<h2>Stop counting the same receivable twice.</h2>
|
||||||
|
<p class="lead" style="margin: 0 auto 28px;">One $49 download. Standardizes invoice dates, due dates, and amounts, lowercases client emails, removes the double-entered invoices your aging report was counting twice, and saves a pipeline you can re-run on next period's open-invoices export.</p>
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=ar-aging" rel="noopener">Get DataTools for Accounting — $49 →</a>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<footer>
|
||||||
|
<div class="container">
|
||||||
|
<div>
|
||||||
|
<p><strong>DataTools</strong> — local data-cleaning for bookkeepers, accounts payable, and accounts receivable teams.</p>
|
||||||
|
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p>
|
||||||
|
<a href="../bookkeeper/">For bookkeepers</a> ·
|
||||||
|
<a href="../ap-1099/">For accounts payable / 1099</a><br />
|
||||||
|
<a href="https://gumroad.com/l/datatools?from=ar-aging">Buy on Gumroad</a> ·
|
||||||
|
<a href="mailto:hello@datatools.app">Email support</a>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
@@ -3,9 +3,9 @@
|
|||||||
<head>
|
<head>
|
||||||
<meta charset="utf-8" />
|
<meta charset="utf-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
<title>DataTools for Bookkeepers — Reconcile Bank Exports With An Audit Trail · $49</title>
|
<title>DataTools for Bookkeepers — Catch Bank Transactions Posted Twice · $49</title>
|
||||||
<meta name="description" content="Reconcile messy bank exports. Catch duplicate transactions QuickBooks imported twice. Standardize dates, amounts, and vendor casing — locally. Every change auditable. $49 one-time." />
|
<meta name="description" content="Catch the transactions your bank export posted twice. Standardize every date to ISO and every amount to numeric, then dedup on the real transaction so the reconciliation ties out — with a row-level audit trail. $49 one-time." />
|
||||||
<meta name="keywords" content="reconcile bank export csv, quickbooks duplicate transactions, vendor list cleanup, bookkeeper csv tool, bank export deduplicator, bookkeeper audit trail" />
|
<meta name="keywords" content="bank reconciliation, duplicate transactions, bank export csv cleanup, QuickBooks reconcile, bookkeeper csv tool" />
|
||||||
<link rel="canonical" href="https://datatools.app/bookkeeper/" />
|
<link rel="canonical" href="https://datatools.app/bookkeeper/" />
|
||||||
<link rel="stylesheet" href="../_shared/styles.css" />
|
<link rel="stylesheet" href="../_shared/styles.css" />
|
||||||
|
|
||||||
@@ -18,8 +18,8 @@
|
|||||||
</style>
|
</style>
|
||||||
|
|
||||||
<!-- Open Graph -->
|
<!-- Open Graph -->
|
||||||
<meta property="og:title" content="DataTools for Bookkeepers — Reconcile Bank Exports With An Audit Trail" />
|
<meta property="og:title" content="DataTools for Bookkeepers — Catch Bank Transactions Posted Twice" />
|
||||||
<meta property="og:description" content="Catch duplicate transactions. Standardize dates and amounts. Hand your client an audit trail. $49 one-time." />
|
<meta property="og:description" content="The same payment posts twice in two date/amount formats and a plain dedupe misses it. DataTools standardizes, dedups on the real transaction, and hands you an audit trail. $49 one-time." />
|
||||||
<meta property="og:type" content="product" />
|
<meta property="og:type" content="product" />
|
||||||
<meta property="og:url" content="https://datatools.app/bookkeeper/" />
|
<meta property="og:url" content="https://datatools.app/bookkeeper/" />
|
||||||
|
|
||||||
@@ -35,7 +35,7 @@
|
|||||||
"price": "49",
|
"price": "49",
|
||||||
"priceCurrency": "USD"
|
"priceCurrency": "USD"
|
||||||
},
|
},
|
||||||
"description": "Reconcile bank exports, dedupe vendor lists, and produce a hand-off-ready audit trail. Six-tool data-cleaning bundle for bookkeepers and freelance accountants.",
|
"description": "Catch the duplicate transactions your bank export posted twice across overlapping months, standardize dates and amounts, and produce a hand-off-ready audit trail. Six-tool data-cleaning bundle for bookkeepers and freelance accountants.",
|
||||||
"softwareVersion": "1.0"
|
"softwareVersion": "1.0"
|
||||||
}
|
}
|
||||||
</script>
|
</script>
|
||||||
@@ -47,7 +47,7 @@
|
|||||||
<div class="brand"><span class="brand-mark">●</span> DataTools <span class="muted">/ for Bookkeepers</span></div>
|
<div class="brand"><span class="brand-mark">●</span> DataTools <span class="muted">/ for Bookkeepers</span></div>
|
||||||
<div>
|
<div>
|
||||||
<span class="price-tag">$49 — one-time, no subscription</span>
|
<span class="price-tag">$49 — one-time, no subscription</span>
|
||||||
<a class="btn" href="https://gumroad.com/l/datatools?from=bookkeeper" rel="noopener">Get DataTools →</a>
|
<a class="btn" href="https://gumroad.com/l/datatools?from=bookkeeper" rel="noopener">Get DataTools for Bookkeepers — $49 →</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -55,24 +55,29 @@
|
|||||||
<section class="hero">
|
<section class="hero">
|
||||||
<div class="container">
|
<div class="container">
|
||||||
<div class="eyebrow">For bookkeepers · freelance accountants · small-firm partners</div>
|
<div class="eyebrow">For bookkeepers · freelance accountants · small-firm partners</div>
|
||||||
<h1>Reconcile messy bank exports.<br /><strong>Hand your client an audit trail.</strong></h1>
|
<h1>Catch the transactions your bank export<br /><strong>posted twice.</strong></h1>
|
||||||
<p class="lead">
|
<p class="lead">
|
||||||
The Jan and Feb exports overlap and you've got the same transaction
|
The Jan and Feb exports overlap, so the <em>same</em> payment posts
|
||||||
booked twice. Vendor names are <em>"Amazon"</em>, <em>"amazon.com"</em>,
|
twice in two different shapes — <code>01/15/2025 +$3,450.00</code>
|
||||||
and <em>"AMAZON.COM*4F2X9"</em> in three different rows. Dates are a
|
in one export and <code>2025-01-15 3450.00</code> in the
|
||||||
smoosh of <code>01/15/2025</code>, <code>2025-01-15</code>, and
|
other — and a plain Excel dedupe never catches it because the dates and
|
||||||
<code>Jan 18 2025</code>. DataTools fixes all of it in one pass —
|
amounts don't match character-for-character. DataTools standardizes
|
||||||
and produces a row-by-row CSV showing every change so your client
|
every date to ISO and every amount to numeric (parens-negatives
|
||||||
can verify your work.
|
resolved), then dedups on the <em>real</em> transaction so the
|
||||||
|
reconciliation ties out. On the sample export that's
|
||||||
|
<strong>26 rows → 20</strong> — six phantom duplicate transactions
|
||||||
|
removed, 36 date/amount cells standardized, 0 unparseable — and you
|
||||||
|
get a row-by-row CSV showing every change so your client can verify
|
||||||
|
your work.
|
||||||
</p>
|
</p>
|
||||||
<div class="cta-row">
|
<div class="cta-row">
|
||||||
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=bookkeeper" rel="noopener">Get DataTools — $49 →</a>
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=bookkeeper" rel="noopener">Get DataTools for Bookkeepers — $49 →</a>
|
||||||
<a class="btn btn-ghost btn-large" href="#demo">Try the live demo ↓</a>
|
<a class="btn btn-ghost btn-large" href="#demo">Try the live demo ↓</a>
|
||||||
<span class="price-note">One-time payment · cross-platform · runs offline</span>
|
<span class="price-note">One-time payment · cross-platform · runs offline</span>
|
||||||
</div>
|
</div>
|
||||||
<div class="stats">
|
<div class="stats">
|
||||||
<div class="stat"><div class="num">6</div><div class="label">tools, one bundle</div></div>
|
<div class="stat"><div class="num">26→20</div><div class="label">rows, on the sample export</div></div>
|
||||||
<div class="stat"><div class="num">100 %</div><div class="label">auditable changes</div></div>
|
<div class="stat"><div class="num">6</div><div class="label">phantom duplicates removed</div></div>
|
||||||
<div class="stat"><div class="num">0</div><div class="label">cloud uploads ever</div></div>
|
<div class="stat"><div class="num">0</div><div class="label">cloud uploads ever</div></div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -129,13 +134,15 @@
|
|||||||
<div class="eyebrow">Live demo · runs in your browser</div>
|
<div class="eyebrow">Live demo · runs in your browser</div>
|
||||||
<h2>Try it on a sample bank export with a known overlap</h2>
|
<h2>Try it on a sample bank export with a known overlap</h2>
|
||||||
<p>
|
<p>
|
||||||
The demo below loads a 25-row export combining January and February
|
The demo below loads a 26-row export combining January and February
|
||||||
activity, with the month-boundary rows duplicated across exports —
|
activity, with the month-boundary rows duplicated across exports —
|
||||||
the exact scenario where QuickBooks (or any reconciler) silently
|
the exact scenario where QuickBooks (or any reconciler) silently
|
||||||
double-counts transactions. Click <strong>Run pipeline</strong> and
|
double-counts transactions. Click <strong>Run pipeline</strong> and
|
||||||
watch the dedup catch every overlap, dates land in ISO format, and
|
watch it standardize 36 date/amount cells, land every date in ISO
|
||||||
the parens-negative amounts (<code>($89.50)</code>) become proper
|
format, turn the parens-negative amounts (<code>($89.50)</code>) into
|
||||||
negative numbers.
|
proper negatives, flag the disguised-null categories, and dedup the
|
||||||
|
export down to <strong>20 real transactions</strong> — six phantom
|
||||||
|
duplicates removed, 0 unparseable.
|
||||||
</p>
|
</p>
|
||||||
<div class="demo-frame">
|
<div class="demo-frame">
|
||||||
<iframe
|
<iframe
|
||||||
@@ -197,13 +204,17 @@
|
|||||||
price. DataTools writes the audit by default, downloadable as a
|
price. DataTools writes the audit by default, downloadable as a
|
||||||
separate CSV alongside the cleaned file.
|
separate CSV alongside the cleaned file.
|
||||||
</div>
|
</div>
|
||||||
<div class="terminal"><span class="prompt">$</span> head -5 client_jan2025_changes.csv
|
<div class="terminal"><span class="prompt">$</span> python -m src.cli_pipeline bank_reconciliation.csv --pipeline bank_reconciliation_pipeline.json --apply
|
||||||
|
standardize · 36 date/amount cells normalized (ISO dates, numeric amounts, parens-negatives resolved)
|
||||||
|
missing · disguised-null categories flagged (—, N/A, (blank))
|
||||||
|
dedup · 6 phantom duplicate transactions removed
|
||||||
|
rows · 26 → 20 · 0 unparseable
|
||||||
|
✓ wrote bank_reconciliation.cleaned.csv + bank_reconciliation.changes.csv (row-level audit)
|
||||||
|
<span class="prompt">$</span> head -4 bank_reconciliation.changes.csv
|
||||||
row,column,field_type,old,new
|
row,column,field_type,old,new
|
||||||
0,"Date ",date,"01/15/2025","2025-01-15"
|
0,"Date ",date,"01/15/2025","2025-01-15"
|
||||||
0,Description,name," AMAZON.COM*4F2X9 PURCHASE","Amazon.com*4F2X9 Purchase"
|
0,Amount,currency,"+$3,450.00","3450.00"
|
||||||
0,Amount,currency,"-$129.99","-129.99"
|
0,Category,category,"—","(missing)"
|
||||||
1,Date ,date,"2025-01-15","2025-01-15"
|
|
||||||
<span class="prompt">$</span> # one row of audit per cell change. handed to the client. signed off.</div>
|
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
@@ -336,13 +347,13 @@ row,column,field_type,old,new
|
|||||||
<footer>
|
<footer>
|
||||||
<div class="container">
|
<div class="container">
|
||||||
<div>
|
<div>
|
||||||
<p><strong>DataTools</strong> — local data-cleaning for Shopify, bookkeepers, and RevOps teams.</p>
|
<p><strong>DataTools</strong> — local data-cleaning for bookkeepers, accounts payable, and accounts receivable teams.</p>
|
||||||
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
||||||
</div>
|
</div>
|
||||||
<div>
|
<div>
|
||||||
<p>
|
<p>
|
||||||
<a href="../shopify-pet/">For Shopify operators</a> ·
|
<a href="../ap-1099/">For accounts payable / 1099</a> ·
|
||||||
<a href="../revops/">For RevOps agencies</a><br />
|
<a href="../ar-aging/">For accounts receivable</a><br />
|
||||||
<a href="https://gumroad.com/l/datatools?from=bookkeeper">Buy on Gumroad</a> ·
|
<a href="https://gumroad.com/l/datatools?from=bookkeeper">Buy on Gumroad</a> ·
|
||||||
<a href="mailto:hello@datatools.app">Email support</a>
|
<a href="mailto:hello@datatools.app">Email support</a>
|
||||||
</p>
|
</p>
|
||||||
|
|||||||
@@ -11,7 +11,7 @@
|
|||||||
"gumroad_listing": "https://gumroad.com/l/datatools",
|
"gumroad_listing": "https://gumroad.com/l/datatools",
|
||||||
"support_email": "hello@datatools.app",
|
"support_email": "hello@datatools.app",
|
||||||
|
|
||||||
"personas": ["shopify-pet", "bookkeeper", "revops"],
|
"personas": ["bookkeeper", "ap-1099", "ar-aging"],
|
||||||
|
|
||||||
"_substitutions_made": [
|
"_substitutions_made": [
|
||||||
"{{site_origin}}/ → site_origin/",
|
"{{site_origin}}/ → site_origin/",
|
||||||
|
|||||||
@@ -7,9 +7,9 @@ to ``landing/deploy.config.json`` and filling in the real URLs:
|
|||||||
|
|
||||||
Output:
|
Output:
|
||||||
landing/dist/index.html
|
landing/dist/index.html
|
||||||
landing/dist/shopify-pet/index.html
|
|
||||||
landing/dist/bookkeeper/index.html
|
landing/dist/bookkeeper/index.html
|
||||||
landing/dist/revops/index.html
|
landing/dist/ap-1099/index.html
|
||||||
|
landing/dist/ar-aging/index.html
|
||||||
landing/dist/_shared/styles.css
|
landing/dist/_shared/styles.css
|
||||||
landing/dist/robots.txt
|
landing/dist/robots.txt
|
||||||
landing/dist/sitemap.xml
|
landing/dist/sitemap.xml
|
||||||
@@ -50,9 +50,9 @@ EXAMPLE_PATH = LANDING / "deploy.config.example.json"
|
|||||||
# Files to substitute and copy. Order matters only for readability.
|
# Files to substitute and copy. Order matters only for readability.
|
||||||
HTML_PAGES = [
|
HTML_PAGES = [
|
||||||
LANDING / "index.html",
|
LANDING / "index.html",
|
||||||
LANDING / "shopify-pet" / "index.html",
|
LANDING / "bookkeeper" / "index.html",
|
||||||
LANDING / "bookkeeper" / "index.html",
|
LANDING / "ap-1099" / "index.html",
|
||||||
LANDING / "revops" / "index.html",
|
LANDING / "ar-aging" / "index.html",
|
||||||
]
|
]
|
||||||
SHARED = LANDING / "_shared" / "styles.css"
|
SHARED = LANDING / "_shared" / "styles.css"
|
||||||
|
|
||||||
@@ -125,7 +125,7 @@ def _stamp_sitemap(cfg: dict) -> str:
|
|||||||
site = cfg["site_origin"].rstrip("/")
|
site = cfg["site_origin"].rstrip("/")
|
||||||
today = date.today().isoformat()
|
today = date.today().isoformat()
|
||||||
urls = [site + "/"] + [
|
urls = [site + "/"] + [
|
||||||
f"{site}/{p}/" for p in cfg.get("personas", ["shopify-pet", "bookkeeper", "revops"])
|
f"{site}/{p}/" for p in cfg.get("personas", ["bookkeeper", "ap-1099", "ar-aging"])
|
||||||
]
|
]
|
||||||
items = "\n".join(
|
items = "\n".join(
|
||||||
f" <url><loc>{u}</loc><lastmod>{today}</lastmod></url>"
|
f" <url><loc>{u}</loc><lastmod>{today}</lastmod></url>"
|
||||||
@@ -177,11 +177,11 @@ def _build_404_html(cfg: dict) -> str:
|
|||||||
<h1>That page isn't here.</h1>
|
<h1>That page isn't here.</h1>
|
||||||
<p class="lead" style="margin: 0 auto 28px;">Pick a workflow below to land somewhere useful.</p>
|
<p class="lead" style="margin: 0 auto 28px;">Pick a workflow below to land somewhere useful.</p>
|
||||||
<p>
|
<p>
|
||||||
<a class="btn" href="{site_origin}/shopify-pet/">For Shopify</a>
|
|
||||||
|
|
||||||
<a class="btn" href="{site_origin}/bookkeeper/">For bookkeepers</a>
|
<a class="btn" href="{site_origin}/bookkeeper/">For bookkeepers</a>
|
||||||
|
|
||||||
<a class="btn" href="{site_origin}/revops/">For RevOps</a>
|
<a class="btn" href="{site_origin}/ap-1099/">For AP / 1099</a>
|
||||||
|
|
||||||
|
<a class="btn" href="{site_origin}/ar-aging/">For AR</a>
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|||||||
@@ -3,13 +3,13 @@
|
|||||||
<head>
|
<head>
|
||||||
<meta charset="utf-8" />
|
<meta charset="utf-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
<title>DataTools — Local CSV / Excel Cleaning for Shopify, Bookkeepers, and RevOps</title>
|
<title>DataTools — Local CSV / Excel Cleaning for Bookkeepers and Accountants</title>
|
||||||
<meta name="description" content="One desktop tool. Three workflows. Clean Shopify customer exports, reconcile messy bank statements, or dedupe lead lists across HubSpot and LinkedIn — all locally. $49 one-time." />
|
<meta name="description" content="One desktop tool for messy accounting exports. Reconcile bank statements, build clean 1099 vendor lists, and de-duplicate AR aging — all locally. $49 one-time." />
|
||||||
<link rel="canonical" href="https://datatools.app/" />
|
<link rel="canonical" href="https://datatools.app/" />
|
||||||
<link rel="stylesheet" href="_shared/styles.css" />
|
<link rel="stylesheet" href="_shared/styles.css" />
|
||||||
|
|
||||||
<meta property="og:title" content="DataTools — Local CSV / Excel Cleaning" />
|
<meta property="og:title" content="DataTools — Local CSV / Excel Cleaning for Accounting" />
|
||||||
<meta property="og:description" content="One desktop tool, three niche workflows. Runs entirely offline. $49 one-time." />
|
<meta property="og:description" content="Reconcile bank exports, prep 1099 vendor lists, clean AR aging — offline. $49 one-time." />
|
||||||
<meta property="og:type" content="website" />
|
<meta property="og:type" content="website" />
|
||||||
<meta property="og:url" content="https://datatools.app/" />
|
<meta property="og:url" content="https://datatools.app/" />
|
||||||
|
|
||||||
@@ -38,9 +38,9 @@
|
|||||||
box-shadow: var(--shadow);
|
box-shadow: var(--shadow);
|
||||||
text-decoration: none;
|
text-decoration: none;
|
||||||
}
|
}
|
||||||
.persona-card.shopify { --card-accent: #6ee7b7; }
|
|
||||||
.persona-card.bookkeeper{ --card-accent: #7dd3fc; }
|
.persona-card.bookkeeper{ --card-accent: #7dd3fc; }
|
||||||
.persona-card.revops { --card-accent: #c4b5fd; }
|
.persona-card.ap1099 { --card-accent: #fbbf24; }
|
||||||
|
.persona-card.ar { --card-accent: #6ee7b7; }
|
||||||
.persona-card .pill {
|
.persona-card .pill {
|
||||||
display: inline-block;
|
display: inline-block;
|
||||||
background: rgba(255,255,255,0.04);
|
background: rgba(255,255,255,0.04);
|
||||||
@@ -93,70 +93,69 @@
|
|||||||
|
|
||||||
<section class="hero">
|
<section class="hero">
|
||||||
<div class="container">
|
<div class="container">
|
||||||
<div class="eyebrow">For Shopify operators · bookkeepers · marketing & RevOps agencies</div>
|
<div class="eyebrow">For bookkeepers · accounts payable · accounts receivable</div>
|
||||||
<h1>Local CSV / Excel cleaning.<br /><strong>One tool. Three workflows.</strong></h1>
|
<h1>Local CSV / Excel cleaning for accounting.<br /><strong>One tool. Three workflows.</strong></h1>
|
||||||
<p class="lead">
|
<p class="lead">
|
||||||
DataTools is a desktop app that fixes the data-cleaning headaches
|
DataTools is a desktop app that fixes the export headaches that
|
||||||
every small business hits — duplicates Excel can't catch,
|
throw off your books — the transaction your bank posted twice,
|
||||||
international phones it can't parse, dates and currencies in three
|
the vendor entered three ways at 1099 time, the invoice your aging
|
||||||
different formats per export. One $49 download. Works on Mac,
|
report counted twice. One $49 download. Mac, Windows, and Linux.
|
||||||
Windows, and Linux. <strong>Your data never leaves your
|
<strong>Your data never leaves your computer.</strong>
|
||||||
computer.</strong>
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<div class="persona-grid">
|
<div class="persona-grid">
|
||||||
<a class="persona-card shopify" href="shopify-pet/">
|
|
||||||
<span class="pill">🛍️ Shopify operator</span>
|
|
||||||
<h3>Customer / vendor / subscriber export cleanup</h3>
|
|
||||||
<p>
|
|
||||||
Klaviyo-import-ready customer lists in 30 seconds. Catches
|
|
||||||
cross-device duplicates, standardizes international phones
|
|
||||||
and addresses, fixes the disguised nulls that break product
|
|
||||||
feeds.
|
|
||||||
</p>
|
|
||||||
<ul class="pain">
|
|
||||||
<li>· Fix Klaviyo per-contact billing on phantom dupes</li>
|
|
||||||
<li>· Repair feeds rejected by Google Merchant / Meta</li>
|
|
||||||
<li>· Unify orders from Shopify + Etsy + Amazon + Faire</li>
|
|
||||||
<li>· Resolve VAT-MOSS country-name drift</li>
|
|
||||||
</ul>
|
|
||||||
<span class="open">Open the Shopify demo & pricing</span>
|
|
||||||
</a>
|
|
||||||
|
|
||||||
<a class="persona-card bookkeeper" href="bookkeeper/">
|
<a class="persona-card bookkeeper" href="bookkeeper/">
|
||||||
<span class="pill">📒 Bookkeeper / accountant</span>
|
<span class="pill">📒 Bookkeeper</span>
|
||||||
<h3>Bank-export reconciliation with audit trail</h3>
|
<h3>Bank reconciliation with an audit trail</h3>
|
||||||
<p>
|
<p>
|
||||||
Catches the duplicate transaction QuickBooks imported twice
|
When the Jan and Feb exports overlap, the same payment posts
|
||||||
when Jan and Feb exports overlap. Standardizes dates,
|
twice in two formats. DataTools standardizes every date and
|
||||||
amounts, and vendor casing. Hands you a row-level audit log
|
amount, then dedups on the real transaction so it ties out —
|
||||||
to share with the client.
|
with a row-level audit log to hand the client.
|
||||||
</p>
|
</p>
|
||||||
<ul class="pain">
|
<ul class="pain">
|
||||||
<li>· Catch month-overlap re-import dupes</li>
|
<li>· Catch month-overlap re-import duplicates</li>
|
||||||
<li>· Consolidate vendors for clean 1099 reports</li>
|
<li>· ISO dates, numeric amounts, parens-negatives resolved</li>
|
||||||
<li>· Produce hand-off-ready audit trail</li>
|
<li>· Hand-off-ready audit trail</li>
|
||||||
<li>· Multi-currency books (EUR / GBP / BRL)</li>
|
<li>· Sample: 26 rows → 20, six phantom duplicates removed</li>
|
||||||
</ul>
|
</ul>
|
||||||
<span class="open">Open the bookkeeper demo & pricing</span>
|
<span class="open">Open the bookkeeper demo & pricing</span>
|
||||||
</a>
|
</a>
|
||||||
|
|
||||||
<a class="persona-card revops" href="revops/">
|
<a class="persona-card ap1099" href="ap-1099/">
|
||||||
<span class="pill">🪢 Marketing / RevOps</span>
|
<span class="pill">🧾 Accounts payable / 1099</span>
|
||||||
<h3>Lead-list dedup across HubSpot, LinkedIn, scrapes</h3>
|
<h3>Clean 1099 vendor list — missing EINs filled in</h3>
|
||||||
<p>
|
<p>
|
||||||
One canonical lead per real person — across HubSpot,
|
The same vendor entered three times, each record holding only
|
||||||
LinkedIn, Apollo, ZoomInfo, and manual scrapes.
|
part of the details. DataTools consolidates each vendor to one
|
||||||
International phones (50+ country codes), per-row country
|
row and backfills the gaps from the duplicates, so the EINs you
|
||||||
column, fuzzy match with merge.
|
need at filing time are recovered.
|
||||||
</p>
|
</p>
|
||||||
<ul class="pain">
|
<ul class="pain">
|
||||||
<li>· Stop paying HubSpot tier price for cross-source dupes</li>
|
<li>· Consolidate vendor masters for 1099-NEC</li>
|
||||||
<li>· Protect sender reputation from invalid emails</li>
|
<li>· Recover EINs scattered across duplicate records</li>
|
||||||
<li>· Skip the 4–8 wk GDPR review on cloud cleaners</li>
|
<li>· Standardize phones, emails, and amounts</li>
|
||||||
<li>· Suppression-list sync across 5+ platforms</li>
|
<li>· Sample: 24 records → 8 vendors, 7 EINs recovered</li>
|
||||||
</ul>
|
</ul>
|
||||||
<span class="open">Open the RevOps demo & pricing</span>
|
<span class="open">Open the 1099 / AP demo & pricing</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<a class="persona-card ar" href="ar-aging/">
|
||||||
|
<span class="pill">💵 Accounts receivable</span>
|
||||||
|
<h3>AR aging without the double-counted invoices</h3>
|
||||||
|
<p>
|
||||||
|
Double-entered invoices inflate your aging report and your
|
||||||
|
follow-ups. DataTools standardizes invoice dates, due dates,
|
||||||
|
and amounts, lowercases client emails, then removes the
|
||||||
|
duplicate invoice numbers so the aging is accurate.
|
||||||
|
</p>
|
||||||
|
<ul class="pain">
|
||||||
|
<li>· Remove double-entered invoices from the aging</li>
|
||||||
|
<li>· ISO dates, numeric amounts, lowercased client emails</li>
|
||||||
|
<li>· Backfill a blank status from its twin row</li>
|
||||||
|
<li>· Sample: 26 rows → 21, five duplicate invoices removed</li>
|
||||||
|
</ul>
|
||||||
|
<span class="open">Open the AR demo & pricing</span>
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -218,14 +217,14 @@
|
|||||||
<footer>
|
<footer>
|
||||||
<div class="container">
|
<div class="container">
|
||||||
<div>
|
<div>
|
||||||
<p><strong>DataTools</strong> — local data-cleaning for Shopify, bookkeepers, and RevOps teams.</p>
|
<p><strong>DataTools</strong> — local data-cleaning for bookkeepers, accounts payable, and accounts receivable teams.</p>
|
||||||
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
||||||
</div>
|
</div>
|
||||||
<div>
|
<div>
|
||||||
<p>
|
<p>
|
||||||
<a href="shopify-pet/">For Shopify operators</a> ·
|
|
||||||
<a href="bookkeeper/">For bookkeepers</a> ·
|
<a href="bookkeeper/">For bookkeepers</a> ·
|
||||||
<a href="revops/">For RevOps agencies</a><br />
|
<a href="ap-1099/">For accounts payable / 1099</a> ·
|
||||||
|
<a href="ar-aging/">For accounts receivable</a><br />
|
||||||
<a href="mailto:hello@datatools.app">Email support</a>
|
<a href="mailto:hello@datatools.app">Email support</a>
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,352 +0,0 @@
|
|||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="utf-8" />
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
||||||
<title>DataTools for RevOps — Dedupe Lead Lists Across HubSpot, LinkedIn, and Manual Scrapes · $49</title>
|
|
||||||
<meta name="description" content="One tool to dedupe lead lists across HubSpot, LinkedIn, and manual scrapes. International phones (50+ country codes), per-row country normalization, fuzzy match across vendors, fully offline. $49 one-time." />
|
|
||||||
<meta name="keywords" content="dedupe lead list, hubspot deduplicate, linkedin lead cleanup, marketing data cleaning, revops csv tool, multi-vendor lead unification, international phone normalization" />
|
|
||||||
<link rel="canonical" href="https://datatools.app/revops/" />
|
|
||||||
<link rel="stylesheet" href="../_shared/styles.css" />
|
|
||||||
|
|
||||||
<!-- Persona accent: RevOps → vivid violet -->
|
|
||||||
<style>
|
|
||||||
:root {
|
|
||||||
--accent: #c4b5fd;
|
|
||||||
--accent-ink: #2e1065;
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
|
|
||||||
<meta property="og:title" content="DataTools for RevOps — Dedupe Lead Lists Across HubSpot, LinkedIn, and Manual Scrapes" />
|
|
||||||
<meta property="og:description" content="International phones, country normalization, fuzzy dedup with merge — one tool, no upload. $49 one-time." />
|
|
||||||
<meta property="og:type" content="product" />
|
|
||||||
<meta property="og:url" content="https://datatools.app/revops/" />
|
|
||||||
|
|
||||||
<script type="application/ld+json">
|
|
||||||
{
|
|
||||||
"@context": "https://schema.org",
|
|
||||||
"@type": "SoftwareApplication",
|
|
||||||
"name": "DataTools for RevOps",
|
|
||||||
"operatingSystem": "Windows, macOS, Linux",
|
|
||||||
"applicationCategory": "BusinessApplication",
|
|
||||||
"offers": {
|
|
||||||
"@type": "Offer",
|
|
||||||
"price": "49",
|
|
||||||
"priceCurrency": "USD"
|
|
||||||
},
|
|
||||||
"description": "Dedupe and unify lead lists across CRM, scraping, and manual sources. International phone normalization, per-row country, fuzzy match with merge. Six-tool data-cleaning bundle for RevOps and marketing agencies.",
|
|
||||||
"softwareVersion": "1.0"
|
|
||||||
}
|
|
||||||
</script>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<div class="buybar">
|
|
||||||
<div class="buybar-inner">
|
|
||||||
<div class="brand"><span class="brand-mark">●</span> DataTools <span class="muted">/ for RevOps</span></div>
|
|
||||||
<div>
|
|
||||||
<span class="price-tag">$49 — one-time, no subscription</span>
|
|
||||||
<a class="btn" href="https://gumroad.com/l/datatools?from=revops" rel="noopener">Get DataTools →</a>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<section class="hero">
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">For RevOps · marketing ops · agency lead-gen · audience-builders</div>
|
|
||||||
<h1>Dedupe lead lists across HubSpot, LinkedIn,<br /><strong>and manual scrapes — locally.</strong></h1>
|
|
||||||
<p class="lead">
|
|
||||||
The same prospect shows up as <code>alice@acme.com</code> in HubSpot,
|
|
||||||
<code>Alice.Johnson@acme.com</code> in LinkedIn Sales Navigator, and
|
|
||||||
<code>alice@acme.com</code> again from your VA's manual scrape. Their
|
|
||||||
phone is <code>(415) 555-1234</code> in one source and
|
|
||||||
<code>4155551234</code> in another. DataTools fuzzy-matches across
|
|
||||||
sources, normalizes phones to E.164 with per-row country awareness,
|
|
||||||
and produces one canonical lead per real person — without uploading
|
|
||||||
a single contact to a third-party tool.
|
|
||||||
</p>
|
|
||||||
<div class="cta-row">
|
|
||||||
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=revops" rel="noopener">Get DataTools — $49 →</a>
|
|
||||||
<a class="btn btn-ghost btn-large" href="#demo">Try the live demo ↓</a>
|
|
||||||
<span class="price-note">One-time payment · cross-platform · runs offline</span>
|
|
||||||
</div>
|
|
||||||
<div class="stats">
|
|
||||||
<div class="stat"><div class="num">50+</div><div class="label">country codes</div></div>
|
|
||||||
<div class="stat"><div class="num">3</div><div class="label">CRM sources unified</div></div>
|
|
||||||
<div class="stat"><div class="num">0</div><div class="label">cloud uploads ever</div></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= Pain points ============= -->
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">If your last campaign launch was held up by data hygiene</div>
|
|
||||||
<h2>Five pains DataTools fixes before you import to HubSpot</h2>
|
|
||||||
<div class="grid">
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">💸</span>
|
|
||||||
<h3>HubSpot / Marketo / Iterable bills you for every duplicate contact</h3>
|
|
||||||
<p>10 k contacts → enterprise tier at $4–8 k/mo. 18 % cross-source duplicate rate from Apollo + ZoomInfo + LinkedIn means you're at 8.2 k unique people but paying for 10 k. Every month. Forever.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> $200–$800 per 1 k duplicate contacts — recurring, every month.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🚫</span>
|
|
||||||
<h3>Sender reputation tanks when you mail to invalid or duplicate addresses</h3>
|
|
||||||
<p>One bad sending session — to addresses your team scraped or imported without hygiene — and your domain reputation takes weeks to recover. Your good campaigns sit in spam folders during the recovery.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> catastrophic — entire email programme degraded for 2–6 weeks.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">⚖️</span>
|
|
||||||
<h3>GDPR makes uploading to a cloud cleaner a legal-review marathon</h3>
|
|
||||||
<p>Every cloud-based lead-cleaner needs you to upload your prospect list. Your legal team needs 4–8 weeks to bless that. DataTools is desktop-only — no upload, no DPA, no review, no delay.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> 4–8 weeks of legal-review delay per tool, every time.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🪢</span>
|
|
||||||
<h3>Apollo + ZoomInfo + LinkedIn + manual scrapes all use different schemas</h3>
|
|
||||||
<p>Each export has its own column names, scoring scale, country format. Unifying them by hand for one campaign costs 1–3 days. Doing it for every campaign is unsustainable.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> 1–3 days per campaign of manual unification + judgement calls that drift across team members.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🛡️</span>
|
|
||||||
<h3>Suppression lists across 5+ marketing platforms get out of sync</h3>
|
|
||||||
<p>Each platform has its own suppression format. Out-of-sync lists let opted-out contacts slip through, triggering CAN-SPAM / GDPR exposure and the kind of "we got a complaint" email no one wants.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> compliance risk + churn-back cost + stakeholder trust.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">📞</span>
|
|
||||||
<h3>International dialer fails because phone formats vary</h3>
|
|
||||||
<p>Calling list to 15 countries with mixed formats means dialler rejects 8–15 % of numbers, your reps spend the day on "number invalid" tones instead of conversations.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> rep productivity × failure rate × team size.</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<section id="demo">
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">Live demo · runs in your browser</div>
|
|
||||||
<h2>Try it on a real-looking 3-vendor lead list</h2>
|
|
||||||
<p>
|
|
||||||
The demo below loads a 25-row lead worksheet combining HubSpot,
|
|
||||||
LinkedIn Sales Navigator, and manual scraping — with the same prospect
|
|
||||||
appearing in two or three sources, country names spelled three
|
|
||||||
different ways (<code>USA</code>, <code>US</code>, <code>United
|
|
||||||
States</code>), and 13 different international phone formats. Click
|
|
||||||
<strong>Run pipeline</strong> and watch the 5-step pipeline (text
|
|
||||||
clean → format → missing → column map → dedup) collapse 25 rows to 19
|
|
||||||
with a single canonical record per prospect.
|
|
||||||
</p>
|
|
||||||
<div class="demo-frame">
|
|
||||||
<iframe
|
|
||||||
src="https://demo.datatools.app/?p=revops"
|
|
||||||
loading="lazy"
|
|
||||||
title="DataTools live demo — RevOps"
|
|
||||||
sandbox="allow-scripts allow-same-origin allow-downloads allow-forms"></iframe>
|
|
||||||
<div class="demo-caption">
|
|
||||||
Demo runs on free hosting. Capped at 100 input rows · output
|
|
||||||
watermarked. The paid product has no caps and runs entirely offline.
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">Built for the agency RevOps day</div>
|
|
||||||
<h2>Three workflows you do every campaign</h2>
|
|
||||||
<div class="grid">
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🪢</span>
|
|
||||||
<h3>Email-list dedup across lead sources</h3>
|
|
||||||
<p>HubSpot exports + LinkedIn Sales Navigator + the VA's spreadsheet, all merged. Fuzzy match across email + phone + name catches the cross-source duplicates that broke your last campaign send.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🌍</span>
|
|
||||||
<h3>Multi-platform audience reconciliation</h3>
|
|
||||||
<p>Build one canonical audience from Meta, Google Ads, LinkedIn, and your CRM. Each platform exports a different shape; Map Columns aligns them all, dedup merges the survivors with their most-complete fields.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🛡️</span>
|
|
||||||
<h3>Suppression-list management</h3>
|
|
||||||
<p>Suppression lists need to dedupe across email + phone + first-party identifiers. Add a row, dedupe, ship the canonical CSV to every platform — without uploading the suppression list to any of them.</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">If your campaigns target outside the US — almost everyone's do</div>
|
|
||||||
<h2>50+ country codes. Per-row country awareness.</h2>
|
|
||||||
<p>
|
|
||||||
Your HubSpot list has <code>(415) 555-1234</code>. Your scraped
|
|
||||||
list from the same prospect has <code>+1 415 555 1234</code>. Your
|
|
||||||
Italian prospect entered <code>+39 06 6982</code>. Your Brazilian
|
|
||||||
lead has <code>11 3071 0000</code>. Each comes from a row tagged
|
|
||||||
with its country — DataTools reads that column per row and parses
|
|
||||||
every phone correctly to E.164.
|
|
||||||
</p>
|
|
||||||
<ul class="bullets">
|
|
||||||
<li><strong>Per-row country column</strong> drives the parser — no global default that bucks UK numbers as malformed US.</li>
|
|
||||||
<li><strong>Country-name normalization</strong>: <code>USA</code> / <code>US</code> / <code>United States</code> all resolve to the same ISO-2 code.</li>
|
|
||||||
<li><strong>50+ country support</strong> via Google's libphonenumber, including KR, CN, IN, MX, BR, IL, TR, PL, DK, SE.</li>
|
|
||||||
<li><strong>Schema enforcement</strong> via Map Columns: project to your CRM's required shape, coerce score columns to integers, reorder fields to match the import contract.</li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">For platforms that charge per contact</div>
|
|
||||||
<h2>Every duplicate you don't catch costs you for the life of the contract.</h2>
|
|
||||||
<p>
|
|
||||||
HubSpot prices on contacts. Klaviyo prices on contacts. Marketo,
|
|
||||||
Iterable, ActiveCampaign — all priced on contacts. Every duplicate
|
|
||||||
you don't catch is a recurring tax on your campaign. DataTools
|
|
||||||
catches them once, before import, with a fuzzy matcher that's
|
|
||||||
tuned to the cross-source noise you actually see.
|
|
||||||
</p>
|
|
||||||
<div class="callout">
|
|
||||||
<strong>Real numbers from the demo:</strong> 25 input rows from
|
|
||||||
three sources collapse to 19 — that's 6 duplicates the cross-source
|
|
||||||
noise was hiding. On a 50,000-row campaign list, that ratio
|
|
||||||
typically saves 12,000+ contacts a month, every month.
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">The thing every cloud cleaner can't say</div>
|
|
||||||
<h2>Your prospects' contact info never leaves your computer.</h2>
|
|
||||||
<p>
|
|
||||||
Cloud lead-cleaning tools require you to upload your audience.
|
|
||||||
That audience is your single most valuable agency asset — and once
|
|
||||||
it's on someone else's server, your client's privacy story is
|
|
||||||
no longer in your hands. DataTools is a desktop app. There is no
|
|
||||||
upload step.
|
|
||||||
</p>
|
|
||||||
<div class="terminal"><span class="prompt">$</span> python -m src.cli_pipeline campaign_q1.csv --pipeline revops_pipeline.json --apply
|
|
||||||
Reading campaign_q1.csv...
|
|
||||||
53,802 rows, 14 columns
|
|
||||||
Executing pipeline:
|
|
||||||
<span class="ok">✓</span> text_clean (160 ms) {cells_changed: 8,205}
|
|
||||||
<span class="ok">✓</span> format_standardize (1.4 s) {cells_changed: 41,889 — 50 country codes}
|
|
||||||
<span class="ok">✓</span> missing (140 ms) {sentinels_standardized: 6,710}
|
|
||||||
<span class="ok">✓</span> column_map (220 ms) {columns_renamed: 4, columns_added: 1}
|
|
||||||
<span class="ok">✓</span> dedup (4.8 s) {duplicates_removed: 12,344, merged: 12,344}
|
|
||||||
|
|
||||||
Initial rows: 53,802 → Final rows: 41,458
|
|
||||||
Total elapsed: 6.7 s
|
|
||||||
<span class="prompt">$</span> # 12,344 fewer contacts to pay for. for $49.</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">In the bundle</div>
|
|
||||||
<h2>Six tools. One pipeline. One $49 download.</h2>
|
|
||||||
<div class="grid">
|
|
||||||
<div class="card"><h3>1 · Find Duplicates</h3><p>Fuzzy match across email + phone + name + company; merge survivors with most-complete fields.</p></div>
|
|
||||||
<div class="card"><h3>2 · Clean Text</h3><p>Smart quotes from copy-paste, NBSP from spreadsheet exports, BOM from Excel.</p></div>
|
|
||||||
<div class="card"><h3>3 · Standardize Formats</h3><p>E.164 phones with per-row country, canonical emails, name casing, ISO dates.</p></div>
|
|
||||||
<div class="card"><h3>4 · Fix Missing Values</h3><p>Detect <code>TBD</code>, <code>(unknown)</code>, <code>—</code> across vendor exports.</p></div>
|
|
||||||
<div class="card"><h3>5 · Map Columns</h3><p>Project to your CRM's required schema, coerce score to integer, reorder for import.</p></div>
|
|
||||||
<div class="card"><h3>6 · Automated Workflows</h3><p>Save the cleanup as JSON. Drop next campaign's combined export on it. Same dedup, automated.</p></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">Pricing — pay once, own it</div>
|
|
||||||
<h2>$49. No subscription. No per-campaign fee.</h2>
|
|
||||||
<div class="pricing">
|
|
||||||
<div class="card featured">
|
|
||||||
<div class="row"><div class="price">$49</div><div class="price-suffix">one-time</div></div>
|
|
||||||
<h3>DataTools for RevOps</h3>
|
|
||||||
<ul>
|
|
||||||
<li>All 6 tools, full pipeline</li>
|
|
||||||
<li>Mac · Windows · Linux installers</li>
|
|
||||||
<li>Code-signed (no Gatekeeper warnings)</li>
|
|
||||||
<li>Free updates for the v1.x line</li>
|
|
||||||
<li>Bonus: 3-source unification pipeline preset</li>
|
|
||||||
<li><strong>Use on any number of clients</strong> — no seat limits</li>
|
|
||||||
</ul>
|
|
||||||
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=revops" rel="noopener">Buy on Gumroad →</a>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<div class="row"><div class="price">$149</div><div class="price-suffix">one-time</div></div>
|
|
||||||
<h3>Full DataTools Suite</h3>
|
|
||||||
<p class="muted">Available when 3+ bundles ship. Includes everything in the RevOps pack plus the Shopify and Bookkeeper bundles. Save $48.</p>
|
|
||||||
<a class="btn btn-ghost btn-large" href="#" aria-disabled="true">Coming when ready</a>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<h2>Questions</h2>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>Does this replace HubSpot's deduplication?</summary>
|
|
||||||
<p>No — it cleans data <em>before</em> import to HubSpot (or LinkedIn, Marketo, Klaviyo, etc.). HubSpot's dedup runs on already-imported contacts; DataTools catches duplicates that haven't yet cost you a contract slot.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>Does it handle international phones correctly?</summary>
|
|
||||||
<p>Yes — via Google's libphonenumber, with 50+ country codes. The killer feature is per-row country: point a column at it (any column with values like <code>US</code>, <code>USA</code>, <code>United States</code>, <code>+1</code>, <code>JP</code>, <code>Japan</code>) and DataTools parses each row in its own region. No more UK numbers bucketed as malformed US.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>Can I use it on multiple clients without paying again?</summary>
|
|
||||||
<p>Yes. The licence is per-operator, not per-client. Run it on every agency client's lead list for the same $49.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>How does fuzzy match work across columns?</summary>
|
|
||||||
<p>Out of the box, the dedup engine builds default strategies based on column names — typically email + phone with exact match, name with Jaro-Winkler at 85%. You can override via JSON: pick which columns to match on, which algorithm, and what threshold. Strategies survive in the saved pipeline so next campaign uses the same rules.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>What's the audit trail look like?</summary>
|
|
||||||
<p>A row-by-row CSV: every modified cell with its original value, new value, and which rule fired. A separate JSON file describes the pipeline that produced it. Together they reproduce the cleanup deterministically — your client can verify it on their machine.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>What's your refund policy?</summary>
|
|
||||||
<p>Try the live demo above on the sample dataset before you buy. If DataTools doesn't fit your workflow within 14 days, email for a refund — no questions asked.</p>
|
|
||||||
</details>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<section>
|
|
||||||
<div class="container" style="text-align: center;">
|
|
||||||
<h2>Stop paying twice for the same contact.</h2>
|
|
||||||
<p class="lead" style="margin: 0 auto 28px;">One $49 download. Catches the cross-source duplicates HubSpot and LinkedIn can't see, normalizes phones for 50+ countries, and saves a pipeline you can re-run on next campaign's combined list.</p>
|
|
||||||
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=revops" rel="noopener">Get DataTools — $49 →</a>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<footer>
|
|
||||||
<div class="container">
|
|
||||||
<div>
|
|
||||||
<p><strong>DataTools</strong> — local data-cleaning for Shopify, bookkeepers, and RevOps teams.</p>
|
|
||||||
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
|
||||||
</div>
|
|
||||||
<div>
|
|
||||||
<p>
|
|
||||||
<a href="../shopify-pet/">For Shopify operators</a> ·
|
|
||||||
<a href="../bookkeeper/">For bookkeepers</a><br />
|
|
||||||
<a href="https://gumroad.com/l/datatools?from=revops">Buy on Gumroad</a> ·
|
|
||||||
<a href="mailto:hello@datatools.app">Email support</a>
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</footer>
|
|
||||||
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@@ -1,381 +0,0 @@
|
|||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="utf-8" />
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
||||||
<title>DataTools for Shopify — Clean Customer & Product Exports Locally · $49</title>
|
|
||||||
<meta name="description" content="Clean Shopify customer, product, and subscriber exports — locally. Klaviyo-import-ready in 30 seconds. Catches duplicates Excel misses. Your data never leaves your computer. $49 one-time." />
|
|
||||||
<meta name="keywords" content="shopify customer cleanup, shopify csv cleaner, shopify product feed cleaner, klaviyo deduplicate, shopify customer dedup tool, shopify pet supplies" />
|
|
||||||
<link rel="canonical" href="https://datatools.app/shopify/" />
|
|
||||||
<link rel="stylesheet" href="../_shared/styles.css" />
|
|
||||||
|
|
||||||
<!-- Persona accent: Shopify pet → mint green (default in shared sheet) -->
|
|
||||||
|
|
||||||
<!-- Open Graph -->
|
|
||||||
<meta property="og:title" content="DataTools for Shopify — Clean Customer & Product Exports Locally" />
|
|
||||||
<meta property="og:description" content="Klaviyo-import-ready in 30 seconds. Local. No upload. $49 one-time." />
|
|
||||||
<meta property="og:type" content="product" />
|
|
||||||
<meta property="og:url" content="https://datatools.app/shopify/" />
|
|
||||||
|
|
||||||
<!-- Schema.org Product -->
|
|
||||||
<script type="application/ld+json">
|
|
||||||
{
|
|
||||||
"@context": "https://schema.org",
|
|
||||||
"@type": "SoftwareApplication",
|
|
||||||
"name": "DataTools for Shopify",
|
|
||||||
"operatingSystem": "Windows, macOS, Linux",
|
|
||||||
"applicationCategory": "BusinessApplication",
|
|
||||||
"offers": {
|
|
||||||
"@type": "Offer",
|
|
||||||
"price": "49",
|
|
||||||
"priceCurrency": "USD"
|
|
||||||
},
|
|
||||||
"description": "Clean Shopify customer, product, and subscriber CSV exports locally. Six-tool data-cleaning bundle: dedupe, text-clean, format-standardize, missing-value handle, column-map, pipeline.",
|
|
||||||
"softwareVersion": "1.0"
|
|
||||||
}
|
|
||||||
</script>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<!-- ============= Sticky buy bar ============= -->
|
|
||||||
<div class="buybar">
|
|
||||||
<div class="buybar-inner">
|
|
||||||
<div class="brand"><span class="brand-mark">●</span> DataTools <span class="muted">/ for Shopify</span></div>
|
|
||||||
<div>
|
|
||||||
<span class="price-tag">$49 — one-time, no subscription</span>
|
|
||||||
<a class="btn" href="https://gumroad.com/l/datatools?from=shopify-pet" rel="noopener">Get DataTools →</a>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<!-- ============= Hero ============= -->
|
|
||||||
<section class="hero">
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">For Shopify operators · pet supplies · subscription stores · DTC</div>
|
|
||||||
<h1>Klaviyo-import-ready customer lists.<br /><strong>In 30 seconds. Locally.</strong></h1>
|
|
||||||
<p class="lead">
|
|
||||||
Your Shopify customer export is a mess of formatting drift, disguised
|
|
||||||
duplicates, and inconsistent phone numbers. DataTools fixes all of it
|
|
||||||
in one pass — fuzzy-dedupes the same customer Klaviyo would charge
|
|
||||||
you for twice, standardises phones across your international
|
|
||||||
subscribers, and hands you a cleaned CSV. <strong>Your data never
|
|
||||||
leaves your computer.</strong>
|
|
||||||
</p>
|
|
||||||
<div class="cta-row">
|
|
||||||
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=shopify-pet" rel="noopener">Get DataTools — $49 →</a>
|
|
||||||
<a class="btn btn-ghost btn-large" href="#demo">Try the live demo ↓</a>
|
|
||||||
<span class="price-note">One-time payment · cross-platform · runs offline</span>
|
|
||||||
</div>
|
|
||||||
<div class="stats">
|
|
||||||
<div class="stat"><div class="num">6</div><div class="label">tools, one bundle</div></div>
|
|
||||||
<div class="stat"><div class="num">1 GB</div><div class="label">customer file in 2.5 min</div></div>
|
|
||||||
<div class="stat"><div class="num">0</div><div class="label">cloud uploads ever</div></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= Pain points ============= -->
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">If any of these sound like your Tuesday</div>
|
|
||||||
<h2>Five pains DataTools fixes in one pass</h2>
|
|
||||||
<div class="grid">
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">💸</span>
|
|
||||||
<h3>Klaviyo / Mailchimp / Omnisend bills you for every duplicate</h3>
|
|
||||||
<p>Same customer signs up twice — once with a typo, once with a plus-tag, once on mobile. Your subscriber list has 10–18 % duplicate rate and you're paying for every one of them, every month, forever.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> $30–$300/mo per percent of dupes on a 50 k-list — recurring.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">📵</span>
|
|
||||||
<h3>Your product feed got rejected by Google Merchant Center</h3>
|
|
||||||
<p>Smart quotes from a copy-paste in product titles. NBSP in SKU. Inconsistent attribute casing. Feed bounces, the launch sits for 24–72 hours while you try to find the bad row in a 12,000-line CSV.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> 1–3 days of delayed campaign × the campaign value.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🪢</span>
|
|
||||||
<h3>Orders from Shopify + Etsy + Amazon + Faire don't speak the same language</h3>
|
|
||||||
<p>Each platform's export uses different column names for "customer email" / "ship country" / "order total." Merging takes hours of manual rename and copy-paste before the analysis can even begin.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> 4–8 hours per month manually merging exports.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🔁</span>
|
|
||||||
<h3>Subscription churn looks higher than it is</h3>
|
|
||||||
<p>Pet-box subscribers cancel, then re-sub three months later under a different email or device. Your cohort report says churn is 20 % when it's actually 12 % — and you're over-paying for acquisition because LTV is mis-calculated.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> wrong CAC ceiling for the next year of paid ads.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🌍</span>
|
|
||||||
<h3>VAT MOSS / EU tax breaks because country is spelled three ways</h3>
|
|
||||||
<p>Your UK customers are tagged <code>UK</code>, <code>U.K.</code>, and <code>United Kingdom</code> — all in one export. The VAT report aggregates them as three different markets. Compliance friction every quarter.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> compliance risk + repeated manual normalization.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🔒</span>
|
|
||||||
<h3>Cloud cleaners want you to upload your customer list</h3>
|
|
||||||
<p>Your customer list is your single most valuable business asset. Uploading it to a SaaS to clean it is the privacy story you do not want. DataTools is desktop-only — your list never leaves your computer.</p>
|
|
||||||
<p class="muted"><strong>What it costs:</strong> nothing — and that's the point.</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= Live demo ============= -->
|
|
||||||
<section id="demo">
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">Live demo · runs in your browser</div>
|
|
||||||
<h2>Try it on a real-looking Shopify customer export</h2>
|
|
||||||
<p>
|
|
||||||
The demo below loads a sample 15-row Shopify customer file with
|
|
||||||
pollution we've seen in actual stores: smart quotes from copy-paste,
|
|
||||||
duplicates with email-case drift, international phones from the UK,
|
|
||||||
Spain, Germany, Australia, and Japan, and the usual mess of
|
|
||||||
<code>N/A</code> / <code>(blank)</code> / <code>?</code> sentinels.
|
|
||||||
Click <strong>Run pipeline</strong> and watch every column get
|
|
||||||
cleaned in under a second.
|
|
||||||
</p>
|
|
||||||
<div class="demo-frame">
|
|
||||||
<iframe
|
|
||||||
src="https://demo.datatools.app/?p=shopify-pet"
|
|
||||||
loading="lazy"
|
|
||||||
title="DataTools live demo — Shopify pet supplies"
|
|
||||||
sandbox="allow-scripts allow-same-origin allow-downloads allow-forms"></iframe>
|
|
||||||
<div class="demo-caption">
|
|
||||||
Demo runs on free hosting (Streamlit Community Cloud). Capped at
|
|
||||||
100 input rows · output watermarked with one trailing row. The
|
|
||||||
paid product has no caps and runs entirely offline.
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= Built for Shopify ============= -->
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">Built for the Shopify operator</div>
|
|
||||||
<h2>Five workflows you do every week</h2>
|
|
||||||
<div class="grid">
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🧹</span>
|
|
||||||
<h3>Customer-list cleanup</h3>
|
|
||||||
<p>Catches the same customer who shows up as <code>john@gmail.com</code>, <code>John@Gmail.com</code>, and <code>j.ohn@gmail.com</code>. Fuzzy match merges the spellings, exact match catches the obvious ones.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">📦</span>
|
|
||||||
<h3>Product catalogue dedup</h3>
|
|
||||||
<p>SKU whitespace, near-identical product names, copy-paste smart quotes in titles — gone. Audit log shows every change.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🛒</span>
|
|
||||||
<h3>Abandoned-cart hygiene</h3>
|
|
||||||
<p>Before re-engagement: dedupe across email + phone, drop sentinels-as-missing, format dates so your sequence triggers fire correctly.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">📥</span>
|
|
||||||
<h3>Subscriber-list import to Klaviyo</h3>
|
|
||||||
<p>Klaviyo charges per contact. Every duplicate you don't catch costs you for the life of the subscription. Catch them once, pay once.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">🔗</span>
|
|
||||||
<h3>Multi-channel order consolidation</h3>
|
|
||||||
<p>Orders from Shopify + Etsy + a wholesale spreadsheet, each with a different column for "customer email." Map Columns aligns them; dedup merges across channels.</p>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<span class="icon">⚙️</span>
|
|
||||||
<h3>Repeatable pipeline</h3>
|
|
||||||
<p>Save the cleanup as a JSON file. Drop next week's export on it. Same cleanup, zero re-configuration. Automatable via the CLI.</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= Privacy moat ============= -->
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">The thing every cloud cleaner can't say</div>
|
|
||||||
<h2>Your customer list never leaves your computer.</h2>
|
|
||||||
<p>
|
|
||||||
DataTools is a desktop app. There's no upload step, no SaaS account,
|
|
||||||
no subscription, no "trust our security policy." The first thing you
|
|
||||||
can do after install is open your browser's network tab, run the
|
|
||||||
cleaner on your real customer file, and verify zero outbound
|
|
||||||
requests.
|
|
||||||
</p>
|
|
||||||
<div class="callout">
|
|
||||||
<strong>Why it matters for Shopify:</strong> your customer list is
|
|
||||||
your single most valuable business asset. Cloud cleaners require
|
|
||||||
you to upload it. We don't.
|
|
||||||
</div>
|
|
||||||
<div class="terminal"><span class="prompt">$</span> python -m src.cli_pipeline customers.csv --apply
|
|
||||||
Reading customers.csv...
|
|
||||||
47,832 rows, 14 columns
|
|
||||||
Executing pipeline:
|
|
||||||
<span class="ok">✓</span> text_clean (140 ms) {cells_changed: 12,408}
|
|
||||||
<span class="ok">✓</span> format_standardize (810 ms) {cells_changed: 31,202}
|
|
||||||
<span class="ok">✓</span> missing (95 ms) {sentinels_standardized: 8,129}
|
|
||||||
<span class="ok">✓</span> dedup (3.1 s) {duplicates_removed: 2,347}
|
|
||||||
|
|
||||||
Initial rows: 47,832 → Final rows: 45,485
|
|
||||||
Total elapsed: 4.2 s
|
|
||||||
<span class="prompt">$</span> # zero network calls. zero. promise.</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= Audit moat ============= -->
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">For when your client asks "what changed?"</div>
|
|
||||||
<h2>Every change auditable. Every cell logged.</h2>
|
|
||||||
<p>
|
|
||||||
Every modification is recorded with the original value, the new
|
|
||||||
value, and which rule fired. Hand the audit CSV to your accountant,
|
|
||||||
your marketing manager, or your boss along with the cleaned file.
|
|
||||||
No <em>"I trust the AI"</em> hand-waving — they see exactly what
|
|
||||||
happened.
|
|
||||||
</p>
|
|
||||||
<div class="callout">
|
|
||||||
<strong>Real example:</strong> the demo above standardized 27
|
|
||||||
cells across 15 customers. The audit log lists each one — row,
|
|
||||||
column, before, after, which standardizer fired. The dedup audit
|
|
||||||
lists every duplicate group with the survivor and its losers.
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= International ============= -->
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">If you sell internationally — most pet brands do</div>
|
|
||||||
<h2>Phones, addresses, and currencies from anywhere on Earth.</h2>
|
|
||||||
<p>
|
|
||||||
Your subscriber from London entered her phone as <code>020 7946
|
|
||||||
0958</code>. Your Tokyo customer entered <code>03-3210-7000</code>.
|
|
||||||
Your German wholesale buyer wrote <code>€2.410,75</code>. Excel
|
|
||||||
thinks all of them are mistakes. DataTools knows what country each
|
|
||||||
row is from (per-row country column) and parses every one correctly
|
|
||||||
to E.164 phones, ISO dates, and numeric amounts.
|
|
||||||
</p>
|
|
||||||
<ul class="bullets">
|
|
||||||
<li><strong>50+ country codes</strong> via Google's libphonenumber.</li>
|
|
||||||
<li><strong>Currency auto-detect</strong> for $ / £ / € / ¥ / R$ / kr / zł — including the EU comma-decimal that breaks Excel.</li>
|
|
||||||
<li><strong>Address shape detection</strong> for US, UK, Canada, Germany, Australia.</li>
|
|
||||||
<li><strong>Locale-aware month names</strong> in English, French, German.</li>
|
|
||||||
</ul>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= What you get ============= -->
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">In the bundle</div>
|
|
||||||
<h2>Six tools. One pipeline. One $49 download.</h2>
|
|
||||||
<div class="grid">
|
|
||||||
<div class="card"><h3>1 · Find Duplicates</h3><p>Fuzzy match (Jaro-Winkler), 5 normalizers, survivor rules, interactive review.</p></div>
|
|
||||||
<div class="card"><h3>2 · Clean Text</h3><p>Whitespace, smart chars, NBSP, BOM, line endings, case ops.</p></div>
|
|
||||||
<div class="card"><h3>3 · Standardize Formats</h3><p>Dates, phones, emails, addresses, names, currencies, booleans.</p></div>
|
|
||||||
<div class="card"><h3>4 · Fix Missing Values</h3><p>Disguised-null detection, profile, mean/median/mode/ffill, drop strategies.</p></div>
|
|
||||||
<div class="card"><h3>5 · Map Columns</h3><p>Fuzzy auto-rename, target schema, type coercion, required-field defaults.</p></div>
|
|
||||||
<div class="card"><h3>6 · Automated Workflows</h3><p>Chain tools in recommended order, save/load JSON, automate weekly cleanups.</p></div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= Pricing ============= -->
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<div class="eyebrow">Pricing — pay once, own it</div>
|
|
||||||
<h2>$49. No subscription. No ceiling on rows or files.</h2>
|
|
||||||
<div class="pricing">
|
|
||||||
<div class="card featured">
|
|
||||||
<div class="row"><div class="price">$49</div><div class="price-suffix">one-time</div></div>
|
|
||||||
<h3>DataTools for Shopify</h3>
|
|
||||||
<ul>
|
|
||||||
<li>All 6 tools, full pipeline</li>
|
|
||||||
<li>Mac · Windows · Linux installers</li>
|
|
||||||
<li>Code-signed (no Gatekeeper warnings)</li>
|
|
||||||
<li>Free updates for the v1.x line</li>
|
|
||||||
<li>Bonus: 3 ready-made Shopify pipelines</li>
|
|
||||||
</ul>
|
|
||||||
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=shopify-pet" rel="noopener">Buy on Gumroad →</a>
|
|
||||||
</div>
|
|
||||||
<div class="card">
|
|
||||||
<div class="row"><div class="price">$149</div><div class="price-suffix">one-time</div></div>
|
|
||||||
<h3>Full DataTools Suite</h3>
|
|
||||||
<p class="muted">Available when 3+ bundles ship. Includes everything in the Shopify pack plus the Bookkeeper and RevOps bundles. Save $48.</p>
|
|
||||||
<a class="btn btn-ghost btn-large" href="#" aria-disabled="true">Coming when ready</a>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= FAQ ============= -->
|
|
||||||
<section>
|
|
||||||
<div class="container">
|
|
||||||
<h2>Questions</h2>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>Does this work with Shopify Plus?</summary>
|
|
||||||
<p>Yes — the input is just CSV / Excel from any source. Your Shopify Plus exports work the same as the standard plan, the same as a Shopify-to-CSV pipeline you've stitched together yourself. The cleaner doesn't care.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>How does this compare to Excel's "Remove Duplicates"?</summary>
|
|
||||||
<p>Excel does <em>exact</em> deduplication. <code>John@Gmail.com</code> and <code>john@gmail.com</code> are different customers to Excel. DataTools fuzzy-matches across case, whitespace, formatting, and even close-but-not-identical strings. The demo above merges 4 customer pairs Excel would leave duplicated.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>How big a file can it handle?</summary>
|
|
||||||
<p>1 GB CSV with international phones + addresses processes in about 2.5 minutes on a typical workstation. Streaming mode keeps memory bounded regardless of input size — we tested it on 26 million rows.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>Do I need to know Python to use it?</summary>
|
|
||||||
<p>No. The GUI is a browser interface that opens automatically when you double-click the app. It loads your file, you click Run, you download the cleaned file. The CLI is there for power users who want to script weekly cleanups.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>What about my privacy?</summary>
|
|
||||||
<p>Your customer list never leaves your computer. There is no cloud component, no telemetry, no "anonymous usage stats." When the app is running you can confirm zero outbound network requests in your browser's developer tools.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>What's your refund policy?</summary>
|
|
||||||
<p>Try the live demo above on the sample dataset before you buy. If you still find DataTools doesn't fit your workflow within 14 days, email for a refund — no questions asked.</p>
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details class="faq">
|
|
||||||
<summary>Will there be updates?</summary>
|
|
||||||
<p>Yes. The v1.x line is included free for everyone who buys DataTools today. We ship a patch every 30 days adding country support, edge-case fixes, and small features.</p>
|
|
||||||
</details>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= Final CTA ============= -->
|
|
||||||
<section>
|
|
||||||
<div class="container" style="text-align: center;">
|
|
||||||
<h2>Stop deduplicating customers by hand.</h2>
|
|
||||||
<p class="lead" style="margin: 0 auto 28px;">One $49 download. Mac, Windows, or Linux. Runs offline. Catches the duplicates Excel misses, standardizes the phones from your international customers, and saves a pipeline you can re-run on next week's export.</p>
|
|
||||||
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=shopify-pet" rel="noopener">Get DataTools — $49 →</a>
|
|
||||||
</div>
|
|
||||||
</section>
|
|
||||||
|
|
||||||
<!-- ============= Footer ============= -->
|
|
||||||
<footer>
|
|
||||||
<div class="container">
|
|
||||||
<div>
|
|
||||||
<p><strong>DataTools</strong> — local data-cleaning for Shopify, bookkeepers, and RevOps teams.</p>
|
|
||||||
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
|
||||||
</div>
|
|
||||||
<div>
|
|
||||||
<p>
|
|
||||||
<a href="../bookkeeper/">For bookkeepers</a> ·
|
|
||||||
<a href="../revops/">For RevOps agencies</a><br />
|
|
||||||
<a href="https://gumroad.com/l/datatools?from=shopify-pet">Buy on Gumroad</a> ·
|
|
||||||
<a href="mailto:hello@datatools.app">Email support</a>
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
</div>
|
|
||||||
</footer>
|
|
||||||
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
Lead ID,First Name,Last Name,Company,Title,Email,Phone,Country,Source,Score,Last Activity,Tags
|
|
||||||
HUB-001,Alice,Johnson,Acme Corp,VP Marketing,alice@acme.com,(415) 555-1234,USA,HubSpot,87,2025-12-04,Enterprise
|
|
||||||
HUB-002,bob,smith,Beta LLC,Director Growth,bob@beta.com,N/A,United States,HubSpot,N/A,2025-11-22,SMB
|
|
||||||
HUB-003,Carlos,Garcia,Gamma Inc,CEO,carlos@gamma.io,+34 91 411 1111,Spain,HubSpot,82,2025-10-30,Enterprise
|
|
||||||
HUB-004,DIANA,LEE,Delta Co,Marketing Manager,diana@delta.com,020 7946 0958,United Kingdom,HubSpot,74,2025-12-15,Mid-Market
|
|
||||||
HUB-005,Eve,Martinez,Epsilon Group,VP Ops,eve@epsilon.com,(none),Mexico,HubSpot,(blank),2025-09-15,SMB
|
|
||||||
LIN-006,Alice,Johnson,Acme Corporation,VP of Marketing,Alice.Johnson@acme.com,4155551234,US,LinkedIn,—,2025-12-04,Enterprise
|
|
||||||
LIN-007,Frank,Brown,Foxtrot Ltd,Head Sales,frank@foxtrot.de,+49 30 12345678,Germany,LinkedIn,68,2025-12-01,Mid-Market
|
|
||||||
LIN-008,Grace,Davis,Golf Industries,Marketing Lead,grace@golfind.com,+44 20 7946 0958,UK,LinkedIn,79,2025-11-08,Mid-Market
|
|
||||||
LIN-009,henry,wilson,Hotel Logistics,COO,henry@hotellog.com,+86 10 1234 5678,China,LinkedIn,91,2025-12-12,Enterprise
|
|
||||||
LIN-010,IVY CHEN,,India Tech,CTO,ivy@indiatech.in,+91 11 2345 6789,IN,LinkedIn,88,2025-11-30,Enterprise
|
|
||||||
LIN-011,Jack,Taylor,Juliet & Co,Founder,jack@juliet.co,unknown,United States,LinkedIn,?,(unknown),SMB
|
|
||||||
SCR-012,Diana,Lee,Delta Company,Marketing Manager,diana@delta.com,020-7946-0958,UK,Manual Scrape,74,12/15/2025,Mid-Market
|
|
||||||
SCR-013,kate,o'neil,Kilo Ventures,Partner,kate@kilo.vc,+1 415 555 2222,USA,Manual Scrape,N/A,?,Investor
|
|
||||||
SCR-014,Carlos,García,Gamma Incorporated,CEO,Carlos@gamma.io,+34-91-411-1111,Spain,Manual Scrape,82,Oct 30 2025,Enterprise
|
|
||||||
SCR-015,Liam,Park,Lima Solutions,Director Marketing,liam@limasol.kr,+82 2 2287 0114,South Korea,Manual Scrape,77,2025-11-20,Enterprise
|
|
||||||
SCR-016,Mia,nguyen,Mike Corp,VP Marketing,mia@mikecorp.com.au,02 9374 4000,Australia,Manual Scrape,72,2025-10-05,Mid-Market
|
|
||||||
SCR-017,Noah,Brown,November Inc,Head of Growth,noah@november.com,(555) 444-5555,US,Manual Scrape,—,#N/A,SMB
|
|
||||||
HUB-018,Frank,Brown,Foxtrot,Head of Sales,Frank@Foxtrot.de,+49-30-12345678,Germany,HubSpot,68,2025-12-01,Mid-Market
|
|
||||||
HUB-019,Olivia,Rossi,Oscar Italia,CMO,olivia@oscar.it,+39 06 6982,Italy,HubSpot,85,2025-12-08,Enterprise
|
|
||||||
HUB-020,papa,wong,Papa Trading,Founder,papa@papatrading.hk,+852 2123 4567,Hong Kong,HubSpot,69,2025-11-15,SMB
|
|
||||||
LIN-021,Quinn,Reyes,Quebec Group,VP Sales,quinn@quebec.mx,+52 55 5555 0000,Mexico,LinkedIn,80,2025-12-05,Mid-Market
|
|
||||||
LIN-022,Robert,Tan,Romeo Logistics,Director,r.tan@romeo.sg,+65 6123 4567,Singapore,LinkedIn,76,2025-11-28,Mid-Market
|
|
||||||
SCR-023,Sara,Khan,Sierra Foods,Head Marketing,sara@sierra.in,+91-22-1234-5678,India,Manual Scrape,73,2025-12-02,SMB
|
|
||||||
SCR-024,bob,Smith,Beta,Director Growth,Bob@Beta.com,(none),United States,Manual Scrape,(unknown),(unknown),SMB
|
|
||||||
HUB-025,Tara,Levi,Tango Tech,VP Product,tara@tango.il,+972 3 6957 0000,Israel,HubSpot,82,2025-12-10,Enterprise
|
|
||||||
HUB-026,Uma,Patel,Uniform Health,CMO,uma at uniform dot com,+44 20 7946 8888,United Kingdom,HubSpot,71,2025-12-12,Enterprise
|
|
||||||
LIN-027,Victor,Lee,Victor Co,Director,victor@@victorco.com,+1 415 555 8888,USA,LinkedIn,69,2025-11-30,SMB
|
|
||||||
SCR-028,Wendy,Akin,Whiskey Inc,CMO,wendy@whiskey.tr,+90 212 252 1111,Turkey,Manual Scrape,77,2025-12-04,Mid-Market
|
|
||||||
SCR-029,Xander,Ng,Xray Group,Founder,xander@xray.sg,+65 6234 5678,Singapore,Manual Scrape,65,2025-11-15,Suppressed
|
|
||||||
HUB-030,Yara,Costa,Yankee Foods,Marketing Lead,yara@yankee.br,+55 11 3071 2222,Brazil,HubSpot,—,2025-12-15,Opted Out
|
|
||||||
|
@@ -1,74 +0,0 @@
|
|||||||
{
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"tool": "text_clean",
|
|
||||||
"options": {},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "1. Clean text (whitespace + smart quotes from copy-paste)"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tool": "format_standardize",
|
|
||||||
"options": {
|
|
||||||
"column_types": {
|
|
||||||
"First Name": "name",
|
|
||||||
"Last Name": "name",
|
|
||||||
"Company": "name",
|
|
||||||
"Email": "email",
|
|
||||||
"Phone": "phone"
|
|
||||||
},
|
|
||||||
"phone_country_column": "Country",
|
|
||||||
"phone_format": "E164",
|
|
||||||
"email_gmail_canonical": true
|
|
||||||
},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "2. E.164 phones (per-row country) · canonical emails · name casing"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tool": "missing",
|
|
||||||
"options": {
|
|
||||||
"strategy": "none",
|
|
||||||
"standardize_sentinels": true,
|
|
||||||
"sentinels": ["N/A", "n/a", "—", "?", "(unknown)", "unknown", "(blank)", "(none)", "TBD", "#N/A"]
|
|
||||||
},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "3. Standardize sentinels across vendor exports"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tool": "column_map",
|
|
||||||
"options": {
|
|
||||||
"schema": {
|
|
||||||
"fields": [
|
|
||||||
{"name": "Lead ID", "dtype": "string", "required": true},
|
|
||||||
{"name": "First Name", "dtype": "string"},
|
|
||||||
{"name": "Last Name", "dtype": "string"},
|
|
||||||
{"name": "Company", "dtype": "string"},
|
|
||||||
{"name": "Title", "dtype": "string"},
|
|
||||||
{"name": "Email", "dtype": "string"},
|
|
||||||
{"name": "Phone", "dtype": "string"},
|
|
||||||
{"name": "Country", "dtype": "string"},
|
|
||||||
{"name": "Source", "dtype": "string"},
|
|
||||||
{"name": "Score", "dtype": "integer"},
|
|
||||||
{"name": "Last Activity", "dtype": "date"},
|
|
||||||
{"name": "Tags", "dtype": "string"}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"auto_infer": true,
|
|
||||||
"unmapped": "keep",
|
|
||||||
"coerce_types": true,
|
|
||||||
"reorder_to_schema": true,
|
|
||||||
"enforce_required": false
|
|
||||||
},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "4. Coerce types · reorder to canonical schema"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tool": "dedup",
|
|
||||||
"options": {
|
|
||||||
"survivor_rule": "most_complete",
|
|
||||||
"merge": true
|
|
||||||
},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "5. Dedup leads across HubSpot / LinkedIn / Manual Scrape (fuzzy + merge)"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
27
samples/demo/ar_open_invoices.csv
Normal file
27
samples/demo/ar_open_invoices.csv
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
Invoice,Client,Email,Invoice_Date,Due_Date,Amount,Status
|
||||||
|
INV-1007,ACME LLC,AP@Acme.com,03/04/2025,04/03/2025,"$1,250.00",Open
|
||||||
|
INV-1007, Acme LLC ,ap@acme.com,2025-03-04,2025-04-03,"1,250.00",(blank)
|
||||||
|
INV-1001,northwind traders,billing@northwind.com,Mar 6 2025,04/05/2025,$980,Overdue
|
||||||
|
INV-1002,Globex Corp,AR@Globex.com,3/11/25,4/10/25,"2,400.50",Sent
|
||||||
|
INV-1011,initech,accounts@initech.com,04/01/2025,05/01/2025,"$ 1,100.00",?
|
||||||
|
INV-1011,Initech,Accounts@Initech.com,2025-04-01,2025-05-01,1100,Open
|
||||||
|
INV-1003,Stark Industries,ap@stark.com,Mar 6 2025,Apr 6 2025,$75.00,Open
|
||||||
|
INV-1004,Wayne Enterprises,ar@wayne.com,03/15/2025,04/14/2025,($300.00),—
|
||||||
|
INV-1015,Hooli,billing@hooli.com,3/11/25,4/10/25,"$4,300.00",Overdue
|
||||||
|
INV-1015,hooli,Billing@Hooli.com,2025-03-11,2025-04-10,4300,(none)
|
||||||
|
INV-1005,Soylent Corp,ap@soylent.com,2025-03-20,2025-04-19,"$1,875.25",Sent
|
||||||
|
INV-1006,Umbrella Co,ar@umbrella.com,03/22/2025,04/21/2025,$640.00,TBD
|
||||||
|
INV-1019,Cyberdyne Systems,ap@cyberdyne.com,Mar 25 2025,04/24/2025,"$2,050.00",unknown
|
||||||
|
INV-1019,cyberdyne systems,AP@Cyberdyne.com,2025-03-25,2025-04-24,"2,050.00",Open
|
||||||
|
INV-1008,Vandelay Industries,ar@vandelay.com,3/28/25,4/27/25,$915.00,Overdue
|
||||||
|
INV-1009,Gekko & Co,billing@gekko.com,2025-03-30,2025-04-29,"$3,120.75",Open
|
||||||
|
INV-1010,Pied Piper,ap@piedpiper.com,04/02/2025,05/02/2025,$180,Sent
|
||||||
|
INV-1023,Tyrell Corp,ar@tyrell.com,04/05/2025,05/05/2025,($300.00),(blank)
|
||||||
|
INV-1023,Tyrell Corp,AR@Tyrell.com,2025-04-05,2025-05-05,-300.00,Open
|
||||||
|
INV-1012,Oscorp,ap@oscorp.com,Apr 8 2025,05/08/2025,"$5,000.00",Overdue
|
||||||
|
INV-1013,Nakatomi Trading,ar@nakatomi.com,4/9/25,5/9/25,$725.50,Sent
|
||||||
|
INV-1014,Bluth Company,billing@bluth.com,2025-04-10,2025-05-10,"$1,420.00",Open
|
||||||
|
INV-1016,Dunder Mifflin,ap@dundermifflin.com,04/12/2025,05/12/2025,$960.00,Overdue
|
||||||
|
INV-1017,Prestige Worldwide,ar@prestige.com,Apr 14 2025,05/14/2025,"$2,680.00",Sent
|
||||||
|
INV-1018,Sterling Cooper,billing@sterlingcooper.com,4/15/25,5/15/25,"$3,950.00",Open
|
||||||
|
INV-1020,Wonka Industries,ap@wonka.com,2025-04-18,2025-05-18,"$1,050.00",Overdue
|
||||||
|
50
samples/demo/ar_open_invoices_pipeline.json
Normal file
50
samples/demo/ar_open_invoices_pipeline.json
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
{
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"tool": "text_clean",
|
||||||
|
"enabled": true,
|
||||||
|
"options": {
|
||||||
|
"trim": true,
|
||||||
|
"collapse_whitespace": true,
|
||||||
|
"fold_smart_chars": true,
|
||||||
|
"strip_zero_width": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tool": "format_standardize",
|
||||||
|
"enabled": true,
|
||||||
|
"options": {
|
||||||
|
"column_types": {
|
||||||
|
"Invoice_Date": "date",
|
||||||
|
"Due_Date": "date",
|
||||||
|
"Amount": "currency",
|
||||||
|
"Email": "email"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tool": "missing",
|
||||||
|
"enabled": true,
|
||||||
|
"options": {
|
||||||
|
"strategy": "none",
|
||||||
|
"standardize_sentinels": true,
|
||||||
|
"sentinels": ["—", "-", "?", "(blank)", "TBD", "unknown", "(none)", "N/A", "#N/A"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tool": "dedup",
|
||||||
|
"enabled": true,
|
||||||
|
"options": {
|
||||||
|
"survivor_rule": "most_complete",
|
||||||
|
"merge": true,
|
||||||
|
"strategies": [
|
||||||
|
{
|
||||||
|
"columns": [
|
||||||
|
{"column": "Invoice", "algorithm": "exact", "threshold": 100}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
27
samples/demo/bank_reconciliation.csv
Normal file
27
samples/demo/bank_reconciliation.csv
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
Date,Description,Vendor,Category,Amount,Account
|
||||||
|
01/15/2025,“Stripe payout — weekly”,Stripe,Income,"+$3,450.00",Business Checking
|
||||||
|
2025-01-15,Verizon business line,Verizon,—,($89.50),Business Checking
|
||||||
|
Jan 18 2025,Adobe Creative Cloud ,Adobe,(blank),-$129.99,Business Checking
|
||||||
|
1/27/25,Office supplies,Amazon,Supplies,-$74.20,Business Checking
|
||||||
|
02/03/2025, Monthly office rent,Highland Properties,Rent,"$1,200.00",Business Checking
|
||||||
|
Feb 5 2025,Account service fee,First National Bank,?,(50.00),Business Checking
|
||||||
|
2025-01-09,Shipping labels,amazon.com,unknown,-$18.40,Business Checking
|
||||||
|
1/22/25,Contractor — landing page,Bright Lane Design,TBD,- $599.88,Business Checking
|
||||||
|
Jan 30 2025,Late fee adjustment,verizon,Utilities,-$12.00,Business Checking
|
||||||
|
2025-01-11,Packaging tape,AMAZON.COM,Supplies,-$31.75,Business Checking
|
||||||
|
01/06/2025,Client deposit — ACME Co,ACME Co,Income,"$2,500.00",Business Checking
|
||||||
|
2025-01-20,Google Workspace,Google,Software,-$36.00,Business Checking
|
||||||
|
Jan 24 2025,Fuel — delivery van,Shell,Vehicle,-$58.63,Business Checking
|
||||||
|
1/28/25,QuickBooks subscription,Intuit,Software,-$80.00,Business Checking
|
||||||
|
2025-01-15,Stripe payout weekly,Stripe,Income,3450.00,Business Checking
|
||||||
|
01/15/2025,Verizon business line,Verizon,Utilities,-89.50,Business Checking
|
||||||
|
2025-01-18,Adobe Creative Cloud,Adobe,Software,-129.99,Business Checking
|
||||||
|
2025-02-03,Monthly office rent,Highland Properties,Rent,1200.00,Business Checking
|
||||||
|
2025-02-05,Account service fee,First National Bank,Bank Fees,-50.00,Business Checking
|
||||||
|
2025-01-22,Contractor landing page,Bright Lane Design,Contractors,-599.88,Business Checking
|
||||||
|
02/10/2025,Client deposit — Globex,Globex,Income,"$1,800.00",Business Checking
|
||||||
|
2025-02-12,Slack subscription,Slack,Software,-$96.00,Business Checking
|
||||||
|
Feb 14 2025,Coffee — client meeting,Blue Bottle,Meals,-$23.10,Business Checking
|
||||||
|
2/18/25,Insurance premium,Hartford,Insurance,-$240.50,Business Checking
|
||||||
|
02/21/2025,Refund — returned printer,Staples,Supplies,$210.99,Business Checking
|
||||||
|
Feb 25 2025,Domain renewal,Namecheap,Software,-$13.98,Business Checking
|
||||||
|
6
samples/demo/bank_reconciliation_pipeline.json
Normal file
6
samples/demo/bank_reconciliation_pipeline.json
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
{"steps":[
|
||||||
|
{"tool":"text_clean","enabled":true,"options":{"trim":true,"collapse_whitespace":true,"fold_smart_chars":true,"strip_zero_width":true}},
|
||||||
|
{"tool":"format_standardize","enabled":true,"options":{"column_types":{"Date":"date","Amount":"currency"}}},
|
||||||
|
{"tool":"missing","enabled":true,"options":{"strategy":"none","standardize_sentinels":true,"sentinels":["—","(blank)","?","unknown","TBD","N/A","#N/A","(none)"]}},
|
||||||
|
{"tool":"dedup","enabled":true,"options":{"survivor_rule":"most_complete","merge":true,"strategies":[{"columns":[{"column":"Date","algorithm":"exact","threshold":100},{"column":"Amount","algorithm":"exact","threshold":100}]}]}}
|
||||||
|
]}
|
||||||
@@ -1,56 +0,0 @@
|
|||||||
{
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"tool": "text_clean",
|
|
||||||
"options": {},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "1. Clean text (header whitespace, smart quotes, em-dash)"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tool": "format_standardize",
|
|
||||||
"options": {
|
|
||||||
"column_types": {
|
|
||||||
"Date": "date",
|
|
||||||
"Amount": "currency",
|
|
||||||
"Balance": "currency",
|
|
||||||
"Vendor": "name"
|
|
||||||
},
|
|
||||||
"currency_decimal": "auto",
|
|
||||||
"currency_preserve_code": false,
|
|
||||||
"currency_decimals": 2,
|
|
||||||
"date_output_format": "%Y-%m-%d"
|
|
||||||
},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "2. ISO dates · numeric amounts (parens-negative) · vendor casing"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tool": "missing",
|
|
||||||
"options": {
|
|
||||||
"strategy": "none",
|
|
||||||
"standardize_sentinels": true,
|
|
||||||
"sentinels": ["N/A", "n/a", "—", "-", "?", "(blank)", "(none)", "unknown", "#N/A"]
|
|
||||||
},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "3. Standardize disguised nulls (— / N/A / (blank))"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tool": "dedup",
|
|
||||||
"options": {
|
|
||||||
"survivor_rule": "most_complete",
|
|
||||||
"merge": false,
|
|
||||||
"date_column": "Date",
|
|
||||||
"strategies": [
|
|
||||||
{
|
|
||||||
"columns": [
|
|
||||||
{"column": "Date", "algorithm": "exact", "threshold": 100},
|
|
||||||
{"column": "Amount", "algorithm": "exact", "threshold": 100},
|
|
||||||
{"column": "Vendor", "algorithm": "jaro_winkler", "threshold": 80}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "4. Dedup transactions on Date+Amount+fuzzy Vendor"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
@@ -1,31 +0,0 @@
|
|||||||
Txn ID,Date ,Description,Amount,Balance,Account,Vendor,Category
|
|
||||||
TXN-2401,01/15/2025," AMAZON.COM*4F2X9 PURCHASE",-$129.99,"$2,450.01",Checking,Amazon,Office Supplies
|
|
||||||
TXN-2402,2025-01-15,"AMAZON.COM*4F2X9 PURCHASE",-$129.99,"2450.01",Checking,amazon.com,Office Supplies
|
|
||||||
TXN-2403,Jan 18 2025,"STAPLES #4422 — paper, toner",($89.50),$2360.51,Checking,STAPLES,Office Supplies
|
|
||||||
TXN-2404,01/22/2025,"Verizon Wireless ""autopay""",-$120.00,"$2,240.51",Checking,Verizon,Utilities
|
|
||||||
TXN-2405,2025-01-22,Verizon Wireless autopay,-120.00,"2,240.51",Checking,verizon,Utilities
|
|
||||||
TXN-2406,01-25-2025,"Stripe Payout — invoice #1077","+$3,450.00","$5,690.51",Checking,Stripe,Income
|
|
||||||
TXN-2407,1/27/25,"Office Lease - Suite 204",-1500.00,"$4,190.51",Checking,Acme Realty,Rent
|
|
||||||
TXN-2408,02/01/2025,"Wire — Acme Realty Mgmt","-$1,500.00","$2,690.51",Checking,acme realty,Rent
|
|
||||||
TXN-2409,2025-02-03,"Adobe Creative Cloud annual","- $599.88","$2,090.63",Credit Card,Adobe Inc.,Software
|
|
||||||
TXN-2410,02/03/2025,"ADOBE CREATIVE CLOUD ANN",-599.88,2090.63,Credit Card,adobe,Software
|
|
||||||
TXN-2411,Feb 5 2025,"FedEx — overnight to client A",-$32.50,"$2,058.13",Checking,FedEx,Shipping
|
|
||||||
TXN-2412,02/07/2025,"Square fee — invoice #1078","-$3.20","$2,054.93",Checking,Square,Fees
|
|
||||||
TXN-2413,02/10/2025,"Stripe Payout invoice #1079","+ $1,200.00","$3,254.93",Checking,Stripe,Income
|
|
||||||
TXN-2414,2025-02-12,"USPS PRIORITY — to vendor B","-12.40","$3,242.53",Checking,USPS,Shipping
|
|
||||||
TXN-2415,02/14/2025,"Zoom Video Comms — annual","-$149.90","$3,092.63",Credit Card,Zoom,Software
|
|
||||||
TXN-2416,2/14/25,"Zoom Video Communications","-149.90","3092.63",Credit Card,zoom,Software
|
|
||||||
TXN-2417,02/18/2025,"Costco Whse #421 — supplies","-$237.84","$2,854.79",Checking,Costco,Office Supplies
|
|
||||||
TXN-2418,2025-02-18,COSTCO WHSE #421,-237.84,"2,854.79",Checking,costco,Office Supplies
|
|
||||||
TXN-2419,02/22/2025,"Bank fee — int'l wire","-$45.00","$2,809.79",Checking,Bank Fee,Fees
|
|
||||||
TXN-2420,02/24/2025,"Stripe Payout — invoice #1080","+$2,100.00","$4,909.79",Checking,Stripe,Income
|
|
||||||
TXN-2421,02/28/2025," Refund — overcharge ","+$45.00","$4,954.79",Checking,—,Refunds
|
|
||||||
TXN-2422,Feb 28 2025,REFUND OVERCHARGE,45.00,4954.79,Checking,N/A,Refunds
|
|
||||||
TXN-2423,03/01/2025,"Office Lease — Suite 204","-$1,500.00","$3,454.79",Checking,Acme Realty,Rent
|
|
||||||
TXN-2424,2025-03-03,"Slack Technologies — annual","-$840.00","$2,614.79",Credit Card,Slack,Software
|
|
||||||
TXN-2425,03/05/2025,"Stripe Payout — invoice #1081","+$1,875.00","$4,489.79",Checking,Stripe,Income
|
|
||||||
TXN-2426,03/08/2025,"Wire — Berlin office rent (EUR vendor)","-€1.450,00","$2,989.79",Checking,Mietverwaltung GmbH,Rent
|
|
||||||
TXN-2427,03/10/2025,"London supplier invoice (GBP)","-£950.00","$1,939.79",Checking,Stationery Co Ltd,Office Supplies
|
|
||||||
TXN-2428,03/12/2025,"São Paulo agency retainer","-R$ 1.299,90","$1,679.79",Credit Card,Estúdio Ágil,Software
|
|
||||||
TXN-2429,03/14/2025,"VAT MOSS prep — multi-EU sales","($89.00)","$1,768.79",Checking,EU VAT Service,Fees
|
|
||||||
TXN-2430,03/14/2025,"VAT MOSS prep multi EU sales",-89.00,"1,768.79",Checking,eu vat service,Fees
|
|
||||||
|
@@ -1,21 +0,0 @@
|
|||||||
Customer ID,First Name,Last Name,Email,Phone,Address,City,State,ZIP,Country,Total Orders,Lifetime Value,Last Order Date,Tags
|
|
||||||
SHOP-1001, Alice ,Johnson,alice@petshop.com,(415) 555-1234,"123 Main St., Apt 4B",San Francisco,CA,94102,US,12,$1,240.50,2025-12-04,VIP
|
|
||||||
SHOP-1002,Bob,SMITH,Bob@PetShop.com,415.555.1234,"123 Main St, Apt 4B",San Francisco,CA,94102,US,12,"$1,240.50",N/A,VIP
|
|
||||||
SHOP-1003,carlos,garcia,carlos@petshop.com,5559876543,"742 Evergreen Terrace",Springfield,IL,62704,US,5,420.00,12/15/2025,Wholesale
|
|
||||||
SHOP-1004,Diana,Lee,diana@petshop.com,(555) 222-3344,"PO Box 12, Sherwood Forest",Nottingham,,NG1 5BA,GB,8,£890.25,2025-10-30,VIP|Wholesale
|
|
||||||
SHOP-1005,EVE MARTINEZ,,eve.martinez@petshop.com,555-9988,"Calle Mayor 45","Madrid",,"28013",ES,3,€180,2025-09-15,
|
|
||||||
SHOP-1006,Frank,Brown,frank@petshop.com,, ,"Berlin",BE,10115,DE,15,€2.410,75,(blank),Wholesale
|
|
||||||
SHOP-1007,Grace,Davis,grace@petshop.com,+1 555-111-1111,"888 Maple Ave",Toronto,ON,M5V 3A8,CA,1,$49.99,#N/A,New
|
|
||||||
SHOP-1008,henry,wilson,Henry@PetShop.com,5551111111,"888 Maple Avenue","Toronto",ON,M5V 3A8,CA,1,$49.99,2025-12-01,New
|
|
||||||
SHOP-1009,Ivy,Chen,IVY@petshop.com,+1 (555) 777-7777,"550 Elm Street, Suite 200",Brooklyn,NY,11201,US,4,"$320.50 ",10/12/2025,
|
|
||||||
SHOP-1010,Jack,Taylor,jack@petshop.com,(none),"550 elm street, suite 200",brooklyn,NY,11201,US,4,$320.50,2025-10-12,
|
|
||||||
SHOP-1011,kate,o'neil,kate.oneil@petshop.com,415-555-2222,"99 King's Rd","London",,SW3 4LX,GB,7,£675.00,?,VIP
|
|
||||||
SHOP-1012,luis,rodriguez,LUIS@petshop.com,+34 91 411 1111,"Avenida de la Paz 12, 3°D",Madrid,,28013,ES,2,"€89,99",unknown,
|
|
||||||
SHOP-1013,Mia,Park,mia@petshop.com,02-9374-4000,"Sydney Opera House Drive","Sydney",NSW,2000,AU,9,"A$ 1,299.00",2025-11-20,Wholesale
|
|
||||||
SHOP-1014,Noah,nguyen,noah@petshop.com,+81 3 3210 7000,"丸の内 2-7-3","Tokyo",,100-0005,JP,6,"¥75000",2025-12-10,VIP
|
|
||||||
SHOP-1015,Olivia,Brown,OLIVIA@PETSHOP.COM,(555) 333-4444,"742 evergreen terrace",springfield,IL,62704,US,3,$180.00,(none),
|
|
||||||
SHOP-1016,Pavel,Novak,pavel@petshop.com,+44 20 7946 1234,"22 Baker Street",London,,W1U 6AB,United Kingdom,4,£412.00,2025-11-18,VIP
|
|
||||||
SHOP-1017,Quinn,Murphy,quinn@petshop.com,+44 20 7946 5678,"5 Princes Street",Edinburgh,,EH2 2DA,U.K.,2,£189.50,2025-12-09,
|
|
||||||
SHOP-1018,Rachel,O'Brien,rachel@petshop.com,02-9374-9999,"100 George Street","Sydney",NSW,2000,UK,1,£75.00,?,New
|
|
||||||
SHOP-1019,Sam,Klein,sam@petshop.com,+49 30 99887766,"Friedrichstraße 100","Berlin",,10117,Germany,11,"€1.890,40",2025-12-11,VIP|Wholesale
|
|
||||||
SHOP-1020,Tara,Gianni,tara@petshop.com,+39 06 6982 4567,"Via del Corso 250",Roma,,00186,Italia,5,"€649,99",2025-12-03,
|
|
||||||
|
@@ -1,49 +0,0 @@
|
|||||||
{
|
|
||||||
"steps": [
|
|
||||||
{
|
|
||||||
"tool": "text_clean",
|
|
||||||
"options": {},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "1. Clean text (whitespace, smart quotes, NBSP, BOM)"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tool": "format_standardize",
|
|
||||||
"options": {
|
|
||||||
"column_types": {
|
|
||||||
"First Name": "name",
|
|
||||||
"Last Name": "name",
|
|
||||||
"Email": "email",
|
|
||||||
"Phone": "phone",
|
|
||||||
"Address": "address",
|
|
||||||
"Lifetime Value": "currency",
|
|
||||||
"Last Order Date": "date"
|
|
||||||
},
|
|
||||||
"phone_country_column": "Country",
|
|
||||||
"address_country_column": "Country",
|
|
||||||
"currency_preserve_code": true,
|
|
||||||
"currency_decimal": "auto",
|
|
||||||
"email_gmail_canonical": false
|
|
||||||
},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "2. Standardize phones, addresses, dates, currencies, names"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tool": "missing",
|
|
||||||
"options": {
|
|
||||||
"strategy": "none",
|
|
||||||
"standardize_sentinels": true
|
|
||||||
},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "3. Standardize disguised nulls (N/A, -, (blank), ?, #N/A)"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"tool": "dedup",
|
|
||||||
"options": {
|
|
||||||
"survivor_rule": "most_complete",
|
|
||||||
"merge": true
|
|
||||||
},
|
|
||||||
"enabled": true,
|
|
||||||
"name": "4. Dedup customers (fuzzy match, merge missing fields)"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
25
samples/demo/vendor_1099.csv
Normal file
25
samples/demo/vendor_1099.csv
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
Vendor,Contact,Email,Phone,EIN,Address,Total_Paid
|
||||||
|
Acme Realty,Bob Stein,acme.ap@acmerealty.com,(212) 555-0100,12-3456789,(blank),"$12,400.00"
|
||||||
|
acme realty llc,Bob Stein, ACME.AP@AcmeRealty.com ,,—,"118 Canal St, New York, NY 10013","$8,250"
|
||||||
|
ACME REALTY,R. Stein,Acme.AP@acmerealty.com,212.555.0100,N/A,TBD,"1,999.99"
|
||||||
|
Bright Books Bookkeeping,Dana Cole,hello@brightbooks.com,,98-7654321,(blank),"$6,000.00"
|
||||||
|
bright books,Dana Cole,HELLO@brightbooks.com,(415) 555-0142,unknown,"50 Market St, San Francisco, CA 94105","$6,000"
|
||||||
|
"Bright Books, LLC",D. Cole, hello@BrightBooks.com,4155550142,98-7654321,unknown,"5,500.00"
|
||||||
|
Northwind Logistics,Sam Reyes,ap@northwindlog.com,(312) 555-0198,—,(blank),"$22,750.00"
|
||||||
|
northwind logistics inc,Sam Reyes,AP@NorthwindLog.com,,45-6789012,"900 W Loop, Chicago, IL 60607","$22,750"
|
||||||
|
Pearl Design Studio,“Jo” Marsh,billing@pearldesign.co,,33-2211000,(blank),"$3,200.00"
|
||||||
|
pearl design,Jo Marsh,Billing@PearlDesign.co,(206) 555-0167,TBD,"77 Pike St, Seattle, WA 98101","$3,200"
|
||||||
|
PEARL DESIGN STUDIO,J. Marsh, billing@pearldesign.co ,206.555.0167,33-2211000,unknown,"2,800.00"
|
||||||
|
Cooper Plumbing,Lee Cooper,office@cooperplumb.com,(617) 555-0133,—,(blank),"$1,450.00"
|
||||||
|
cooper plumbing co,Lee Cooper,OFFICE@cooperplumb.com,,TBD,"12 Beacon St, Boston, MA 02108","$1,450"
|
||||||
|
COOPER PLUMBING,L. Cooper, office@CooperPlumb.com,6175550133,N/A,unknown,900.00
|
||||||
|
Vertex Marketing,Pat Nguyen,accounts@vertexmktg.com,(404) 555-0119,77-8899001,(blank),"$15,000.00"
|
||||||
|
vertex marketing group,Pat Nguyen,ACCOUNTS@VertexMktg.com,,unknown,"300 Peachtree St, Atlanta, GA 30308","$15,000"
|
||||||
|
Summit Consulting,Ray Brooks,invoices@summitconsult.net,,21-0099887,(blank),"$9,800.00"
|
||||||
|
summit consulting llc,Ray Brooks,INVOICES@summitconsult.net,(303) 555-0175,—,"1100 17th St, Denver, CO 80202","$9,800"
|
||||||
|
SUMMIT CONSULTING,R. Brooks, invoices@SummitConsult.net ,303.555.0175,21-0099887,TBD,"7,250.00"
|
||||||
|
Garcia Catering,Mia Garcia,ap@garciacatering.com,(305) 555-0188,—,(blank),"$4,600.00"
|
||||||
|
garcia catering services,Mia Garcia,AP@GarciaCatering.com,,66-1234509,"450 Ocean Dr, Miami, FL 33139",$600.00
|
||||||
|
Northwind Logistics,S. Reyes, ap@northwindlog.com ,312.555.0198,45-6789012,TBD,"21,000.00"
|
||||||
|
VERTEX MARKETING,P. Nguyen, accounts@vertexmktg.com ,404.555.0119,77-8899001,TBD,"14,500.00"
|
||||||
|
GARCIA CATERING,M. Garcia,ap@GARCIACATERING.com,305.555.0188,66-1234509,unknown,"4,200.00"
|
||||||
|
49
samples/demo/vendor_1099_pipeline.json
Normal file
49
samples/demo/vendor_1099_pipeline.json
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
{
|
||||||
|
"steps": [
|
||||||
|
{
|
||||||
|
"tool": "text_clean",
|
||||||
|
"enabled": true,
|
||||||
|
"options": {
|
||||||
|
"trim": true,
|
||||||
|
"collapse_whitespace": true,
|
||||||
|
"fold_smart_chars": true,
|
||||||
|
"strip_zero_width": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tool": "format_standardize",
|
||||||
|
"enabled": true,
|
||||||
|
"options": {
|
||||||
|
"column_types": {
|
||||||
|
"Phone": "phone",
|
||||||
|
"Email": "email",
|
||||||
|
"Total_Paid": "currency"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tool": "missing",
|
||||||
|
"enabled": true,
|
||||||
|
"options": {
|
||||||
|
"strategy": "none",
|
||||||
|
"standardize_sentinels": true,
|
||||||
|
"sentinels": ["—", "-", "--", "(blank)", "TBD", "unknown", "N/A", "#N/A", "(none)"]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"tool": "dedup",
|
||||||
|
"enabled": true,
|
||||||
|
"options": {
|
||||||
|
"survivor_rule": "most_complete",
|
||||||
|
"merge": true,
|
||||||
|
"strategies": [
|
||||||
|
{
|
||||||
|
"columns": [
|
||||||
|
{"column": "Email", "algorithm": "exact", "threshold": 100, "normalizer": "email"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
customer_name,email,vendor,memo
|
|
||||||
Alice Johnson,alice@example.com,ACME Corp ,Welcome aboard
|
|
||||||
Bob Smith,bob@example.com,ACME Corp,Returning customer
|
|
||||||
Charlie Brown,charlie@example.com,Globex,Net 30
|
|
||||||
Diana Prince,diana@example.com,Globex,VIP
|
|
||||||
Edward Norton,ed@example.com,“Best Pet Supplies”,Order#42 - rush
|
|
||||||
Frank Castle,frank@example.com,Stark—Industries,"Line 1
|
|
||||||
Line 2
|
|
||||||
Line 3"
|
|
||||||
grace HOPPER ,grace@example.com,Globex,Loves long memos…
|
|
||||||
Henry Ford,henry@example.com,Ford Motor,Industrial
|
|
||||||
Iris West,iris@example.com,S.T.A.R. Labs,Notewith-bell
|
|
||||||
Jane Doe,jane@example.com,Acme,Standard
|
|
||||||
|
@@ -9,10 +9,10 @@ side-by-side, and converts the visitor to a Gumroad purchase.
|
|||||||
Launch:
|
Launch:
|
||||||
streamlit run src/gui/app_demo.py
|
streamlit run src/gui/app_demo.py
|
||||||
|
|
||||||
URL routing:
|
URL routing (all three personas serve one audience: accounting):
|
||||||
https://demo.datatools.app/?p=shopify-pet (Shopify operator)
|
https://demo.datatools.app/?p=bookkeeper (Bookkeeper — bank reconciliation)
|
||||||
https://demo.datatools.app/?p=bookkeeper (Bookkeeper)
|
https://demo.datatools.app/?p=ap-1099 (Accounts payable — 1099 vendor prep)
|
||||||
https://demo.datatools.app/?p=revops (RevOps agency)
|
https://demo.datatools.app/?p=ar-aging (Accounts receivable — open invoices)
|
||||||
|
|
||||||
Free / paid boundary (per docs/DEMO-PLAN.md §6):
|
Free / paid boundary (per docs/DEMO-PLAN.md §6):
|
||||||
- input rows capped at ``DEMO_ROW_CAP``
|
- input rows capped at ``DEMO_ROW_CAP``
|
||||||
@@ -64,59 +64,66 @@ GUMROAD_BASE: str = "https://gumroad.com/l/datatools"
|
|||||||
DEMO_DIR = _project_root / "samples" / "demo"
|
DEMO_DIR = _project_root / "samples" / "demo"
|
||||||
|
|
||||||
|
|
||||||
|
# All three personas serve one audience — accounting — entering through the
|
||||||
|
# three workflows where messy exports cost real money: bank reconciliation,
|
||||||
|
# 1099 / AP vendor prep, and AR aging. Each H1/sub names the exact pain and
|
||||||
|
# the validated demo outcome (see docs/DEMO-PLAN.md §4 for the numbers).
|
||||||
PERSONAS: dict[str, dict[str, Any]] = {
|
PERSONAS: dict[str, dict[str, Any]] = {
|
||||||
"shopify-pet": {
|
|
||||||
"label": "Shopify pet operator",
|
|
||||||
"icon": "🛍️",
|
|
||||||
"h1": "Klaviyo-import-ready customer lists. **In 30 seconds. Locally.**",
|
|
||||||
"sub": (
|
|
||||||
"Your Shopify customer export has duplicates Excel can't catch, "
|
|
||||||
"international phones Excel can't parse, and disguised nulls "
|
|
||||||
"(`N/A`, `(blank)`, `?`) that break Klaviyo's import. "
|
|
||||||
"DataTools fixes all of it in one pass — and your data never "
|
|
||||||
"leaves your computer."
|
|
||||||
),
|
|
||||||
"data_file": "shopify_pet_customers.csv",
|
|
||||||
"pipeline_file": "shopify_pet_pipeline.json",
|
|
||||||
"cta": "Get DataTools for Shopify — $49 →",
|
|
||||||
"landing": "https://datatools.app/shopify/",
|
|
||||||
},
|
|
||||||
"bookkeeper": {
|
"bookkeeper": {
|
||||||
"label": "Bookkeeper / freelance accountant",
|
"label": "Bookkeeper — bank reconciliation",
|
||||||
"icon": "📒",
|
"icon": "📒",
|
||||||
"h1": "Reconcile messy bank exports. **Hand your client an audit trail.**",
|
"h1": "Catch the transactions your bank export posted twice. **Locally.**",
|
||||||
"sub": (
|
"sub": (
|
||||||
"The Jan and Feb exports overlap; the same transaction posts twice. "
|
"When the Jan and Feb exports overlap, the same payment lands "
|
||||||
"Vendor names are *Amazon* / *amazon.com* / *AMAZON.COM*4F2X9* in "
|
"twice — once as `01/15/2025 +$3,450.00`, once as "
|
||||||
"three rows. DataTools dedups on Date + Amount + fuzzy Vendor, "
|
"`2025-01-15 3450.00`. DataTools standardizes every date and "
|
||||||
"produces ISO dates and numeric amounts, and gives you a row-level "
|
"amount, then dedups on the *real* transaction so your "
|
||||||
"audit log to hand the client."
|
"reconciliation ties out. In this sample: **26 rows → 20, six "
|
||||||
|
"phantom duplicates removed** — and your data never leaves your "
|
||||||
|
"computer."
|
||||||
),
|
),
|
||||||
"data_file": "bookkeeper_bank_reconcile.csv",
|
"data_file": "bank_reconciliation.csv",
|
||||||
"pipeline_file": "bookkeeper_bank_pipeline.json",
|
"pipeline_file": "bank_reconciliation_pipeline.json",
|
||||||
"cta": "Get DataTools for Bookkeepers — $49 →",
|
"cta": "Get DataTools for Bookkeepers — $49 →",
|
||||||
"landing": "https://datatools.app/bookkeeper/",
|
"landing": "https://datatools.app/bookkeeper/",
|
||||||
},
|
},
|
||||||
"revops": {
|
"ap-1099": {
|
||||||
"label": "Marketing / RevOps agency",
|
"label": "Accounts payable — 1099 prep",
|
||||||
"icon": "🪢",
|
"icon": "🧾",
|
||||||
"h1": "Dedupe lead lists across HubSpot, LinkedIn, and manual scrapes — **locally.**",
|
"h1": "Build a clean 1099 vendor list — **with the missing EINs filled in.**",
|
||||||
"sub": (
|
"sub": (
|
||||||
"The same prospect shows up in HubSpot as `alice@acme.com`, in "
|
"The same vendor was entered three times across the year — one "
|
||||||
"LinkedIn as `Alice.Johnson@acme.com`, and in your VA's manual "
|
"record has the EIN, another the address, a third the phone. "
|
||||||
"scrape as `alice@acme.com` again. Country is `USA` / `US` / "
|
"DataTools consolidates each vendor to one row and *backfills the "
|
||||||
"`United States`. DataTools fuzzy-matches across sources, "
|
"gaps from the duplicates*. In this sample: **24 messy records → "
|
||||||
"normalizes phones for 50+ countries, and merges survivors "
|
"8 complete vendors, with 7 missing EINs recovered** from the "
|
||||||
"with their most-complete fields — without uploading anything."
|
"duplicate rows. No upload, no VLOOKUP gymnastics."
|
||||||
),
|
),
|
||||||
"data_file": "agency_combined_leads.csv",
|
"data_file": "vendor_1099.csv",
|
||||||
"pipeline_file": "agency_leads_pipeline.json",
|
"pipeline_file": "vendor_1099_pipeline.json",
|
||||||
"cta": "Get DataTools for RevOps — $49 →",
|
"cta": "Get DataTools for Accounting — $49 →",
|
||||||
"landing": "https://datatools.app/revops/",
|
"landing": "https://datatools.app/accounting/",
|
||||||
|
},
|
||||||
|
"ar-aging": {
|
||||||
|
"label": "Accounts receivable — open invoices",
|
||||||
|
"icon": "💵",
|
||||||
|
"h1": "Stop chasing the invoices your aging report counted twice. **Locally.**",
|
||||||
|
"sub": (
|
||||||
|
"Double-entered invoices inflate your AR aging and your "
|
||||||
|
"follow-ups. DataTools standardizes invoice dates, due dates, and "
|
||||||
|
"amounts, lowercases client emails, then removes the duplicate "
|
||||||
|
"invoice numbers — backfilling any blank status from the twin row. "
|
||||||
|
"In this sample: **26 rows → 21, five phantom invoices off the "
|
||||||
|
"books** in one pass."
|
||||||
|
),
|
||||||
|
"data_file": "ar_open_invoices.csv",
|
||||||
|
"pipeline_file": "ar_open_invoices_pipeline.json",
|
||||||
|
"cta": "Get DataTools for Accounting — $49 →",
|
||||||
|
"landing": "https://datatools.app/accounting/",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_PERSONA = "shopify-pet"
|
DEFAULT_PERSONA = "bookkeeper"
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|||||||
505
src/gui/components/pipeline_modules.py
Normal file
505
src/gui/components/pipeline_modules.py
Normal file
@@ -0,0 +1,505 @@
|
|||||||
|
"""Visual pipeline builder — per-step "module" cards + plain-language config panels.
|
||||||
|
|
||||||
|
The Automated Workflows page (``9_Pipeline_Runner.py``) used to configure each
|
||||||
|
step through a raw ``options_json`` text column. This module replaces that with
|
||||||
|
one **module card** per step: a friendly name + caption, an enable toggle,
|
||||||
|
reorder/remove controls, and a **Configure** expander that renders that tool's
|
||||||
|
own controls in plain language (no JSON). Raw JSON survives only as the page's
|
||||||
|
Advanced import/export surface.
|
||||||
|
|
||||||
|
Each config renderer takes the step's current ``options`` dict, renders the
|
||||||
|
curated controls from the design mockup (``layout-review/09_pipeline_runner.html``),
|
||||||
|
and returns an updated **JSON-serialisable** options dict — the same shape the
|
||||||
|
``TOOL_ADAPTERS`` in ``src/core/pipeline.py`` consume via ``Options.from_dict``.
|
||||||
|
|
||||||
|
Two hard Streamlit constraints shaped this:
|
||||||
|
* No nested expanders — the per-step Configure expander means config renderers
|
||||||
|
here must NOT open their own expander, and the page must not wrap the card
|
||||||
|
stack in an outer expander.
|
||||||
|
* Widget identity must be stable across reorder/remove — every widget key is
|
||||||
|
derived from a step's stable ``id``, never its list position.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Callable, Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
from src.gui.tools_registry import tool_name
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Adapter-key → registry tool_id bridge
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Pipeline steps are keyed by adapter name (``text_clean``); the tools registry
|
||||||
|
# and i18n packs are keyed by tool_id (``02_text_cleaner``). The registry has no
|
||||||
|
# reverse lookup, so we keep the bridge here. ``step_label`` resolves the
|
||||||
|
# localized friendly name; ``step_caption`` returns a short, plain-English "what
|
||||||
|
# this step does" line for the card body.
|
||||||
|
|
||||||
|
PIPELINE_TOOL_META: dict[str, str] = {
|
||||||
|
"text_clean": "02_text_cleaner",
|
||||||
|
"format_standardize": "03_format_standardizer",
|
||||||
|
"missing": "04_missing_handler",
|
||||||
|
"column_map": "05_column_mapper",
|
||||||
|
"dedup": "01_deduplicator",
|
||||||
|
}
|
||||||
|
|
||||||
|
_STEP_CAPTIONS: dict[str, str] = {
|
||||||
|
"text_clean": "Trim spaces, collapse repeats, strip invisible characters.",
|
||||||
|
"format_standardize": "Canonicalize phones, dates, currency, names per column.",
|
||||||
|
"missing": "Flag, fill, or drop blank cells (and disguised blanks).",
|
||||||
|
"column_map": "Rename source columns onto your target column names.",
|
||||||
|
"dedup": "Find duplicate rows and keep one survivor per group.",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def step_label(tool: str) -> str:
|
||||||
|
"""Friendly, localized name for a pipeline adapter key (falls back to the key)."""
|
||||||
|
tool_id = PIPELINE_TOOL_META.get(tool)
|
||||||
|
return tool_name(tool_id) if tool_id else tool
|
||||||
|
|
||||||
|
|
||||||
|
def step_caption(tool: str) -> str:
|
||||||
|
return _STEP_CAPTIONS.get(tool, "")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Plain-English result phrasing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Each adapter returns a stats dict (see ``TOOL_ADAPTERS`` in
|
||||||
|
# ``src/core/pipeline.py``). ``step_phrase`` turns that dict into the one-line
|
||||||
|
# sentence the mockup shows in the Results table ("312 duplicates removed across
|
||||||
|
# 147 groups …"); ``step_status`` derives the status pill + an optional inline
|
||||||
|
# detail line for steps that warn (e.g. unparseable cells) or error.
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt_cols(cols: list) -> str:
|
||||||
|
"""Join column names for prose: 'name', 'name & city', 'a, b & 2 more'."""
|
||||||
|
cols = [str(c) for c in cols]
|
||||||
|
if not cols:
|
||||||
|
return ""
|
||||||
|
if len(cols) == 1:
|
||||||
|
return cols[0]
|
||||||
|
if len(cols) == 2:
|
||||||
|
return f"{cols[0]} & {cols[1]}"
|
||||||
|
if len(cols) == 3:
|
||||||
|
return f"{cols[0]}, {cols[1]} & {cols[2]}"
|
||||||
|
return f"{cols[0]}, {cols[1]} & {len(cols) - 2} more"
|
||||||
|
|
||||||
|
|
||||||
|
def _in_cols(cols: list) -> str:
|
||||||
|
label = _fmt_cols(cols)
|
||||||
|
return f" in {label}" if label else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _n(count: int, noun: str) -> str:
|
||||||
|
"""'1 column' / '3 columns' — naive but covers every noun used here."""
|
||||||
|
return f"{count:,} {noun}" if count == 1 else f"{count:,} {noun}s"
|
||||||
|
|
||||||
|
|
||||||
|
def step_phrase(tool: str, summary: dict) -> str:
|
||||||
|
"""A plain-English, one-line summary of what a step did."""
|
||||||
|
s = summary or {}
|
||||||
|
|
||||||
|
if tool == "text_clean":
|
||||||
|
changed = s.get("cells_changed", 0)
|
||||||
|
if not changed:
|
||||||
|
return "No changes needed."
|
||||||
|
return f"{_n(changed, 'cell')} cleaned{_in_cols(s.get('columns_processed', []))}"
|
||||||
|
|
||||||
|
if tool == "format_standardize":
|
||||||
|
changed = s.get("cells_changed", 0)
|
||||||
|
bad = s.get("cells_unparseable", 0)
|
||||||
|
if not changed and not bad:
|
||||||
|
return "Nothing to standardize."
|
||||||
|
base = f"{_n(changed, 'cell')} standardized{_in_cols(s.get('columns_processed', []))}"
|
||||||
|
return base if not bad else f"{base} ({bad:,} left unchanged)"
|
||||||
|
|
||||||
|
if tool == "missing":
|
||||||
|
parts: list[str] = []
|
||||||
|
if s.get("cells_filled"):
|
||||||
|
parts.append(f"{_n(s['cells_filled'], 'cell')} filled")
|
||||||
|
if s.get("rows_dropped"):
|
||||||
|
parts.append(f"{_n(s['rows_dropped'], 'row')} dropped")
|
||||||
|
if s.get("columns_dropped"):
|
||||||
|
parts.append(f"{_n(len(s['columns_dropped']), 'column')} dropped")
|
||||||
|
if not parts and s.get("sentinels_standardized"):
|
||||||
|
parts.append(f"{_n(s['sentinels_standardized'], 'blank cell')} flagged")
|
||||||
|
return ", ".join(parts) if parts else "No missing values to handle."
|
||||||
|
|
||||||
|
if tool == "column_map":
|
||||||
|
parts = []
|
||||||
|
if s.get("columns_renamed"):
|
||||||
|
parts.append(f"{_n(s['columns_renamed'], 'column')} renamed")
|
||||||
|
if s.get("columns_added"):
|
||||||
|
parts.append(f"{_n(len(s['columns_added']), 'column')} added")
|
||||||
|
if s.get("columns_dropped"):
|
||||||
|
parts.append(f"{_n(len(s['columns_dropped']), 'column')} dropped")
|
||||||
|
return ", ".join(parts) if parts else "Columns already aligned."
|
||||||
|
|
||||||
|
if tool == "dedup":
|
||||||
|
removed = s.get("duplicates_removed", 0)
|
||||||
|
if not removed:
|
||||||
|
return "No duplicates found."
|
||||||
|
return (
|
||||||
|
f"{_n(removed, 'duplicate')} removed across {_n(s.get('groups', 0), 'group')} "
|
||||||
|
f"({s.get('input_rows', 0):,} → {s.get('output_rows', 0):,} rows)"
|
||||||
|
)
|
||||||
|
|
||||||
|
return ", ".join(f"{k}: {v}" for k, v in s.items())
|
||||||
|
|
||||||
|
|
||||||
|
def step_status(
|
||||||
|
tool: str, summary: dict, *, skipped: bool = False, error: Optional[str] = None,
|
||||||
|
) -> tuple[str, str, str]:
|
||||||
|
"""Return ``(pill_label, level, detail)`` for a step result.
|
||||||
|
|
||||||
|
``level`` is one of ``ok`` / ``warn`` / ``error`` / ``skipped``. ``detail``
|
||||||
|
is a longer inline explanation for warn/error rows (else "").
|
||||||
|
"""
|
||||||
|
if error:
|
||||||
|
return "✗ error", "error", error.splitlines()[0]
|
||||||
|
if skipped:
|
||||||
|
return "⏭ skipped", "skipped", ""
|
||||||
|
|
||||||
|
s = summary or {}
|
||||||
|
if tool == "format_standardize" and s.get("cells_unparseable"):
|
||||||
|
n = s["cells_unparseable"]
|
||||||
|
return (
|
||||||
|
f"⚠ ok · {n:,} skipped", "warn",
|
||||||
|
f"{n:,} values didn't match a known pattern and were left "
|
||||||
|
"unchanged. The step still completed — review them in the output "
|
||||||
|
"preview if needed.",
|
||||||
|
)
|
||||||
|
if tool == "column_map":
|
||||||
|
fails = s.get("coercion_failures") or {}
|
||||||
|
n_fail = sum(fails.values()) if isinstance(fails, dict) else 0
|
||||||
|
missing_req = s.get("missing_required_targets") or []
|
||||||
|
if missing_req:
|
||||||
|
return (
|
||||||
|
"⚠ ok · missing targets", "warn",
|
||||||
|
"Required target columns had no source match: "
|
||||||
|
+ ", ".join(map(str, missing_req)) + ".",
|
||||||
|
)
|
||||||
|
if n_fail:
|
||||||
|
return (
|
||||||
|
f"⚠ ok · {n_fail:,} not coerced", "warn",
|
||||||
|
f"{n_fail:,} values couldn't be coerced to their target type "
|
||||||
|
"and were left as-is.",
|
||||||
|
)
|
||||||
|
|
||||||
|
return "✓ ok", "ok", ""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Per-tool config renderers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
#
|
||||||
|
# Uniform signature: ``render_<tool>_config(df, options, kp) -> options``.
|
||||||
|
# * ``df`` — the uploaded DataFrame (for column lists / type hints).
|
||||||
|
# * ``options`` — the step's current options dict (seed widget defaults).
|
||||||
|
# * ``kp`` — key prefix, unique per step (``f"{tool}_{id}"``).
|
||||||
|
# Returns a JSON-serialisable options dict. Renderers must not open expanders.
|
||||||
|
|
||||||
|
|
||||||
|
_CASE_LABELS: list[tuple[str, Optional[str]]] = [
|
||||||
|
("Leave as-is", None),
|
||||||
|
("UPPERCASE", "upper"),
|
||||||
|
("lowercase", "lower"),
|
||||||
|
("Title Case", "title"),
|
||||||
|
("Sentence case", "sentence"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def render_text_clean_config(df: pd.DataFrame, options: dict, kp: str) -> dict:
|
||||||
|
trim = st.checkbox(
|
||||||
|
"Trim leading & trailing whitespace",
|
||||||
|
value=bool(options.get("trim", True)), key=f"{kp}_trim",
|
||||||
|
)
|
||||||
|
collapse = st.checkbox(
|
||||||
|
"Collapse repeated spaces to one",
|
||||||
|
value=bool(options.get("collapse_whitespace", True)), key=f"{kp}_collapse",
|
||||||
|
)
|
||||||
|
fold = st.checkbox(
|
||||||
|
"Normalize smart quotes & dashes to plain ASCII",
|
||||||
|
value=bool(options.get("fold_smart_chars", True)), key=f"{kp}_fold",
|
||||||
|
)
|
||||||
|
strip_zw = st.checkbox(
|
||||||
|
"Strip zero-width / invisible characters",
|
||||||
|
value=bool(options.get("strip_zero_width", True)), key=f"{kp}_zw",
|
||||||
|
)
|
||||||
|
|
||||||
|
cur_case = options.get("case")
|
||||||
|
case_idx = next((i for i, (_, v) in enumerate(_CASE_LABELS) if v == cur_case), 0)
|
||||||
|
case_choice = st.selectbox(
|
||||||
|
"Letter case",
|
||||||
|
[lbl for lbl, _ in _CASE_LABELS],
|
||||||
|
index=case_idx, key=f"{kp}_case",
|
||||||
|
)
|
||||||
|
case_val = next(v for lbl, v in _CASE_LABELS if lbl == case_choice)
|
||||||
|
|
||||||
|
out: dict[str, Any] = {
|
||||||
|
"trim": trim,
|
||||||
|
"collapse_whitespace": collapse,
|
||||||
|
"fold_smart_chars": fold,
|
||||||
|
"strip_zero_width": strip_zw,
|
||||||
|
}
|
||||||
|
if case_val is not None:
|
||||||
|
out["case"] = case_val
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
_FORMAT_LABELS: list[tuple[str, Optional[str]]] = [
|
||||||
|
("Leave as-is", None),
|
||||||
|
("Date", "date"),
|
||||||
|
("Phone number", "phone"),
|
||||||
|
("Currency", "currency"),
|
||||||
|
("Name", "name"),
|
||||||
|
("Address", "address"),
|
||||||
|
("Email", "email"),
|
||||||
|
("Boolean (yes/no)", "boolean"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def render_format_standardize_config(df: pd.DataFrame, options: dict, kp: str) -> dict:
|
||||||
|
st.caption(
|
||||||
|
"Pick a target format for each column. Columns left as “Leave as-is” "
|
||||||
|
"are untouched."
|
||||||
|
)
|
||||||
|
current = dict(options.get("column_types", {}))
|
||||||
|
labels = [lbl for lbl, _ in _FORMAT_LABELS]
|
||||||
|
column_types: dict[str, str] = {}
|
||||||
|
for col in df.columns:
|
||||||
|
cur_val = current.get(col)
|
||||||
|
idx = next((i for i, (_, v) in enumerate(_FORMAT_LABELS) if v == cur_val), 0)
|
||||||
|
choice = st.selectbox(
|
||||||
|
str(col), labels, index=idx, key=f"{kp}_fmt__{col}",
|
||||||
|
)
|
||||||
|
val = next(v for lbl, v in _FORMAT_LABELS if lbl == choice)
|
||||||
|
if val is not None:
|
||||||
|
column_types[str(col)] = val
|
||||||
|
return {"column_types": column_types}
|
||||||
|
|
||||||
|
|
||||||
|
# Plain-language blank-handling choices → core strategy values. "fill" is a UI
|
||||||
|
# token expanded to numeric median + categorical mode (MissingOptions handles
|
||||||
|
# the per-dtype split via ``categorical_strategy``).
|
||||||
|
_MISSING_CHOICES: list[tuple[str, str]] = [
|
||||||
|
("Flag them (mark blanks, change nothing)", "flag"),
|
||||||
|
("Fill them in (numbers → median, text → most common)", "fill"),
|
||||||
|
("Drop rows that have any blank", "drop"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _missing_mode_from_strategy(strategy: Optional[str]) -> str:
|
||||||
|
if strategy in ("drop_row", "drop_col", "drop_both"):
|
||||||
|
return "drop"
|
||||||
|
if strategy in ("mean", "median", "mode", "constant", "ffill", "bfill", "interpolate"):
|
||||||
|
return "fill"
|
||||||
|
return "flag"
|
||||||
|
|
||||||
|
|
||||||
|
def render_missing_config(df: pd.DataFrame, options: dict, kp: str) -> dict:
|
||||||
|
from src.core.missing import DEFAULT_SENTINELS
|
||||||
|
|
||||||
|
cur_mode = _missing_mode_from_strategy(options.get("strategy"))
|
||||||
|
mode_idx = next((i for i, (_, v) in enumerate(_MISSING_CHOICES) if v == cur_mode), 0)
|
||||||
|
mode_choice = st.radio(
|
||||||
|
"What should happen to blank cells?",
|
||||||
|
[lbl for lbl, _ in _MISSING_CHOICES],
|
||||||
|
index=mode_idx, key=f"{kp}_strategy",
|
||||||
|
)
|
||||||
|
mode = next(v for lbl, v in _MISSING_CHOICES if lbl == mode_choice)
|
||||||
|
|
||||||
|
seed_sentinels = options.get("sentinels") or list(DEFAULT_SENTINELS)
|
||||||
|
sent_text = st.text_input(
|
||||||
|
"Treat these as blank (comma-separated)",
|
||||||
|
value=", ".join(seed_sentinels), key=f"{kp}_sentinels",
|
||||||
|
help="Matched case-insensitively after stripping whitespace.",
|
||||||
|
)
|
||||||
|
sentinels = [s.strip() for s in sent_text.split(",") if s.strip()]
|
||||||
|
|
||||||
|
out: dict[str, Any] = {
|
||||||
|
"standardize_sentinels": True,
|
||||||
|
"sentinels": sentinels,
|
||||||
|
}
|
||||||
|
if mode == "flag":
|
||||||
|
out["strategy"] = "none"
|
||||||
|
elif mode == "fill":
|
||||||
|
out["strategy"] = "median"
|
||||||
|
out["categorical_strategy"] = "mode"
|
||||||
|
else: # drop
|
||||||
|
out["strategy"] = "drop_row"
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
_UNMAPPED_CHOICES = ["keep", "drop", "error"]
|
||||||
|
|
||||||
|
|
||||||
|
def render_column_map_config(df: pd.DataFrame, options: dict, kp: str) -> dict:
|
||||||
|
st.caption(
|
||||||
|
"Type the target name each source column should become. Leave a target "
|
||||||
|
"blank to keep that column's name unchanged."
|
||||||
|
)
|
||||||
|
current = dict(options.get("mapping", {}))
|
||||||
|
table = pd.DataFrame(
|
||||||
|
{
|
||||||
|
"source": [str(c) for c in df.columns],
|
||||||
|
"target": [current.get(str(c), "") for c in df.columns],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
edited = st.data_editor(
|
||||||
|
table,
|
||||||
|
width="stretch",
|
||||||
|
hide_index=True,
|
||||||
|
disabled=["source"],
|
||||||
|
column_config={
|
||||||
|
"source": st.column_config.TextColumn("Source column"),
|
||||||
|
"target": st.column_config.TextColumn("Rename to"),
|
||||||
|
},
|
||||||
|
key=f"{kp}_mapping",
|
||||||
|
)
|
||||||
|
mapping = {
|
||||||
|
str(r["source"]): str(r["target"]).strip()
|
||||||
|
for _, r in edited.iterrows()
|
||||||
|
if str(r.get("target") or "").strip()
|
||||||
|
}
|
||||||
|
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
with c1:
|
||||||
|
unmapped = st.selectbox(
|
||||||
|
"Columns with no rename",
|
||||||
|
_UNMAPPED_CHOICES,
|
||||||
|
index=_UNMAPPED_CHOICES.index(options.get("unmapped", "keep"))
|
||||||
|
if options.get("unmapped") in _UNMAPPED_CHOICES else 0,
|
||||||
|
key=f"{kp}_unmapped",
|
||||||
|
help="keep: leave them in place · drop: remove them · error: stop the run.",
|
||||||
|
)
|
||||||
|
with c2:
|
||||||
|
coerce = st.checkbox(
|
||||||
|
"Coerce values to target types",
|
||||||
|
value=bool(options.get("coerce_types", False)), key=f"{kp}_coerce",
|
||||||
|
)
|
||||||
|
return {"mapping": mapping, "unmapped": unmapped, "coerce_types": coerce}
|
||||||
|
|
||||||
|
|
||||||
|
_SURVIVOR_LABELS: list[tuple[str, str]] = [
|
||||||
|
("Keep the most complete row", "most_complete"),
|
||||||
|
("Keep the first seen", "first"),
|
||||||
|
("Keep the last seen", "last"),
|
||||||
|
("Keep the most recent (by date)", "most_recent"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def render_dedup_config(df: pd.DataFrame, options: dict, kp: str) -> dict:
|
||||||
|
cur_rule = options.get("survivor_rule", "first")
|
||||||
|
rule_idx = next((i for i, (_, v) in enumerate(_SURVIVOR_LABELS) if v == cur_rule), 0)
|
||||||
|
rule_choice = st.selectbox(
|
||||||
|
"When rows match, which one survives?",
|
||||||
|
[lbl for lbl, _ in _SURVIVOR_LABELS],
|
||||||
|
index=rule_idx, key=f"{kp}_survivor",
|
||||||
|
)
|
||||||
|
survivor_rule = next(v for lbl, v in _SURVIVOR_LABELS if lbl == rule_choice)
|
||||||
|
|
||||||
|
merge = st.checkbox(
|
||||||
|
"Merge matched rows (fill each survivor's blanks from its duplicates)",
|
||||||
|
value=bool(options.get("merge", False)), key=f"{kp}_merge",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Recover the previously-selected match columns from the stored strategies
|
||||||
|
# (a single exact-match strategy over the chosen columns).
|
||||||
|
prev_cols: list[str] = []
|
||||||
|
for strat in options.get("strategies", []) or []:
|
||||||
|
for c in strat.get("columns", []):
|
||||||
|
if c.get("column"):
|
||||||
|
prev_cols.append(c["column"])
|
||||||
|
all_cols = [str(c) for c in df.columns]
|
||||||
|
match_cols = st.multiselect(
|
||||||
|
"Match on these columns",
|
||||||
|
all_cols,
|
||||||
|
default=[c for c in prev_cols if c in all_cols],
|
||||||
|
key=f"{kp}_matchcols",
|
||||||
|
help="Rows are duplicates when these columns all match. Leave empty to auto-detect.",
|
||||||
|
)
|
||||||
|
|
||||||
|
out: dict[str, Any] = {"survivor_rule": survivor_rule, "merge": merge}
|
||||||
|
if match_cols:
|
||||||
|
out["strategies"] = [
|
||||||
|
{"columns": [
|
||||||
|
{"column": c, "algorithm": "exact", "threshold": 100}
|
||||||
|
for c in match_cols
|
||||||
|
]}
|
||||||
|
]
|
||||||
|
if survivor_rule == "most_recent":
|
||||||
|
date_default = options.get("date_column")
|
||||||
|
date_idx = all_cols.index(date_default) if date_default in all_cols else 0
|
||||||
|
out["date_column"] = st.selectbox(
|
||||||
|
"Date column (for most-recent)",
|
||||||
|
all_cols, index=date_idx, key=f"{kp}_datecol",
|
||||||
|
) if all_cols else None
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
CONFIG_RENDERERS: dict[str, Callable[[pd.DataFrame, dict, str], dict]] = {
|
||||||
|
"text_clean": render_text_clean_config,
|
||||||
|
"format_standardize": render_format_standardize_config,
|
||||||
|
"missing": render_missing_config,
|
||||||
|
"column_map": render_column_map_config,
|
||||||
|
"dedup": render_dedup_config,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Module card
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def render_step_card(
|
||||||
|
df: pd.DataFrame, step: dict, idx: int, total: int,
|
||||||
|
) -> Optional[str]:
|
||||||
|
"""Render one pipeline step as a module card.
|
||||||
|
|
||||||
|
Mutates ``step`` in place (``enabled`` toggle, ``options`` from the Configure
|
||||||
|
panel). Returns an action string (``"up"`` / ``"down"`` / ``"remove"``) when
|
||||||
|
the user clicks a reorder/remove control, else ``None`` — the caller applies
|
||||||
|
the action to the step list and reruns.
|
||||||
|
"""
|
||||||
|
sid = step["id"]
|
||||||
|
kp = f"{step['tool']}_{sid}"
|
||||||
|
action: Optional[str] = None
|
||||||
|
|
||||||
|
with st.container(border=True):
|
||||||
|
head, toggle, up, down, rm = st.columns([0.66, 0.12, 0.07, 0.07, 0.08])
|
||||||
|
with head:
|
||||||
|
st.markdown(f"**{idx + 1}. {step_label(step['tool'])}**")
|
||||||
|
st.caption(step_caption(step["tool"]))
|
||||||
|
with toggle:
|
||||||
|
step["enabled"] = st.toggle(
|
||||||
|
"On", value=step.get("enabled", True), key=f"{kp}_enabled",
|
||||||
|
help="Disabled steps are kept in the pipeline but skipped at run time.",
|
||||||
|
)
|
||||||
|
with up:
|
||||||
|
if st.button("▲", key=f"{kp}_up", disabled=idx == 0,
|
||||||
|
help="Move up", width="stretch"):
|
||||||
|
action = "up"
|
||||||
|
with down:
|
||||||
|
if st.button("▼", key=f"{kp}_down", disabled=idx == total - 1,
|
||||||
|
help="Move down", width="stretch"):
|
||||||
|
action = "down"
|
||||||
|
with rm:
|
||||||
|
if st.button("✕", key=f"{kp}_rm", help="Remove step", width="stretch"):
|
||||||
|
action = "remove"
|
||||||
|
|
||||||
|
renderer = CONFIG_RENDERERS.get(step["tool"])
|
||||||
|
with st.expander(f"Configure: {step_label(step['tool'])}"):
|
||||||
|
if renderer is None:
|
||||||
|
st.caption("This step has no options.")
|
||||||
|
else:
|
||||||
|
step["options"] = renderer(df, step.get("options", {}) or {}, kp)
|
||||||
|
|
||||||
|
return action
|
||||||
@@ -32,6 +32,12 @@ from src.core.pipeline import (
|
|||||||
run_pipeline,
|
run_pipeline,
|
||||||
validate_pipeline,
|
validate_pipeline,
|
||||||
)
|
)
|
||||||
|
from src.gui.components.pipeline_modules import (
|
||||||
|
render_step_card,
|
||||||
|
step_label,
|
||||||
|
step_phrase,
|
||||||
|
step_status,
|
||||||
|
)
|
||||||
from src.license import FeatureFlag
|
from src.license import FeatureFlag
|
||||||
|
|
||||||
hide_streamlit_chrome()
|
hide_streamlit_chrome()
|
||||||
@@ -104,135 +110,186 @@ st.divider()
|
|||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Pipeline builder
|
# Pipeline builder — visual module cards
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
#
|
#
|
||||||
# Wrapped in an outer expander whose default state mirrors the preview
|
# Each step is a "module" card (src/gui/components/pipeline_modules.py) with a
|
||||||
# expander above: open before a result exists, folded once the user has
|
# plain-language Configure panel — no raw JSON. Steps live in session state as
|
||||||
# clicked Run Pipeline. The pipeline editor is this page's "Options"
|
# an ordered list of dicts, each carrying a STABLE integer id so widget keys
|
||||||
# section — structurally analogous to Text Cleaner's options block.
|
# survive reorder/remove. Raw JSON is import/export only, under Advanced.
|
||||||
|
#
|
||||||
|
# NB: the builder is NOT wrapped in an outer expander — per-step Configure
|
||||||
|
# panels are expanders, and Streamlit forbids nesting expanders.
|
||||||
|
|
||||||
with st.expander("Options", expanded=not _has_result):
|
|
||||||
mode = st.radio(
|
def _seed_steps_from(pipeline) -> None:
|
||||||
"How would you like to define the pipeline?",
|
"""Replace the session step list from a Pipeline, assigning fresh ids."""
|
||||||
[
|
seq = st.session_state.get("pipeline_step_seq", 0)
|
||||||
"Use the recommended default (text-clean → format → missing → dedup)",
|
steps: list[dict] = []
|
||||||
"Build interactively",
|
for s in pipeline.steps:
|
||||||
"Import a saved pipeline JSON",
|
steps.append({
|
||||||
],
|
"id": seq, "tool": s.tool,
|
||||||
index=0,
|
"enabled": s.enabled, "options": dict(s.options),
|
||||||
|
})
|
||||||
|
seq += 1
|
||||||
|
st.session_state["pipeline_steps"] = steps
|
||||||
|
st.session_state["pipeline_step_seq"] = seq
|
||||||
|
|
||||||
|
|
||||||
|
if "pipeline_steps" not in st.session_state:
|
||||||
|
_seed_steps_from(recommended_pipeline())
|
||||||
|
|
||||||
|
st.subheader("Build your pipeline")
|
||||||
|
|
||||||
|
mode = st.radio(
|
||||||
|
"How would you like to define the pipeline?",
|
||||||
|
[
|
||||||
|
"Use the recommended default (Clean Text → Standardize → Fix Missing → Find Duplicates)",
|
||||||
|
"Build interactively",
|
||||||
|
"Import a saved pipeline JSON",
|
||||||
|
],
|
||||||
|
index=0,
|
||||||
|
key="pipeline_mode",
|
||||||
|
)
|
||||||
|
|
||||||
|
if mode.startswith("Use the recommended"):
|
||||||
|
# Only reseed on an explicit click that lands here while the steps already
|
||||||
|
# diverge — otherwise every rerun would wipe edits. We detect "user just
|
||||||
|
# selected this mode" by comparing against the recommended default and
|
||||||
|
# offering a one-click restore rather than silently discarding.
|
||||||
|
rec_dict = recommended_pipeline().to_dict()
|
||||||
|
cur_dict = {
|
||||||
|
"steps": [
|
||||||
|
{"tool": s["tool"], "options": s["options"],
|
||||||
|
"enabled": s["enabled"], "name": None}
|
||||||
|
for s in st.session_state["pipeline_steps"]
|
||||||
|
]
|
||||||
|
}
|
||||||
|
if cur_dict != rec_dict:
|
||||||
|
st.info(
|
||||||
|
"You've edited the recommended steps, so they're now yours to "
|
||||||
|
"change — you're effectively in **Build interactively** mode. "
|
||||||
|
"Restore the suggested steps to discard your edits."
|
||||||
|
)
|
||||||
|
if st.button("↺ Restore recommended steps"):
|
||||||
|
_seed_steps_from(recommended_pipeline())
|
||||||
|
st.rerun()
|
||||||
|
elif mode.startswith("Import"):
|
||||||
|
pipeline_file = st.file_uploader(
|
||||||
|
"Pipeline JSON", type=["json"], key="pipeline_upload",
|
||||||
|
)
|
||||||
|
if pipeline_file is not None:
|
||||||
|
try:
|
||||||
|
data = json.loads(pipeline_file.getvalue())
|
||||||
|
_seed_steps_from(Pipeline.from_dict(data))
|
||||||
|
st.success(
|
||||||
|
f"Loaded {len(st.session_state['pipeline_steps'])} step(s). "
|
||||||
|
"Switch to **Build interactively** to tweak them."
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
from src.core.errors import format_for_user
|
||||||
|
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
|
||||||
|
|
||||||
|
st.caption(
|
||||||
|
"Each step is a module: toggle it on/off, reorder with ▲ ▼, remove with ✕, "
|
||||||
|
"and open **Configure** to set its options in plain language. Tool order is "
|
||||||
|
"recommended, not enforced — violations surface as warnings below."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Render the module stack. A reorder/remove action mutates the list and reruns.
|
||||||
|
steps = st.session_state["pipeline_steps"]
|
||||||
|
total = len(steps)
|
||||||
|
pending_action: tuple[str, int] | None = None
|
||||||
|
for i, step in enumerate(steps):
|
||||||
|
act = render_step_card(df, step, i, total)
|
||||||
|
if act is not None:
|
||||||
|
pending_action = (act, i)
|
||||||
|
|
||||||
|
if pending_action is not None:
|
||||||
|
act, i = pending_action
|
||||||
|
if act == "remove":
|
||||||
|
steps.pop(i)
|
||||||
|
elif act == "up" and i > 0:
|
||||||
|
steps[i - 1], steps[i] = steps[i], steps[i - 1]
|
||||||
|
elif act == "down" and i < total - 1:
|
||||||
|
steps[i + 1], steps[i] = steps[i], steps[i + 1]
|
||||||
|
st.session_state["pipeline_steps"] = steps
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
# Add-step control.
|
||||||
|
add_col, btn_col = st.columns([0.7, 0.3])
|
||||||
|
with add_col:
|
||||||
|
add_tool = st.selectbox(
|
||||||
|
"Add a step",
|
||||||
|
TOOL_NAMES,
|
||||||
|
format_func=step_label,
|
||||||
|
key="pipeline_add_tool",
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
with btn_col:
|
||||||
|
if st.button("➕ Add step", width="stretch"):
|
||||||
|
seq = st.session_state.get("pipeline_step_seq", 0)
|
||||||
|
steps.append({"id": seq, "tool": add_tool, "enabled": True, "options": {}})
|
||||||
|
st.session_state["pipeline_step_seq"] = seq + 1
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
# Build a Pipeline object from the step list.
|
||||||
|
steps_list: list[Step] = []
|
||||||
|
parse_errors: list[str] = []
|
||||||
|
for i, step in enumerate(steps):
|
||||||
|
try:
|
||||||
|
steps_list.append(Step(
|
||||||
|
tool=str(step["tool"]),
|
||||||
|
options=dict(step.get("options") or {}),
|
||||||
|
enabled=bool(step.get("enabled", True)),
|
||||||
|
))
|
||||||
|
except Exception as e:
|
||||||
|
parse_errors.append(f"Step {i + 1} ({step.get('tool')}): {e}")
|
||||||
|
|
||||||
|
for err in parse_errors:
|
||||||
|
st.error(err)
|
||||||
|
|
||||||
|
current_pipeline = Pipeline(steps=steps_list) if steps_list else None
|
||||||
|
|
||||||
|
if current_pipeline is not None:
|
||||||
|
warnings = validate_pipeline(current_pipeline)
|
||||||
|
if warnings:
|
||||||
|
st.warning(
|
||||||
|
"Pipeline is out of recommended order:\n\n"
|
||||||
|
+ "\n".join(f"- {w}" for w in warnings)
|
||||||
|
+ "\n\nThe pipeline will still run — these are recommendations only."
|
||||||
|
)
|
||||||
|
|
||||||
|
with st.expander("Recommended tool order — why each step belongs where it does"):
|
||||||
|
st.markdown(
|
||||||
|
"\n".join(
|
||||||
|
f"- **{step_label(e)}** before **{step_label(l)}** — {why}"
|
||||||
|
for e, l, why in SOFT_DEPENDENCIES
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
if "pipeline_rows" not in st.session_state:
|
with st.expander("Advanced — import / export pipeline as JSON"):
|
||||||
default = recommended_pipeline()
|
|
||||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
|
||||||
{
|
|
||||||
"tool": s.tool, "enabled": s.enabled,
|
|
||||||
"options_json": json.dumps(s.options),
|
|
||||||
}
|
|
||||||
for s in default.steps
|
|
||||||
])
|
|
||||||
|
|
||||||
if mode.startswith("Use the recommended"):
|
|
||||||
default = recommended_pipeline()
|
|
||||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
|
||||||
{
|
|
||||||
"tool": s.tool, "enabled": s.enabled,
|
|
||||||
"options_json": json.dumps(s.options),
|
|
||||||
}
|
|
||||||
for s in default.steps
|
|
||||||
])
|
|
||||||
elif mode.startswith("Import"):
|
|
||||||
pipeline_file = st.file_uploader(
|
|
||||||
"Pipeline JSON", type=["json"], key="pipeline_upload",
|
|
||||||
)
|
|
||||||
if pipeline_file is not None:
|
|
||||||
try:
|
|
||||||
data = json.loads(pipeline_file.getvalue())
|
|
||||||
uploaded_pipe = Pipeline.from_dict(data)
|
|
||||||
st.session_state["pipeline_rows"] = pd.DataFrame([
|
|
||||||
{
|
|
||||||
"tool": s.tool, "enabled": s.enabled,
|
|
||||||
"options_json": json.dumps(s.options),
|
|
||||||
}
|
|
||||||
for s in uploaded_pipe.steps
|
|
||||||
])
|
|
||||||
st.success(f"Loaded {len(uploaded_pipe.steps)} step(s).")
|
|
||||||
except Exception as e:
|
|
||||||
from src.core.errors import format_for_user
|
|
||||||
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
|
|
||||||
|
|
||||||
st.caption(
|
st.caption(
|
||||||
"Edit the table to add, remove, reorder (drag the row index), enable, "
|
"For sharing or version control. Editing is done in the step panels "
|
||||||
"or configure each step. Tool order is recommended, not enforced — "
|
"above — this is just the saved form of the same settings. The same "
|
||||||
"violations surface as warnings below the table."
|
"JSON runs in the CLI via `--pipeline pipeline.json`."
|
||||||
)
|
)
|
||||||
edited = st.data_editor(
|
export_json = json.dumps(
|
||||||
st.session_state["pipeline_rows"],
|
current_pipeline.to_dict() if current_pipeline else {"steps": []},
|
||||||
width="stretch",
|
indent=2, default=str,
|
||||||
num_rows="dynamic",
|
|
||||||
column_config={
|
|
||||||
"tool": st.column_config.SelectboxColumn(
|
|
||||||
"Tool", options=TOOL_NAMES, required=True,
|
|
||||||
),
|
|
||||||
"enabled": st.column_config.CheckboxColumn("Enabled"),
|
|
||||||
"options_json": st.column_config.TextColumn(
|
|
||||||
"Options (JSON)",
|
|
||||||
help='e.g. {"column_types": {"phone": "phone"}}',
|
|
||||||
),
|
|
||||||
},
|
|
||||||
key="pipeline_editor",
|
|
||||||
)
|
)
|
||||||
st.session_state["pipeline_rows"] = edited
|
st.code(export_json, language="json")
|
||||||
|
adv_paste = st.text_area(
|
||||||
# Build a Pipeline object from the editor state.
|
"Paste pipeline JSON to load it", key="pipeline_json_paste", height=140,
|
||||||
steps_list: list[Step] = []
|
)
|
||||||
parse_errors: list[str] = []
|
if st.button("Load pasted JSON", disabled=not adv_paste.strip()):
|
||||||
for i, row in edited.iterrows():
|
|
||||||
tool = row.get("tool")
|
|
||||||
if not tool or pd.isna(tool):
|
|
||||||
continue
|
|
||||||
raw_opts = row.get("options_json") or "{}"
|
|
||||||
if pd.isna(raw_opts):
|
|
||||||
raw_opts = "{}"
|
|
||||||
try:
|
try:
|
||||||
opts = json.loads(raw_opts) if isinstance(raw_opts, str) else dict(raw_opts)
|
_seed_steps_from(Pipeline.from_dict(json.loads(adv_paste)))
|
||||||
if not isinstance(opts, dict):
|
st.success("Loaded. Scroll up to see the steps.")
|
||||||
raise ValueError("options must be a JSON object")
|
st.rerun()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
parse_errors.append(f"Step {i + 1}: {e}")
|
from src.core.errors import format_for_user
|
||||||
continue
|
st.error(f"**Could not parse pipeline**\n\n```\n{format_for_user(e)}\n```")
|
||||||
try:
|
|
||||||
steps_list.append(Step(
|
|
||||||
tool=str(tool),
|
|
||||||
options=opts,
|
|
||||||
enabled=bool(row.get("enabled", True)),
|
|
||||||
))
|
|
||||||
except Exception as e:
|
|
||||||
parse_errors.append(f"Step {i + 1}: {e}")
|
|
||||||
|
|
||||||
if parse_errors:
|
|
||||||
for err in parse_errors:
|
|
||||||
st.error(err)
|
|
||||||
|
|
||||||
current_pipeline = Pipeline(steps=steps_list) if steps_list else None
|
|
||||||
|
|
||||||
if current_pipeline is not None:
|
|
||||||
warnings = validate_pipeline(current_pipeline)
|
|
||||||
if warnings:
|
|
||||||
st.warning(
|
|
||||||
"Pipeline is out of recommended order:\n\n"
|
|
||||||
+ "\n".join(f"- {w}" for w in warnings)
|
|
||||||
+ "\n\nThe pipeline will still run — these are recommendations only."
|
|
||||||
)
|
|
||||||
|
|
||||||
with st.expander("Recommended tool order — why each step belongs where it does"):
|
|
||||||
st.markdown(
|
|
||||||
"\n".join(
|
|
||||||
f"- **{e}** before **{l}** — {why}"
|
|
||||||
for e, l, why in SOFT_DEPENDENCIES
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
st.divider()
|
st.divider()
|
||||||
|
|
||||||
@@ -257,14 +314,14 @@ if st.button(
|
|||||||
def _on_step(sr) -> None:
|
def _on_step(sr) -> None:
|
||||||
completed[0] += 1
|
completed[0] += 1
|
||||||
if sr.skipped:
|
if sr.skipped:
|
||||||
log_lines.append(f"○ {sr.step.display_name()} (skipped)")
|
log_lines.append(f"○ {step_label(sr.step.tool)} (skipped)")
|
||||||
elif sr.error:
|
elif sr.error:
|
||||||
log_lines.append(
|
log_lines.append(
|
||||||
f"✗ {sr.step.display_name()} — {sr.error.splitlines()[0]}"
|
f"✗ {step_label(sr.step.tool)} — {sr.error.splitlines()[0]}"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
log_lines.append(
|
log_lines.append(
|
||||||
f"✓ {sr.step.display_name()} — {sr.elapsed_seconds*1000:.0f} ms"
|
f"✓ {step_label(sr.step.tool)} — {sr.elapsed_seconds*1000:.0f} ms"
|
||||||
)
|
)
|
||||||
log_box.markdown("\n".join(log_lines))
|
log_box.markdown("\n".join(log_lines))
|
||||||
progress.progress(
|
progress.progress(
|
||||||
@@ -328,22 +385,38 @@ m3.metric("Steps run", sum(1 for s in result.step_results if not s.skipped))
|
|||||||
m4.metric("Elapsed", f"{result.total_elapsed:.2f} s")
|
m4.metric("Elapsed", f"{result.total_elapsed:.2f} s")
|
||||||
|
|
||||||
st.markdown("**Per-step summary**")
|
st.markdown("**Per-step summary**")
|
||||||
|
# Plain-English status pill + summary phrase per step (mockup §Results). The
|
||||||
|
# at-a-glance table stays scannable; any warn/error step also gets an inline
|
||||||
|
# detail callout directly below it, so a non-fatal issue surfaces in context
|
||||||
|
# without a dedicated always-empty column.
|
||||||
step_df = pd.DataFrame([
|
step_df = pd.DataFrame([
|
||||||
{
|
{
|
||||||
"step": sr.step.display_name(),
|
"step": step_label(sr.step.tool),
|
||||||
"status": (
|
"status": step_status(
|
||||||
"skipped" if sr.skipped
|
sr.step.tool, sr.summary, skipped=sr.skipped, error=sr.error,
|
||||||
else "error" if sr.error
|
)[0],
|
||||||
else "ok"
|
"elapsed": f"{int(sr.elapsed_seconds * 1000)} ms",
|
||||||
|
"summary": (
|
||||||
|
"—" if sr.skipped
|
||||||
|
else step_phrase(sr.step.tool, sr.summary)
|
||||||
),
|
),
|
||||||
"elapsed_ms": int(sr.elapsed_seconds * 1000),
|
|
||||||
"summary": json.dumps(sr.summary, default=str)[:200],
|
|
||||||
"error": sr.error or "",
|
|
||||||
}
|
}
|
||||||
for sr in result.step_results
|
for sr in result.step_results
|
||||||
])
|
])
|
||||||
st.dataframe(step_df, width="stretch", hide_index=True)
|
st.dataframe(step_df, width="stretch", hide_index=True)
|
||||||
|
|
||||||
|
for sr in result.step_results:
|
||||||
|
_label, level, detail = step_status(
|
||||||
|
sr.step.tool, sr.summary, skipped=sr.skipped, error=sr.error,
|
||||||
|
)
|
||||||
|
if not detail:
|
||||||
|
continue
|
||||||
|
name = step_label(sr.step.tool)
|
||||||
|
if level == "error":
|
||||||
|
st.error(f"**{name}** — {detail}")
|
||||||
|
else:
|
||||||
|
st.warning(f"**{name}** — {detail}")
|
||||||
|
|
||||||
st.markdown("**Output preview (first 10 rows)**")
|
st.markdown("**Output preview (first 10 rows)**")
|
||||||
st.dataframe(result.final_df.head(10), width="stretch")
|
st.dataframe(result.final_df.head(10), width="stretch")
|
||||||
|
|
||||||
|
|||||||
116
tests/gui/test_app_demo.py
Normal file
116
tests/gui/test_app_demo.py
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
"""Public demo app (``src/gui/app_demo.py``) behavior — AppTest.
|
||||||
|
|
||||||
|
The demo app is the marketing surface: it preloads one accounting persona's
|
||||||
|
dataset, runs the saved pipeline, and shows BEFORE/AFTER + a buy CTA. These
|
||||||
|
tests pin that every persona renders, the run produces its headline value,
|
||||||
|
persona switching works, and the buy path is present — so a regression can't
|
||||||
|
silently ship a broken or empty demo to a prospect.
|
||||||
|
|
||||||
|
The dataset value numbers themselves are pinned separately in
|
||||||
|
``tests/test_demo_pipelines.py``; here we assert the *app* surfaces them.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
from streamlit.testing.v1 import AppTest
|
||||||
|
|
||||||
|
_PAGE = str(
|
||||||
|
Path(__file__).resolve().parent.parent.parent / "src" / "gui" / "app_demo.py"
|
||||||
|
)
|
||||||
|
_DEMO = Path(__file__).resolve().parent.parent.parent / "samples" / "demo"
|
||||||
|
|
||||||
|
# (persona key, data file, expected rows before -> after, a label substring)
|
||||||
|
_PERSONAS = [
|
||||||
|
("bookkeeper", "bank_reconciliation.csv", 26, 20, "Bookkeeper"),
|
||||||
|
("ap-1099", "vendor_1099.csv", 24, 8, "payable"),
|
||||||
|
("ar-aging", "ar_open_invoices.csv", 26, 21, "receivable"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _app(persona: str | None = None) -> AppTest:
|
||||||
|
at = AppTest.from_file(_PAGE, default_timeout=60)
|
||||||
|
if persona is not None:
|
||||||
|
at.query_params["p"] = persona
|
||||||
|
return at.run()
|
||||||
|
|
||||||
|
|
||||||
|
def _md(at: AppTest) -> str:
|
||||||
|
return " ".join(m.value for m in at.markdown)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("key,data_file,before,after,label", _PERSONAS)
|
||||||
|
def test_persona_renders_with_its_dataset(key, data_file, before, after, label):
|
||||||
|
at = _app(key)
|
||||||
|
assert not at.exception
|
||||||
|
md = _md(at)
|
||||||
|
assert label in md, f"persona label {label!r} not rendered"
|
||||||
|
# BEFORE preview reflects the real dataset size.
|
||||||
|
real_rows = len(pd.read_csv(_DEMO / data_file, dtype=str, keep_default_na=False))
|
||||||
|
assert real_rows == before # guards the fixture against silent drift
|
||||||
|
assert f"BEFORE — {before} rows" in md
|
||||||
|
# The saved pipeline is shown (read-only) as the canonical steps.
|
||||||
|
assert "text_clean" in md and "dedup" in md
|
||||||
|
assert any("Run pipeline" in b.label for b in at.button)
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_persona_is_bookkeeper():
|
||||||
|
at = _app(None)
|
||||||
|
assert not at.exception
|
||||||
|
assert "Bookkeeper" in _md(at)
|
||||||
|
|
||||||
|
|
||||||
|
def test_unknown_persona_falls_back_to_default():
|
||||||
|
at = _app("not-a-real-persona")
|
||||||
|
assert not at.exception
|
||||||
|
assert "Bookkeeper" in _md(at)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("key,data_file,before,after,label", _PERSONAS)
|
||||||
|
def test_run_shows_after_value_and_buy_path(key, data_file, before, after, label):
|
||||||
|
at = _app(key)
|
||||||
|
[b for b in at.button if "Run pipeline" in b.label][0].click().run()
|
||||||
|
assert not at.exception, at.exception
|
||||||
|
|
||||||
|
# A result is cached and the AFTER header reports the dedup win.
|
||||||
|
assert "demo_result" in at.session_state
|
||||||
|
result = at.session_state["demo_result"]
|
||||||
|
assert len(result.final_df) == after
|
||||||
|
assert result.final_rows < result.initial_rows
|
||||||
|
assert f"{before} → {after} rows" in _md(at)
|
||||||
|
|
||||||
|
# The buy path is present after a run (download + Gumroad CTA). The
|
||||||
|
# cleaned-CSV download is a download_button, not a plain button.
|
||||||
|
downloads = at.get("download_button")
|
||||||
|
assert any("Download cleaned CSV" in d.label for d in downloads)
|
||||||
|
assert f"gumroad.com/l/datatools?from={key}" in _md(at)
|
||||||
|
|
||||||
|
|
||||||
|
def test_persona_switch_clears_stale_result():
|
||||||
|
# Run the bookkeeper demo, then switch persona via the quick-switch
|
||||||
|
# dropdown (driving the selectbox — a raw query-param change is
|
||||||
|
# overridden by the dropdown's persisted value).
|
||||||
|
at = _app("bookkeeper")
|
||||||
|
[b for b in at.button if "Run pipeline" in b.label][0].click().run()
|
||||||
|
assert "demo_result" in at.session_state
|
||||||
|
|
||||||
|
switch = [s for s in at.selectbox if s.key == "persona_switch"][0]
|
||||||
|
switch.set_value("ap-1099").run()
|
||||||
|
assert not at.exception
|
||||||
|
# The page drops the stale bookkeeper result when the persona changes,
|
||||||
|
# so the visitor never sees the wrong dataset's AFTER block.
|
||||||
|
assert "demo_result" not in at.session_state
|
||||||
|
assert "payable" in _md(at) # now showing the AP/1099 persona
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_offers_a_watermarked_download():
|
||||||
|
"""After a run the visitor gets a download, labeled as watermarked
|
||||||
|
(the free/paid boundary from DEMO-PLAN §6)."""
|
||||||
|
at = _app("bookkeeper")
|
||||||
|
[b for b in at.button if "Run pipeline" in b.label][0].click().run()
|
||||||
|
dl = [d for d in at.get("download_button") if "Download cleaned CSV" in d.label]
|
||||||
|
assert dl, "no cleaned-CSV download after a run"
|
||||||
|
assert "watermark" in dl[0].label.lower()
|
||||||
281
tests/gui/test_pipeline_builder.py
Normal file
281
tests/gui/test_pipeline_builder.py
Normal file
@@ -0,0 +1,281 @@
|
|||||||
|
"""Pipeline Runner — visual module-card builder contract (AppTest).
|
||||||
|
|
||||||
|
Pins the behaviors the JSON-table → module-card rewrite introduced:
|
||||||
|
recommended steps seed as cards with friendly names, each step exposes a
|
||||||
|
plain-language Configure panel (no raw per-row JSON), steps can be toggled /
|
||||||
|
added / removed, JSON lives only under Advanced, and a run produces results
|
||||||
|
with friendly step names. The page's bare initial-render contract across junk
|
||||||
|
files is covered separately in ``tests/test_junk_corpus_tool_pages.py``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from streamlit.testing.v1 import AppTest
|
||||||
|
|
||||||
|
_PAGE = (
|
||||||
|
Path(__file__).resolve().parent.parent.parent
|
||||||
|
/ "src" / "gui" / "pages" / "9_Pipeline_Runner.py"
|
||||||
|
)
|
||||||
|
|
||||||
|
_CSV = (
|
||||||
|
b"name,email,phone,signup_date\n"
|
||||||
|
b" Jane Doe ,jane@acme.io,512-555-0190,2024-01-04\n"
|
||||||
|
b"jane doe,JANE@ACME.IO,(512) 555-0190,01/04/2024\n"
|
||||||
|
b"Bob Smith,bob@globex.com,720.555.7781,2024-02-11\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _app() -> AppTest:
|
||||||
|
at = AppTest.from_file(str(_PAGE), default_timeout=30)
|
||||||
|
at.session_state["home_uploaded_bytes"] = _CSV
|
||||||
|
at.session_state["home_uploaded_name"] = "customers.csv"
|
||||||
|
at.session_state["home_uploaded_size"] = len(_CSV)
|
||||||
|
return at.run()
|
||||||
|
|
||||||
|
|
||||||
|
def test_recommended_steps_seed_as_named_cards():
|
||||||
|
at = _app()
|
||||||
|
assert not at.exception
|
||||||
|
tools = [s["tool"] for s in at.session_state["pipeline_steps"]]
|
||||||
|
assert tools == ["text_clean", "format_standardize", "missing", "dedup"]
|
||||||
|
md = " ".join(m.value for m in at.markdown)
|
||||||
|
for friendly in ("Clean Text", "Standardize Formats",
|
||||||
|
"Fix Missing Values", "Find Duplicates"):
|
||||||
|
assert friendly in md
|
||||||
|
|
||||||
|
|
||||||
|
def test_each_step_has_a_configure_panel_and_json_is_advanced_only():
|
||||||
|
at = _app()
|
||||||
|
labels = [e.label for e in at.get("expander")]
|
||||||
|
assert any(l.startswith("Configure: Clean Text") for l in labels)
|
||||||
|
assert any(l.startswith("Configure: Find Duplicates") for l in labels)
|
||||||
|
# Raw JSON is import/export only — never a per-step editing surface.
|
||||||
|
assert any("Advanced — import / export" in l for l in labels)
|
||||||
|
|
||||||
|
|
||||||
|
def test_toggle_disables_step_and_persists():
|
||||||
|
at = _app()
|
||||||
|
at.toggle[0].set_value(False).run()
|
||||||
|
assert at.session_state["pipeline_steps"][0]["enabled"] is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_step_appends_a_working_config_panel():
|
||||||
|
at = _app()
|
||||||
|
[s for s in at.selectbox if s.key == "pipeline_add_tool"][0].set_value("column_map").run()
|
||||||
|
[b for b in at.button if "Add step" in b.label][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
assert at.session_state["pipeline_steps"][-1]["tool"] == "column_map"
|
||||||
|
labels = [e.label for e in at.get("expander")]
|
||||||
|
assert any(l.startswith("Configure: Map Columns") for l in labels)
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_step_drops_it():
|
||||||
|
at = _app()
|
||||||
|
before = len(at.session_state["pipeline_steps"])
|
||||||
|
# The first ✕ remove button in the card stack.
|
||||||
|
[b for b in at.button if b.label == "✕"][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
assert len(at.session_state["pipeline_steps"]) == before - 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_produces_results_with_friendly_names():
|
||||||
|
at = _app()
|
||||||
|
[b for b in at.button if b.label == "Run Pipeline"][0].click().run()
|
||||||
|
assert not at.exception, at.exception
|
||||||
|
assert "pipeline_result" in at.session_state
|
||||||
|
res = at.session_state["pipeline_result"]
|
||||||
|
assert res.initial_rows == 3 and res.final_rows == 2 # the two Jane rows merge
|
||||||
|
assert all(sr.error is None for sr in res.step_results)
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_phrase_is_plain_english_not_json():
|
||||||
|
from src.gui.components.pipeline_modules import step_phrase, step_status
|
||||||
|
|
||||||
|
# dedup phrasing mirrors the design mockup wording exactly.
|
||||||
|
phrase = step_phrase("dedup", {
|
||||||
|
"input_rows": 18442, "output_rows": 18130,
|
||||||
|
"duplicates_removed": 312, "groups": 147,
|
||||||
|
})
|
||||||
|
assert phrase == "312 duplicates removed across 147 groups (18,442 → 18,130 rows)"
|
||||||
|
|
||||||
|
# text_clean lists affected columns in prose, with thousands separators.
|
||||||
|
assert step_phrase("text_clean", {
|
||||||
|
"cells_changed": 1204, "columns_processed": ["name", "city"],
|
||||||
|
}) == "1,204 cells cleaned in name & city"
|
||||||
|
|
||||||
|
# singular nouns pluralize correctly
|
||||||
|
assert step_phrase("missing", {"rows_dropped": 1, "columns_dropped": ["x"]}) == \
|
||||||
|
"1 row dropped, 1 column dropped"
|
||||||
|
|
||||||
|
# unparseable cells downgrade the pill to warn with an inline detail
|
||||||
|
label, level, detail = step_status(
|
||||||
|
"format_standardize", {"cells_changed": 100, "cells_unparseable": 141},
|
||||||
|
)
|
||||||
|
assert level == "warn" and "141 skipped" in label and detail
|
||||||
|
|
||||||
|
# a clean step is "ok" with no detail
|
||||||
|
assert step_status("text_clean", {"cells_changed": 5})[1] == "ok"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers for the reorder / config tests below
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _ids(at) -> dict:
|
||||||
|
"""Map tool name → that step's stable id (assumes unique tools)."""
|
||||||
|
return {s["tool"]: s["id"] for s in at.session_state["pipeline_steps"]}
|
||||||
|
|
||||||
|
|
||||||
|
def _tools(at) -> list:
|
||||||
|
return [s["tool"] for s in at.session_state["pipeline_steps"]]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Reorder
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_reorder_down_swaps_with_next_step():
|
||||||
|
at = _app()
|
||||||
|
sid = _ids(at)["text_clean"]
|
||||||
|
before = _tools(at)
|
||||||
|
assert before == ["text_clean", "format_standardize", "missing", "dedup"]
|
||||||
|
[b for b in at.button if b.key == f"text_clean_{sid}_down"][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
assert _tools(at) == ["format_standardize", "text_clean", "missing", "dedup"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_reorder_up_swaps_with_previous_step():
|
||||||
|
at = _app()
|
||||||
|
sid = _ids(at)["missing"]
|
||||||
|
[b for b in at.button if b.key == f"missing_{sid}_up"][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
assert _tools(at) == ["text_clean", "missing", "format_standardize", "dedup"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_first_up_and_last_down_buttons_are_disabled():
|
||||||
|
at = _app()
|
||||||
|
ids = _ids(at)
|
||||||
|
first_up = [b for b in at.button if b.key == f"text_clean_{ids['text_clean']}_up"][0]
|
||||||
|
last_down = [b for b in at.button if b.key == f"dedup_{ids['dedup']}_down"][0]
|
||||||
|
assert first_up.disabled is True
|
||||||
|
assert last_down.disabled is True
|
||||||
|
# interior steps are freely movable
|
||||||
|
mid_up = [b for b in at.button if b.key == f"missing_{ids['missing']}_up"][0]
|
||||||
|
assert mid_up.disabled is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_disabled_step_stays_disabled_after_reorder():
|
||||||
|
at = _app()
|
||||||
|
sid = _ids(at)["text_clean"]
|
||||||
|
at.toggle[0].set_value(False).run()
|
||||||
|
assert at.session_state["pipeline_steps"][0]["enabled"] is False
|
||||||
|
# move the now-disabled first step down one slot
|
||||||
|
[b for b in at.button if b.key == f"text_clean_{sid}_down"][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
steps = at.session_state["pipeline_steps"]
|
||||||
|
moved = [s for s in steps if s["tool"] == "text_clean"][0]
|
||||||
|
assert steps.index(moved) == 1 # it moved
|
||||||
|
assert moved["enabled"] is False # ...and stayed disabled
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Restore recommended steps
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_restore_recommended_steps_button():
|
||||||
|
at = _app()
|
||||||
|
# Diverge from the recommended default by removing a step.
|
||||||
|
[b for b in at.button if b.label == "✕"][0].click().run()
|
||||||
|
assert _tools(at) == ["format_standardize", "missing", "dedup"]
|
||||||
|
restore = [b for b in at.button if "Restore recommended steps" in b.label]
|
||||||
|
assert len(restore) == 1
|
||||||
|
restore[0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
assert _tools(at) == ["text_clean", "format_standardize", "missing", "dedup"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_restore_button_absent_when_steps_match_default():
|
||||||
|
at = _app()
|
||||||
|
# Untouched recommended steps → no restore prompt.
|
||||||
|
assert not [b for b in at.button if "Restore recommended steps" in b.label]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Advanced JSON export / import
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_advanced_json_export_reflects_current_steps():
|
||||||
|
at = _app()
|
||||||
|
exported = json.loads(at.code[0].value)
|
||||||
|
assert [s["tool"] for s in exported["steps"]] == \
|
||||||
|
["text_clean", "format_standardize", "missing", "dedup"]
|
||||||
|
# Remove a step and confirm the exported JSON drops it too.
|
||||||
|
[b for b in at.button if b.label == "✕"][0].click().run()
|
||||||
|
exported = json.loads(at.code[0].value)
|
||||||
|
assert [s["tool"] for s in exported["steps"]] == \
|
||||||
|
["format_standardize", "missing", "dedup"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_pasted_json_replaces_the_step_list():
|
||||||
|
at = _app()
|
||||||
|
one_step = json.dumps(
|
||||||
|
{"steps": [{"tool": "dedup", "options": {}, "enabled": True}]}
|
||||||
|
)
|
||||||
|
[t for t in at.text_area if t.key == "pipeline_json_paste"][0].set_value(
|
||||||
|
one_step
|
||||||
|
).run()
|
||||||
|
[b for b in at.button if b.label == "Load pasted JSON"][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
assert _tools(at) == ["dedup"]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Config renderers emit the right options
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_format_standardize_config_emits_column_types():
|
||||||
|
at = _app()
|
||||||
|
fid = _ids(at)["format_standardize"]
|
||||||
|
[s for s in at.selectbox if s.key == f"format_standardize_{fid}_fmt__phone"][0] \
|
||||||
|
.set_value("Phone number").run()
|
||||||
|
[b for b in at.button if b.label == "Run Pipeline"][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
step = [s for s in at.session_state["pipeline_steps"]
|
||||||
|
if s["tool"] == "format_standardize"][0]
|
||||||
|
assert step["options"]["column_types"].get("phone") == "phone"
|
||||||
|
|
||||||
|
|
||||||
|
def test_missing_config_drop_radio_emits_drop_row_strategy():
|
||||||
|
at = _app()
|
||||||
|
mid = _ids(at)["missing"]
|
||||||
|
[r for r in at.radio if r.key == f"missing_{mid}_strategy"][0] \
|
||||||
|
.set_value("Drop rows that have any blank").run()
|
||||||
|
[b for b in at.button if b.label == "Run Pipeline"][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
step = [s for s in at.session_state["pipeline_steps"]
|
||||||
|
if s["tool"] == "missing"][0]
|
||||||
|
assert step["options"]["strategy"] == "drop_row"
|
||||||
|
|
||||||
|
|
||||||
|
def test_dedup_config_multiselect_builds_strategies():
|
||||||
|
at = _app()
|
||||||
|
did = _ids(at)["dedup"]
|
||||||
|
[m for m in at.multiselect if m.key == f"dedup_{did}_matchcols"][0] \
|
||||||
|
.set_value(["email"]).run()
|
||||||
|
[b for b in at.button if b.label == "Run Pipeline"][0].click().run()
|
||||||
|
assert not at.exception
|
||||||
|
step = [s for s in at.session_state["pipeline_steps"]
|
||||||
|
if s["tool"] == "dedup"][0]
|
||||||
|
strategies = step["options"]["strategies"]
|
||||||
|
cols = [c["column"] for c in strategies[0]["columns"]]
|
||||||
|
assert cols == ["email"]
|
||||||
|
assert strategies[0]["columns"][0]["algorithm"] == "exact"
|
||||||
254
tests/gui/test_pipeline_phrasing.py
Normal file
254
tests/gui/test_pipeline_phrasing.py
Normal file
@@ -0,0 +1,254 @@
|
|||||||
|
"""Pure-function tests for pipeline_modules phrasing helpers.
|
||||||
|
|
||||||
|
These cover the adapter-key → tool bridge, the plain-English ``step_phrase``
|
||||||
|
wording, ``step_status`` pill levels, and the column-prose / pluralization
|
||||||
|
helpers (``_fmt_cols`` / ``_n``). No Streamlit / AppTest needed — every symbol
|
||||||
|
under test is a pure function over plain dicts/lists.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.core.pipeline import TOOL_NAMES
|
||||||
|
from src.gui.components.pipeline_modules import (
|
||||||
|
CONFIG_RENDERERS,
|
||||||
|
PIPELINE_TOOL_META,
|
||||||
|
_fmt_cols,
|
||||||
|
_n,
|
||||||
|
step_label,
|
||||||
|
step_phrase,
|
||||||
|
step_status,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Bridge completeness
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("tool", TOOL_NAMES)
|
||||||
|
def test_pipeline_tool_meta_covers_every_tool(tool):
|
||||||
|
assert tool in PIPELINE_TOOL_META
|
||||||
|
assert PIPELINE_TOOL_META[tool] # non-empty tool_id
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("tool", TOOL_NAMES)
|
||||||
|
def test_step_label_is_friendly_and_not_the_raw_key(tool):
|
||||||
|
label = step_label(tool)
|
||||||
|
assert isinstance(label, str)
|
||||||
|
assert label
|
||||||
|
assert label != tool
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("tool", TOOL_NAMES)
|
||||||
|
def test_every_tool_has_a_config_renderer(tool):
|
||||||
|
assert tool in CONFIG_RENDERERS
|
||||||
|
assert callable(CONFIG_RENDERERS[tool])
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_label_falls_back_to_raw_key_for_unknown_tool():
|
||||||
|
assert step_label("not_a_tool") == "not_a_tool"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# step_phrase — populated + no-op cases for all five tools
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_phrase_text_clean_populated_and_noop():
|
||||||
|
assert step_phrase("text_clean", {
|
||||||
|
"cells_changed": 1204, "columns_processed": ["name", "city"],
|
||||||
|
}) == "1,204 cells cleaned in name & city"
|
||||||
|
assert step_phrase("text_clean", {"cells_changed": 0}) == "No changes needed."
|
||||||
|
assert step_phrase("text_clean", {}) == "No changes needed."
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_phrase_format_standardize_populated_and_noop():
|
||||||
|
assert step_phrase("format_standardize", {
|
||||||
|
"cells_changed": 50, "columns_processed": ["phone"],
|
||||||
|
}) == "50 cells standardized in phone"
|
||||||
|
# unparseable cells append a "left unchanged" tail
|
||||||
|
assert step_phrase("format_standardize", {
|
||||||
|
"cells_changed": 50, "cells_unparseable": 3, "columns_processed": ["phone"],
|
||||||
|
}) == "50 cells standardized in phone (3 left unchanged)"
|
||||||
|
assert step_phrase("format_standardize", {}) == "Nothing to standardize."
|
||||||
|
assert step_phrase("format_standardize", {
|
||||||
|
"cells_changed": 0, "cells_unparseable": 0,
|
||||||
|
}) == "Nothing to standardize."
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_phrase_missing_populated_and_noop():
|
||||||
|
assert step_phrase("missing", {
|
||||||
|
"cells_filled": 12, "rows_dropped": 4, "columns_dropped": ["x", "y"],
|
||||||
|
}) == "12 cells filled, 4 rows dropped, 2 columns dropped"
|
||||||
|
assert step_phrase("missing", {}) == "No missing values to handle."
|
||||||
|
# sentinel-only flagging path
|
||||||
|
assert step_phrase("missing", {
|
||||||
|
"sentinels_standardized": 7,
|
||||||
|
}) == "7 blank cells flagged"
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_phrase_column_map_populated_and_noop():
|
||||||
|
assert step_phrase("column_map", {
|
||||||
|
"columns_renamed": 3, "columns_added": ["new"], "columns_dropped": ["old", "gone"],
|
||||||
|
}) == "3 columns renamed, 1 column added, 2 columns dropped"
|
||||||
|
assert step_phrase("column_map", {}) == "Columns already aligned."
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_phrase_dedup_mockup_case():
|
||||||
|
assert step_phrase("dedup", {
|
||||||
|
"input_rows": 18442, "output_rows": 18130,
|
||||||
|
"duplicates_removed": 312, "groups": 147,
|
||||||
|
}) == "312 duplicates removed across 147 groups (18,442 → 18,130 rows)"
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_phrase_dedup_noop():
|
||||||
|
assert step_phrase("dedup", {"duplicates_removed": 0}) == "No duplicates found."
|
||||||
|
assert step_phrase("dedup", {}) == "No duplicates found."
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Pluralization (_n) through step_phrase
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_phrase_dedup_singular():
|
||||||
|
assert step_phrase("dedup", {
|
||||||
|
"input_rows": 10, "output_rows": 9,
|
||||||
|
"duplicates_removed": 1, "groups": 1,
|
||||||
|
}) == "1 duplicate removed across 1 group (10 → 9 rows)"
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_phrase_missing_singular():
|
||||||
|
assert step_phrase("missing", {
|
||||||
|
"rows_dropped": 1, "columns_dropped": ["x"],
|
||||||
|
}) == "1 row dropped, 1 column dropped"
|
||||||
|
|
||||||
|
|
||||||
|
def test_n_singular_vs_plural_every_noun():
|
||||||
|
assert _n(1, "cell") == "1 cell"
|
||||||
|
assert _n(2, "cell") == "2 cells"
|
||||||
|
assert _n(1, "row") == "1 row"
|
||||||
|
assert _n(3, "row") == "3 rows"
|
||||||
|
assert _n(1, "column") == "1 column"
|
||||||
|
assert _n(5, "column") == "5 columns"
|
||||||
|
assert _n(1, "duplicate") == "1 duplicate"
|
||||||
|
assert _n(9, "duplicate") == "9 duplicates"
|
||||||
|
assert _n(1, "group") == "1 group"
|
||||||
|
assert _n(4, "group") == "4 groups"
|
||||||
|
|
||||||
|
|
||||||
|
def test_n_thousands_separator():
|
||||||
|
assert _n(1204, "cell") == "1,204 cells"
|
||||||
|
assert _n(18442, "row") == "18,442 rows"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Column prose (_fmt_cols)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_fmt_cols_zero():
|
||||||
|
assert _fmt_cols([]) == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_fmt_cols_one():
|
||||||
|
assert _fmt_cols(["name"]) == "name"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fmt_cols_two():
|
||||||
|
assert _fmt_cols(["name", "city"]) == "name & city"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fmt_cols_three():
|
||||||
|
assert _fmt_cols(["a", "b", "c"]) == "a, b & c"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fmt_cols_four_or_more():
|
||||||
|
assert _fmt_cols(["a", "b", "c", "d"]) == "a, b & 2 more"
|
||||||
|
assert _fmt_cols(["a", "b", "c", "d", "e"]) == "a, b & 3 more"
|
||||||
|
|
||||||
|
|
||||||
|
def test_fmt_cols_coerces_non_strings():
|
||||||
|
assert _fmt_cols([1, 2]) == "1 & 2"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# step_status — pill levels + details
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_status_clean_is_ok():
|
||||||
|
assert step_status("text_clean", {"cells_changed": 5}) == ("✓ ok", "ok", "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_status_skipped():
|
||||||
|
label, level, detail = step_status("text_clean", {"cells_changed": 5}, skipped=True)
|
||||||
|
assert level == "skipped"
|
||||||
|
assert detail == ""
|
||||||
|
assert "skipped" in label
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_status_error_uses_first_line_only():
|
||||||
|
label, level, detail = step_status(
|
||||||
|
"dedup", {}, error="X: msg\nline2\nline3",
|
||||||
|
)
|
||||||
|
assert level == "error"
|
||||||
|
assert detail == "X: msg"
|
||||||
|
assert "error" in label
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_status_error_takes_precedence_over_skipped():
|
||||||
|
label, level, detail = step_status(
|
||||||
|
"text_clean", {}, skipped=True, error="boom\nsecond",
|
||||||
|
)
|
||||||
|
assert level == "error"
|
||||||
|
assert detail == "boom"
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_status_format_standardize_unparseable_warns():
|
||||||
|
label, level, detail = step_status(
|
||||||
|
"format_standardize", {"cells_changed": 100, "cells_unparseable": 141},
|
||||||
|
)
|
||||||
|
assert level == "warn"
|
||||||
|
assert "141 skipped" in label
|
||||||
|
assert detail # non-empty inline detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_status_format_standardize_no_unparseable_is_ok():
|
||||||
|
assert step_status(
|
||||||
|
"format_standardize", {"cells_changed": 100},
|
||||||
|
) == ("✓ ok", "ok", "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_status_column_map_coercion_failures_warn():
|
||||||
|
label, level, detail = step_status(
|
||||||
|
"column_map", {"coercion_failures": {"age": 4}},
|
||||||
|
)
|
||||||
|
assert level == "warn"
|
||||||
|
assert "4 not coerced" in label
|
||||||
|
assert detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_status_column_map_missing_required_targets_warn():
|
||||||
|
label, level, detail = step_status(
|
||||||
|
"column_map", {"missing_required_targets": ["email"]},
|
||||||
|
)
|
||||||
|
assert level == "warn"
|
||||||
|
assert "missing targets" in label
|
||||||
|
assert "email" in detail
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_status_column_map_missing_targets_take_precedence_over_coercion():
|
||||||
|
# both present → missing-targets branch wins
|
||||||
|
label, level, detail = step_status(
|
||||||
|
"column_map",
|
||||||
|
{"missing_required_targets": ["email"], "coercion_failures": {"age": 4}},
|
||||||
|
)
|
||||||
|
assert level == "warn"
|
||||||
|
assert "missing targets" in label
|
||||||
|
|
||||||
|
|
||||||
|
def test_step_status_unknown_tool_is_ok():
|
||||||
|
assert step_status("mystery", {"foo": 1}) == ("✓ ok", "ok", "")
|
||||||
293
tests/test_cli_pipeline.py
Normal file
293
tests/test_cli_pipeline.py
Normal file
@@ -0,0 +1,293 @@
|
|||||||
|
"""Integration tests for the pipeline CLI (src/cli_pipeline.py).
|
||||||
|
|
||||||
|
The Typer ``app`` is invoked directly via ``CliRunner`` to bypass the
|
||||||
|
license ``guard(...)`` that ``main()`` runs before ``app()`` — matching the
|
||||||
|
house pattern in ``test_cli_text_clean.py``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
|
from src.cli_pipeline import app
|
||||||
|
from src.core.pipeline import Pipeline, _DEFAULT_ORDER
|
||||||
|
|
||||||
|
runner = CliRunner()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def messy_csv(tmp_path):
|
||||||
|
"""A small messy CSV with duplicate / whitespace / mixed-case rows."""
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"name": [" Alice ", "alice", "Bob", "Charlie"],
|
||||||
|
"email": ["A@X.COM", "a@x.com", "bob@x.com", "charlie@x.com"],
|
||||||
|
"phone": ["555-1234", "5551234", "555-9999", "555-0000"],
|
||||||
|
"signup_date": ["2020-01-01", "2020-01-01", "2020-02-02", "2020-03-03"],
|
||||||
|
})
|
||||||
|
path = tmp_path / "messy.csv"
|
||||||
|
df.to_csv(path, index=False)
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def _pipeline_artifacts(csv_path):
|
||||||
|
"""The output CSV + audit JSON the CLI writes next to *csv_path*."""
|
||||||
|
out_csv = csv_path.parent / f"{csv_path.stem}_pipeline.csv"
|
||||||
|
audit = csv_path.parent / f"{csv_path.stem}_pipeline.json"
|
||||||
|
return out_csv, audit
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# --recommend
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestRecommend:
|
||||||
|
def test_recommend_prints_valid_json(self):
|
||||||
|
result = runner.invoke(app, ["--recommend"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
data = json.loads(result.output)
|
||||||
|
assert "steps" in data
|
||||||
|
tools = [s["tool"] for s in data["steps"]]
|
||||||
|
assert tools == list(_DEFAULT_ORDER)
|
||||||
|
|
||||||
|
def test_recommend_default_tools_in_order(self):
|
||||||
|
result = runner.invoke(app, ["--recommend"])
|
||||||
|
data = json.loads(result.output)
|
||||||
|
tools = [s["tool"] for s in data["steps"]]
|
||||||
|
assert tools == ["text_clean", "format_standardize", "missing", "dedup"]
|
||||||
|
assert len(tools) == 4
|
||||||
|
|
||||||
|
def test_recommend_output_writes_loadable_file(self, tmp_path):
|
||||||
|
out = tmp_path / "pipeline.json"
|
||||||
|
result = runner.invoke(app, ["--recommend", "--output", str(out)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert out.exists()
|
||||||
|
# Confirmation message printed instead of raw JSON.
|
||||||
|
assert str(out) in result.output
|
||||||
|
pipe = Pipeline.from_file(out)
|
||||||
|
assert [s.tool for s in pipe.steps] == list(_DEFAULT_ORDER)
|
||||||
|
|
||||||
|
def test_recommend_output_message_not_json(self, tmp_path):
|
||||||
|
out = tmp_path / "pipeline.json"
|
||||||
|
result = runner.invoke(app, ["--recommend", "--output", str(out)])
|
||||||
|
assert "saved to" in result.output.lower()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Argument / input validation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestArgValidation:
|
||||||
|
def test_no_args_exits_2(self):
|
||||||
|
result = runner.invoke(app, [])
|
||||||
|
assert result.exit_code == 2
|
||||||
|
assert "input file is required" in result.output.lower()
|
||||||
|
|
||||||
|
def test_nonexistent_input_exits_1(self, tmp_path):
|
||||||
|
missing = tmp_path / "does_not_exist_xyz.csv"
|
||||||
|
result = runner.invoke(app, [str(missing)])
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "not found" in result.output.lower()
|
||||||
|
|
||||||
|
def test_pipeline_and_steps_together_exits_1(self, messy_csv, tmp_path):
|
||||||
|
pj = tmp_path / "p.json"
|
||||||
|
Pipeline.from_dict({"steps": [{"tool": "text_clean"}]}).to_file(pj)
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[str(messy_csv), "--pipeline", str(pj), "--steps", "text_clean"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "not both" in result.output.lower()
|
||||||
|
|
||||||
|
def test_pipeline_nonexistent_exits_1(self, messy_csv, tmp_path):
|
||||||
|
missing = tmp_path / "no_such_pipeline.json"
|
||||||
|
result = runner.invoke(
|
||||||
|
app, [str(messy_csv), "--pipeline", str(missing)],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "not found" in result.output.lower()
|
||||||
|
|
||||||
|
def test_unknown_tool_in_steps_errors(self, messy_csv):
|
||||||
|
result = runner.invoke(app, [str(messy_csv), "--steps", "bogus_tool"])
|
||||||
|
assert result.exit_code != 0
|
||||||
|
# Helpful error naming the offending value.
|
||||||
|
assert "bogus_tool" in result.output
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Dry-run (default)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestDryRun:
|
||||||
|
def test_dry_run_exit_0_and_plan_printed(self, messy_csv):
|
||||||
|
result = runner.invoke(app, [str(messy_csv)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "Pipeline plan:" in result.output
|
||||||
|
assert "plan-only run" in result.output
|
||||||
|
|
||||||
|
def test_dry_run_writes_no_artifacts(self, messy_csv):
|
||||||
|
result = runner.invoke(app, [str(messy_csv)])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
out_csv, audit = _pipeline_artifacts(messy_csv)
|
||||||
|
assert not out_csv.exists()
|
||||||
|
assert not audit.exists()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# --apply
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestApply:
|
||||||
|
def test_apply_default_pipeline_writes_outputs(self, messy_csv):
|
||||||
|
result = runner.invoke(app, [str(messy_csv), "--apply"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
out_csv, audit = _pipeline_artifacts(messy_csv)
|
||||||
|
assert out_csv.exists()
|
||||||
|
assert audit.exists()
|
||||||
|
# Output CSV is readable.
|
||||||
|
df = pd.read_csv(out_csv)
|
||||||
|
assert len(df.columns) >= 1
|
||||||
|
|
||||||
|
def test_apply_audit_has_documented_keys(self, messy_csv):
|
||||||
|
result = runner.invoke(app, [str(messy_csv), "--apply"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
_, audit = _pipeline_artifacts(messy_csv)
|
||||||
|
data = json.loads(audit.read_text())
|
||||||
|
for key in (
|
||||||
|
"pipeline", "warnings", "initial_rows", "final_rows",
|
||||||
|
"total_elapsed_seconds", "steps",
|
||||||
|
):
|
||||||
|
assert key in data, f"missing audit key: {key}"
|
||||||
|
# One step entry per pipeline step (default = 4).
|
||||||
|
assert len(data["steps"]) == len(_DEFAULT_ORDER)
|
||||||
|
for step in data["steps"]:
|
||||||
|
for k in (
|
||||||
|
"tool", "name", "enabled", "skipped",
|
||||||
|
"elapsed_seconds", "summary", "error",
|
||||||
|
):
|
||||||
|
assert k in step, f"missing step key: {k}"
|
||||||
|
|
||||||
|
def test_apply_dedup_reduces_rows(self, messy_csv):
|
||||||
|
result = runner.invoke(app, [str(messy_csv), "--apply"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
_, audit = _pipeline_artifacts(messy_csv)
|
||||||
|
data = json.loads(audit.read_text())
|
||||||
|
# 4 input rows; the first two are duplicates once cleaned/standardized.
|
||||||
|
assert data["initial_rows"] == 4
|
||||||
|
assert data["final_rows"] < data["initial_rows"]
|
||||||
|
|
||||||
|
def test_apply_custom_output_path(self, messy_csv, tmp_path):
|
||||||
|
out = tmp_path / "custom.csv"
|
||||||
|
result = runner.invoke(
|
||||||
|
app, [str(messy_csv), "--apply", "--output", str(out)],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert out.exists()
|
||||||
|
# Default-named CSV should NOT be written when --output is given.
|
||||||
|
default_csv, _ = _pipeline_artifacts(messy_csv)
|
||||||
|
assert not default_csv.exists()
|
||||||
|
# Audit JSON is still written next to the input.
|
||||||
|
_, audit = _pipeline_artifacts(messy_csv)
|
||||||
|
assert audit.exists()
|
||||||
|
|
||||||
|
def test_apply_custom_steps_subset(self, messy_csv):
|
||||||
|
result = runner.invoke(
|
||||||
|
app, [str(messy_csv), "--apply", "--steps", "text_clean,missing"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
_, audit = _pipeline_artifacts(messy_csv)
|
||||||
|
data = json.loads(audit.read_text())
|
||||||
|
tools = [s["tool"] for s in data["steps"]]
|
||||||
|
assert tools == ["text_clean", "missing"]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Strict mode
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestStrict:
|
||||||
|
def test_strict_out_of_order_exits_2(self, messy_csv):
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[str(messy_csv), "--steps", "dedup,text_clean", "--strict", "--apply"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 2
|
||||||
|
assert "abort" in result.output.lower()
|
||||||
|
|
||||||
|
def test_strict_out_of_order_writes_nothing(self, messy_csv):
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[str(messy_csv), "--steps", "dedup,text_clean", "--strict", "--apply"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 2
|
||||||
|
out_csv, audit = _pipeline_artifacts(messy_csv)
|
||||||
|
assert not out_csv.exists()
|
||||||
|
assert not audit.exists()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Round-trip: --recommend --output then --pipeline --apply
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestRoundTrip:
|
||||||
|
def test_save_then_run_saved_pipeline(self, messy_csv, tmp_path):
|
||||||
|
pj = tmp_path / "p.json"
|
||||||
|
r1 = runner.invoke(app, ["--recommend", "--output", str(pj)])
|
||||||
|
assert r1.exit_code == 0
|
||||||
|
assert pj.exists()
|
||||||
|
|
||||||
|
r2 = runner.invoke(
|
||||||
|
app, [str(messy_csv), "--pipeline", str(pj), "--apply"],
|
||||||
|
)
|
||||||
|
assert r2.exit_code == 0
|
||||||
|
out_csv, audit = _pipeline_artifacts(messy_csv)
|
||||||
|
assert out_csv.exists()
|
||||||
|
assert audit.exists()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Step error handling (--continue-on-error)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestStepError:
|
||||||
|
"""A dedup step with an invalid survivor_rule raises a ConfigError at
|
||||||
|
run time, letting us exercise the stop/continue-on-error contract."""
|
||||||
|
|
||||||
|
def _bad_pipeline(self, tmp_path):
|
||||||
|
pj = tmp_path / "bad.json"
|
||||||
|
Pipeline.from_dict({
|
||||||
|
"steps": [{
|
||||||
|
"tool": "dedup",
|
||||||
|
"options": {"survivor_rule": "not_a_real_rule"},
|
||||||
|
}]
|
||||||
|
}).to_file(pj)
|
||||||
|
return pj
|
||||||
|
|
||||||
|
def test_step_error_halts_without_continue(self, messy_csv, tmp_path):
|
||||||
|
pj = self._bad_pipeline(tmp_path)
|
||||||
|
result = runner.invoke(
|
||||||
|
app, [str(messy_csv), "--pipeline", str(pj), "--apply"],
|
||||||
|
)
|
||||||
|
assert result.exit_code != 0
|
||||||
|
out_csv, audit = _pipeline_artifacts(messy_csv)
|
||||||
|
# Halted before writing output.
|
||||||
|
assert not out_csv.exists()
|
||||||
|
assert not audit.exists()
|
||||||
|
|
||||||
|
def test_continue_on_error_completes_and_records_error(self, messy_csv, tmp_path):
|
||||||
|
pj = self._bad_pipeline(tmp_path)
|
||||||
|
result = runner.invoke(
|
||||||
|
app,
|
||||||
|
[str(messy_csv), "--pipeline", str(pj), "--apply",
|
||||||
|
"--continue-on-error"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
out_csv, audit = _pipeline_artifacts(messy_csv)
|
||||||
|
assert out_csv.exists()
|
||||||
|
assert audit.exists()
|
||||||
|
data = json.loads(audit.read_text())
|
||||||
|
assert len(data["steps"]) == 1
|
||||||
|
assert data["steps"][0]["error"], "expected the failed step's error recorded"
|
||||||
116
tests/test_demo_pipelines.py
Normal file
116
tests/test_demo_pipelines.py
Normal file
@@ -0,0 +1,116 @@
|
|||||||
|
"""Demo pipelines must keep showing value (accounting personas).
|
||||||
|
|
||||||
|
Each persona's preloaded dataset + saved pipeline is the marketing surface
|
||||||
|
driven by ``src/gui/app_demo.py``. These tests pin that every demo loads,
|
||||||
|
runs clean, and produces its headline value (duplicate rows removed, clean
|
||||||
|
parse, disguised nulls caught) — so a stale dataset or an engine change can't
|
||||||
|
silently gut the sales demo. The read path mirrors ``app_demo._load_demo``
|
||||||
|
exactly (``dtype=str, keep_default_na=False`` so every disguised null survives
|
||||||
|
to the pipeline).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.core.pipeline import Pipeline, run_pipeline
|
||||||
|
|
||||||
|
_REPO = Path(__file__).resolve().parent.parent
|
||||||
|
_DEMO = _REPO / "samples" / "demo"
|
||||||
|
|
||||||
|
# (data_file, pipeline_file, min_duplicates_removed) — one per accounting
|
||||||
|
# persona in app_demo.PERSONAS. The dup floors are the validated demo numbers.
|
||||||
|
_DEMOS = [
|
||||||
|
("bank_reconciliation.csv", "bank_reconciliation_pipeline.json", 6),
|
||||||
|
("vendor_1099.csv", "vendor_1099_pipeline.json", 8),
|
||||||
|
("ar_open_invoices.csv", "ar_open_invoices_pipeline.json", 5),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("data_file,pipeline_file,min_dupes", _DEMOS)
|
||||||
|
def test_demo_runs_clean_and_shows_value(data_file, pipeline_file, min_dupes):
|
||||||
|
df = pd.read_csv(_DEMO / data_file, dtype=str, keep_default_na=False)
|
||||||
|
pipe = Pipeline.from_file(_DEMO / pipeline_file)
|
||||||
|
res = run_pipeline(df, pipe, stop_on_error=True)
|
||||||
|
|
||||||
|
# 1. Nothing errored — the demo never shows a visitor a red banner.
|
||||||
|
assert all(sr.error is None for sr in res.step_results), [
|
||||||
|
(sr.step.tool, sr.error) for sr in res.step_results
|
||||||
|
]
|
||||||
|
|
||||||
|
# 2. Dedup removed the designed duplicate rows (the headline value).
|
||||||
|
assert res.final_rows < res.initial_rows
|
||||||
|
dedup = next(sr for sr in res.step_results if sr.step.tool == "dedup")
|
||||||
|
assert dedup.summary["duplicates_removed"] >= min_dupes
|
||||||
|
|
||||||
|
# 3. Standardization parsed every typed cell — a demo with unparseable
|
||||||
|
# cells reads as "the tool choked," which kills the pitch.
|
||||||
|
fmt = next(sr for sr in res.step_results if sr.step.tool == "format_standardize")
|
||||||
|
assert fmt.summary["cells_unparseable"] == 0
|
||||||
|
assert fmt.summary["cells_changed"] > 0
|
||||||
|
|
||||||
|
# 4. The disguised nulls (—, (blank), TBD, …) were caught.
|
||||||
|
miss = next(sr for sr in res.step_results if sr.step.tool == "missing")
|
||||||
|
assert miss.summary["sentinels_standardized"] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_app_demo_references_each_demo_file():
|
||||||
|
"""Every data/pipeline file the demo app names must exist on disk.
|
||||||
|
|
||||||
|
Guards against a rename in app_demo.py drifting away from samples/demo/
|
||||||
|
(or vice versa) without a test catching it.
|
||||||
|
"""
|
||||||
|
src = (_REPO / "src" / "gui" / "app_demo.py").read_text(encoding="utf-8")
|
||||||
|
for data_file, pipeline_file, _ in _DEMOS:
|
||||||
|
assert data_file in src, f"{data_file} not referenced in app_demo.py"
|
||||||
|
assert pipeline_file in src, f"{pipeline_file} not referenced in app_demo.py"
|
||||||
|
assert (_DEMO / data_file).exists(), f"missing {data_file}"
|
||||||
|
assert (_DEMO / pipeline_file).exists(), f"missing {pipeline_file}"
|
||||||
|
|
||||||
|
|
||||||
|
# The accounting persona keys served by the demo app — each must line up with
|
||||||
|
# a landing page that embeds the matching demo. (key, data-file stem)
|
||||||
|
_PERSONA_KEYS = [
|
||||||
|
("bookkeeper", "bank_reconciliation"),
|
||||||
|
("ap-1099", "vendor_1099"),
|
||||||
|
("ar-aging", "ar_open_invoices"),
|
||||||
|
]
|
||||||
|
_LANDING = _REPO / "landing"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("key,stem", _PERSONA_KEYS)
|
||||||
|
def test_landing_page_embeds_the_matching_demo(key, stem):
|
||||||
|
"""Each landing page exists and its iframe + CTA point at this persona —
|
||||||
|
so the sales surface (landing -> demo app -> dataset) stays coherent."""
|
||||||
|
app_src = (_REPO / "src" / "gui" / "app_demo.py").read_text(encoding="utf-8")
|
||||||
|
assert f'"{key}"' in app_src, f"persona key {key!r} not served by app_demo.py"
|
||||||
|
|
||||||
|
page = _LANDING / key / "index.html"
|
||||||
|
assert page.exists(), f"missing landing page for {key}"
|
||||||
|
html = page.read_text(encoding="utf-8")
|
||||||
|
assert f"?p={key}" in html, f"{key} landing iframe doesn't load ?p={key}"
|
||||||
|
assert f"from={key}" in html, f"{key} landing CTA isn't tagged from={key}"
|
||||||
|
|
||||||
|
# The hub links to this persona's page.
|
||||||
|
hub = (_LANDING / "index.html").read_text(encoding="utf-8")
|
||||||
|
assert f'href="{key}/"' in hub, f"hub doesn't link to {key}/"
|
||||||
|
|
||||||
|
|
||||||
|
def test_landing_surface_has_no_stale_persona_refs():
|
||||||
|
"""No retired Shopify / RevOps persona language remains in landing HTML."""
|
||||||
|
for html_file in _LANDING.rglob("*.html"):
|
||||||
|
text = html_file.read_text(encoding="utf-8").lower()
|
||||||
|
for stale in ("shopify", "revops", "klaviyo", "hubspot"):
|
||||||
|
assert stale not in text, f"{stale!r} still in {html_file.relative_to(_REPO)}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_demo_app_builds_a_single_watermark_row():
|
||||||
|
"""The demo download appends exactly one trailing watermark row
|
||||||
|
(DEMO-PLAN §6: the AFTER preview must read as production-quality)."""
|
||||||
|
src = (_REPO / "src" / "gui" / "app_demo.py").read_text(encoding="utf-8")
|
||||||
|
assert "DataTools demo — buy at" in src
|
||||||
|
# One trailing row concatenated onto the result frame.
|
||||||
|
assert "watermark_row" in src and "pd.concat([result.final_df, watermark_row]" in src
|
||||||
@@ -322,3 +322,499 @@ class TestSoftDependencies:
|
|||||||
assert len(order) == len(TOOL_NAMES), (
|
assert len(order) == len(TOOL_NAMES), (
|
||||||
f"SOFT_DEPENDENCIES contain a cycle; topo order={order}"
|
f"SOFT_DEPENDENCIES contain a cycle; topo order={order}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Per-adapter summary correctness — exact numbers on KNOWN-messy input.
|
||||||
|
# Each adapter is also exercised through run_pipeline so the StepResult
|
||||||
|
# carries the adapter's summary verbatim.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _run_one(df, tool, options):
|
||||||
|
"""Run a single-step pipeline and return (StepResult, PipelineResult)."""
|
||||||
|
res = run_pipeline(df, Pipeline(steps=[Step(tool, options)]))
|
||||||
|
return res.step_results[0], res
|
||||||
|
|
||||||
|
|
||||||
|
class TestTextCleanSummary:
|
||||||
|
def test_two_trimmable_cells_counted(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"a": [" x ", "y", " z"], # 2 cells need trimming (" x ", " z")
|
||||||
|
"b": ["ok", "fine", "good"], # already clean
|
||||||
|
})
|
||||||
|
out, summary = TOOL_ADAPTERS["text_clean"](df, {"trim": True})
|
||||||
|
assert summary["cells_total"] == 6
|
||||||
|
assert summary["cells_changed"] == 2
|
||||||
|
assert sorted(summary["columns_processed"]) == ["a", "b"]
|
||||||
|
assert out["a"].tolist() == ["x", "y", "z"]
|
||||||
|
|
||||||
|
def test_title_case_changes_all_cells(self):
|
||||||
|
df = pd.DataFrame({"name": ["alice smith", "BOB JONES"]})
|
||||||
|
out, summary = TOOL_ADAPTERS["text_clean"](df, {"case": "title"})
|
||||||
|
assert summary["cells_changed"] == 2
|
||||||
|
assert out["name"].tolist() == ["Alice Smith", "Bob Jones"]
|
||||||
|
|
||||||
|
def test_collapse_whitespace_counts_internal_runs(self):
|
||||||
|
df = pd.DataFrame({"name": ["a b", "c d", "e f"]})
|
||||||
|
out, summary = TOOL_ADAPTERS["text_clean"](
|
||||||
|
df, {"trim": True, "collapse_whitespace": True},
|
||||||
|
)
|
||||||
|
# "a b" and "e f" collapse; "c d" is already single-spaced.
|
||||||
|
assert summary["cells_changed"] == 2
|
||||||
|
assert out["name"].tolist() == ["a b", "c d", "e f"]
|
||||||
|
|
||||||
|
def test_summary_visible_through_run_pipeline(self):
|
||||||
|
df = pd.DataFrame({"a": [" x ", "y"]})
|
||||||
|
sr, _res = _run_one(df, "text_clean", {"trim": True})
|
||||||
|
assert sr.skipped is False
|
||||||
|
assert sr.error is None
|
||||||
|
assert sr.summary["cells_changed"] == 1
|
||||||
|
assert sr.summary["cells_total"] == 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestFormatStandardizeSummary:
|
||||||
|
def test_one_unparseable_phone(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"phone": ["(415) 555-1234", "not a phone", "+44 20 7946 0958"],
|
||||||
|
})
|
||||||
|
out, summary = TOOL_ADAPTERS["format_standardize"](
|
||||||
|
df, {"column_types": {"phone": "phone"}},
|
||||||
|
)
|
||||||
|
assert summary["cells_total"] == 3
|
||||||
|
assert summary["cells_unparseable"] == 1
|
||||||
|
assert summary["cells_changed"] == 2
|
||||||
|
assert summary["columns_processed"] == ["phone"]
|
||||||
|
assert out["phone"].tolist() == [
|
||||||
|
"+14155551234", "not a phone", "+442079460958",
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_date_standardization_counts(self):
|
||||||
|
df = pd.DataFrame({"signup_date": ["2024-01-05", "Jan 5 2024", "garbage"]})
|
||||||
|
out, summary = TOOL_ADAPTERS["format_standardize"](
|
||||||
|
df, {"column_types": {"signup_date": "date"}},
|
||||||
|
)
|
||||||
|
# "2024-01-05" already canonical; "Jan 5 2024" rewritten; "garbage" fails.
|
||||||
|
assert summary["cells_unparseable"] == 1
|
||||||
|
assert summary["cells_changed"] == 1
|
||||||
|
assert out["signup_date"].tolist()[:2] == ["2024-01-05", "2024-01-05"]
|
||||||
|
|
||||||
|
def test_summary_visible_through_run_pipeline(self):
|
||||||
|
df = pd.DataFrame({"phone": ["(415) 555-1234", "bad"]})
|
||||||
|
sr, _ = _run_one(df, "format_standardize", {"column_types": {"phone": "phone"}})
|
||||||
|
assert sr.summary["cells_unparseable"] == 1
|
||||||
|
assert sr.summary["columns_processed"] == ["phone"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestMissingSummary:
|
||||||
|
def test_median_fills_each_blank(self):
|
||||||
|
df = pd.DataFrame({"val": [1.0, np.nan, 3.0, np.nan, 5.0]})
|
||||||
|
out, summary = TOOL_ADAPTERS["missing"](df, {"strategy": "median"})
|
||||||
|
assert summary["cells_filled"] == 2 # exactly the 2 NaNs
|
||||||
|
assert summary["rows_dropped"] == 0
|
||||||
|
assert out["val"].tolist() == [1.0, 3.0, 3.0, 3.0, 5.0] # median is 3.0
|
||||||
|
|
||||||
|
def test_drop_row_by_threshold(self):
|
||||||
|
# row_drop_threshold is the *fraction* of nulls needed to drop a row.
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"a": [1.0, np.nan, 3.0],
|
||||||
|
"b": ["x", np.nan, "z"], # middle row is 100% null
|
||||||
|
})
|
||||||
|
out, summary = TOOL_ADAPTERS["missing"](
|
||||||
|
df, {"strategy": "drop_row", "row_drop_threshold": 0.4},
|
||||||
|
)
|
||||||
|
assert summary["rows_dropped"] == 1
|
||||||
|
assert len(out) == 2
|
||||||
|
|
||||||
|
def test_sentinel_standardization_count(self):
|
||||||
|
df = pd.DataFrame({"x": ["ok", "N/A", "fine", "N/A"]})
|
||||||
|
out, summary = TOOL_ADAPTERS["missing"](df, {
|
||||||
|
"strategy": "none",
|
||||||
|
"sentinels": ["N/A"],
|
||||||
|
"standardize_sentinels": True,
|
||||||
|
})
|
||||||
|
assert summary["sentinels_standardized"] == 2
|
||||||
|
# The two "N/A" cells became real NaN.
|
||||||
|
assert out["x"].isna().sum() == 2
|
||||||
|
assert out["x"].tolist()[0] == "ok"
|
||||||
|
|
||||||
|
def test_summary_visible_through_run_pipeline(self):
|
||||||
|
df = pd.DataFrame({"val": [1.0, np.nan, 3.0]})
|
||||||
|
sr, _ = _run_one(df, "missing", {"strategy": "median"})
|
||||||
|
assert sr.summary["cells_filled"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestColumnMapSummary:
|
||||||
|
def test_single_rename(self):
|
||||||
|
df = pd.DataFrame({"old": [1, 2], "keep": [3, 4]})
|
||||||
|
out, summary = TOOL_ADAPTERS["column_map"](
|
||||||
|
df, {"mapping": {"old": "new"}, "unmapped": "keep"},
|
||||||
|
)
|
||||||
|
assert summary["columns_renamed"] == 1
|
||||||
|
assert summary["columns_dropped"] == []
|
||||||
|
assert list(out.columns) == ["new", "keep"]
|
||||||
|
|
||||||
|
def test_unmapped_drop_reports_dropped_columns(self):
|
||||||
|
df = pd.DataFrame({"old": [1, 2], "keep": [3, 4]})
|
||||||
|
out, summary = TOOL_ADAPTERS["column_map"](
|
||||||
|
df, {"mapping": {"old": "new"}, "unmapped": "drop"},
|
||||||
|
)
|
||||||
|
assert summary["columns_renamed"] == 1
|
||||||
|
assert summary["columns_dropped"] == ["keep"]
|
||||||
|
assert list(out.columns) == ["new"]
|
||||||
|
|
||||||
|
def test_summary_visible_through_run_pipeline(self):
|
||||||
|
df = pd.DataFrame({"old": [1], "keep": [2]})
|
||||||
|
sr, _ = _run_one(df, "column_map", {"mapping": {"old": "new"}})
|
||||||
|
assert sr.summary["columns_renamed"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestDedupSummary:
|
||||||
|
def test_exact_duplicate_rows(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"email": ["a@x.com", "b@x.com", "a@x.com", "a@x.com"],
|
||||||
|
"name": ["A", "B", "A", "A"],
|
||||||
|
})
|
||||||
|
out, summary = TOOL_ADAPTERS["dedup"](df, {"survivor_rule": "first"})
|
||||||
|
assert summary["input_rows"] == 4
|
||||||
|
assert summary["output_rows"] == 2
|
||||||
|
assert summary["duplicates_removed"] == 2
|
||||||
|
assert summary["groups"] == 1
|
||||||
|
assert out["email"].tolist() == ["a@x.com", "b@x.com"]
|
||||||
|
|
||||||
|
def test_explicit_exact_strategy_on_column(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"email": ["a@x.com", "b@x.com", "a@x.com"],
|
||||||
|
"name": ["A", "B", "C"],
|
||||||
|
})
|
||||||
|
out, summary = TOOL_ADAPTERS["dedup"](df, {
|
||||||
|
"survivor_rule": "first",
|
||||||
|
"strategies": [{"columns": [
|
||||||
|
{"column": "email", "algorithm": "exact", "threshold": 100},
|
||||||
|
]}],
|
||||||
|
})
|
||||||
|
assert summary["duplicates_removed"] == 1
|
||||||
|
assert summary["groups"] == 1
|
||||||
|
|
||||||
|
def test_most_complete_keeps_fuller_survivor(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"email": ["a@x.com", "a@x.com"],
|
||||||
|
"name": ["", "Alice"], # second row is more complete
|
||||||
|
"phone": ["111", "111"],
|
||||||
|
})
|
||||||
|
out, summary = TOOL_ADAPTERS["dedup"](df, {"survivor_rule": "most_complete"})
|
||||||
|
assert summary["duplicates_removed"] == 1
|
||||||
|
assert out.iloc[0]["name"] == "Alice"
|
||||||
|
|
||||||
|
def test_no_duplicates_is_noop(self):
|
||||||
|
df = pd.DataFrame({"email": ["a@x.com", "b@x.com"], "name": ["A", "B"]})
|
||||||
|
out, summary = TOOL_ADAPTERS["dedup"](df, {"survivor_rule": "first"})
|
||||||
|
assert summary["duplicates_removed"] == 0
|
||||||
|
assert summary["output_rows"] == 2
|
||||||
|
|
||||||
|
def test_summary_visible_through_run_pipeline(self):
|
||||||
|
df = pd.DataFrame({"email": ["a@x.com", "a@x.com"], "name": ["A", "A"]})
|
||||||
|
sr, res = _run_one(df, "dedup", {"survivor_rule": "first"})
|
||||||
|
assert sr.summary["duplicates_removed"] == 1
|
||||||
|
assert res.final_rows == 1
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Data flow — a later step depends on an earlier step's output
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestDataFlow:
|
||||||
|
def test_text_clean_enables_dedup_match(self):
|
||||||
|
# The two phones differ only by surrounding whitespace; without
|
||||||
|
# the trim they are distinct, so dedup alone would keep both.
|
||||||
|
df = pd.DataFrame({"phone": [" +14155551234 ", "+14155551234"]})
|
||||||
|
p = Pipeline(steps=[
|
||||||
|
Step("text_clean", {"trim": True}),
|
||||||
|
Step("dedup", {"survivor_rule": "first"}),
|
||||||
|
])
|
||||||
|
res = run_pipeline(df, p)
|
||||||
|
assert res.initial_rows == 2
|
||||||
|
assert res.final_rows == 1
|
||||||
|
assert res.final_df["phone"].tolist() == ["+14155551234"]
|
||||||
|
|
||||||
|
def test_dedup_default_matching_normalizes_whitespace(self):
|
||||||
|
# Note: dedup's exact matcher already normalizes surrounding
|
||||||
|
# whitespace, so the two phones collapse even WITHOUT a prior
|
||||||
|
# text_clean. The survivor still carries the un-trimmed value.
|
||||||
|
df = pd.DataFrame({"phone": [" +14155551234 ", "+14155551234"]})
|
||||||
|
res = run_pipeline(df, Pipeline(steps=[Step("dedup", {"survivor_rule": "first"})]))
|
||||||
|
assert res.final_rows == 1
|
||||||
|
# Survivor keeps the raw (still-padded) text — dedup does not clean.
|
||||||
|
assert res.final_df["phone"].tolist() == [" +14155551234 "]
|
||||||
|
|
||||||
|
def test_chained_initial_and_final_rows(self):
|
||||||
|
df = pd.DataFrame({
|
||||||
|
"name": [" Al ", "al", "Bob"],
|
||||||
|
"v": [1, 1, 2],
|
||||||
|
})
|
||||||
|
p = Pipeline(steps=[
|
||||||
|
Step("text_clean", {"trim": True, "case": "title"}),
|
||||||
|
Step("dedup", {"survivor_rule": "first"}),
|
||||||
|
])
|
||||||
|
res = run_pipeline(df, p)
|
||||||
|
# " Al " and "al" both become "Al" → duplicate rows collapse.
|
||||||
|
assert res.initial_rows == 3
|
||||||
|
assert res.final_rows == 2
|
||||||
|
assert "Al" in res.final_df["name"].tolist()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Error handling — stop_on_error semantics
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestErrorHandling:
|
||||||
|
def test_most_recent_without_date_raises_by_default(self, messy_df):
|
||||||
|
# dedup with survivor_rule="most_recent" but no date_column errors.
|
||||||
|
p = Pipeline(steps=[Step("dedup", {"survivor_rule": "most_recent"})])
|
||||||
|
with pytest.raises(InputValidationError):
|
||||||
|
run_pipeline(messy_df, p)
|
||||||
|
|
||||||
|
def test_continue_on_error_sets_error_string(self):
|
||||||
|
df = pd.DataFrame({"email": ["a@x.com", "a@x.com"], "name": ["A", "B"]})
|
||||||
|
p = Pipeline(steps=[
|
||||||
|
Step("text_clean", {"trim": True}),
|
||||||
|
Step("dedup", {"survivor_rule": "most_recent"}), # will fail
|
||||||
|
Step("missing", {"strategy": "none"}),
|
||||||
|
])
|
||||||
|
res = run_pipeline(df, p, stop_on_error=False)
|
||||||
|
bad = res.step_results[1]
|
||||||
|
assert bad.error is not None
|
||||||
|
assert isinstance(bad.error, str) and bad.error.strip()
|
||||||
|
# The failed step did NOT change the row count — previous df carried.
|
||||||
|
assert res.step_results[2].error is None
|
||||||
|
assert res.final_rows == 2
|
||||||
|
|
||||||
|
def test_failed_step_summary_is_empty(self):
|
||||||
|
df = pd.DataFrame({"e": ["a", "a"], "n": ["x", "y"]})
|
||||||
|
p = Pipeline(steps=[Step("dedup", {"survivor_rule": "most_recent"})])
|
||||||
|
res = run_pipeline(df, p, stop_on_error=False)
|
||||||
|
assert res.step_results[0].summary == {}
|
||||||
|
assert res.step_results[0].skipped is False
|
||||||
|
|
||||||
|
def test_config_error_on_bad_survivor_rule_propagates(self, messy_df):
|
||||||
|
p = Pipeline(steps=[Step("dedup", {"survivor_rule": "nonsense"})])
|
||||||
|
with pytest.raises(ConfigError):
|
||||||
|
run_pipeline(messy_df, p)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Edge inputs
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestEdgeInputs:
|
||||||
|
def test_empty_dataframe_runs_clean(self):
|
||||||
|
empty = pd.DataFrame({"name": [], "phone": []})
|
||||||
|
res = run_pipeline(
|
||||||
|
empty,
|
||||||
|
recommended_pipeline(options={"missing": {"strategy": "none"}}),
|
||||||
|
)
|
||||||
|
assert res.initial_rows == 0
|
||||||
|
assert res.final_rows == 0
|
||||||
|
assert all(sr.error is None for sr in res.step_results if not sr.skipped)
|
||||||
|
|
||||||
|
def test_single_column_dataframe(self):
|
||||||
|
df = pd.DataFrame({"name": [" Al ", "al"]})
|
||||||
|
res = run_pipeline(
|
||||||
|
df, Pipeline(steps=[Step("text_clean", {"trim": True, "case": "title"})]),
|
||||||
|
)
|
||||||
|
assert res.final_df["name"].tolist() == ["Al", "Al"]
|
||||||
|
|
||||||
|
def test_all_steps_disabled_returns_unchanged(self, messy_df):
|
||||||
|
snapshot = messy_df.copy(deep=True)
|
||||||
|
p = Pipeline(steps=[
|
||||||
|
Step("text_clean", enabled=False),
|
||||||
|
Step("format_standardize", enabled=False),
|
||||||
|
Step("missing", enabled=False),
|
||||||
|
Step("dedup", enabled=False),
|
||||||
|
])
|
||||||
|
res = run_pipeline(messy_df, p)
|
||||||
|
assert all(sr.skipped is True for sr in res.step_results)
|
||||||
|
assert res.final_rows == res.initial_rows == 5
|
||||||
|
pd.testing.assert_frame_equal(res.final_df, snapshot)
|
||||||
|
|
||||||
|
def test_empty_pipeline_is_identity(self, messy_df):
|
||||||
|
res = run_pipeline(messy_df, Pipeline(steps=[]))
|
||||||
|
assert res.step_results == []
|
||||||
|
assert res.final_rows == 5
|
||||||
|
pd.testing.assert_frame_equal(res.final_df, messy_df)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Serialization round-trips with disabled / named / nested-option steps
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestSerializationRoundtrips:
|
||||||
|
def test_disabled_and_named_step_survive_dict(self):
|
||||||
|
p = Pipeline(steps=[
|
||||||
|
Step("text_clean", {"trim": True}, enabled=False, name="Pre-clean"),
|
||||||
|
Step("dedup", {"survivor_rule": "first"}, name="Final dedup"),
|
||||||
|
])
|
||||||
|
loaded = Pipeline.from_dict(p.to_dict())
|
||||||
|
assert loaded.steps[0].enabled is False
|
||||||
|
assert loaded.steps[0].name == "Pre-clean"
|
||||||
|
assert loaded.steps[0].options == {"trim": True}
|
||||||
|
assert loaded.steps[1].name == "Final dedup"
|
||||||
|
assert loaded.steps[1].display_name() == "Final dedup"
|
||||||
|
|
||||||
|
def test_nested_options_survive_dict(self):
|
||||||
|
nested = {
|
||||||
|
"column_types": {"phone": "phone", "signup_date": "date"},
|
||||||
|
}
|
||||||
|
strat = {
|
||||||
|
"survivor_rule": "most_complete",
|
||||||
|
"strategies": [{"columns": [
|
||||||
|
{"column": "email", "algorithm": "exact", "threshold": 100},
|
||||||
|
]}],
|
||||||
|
}
|
||||||
|
p = Pipeline(steps=[
|
||||||
|
Step("format_standardize", nested),
|
||||||
|
Step("dedup", strat),
|
||||||
|
])
|
||||||
|
loaded = Pipeline.from_dict(p.to_dict())
|
||||||
|
assert loaded.steps[0].options["column_types"] == nested["column_types"]
|
||||||
|
assert loaded.steps[1].options["strategies"] == strat["strategies"]
|
||||||
|
|
||||||
|
def test_nested_options_survive_file(self, tmp_path):
|
||||||
|
p = Pipeline(steps=[
|
||||||
|
Step("format_standardize",
|
||||||
|
{"column_types": {"phone": "phone"}},
|
||||||
|
enabled=False, name="formats"),
|
||||||
|
])
|
||||||
|
path = tmp_path / "pipe.json"
|
||||||
|
p.to_file(path)
|
||||||
|
loaded = Pipeline.from_file(path)
|
||||||
|
assert loaded.steps[0].enabled is False
|
||||||
|
assert loaded.steps[0].name == "formats"
|
||||||
|
assert loaded.steps[0].options == {"column_types": {"phone": "phone"}}
|
||||||
|
|
||||||
|
def test_roundtrip_is_idempotent(self):
|
||||||
|
p = Pipeline(steps=[
|
||||||
|
Step("text_clean", {"trim": True}, enabled=False, name="x"),
|
||||||
|
Step("missing", {"strategy": "median"}),
|
||||||
|
])
|
||||||
|
once = Pipeline.from_dict(p.to_dict())
|
||||||
|
twice = Pipeline.from_dict(once.to_dict())
|
||||||
|
assert once.to_dict() == twice.to_dict() == p.to_dict()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# recommended_pipeline(include=...) — subsetting, ordering, option seeding
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestRecommendedInclude:
|
||||||
|
def test_subset_preserves_given_order(self):
|
||||||
|
p = recommended_pipeline(include=["dedup", "text_clean"])
|
||||||
|
assert [s.tool for s in p.steps] == ["dedup", "text_clean"]
|
||||||
|
|
||||||
|
def test_column_map_first(self):
|
||||||
|
p = recommended_pipeline(include=[
|
||||||
|
"column_map", "text_clean", "format_standardize", "missing", "dedup",
|
||||||
|
])
|
||||||
|
assert p.steps[0].tool == "column_map"
|
||||||
|
assert len(p.steps) == 5
|
||||||
|
|
||||||
|
def test_column_map_last(self):
|
||||||
|
p = recommended_pipeline(include=[
|
||||||
|
"text_clean", "format_standardize", "missing", "dedup", "column_map",
|
||||||
|
])
|
||||||
|
assert p.steps[-1].tool == "column_map"
|
||||||
|
|
||||||
|
def test_unknown_tool_in_include_raises(self):
|
||||||
|
with pytest.raises(InputValidationError):
|
||||||
|
recommended_pipeline(include=["text_clean", "not_a_tool"])
|
||||||
|
|
||||||
|
def test_options_seeding_only_targets_named_tool(self):
|
||||||
|
p = recommended_pipeline(
|
||||||
|
include=["text_clean", "dedup"],
|
||||||
|
options={"dedup": {"survivor_rule": "last"}},
|
||||||
|
)
|
||||||
|
assert p.steps[0].options == {} # text_clean unseeded
|
||||||
|
assert p.steps[1].options == {"survivor_rule": "last"}
|
||||||
|
|
||||||
|
def test_empty_include_yields_no_steps(self):
|
||||||
|
p = recommended_pipeline(include=[])
|
||||||
|
assert p.steps == []
|
||||||
|
|
||||||
|
def test_seeded_options_are_independent_copies(self):
|
||||||
|
seed = {"text_clean": {"trim": True}}
|
||||||
|
p = recommended_pipeline(include=["text_clean"], options=seed)
|
||||||
|
# Mutating the produced step must not leak back into the seed.
|
||||||
|
p.steps[0].options["trim"] = False
|
||||||
|
assert seed["text_clean"]["trim"] is True
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Realistic demo integration — messy customers table end-to-end
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
class TestDemoIntegration:
|
||||||
|
@pytest.fixture
|
||||||
|
def customers_df(self):
|
||||||
|
return pd.DataFrame({
|
||||||
|
"Full Name": [" alice smith ", "BOB JONES", "alice smith", ""],
|
||||||
|
"Email": ["alice@x.com ", "bob@x.com", "alice@x.com", "carol@x.com"],
|
||||||
|
"Phone": [" +14155551234 ", "+442079460958",
|
||||||
|
"+14155551234", "+13035551111"],
|
||||||
|
})
|
||||||
|
|
||||||
|
def test_full_recommended_plus_column_map(self, customers_df):
|
||||||
|
p = recommended_pipeline(
|
||||||
|
include=["text_clean", "format_standardize", "missing",
|
||||||
|
"dedup", "column_map"],
|
||||||
|
options={
|
||||||
|
"text_clean": {"trim": True, "collapse_whitespace": True},
|
||||||
|
"missing": {"strategy": "none"},
|
||||||
|
"dedup": {
|
||||||
|
"survivor_rule": "most_complete",
|
||||||
|
"strategies": [{"columns": [
|
||||||
|
{"column": "Phone", "algorithm": "exact", "threshold": 100},
|
||||||
|
]}],
|
||||||
|
},
|
||||||
|
"column_map": {
|
||||||
|
"mapping": {"Full Name": "name", "Email": "email",
|
||||||
|
"Phone": "phone"},
|
||||||
|
"unmapped": "keep",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
res = run_pipeline(customers_df, p)
|
||||||
|
|
||||||
|
# Two rows share the same phone after trimming → one duplicate removed.
|
||||||
|
assert res.initial_rows == 4
|
||||||
|
assert res.final_rows == 3
|
||||||
|
assert res.final_rows < res.initial_rows
|
||||||
|
|
||||||
|
# Headers were renamed by the trailing column_map step.
|
||||||
|
assert list(res.final_df.columns) == ["name", "email", "phone"]
|
||||||
|
|
||||||
|
# The surviving Alice row kept its (trimmed) phone.
|
||||||
|
phones = res.final_df["phone"].tolist()
|
||||||
|
assert "+14155551234" in phones
|
||||||
|
assert phones.count("+14155551234") == 1 # only one Alice survives
|
||||||
|
|
||||||
|
# Every executed step succeeded.
|
||||||
|
assert all(sr.error is None for sr in res.step_results if not sr.skipped)
|
||||||
|
# column_map reported the three renames.
|
||||||
|
cm = res.step_results[-1]
|
||||||
|
assert cm.step.tool == "column_map"
|
||||||
|
assert cm.summary["columns_renamed"] == 3
|
||||||
|
|
||||||
|
def test_demo_dedup_step_reports_one_duplicate(self, customers_df):
|
||||||
|
p = recommended_pipeline(options={
|
||||||
|
"text_clean": {"trim": True},
|
||||||
|
"missing": {"strategy": "none"},
|
||||||
|
"dedup": {
|
||||||
|
"survivor_rule": "most_complete",
|
||||||
|
"strategies": [{"columns": [
|
||||||
|
{"column": "Phone", "algorithm": "exact", "threshold": 100},
|
||||||
|
]}],
|
||||||
|
},
|
||||||
|
})
|
||||||
|
res = run_pipeline(customers_df, p)
|
||||||
|
dedup_sr = next(s for s in res.step_results if s.step.tool == "dedup")
|
||||||
|
assert dedup_sr.summary["duplicates_removed"] == 1
|
||||||
|
assert dedup_sr.summary["groups"] == 1
|
||||||
|
|||||||
Reference in New Issue
Block a user