Compare commits
180 Commits
3f007ef3d6
...
rollback-2
| Author | SHA1 | Date | |
|---|---|---|---|
| 58d0009849 | |||
| b6c39d7a09 | |||
| b2fa8503e6 | |||
| b703911df3 | |||
| 93ccada974 | |||
| 17faf84aed | |||
| 4d8513b1a3 | |||
| ac94208d8f | |||
| 4955fb239b | |||
| 4a8961d58a | |||
| fe4b5dc755 | |||
| 209b5fb1aa | |||
| 904356f4e8 | |||
| 7203a81af7 | |||
| dd3b9bd59d | |||
| 2bd94c4441 | |||
| 9c426194b1 | |||
| 6627895a10 | |||
| ea99e292d2 | |||
| 0be59c0f03 | |||
| 3a3a9a895b | |||
| d090f8cb5e | |||
| e44af3a45e | |||
| 450d4fc9a8 | |||
| a0042d4aba | |||
| a18b126885 | |||
| 981a1a9cba | |||
| dbcf4d4048 | |||
| 34b56b404a | |||
| ad7c22d7fb | |||
| 6f2ad57490 | |||
| a1824b8dc4 | |||
| 155dd30746 | |||
| 3cf935c999 | |||
| 263af3c7c2 | |||
| bece2b4030 | |||
| 60969c0770 | |||
| 48cd9e8249 | |||
| d80befd05a | |||
| 10015c40e1 | |||
| e6ee2e3481 | |||
| 538e23d219 | |||
| 2d927bc95f | |||
| 967d3f6a11 | |||
| b86828d791 | |||
| 5a8e2ec9e1 | |||
| 2f349e8191 | |||
| aea520d2f7 | |||
| b8aff862ed | |||
| c16e2a5e29 | |||
| 7c9139f199 | |||
| b3ae913bb9 | |||
| ba07dcb6c7 | |||
| 76c9f5a679 | |||
| a8ff8f4bd0 | |||
| 4451f74895 | |||
| a022059b1e | |||
| 69240fc922 | |||
| 9a7d861903 | |||
| 1016a4d2c4 | |||
| 6c3939d21b | |||
| d436e34a45 | |||
| 0bb72ecd7e | |||
| 74d0ee270f | |||
| 06f1ea6cf7 | |||
| 784695e3a7 | |||
| 4816da1ad6 | |||
| 6703e2c15c | |||
| a9788ba712 | |||
| da7d86f457 | |||
| 2501119ac2 | |||
| 444dffbc63 | |||
| 3c4b80895e | |||
| b0ee65e922 | |||
| 65b663be97 | |||
| c942b8aa19 | |||
| 61e63913cb | |||
| e011c0b6e6 | |||
| 2fe324279e | |||
| 04dc326020 | |||
| d487a44170 | |||
| f106275643 | |||
| 8232ab1ca7 | |||
| 4c8e1199a4 | |||
| e282f061dc | |||
| 5daae9e5fa | |||
| 48cb802dfb | |||
| d022167ba2 | |||
| 24ee021314 | |||
| add3b866ee | |||
| b568773a1f | |||
| 4a7f99f0ec | |||
| b2449d3139 | |||
| d840230e48 | |||
| 9e8b4b2ca9 | |||
| dd231f5a38 | |||
| 143c775cdf | |||
| d1b9f642e2 | |||
| 65c85107b6 | |||
| d9e32e578b | |||
| 7cb1bc922d | |||
| be7191a5d1 | |||
| 2d2ff43754 | |||
| 36510eee7b | |||
| 1caedbbbc7 | |||
| c0bfd4dbc9 | |||
| 59c6d0f914 | |||
| ee0b1f6f6b | |||
| c73d716d06 | |||
| f0885aeb1e | |||
| 229e1afd45 | |||
| 7ad19ac7f4 | |||
| 84e4665ab0 | |||
| 4685bb4289 | |||
| e96d5901f4 | |||
| ecfc52499f | |||
| 21fd8a4cd7 | |||
| 42f8d78dd5 | |||
| 0f89d7ba66 | |||
| b9147f3b66 | |||
| 5128d35961 | |||
| 696996c119 | |||
| ae9d4a2db5 | |||
| ef9f8b5de4 | |||
| aeead05e4c | |||
| 6415be8bf4 | |||
| d1aaf3c2b9 | |||
| 27f0648093 | |||
| 0a61d52200 | |||
| ca14ce2952 | |||
| 502a72cd46 | |||
| 604debb9a9 | |||
| c575efd26e | |||
| 175389219f | |||
| c568aec8a7 | |||
| ff2eaeb6c4 | |||
| dad744f17f | |||
| fc6c22c6a7 | |||
| db5ec084da | |||
| 93e43fc0d9 | |||
| 624f99653e | |||
| 86ad21db79 | |||
| 2bbaba954b | |||
| b5cd74d474 | |||
| 1cf69dd23b | |||
| 673b902377 | |||
| bab2c9468c | |||
| 4179cb5156 | |||
| 52e04f63a9 | |||
| 23c51fd759 | |||
| 65e17e0a70 | |||
| e534fb4989 | |||
| d32b58e61a | |||
| e612c751a8 | |||
| e435103113 | |||
| b2c7b94fe9 | |||
| 070e3c9f06 | |||
| 35d46a0c1a | |||
| d0423a8912 | |||
| 64452dd783 | |||
| e5f632bcd6 | |||
| 5b672370a6 | |||
| 318b9b45dc | |||
| 38011872e1 | |||
| c4ce86bd64 | |||
| 4706ed571e | |||
| ea89c4d399 | |||
| 701108c9d5 | |||
| 340614e642 | |||
| 58c0195def | |||
| 30e257cc44 | |||
| 0c25d80146 | |||
| e1f364f010 | |||
| 966af8ef94 | |||
| d18b95880d | |||
| abb720997e | |||
| 26b9771625 | |||
| 2eece6467d | |||
| b23a27d4e3 | |||
| 4adeb5c7f3 |
196
.github/workflows/build.yml
vendored
Normal file
196
.github/workflows/build.yml
vendored
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
name: Build installers
|
||||||
|
|
||||||
|
# Triggers:
|
||||||
|
# * Tag push (v*) → produces installers + portable zips, attaches them
|
||||||
|
# to a GitHub Release.
|
||||||
|
# * Manual dispatch → uploads everything as workflow artifacts only.
|
||||||
|
#
|
||||||
|
# Outputs per platform (downloadable by buyers):
|
||||||
|
# * macOS: .dmg installer + portable .zip (signed .app inside).
|
||||||
|
# * Windows: .exe installer + portable .zip (no-install).
|
||||||
|
# * Linux: .AppImage (already portable; no separate zip).
|
||||||
|
#
|
||||||
|
# Self-contained: every artifact ships its own Python interpreter + every
|
||||||
|
# runtime dep through PyInstaller. No pre/post install steps on the
|
||||||
|
# buyer's machine.
|
||||||
|
#
|
||||||
|
# What this workflow doesn't do (yet):
|
||||||
|
# * Code signing (Mac Developer ID, Windows code-signing cert).
|
||||||
|
# Those need GitHub Secrets the owner sets up first. See
|
||||||
|
# build/README.md "Signing" for the secret names this workflow
|
||||||
|
# will read once they exist.
|
||||||
|
# * Auto-update endpoint generation. v1 distributes via Gumroad;
|
||||||
|
# buyers re-download for updates.
|
||||||
|
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
push:
|
||||||
|
tags:
|
||||||
|
- 'v*'
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: write # needed to create the release on tag push
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
name: Build (${{ matrix.os }})
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- os: macos-latest
|
||||||
|
platform: mac
|
||||||
|
installer_glob: dist/DataTools-*-mac.dmg
|
||||||
|
portable_glob: dist/DataTools-*-mac-portable.zip
|
||||||
|
- os: windows-latest
|
||||||
|
platform: win
|
||||||
|
installer_glob: dist/DataTools-*-win-setup.exe
|
||||||
|
portable_glob: dist/DataTools-*-win-portable.zip
|
||||||
|
- os: ubuntu-latest
|
||||||
|
platform: linux
|
||||||
|
installer_glob: dist/DataTools-*-linux-x86_64.AppImage
|
||||||
|
portable_glob: '' # AppImage is already a portable single file
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.12'
|
||||||
|
cache: pip
|
||||||
|
|
||||||
|
- name: Install build deps
|
||||||
|
run: |
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install pyinstaller pillow
|
||||||
|
|
||||||
|
# ---- Tesseract bundling cache --------------------------------
|
||||||
|
# The fetch logic inside build/make_release.py downloads:
|
||||||
|
# * build/vendor/tessdata/eng.traineddata (~16 MB, shared)
|
||||||
|
# * build/_tesseract/<platform>/ (binary + libs, 30-120 MB)
|
||||||
|
# Cache both so iterative CI runs don't re-download. The
|
||||||
|
# cache key bakes in the pinned Tesseract version + tessdata
|
||||||
|
# URL so a version bump invalidates automatically.
|
||||||
|
- name: Cache Tesseract bundle inputs
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
build/_tesseract
|
||||||
|
build/vendor/tessdata
|
||||||
|
key: tesseract-${{ runner.os }}-5.5.0-tessdata_best-v1
|
||||||
|
|
||||||
|
# ---- Linux: install patchelf so make_release.py can rewrite
|
||||||
|
# RPATH on the bundled tesseract binary. apt-get install
|
||||||
|
# tesseract-ocr is handled inside make_release.py itself. -----
|
||||||
|
- name: Install Linux build prereqs for Tesseract bundling
|
||||||
|
if: matrix.os == 'ubuntu-latest'
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y patchelf
|
||||||
|
|
||||||
|
- name: Read version
|
||||||
|
id: version
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
VER=$(python -c "import re; print(re.search(r'__version__\s*=\s*\"([^\"]+)\"', open('src/__init__.py').read()).group(1))")
|
||||||
|
echo "version=$VER" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Generate platform icons
|
||||||
|
run: python build/generate_icons.py
|
||||||
|
|
||||||
|
# Stage Tesseract before PyInstaller. The make_release.py
|
||||||
|
# helpers handle the per-platform fetch (UB-Mannheim on Win,
|
||||||
|
# brew on Mac, apt on Linux) and stage the binary + libs into
|
||||||
|
# build/_tesseract/<platform>/ where the spec picks them up.
|
||||||
|
# We invoke a tiny inline Python so the workflow doesn't have
|
||||||
|
# to know the per-platform target string.
|
||||||
|
- name: Stage Tesseract binary + tessdata
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
DATATOOLS_PLATFORM: ${{ matrix.platform }}
|
||||||
|
run: |
|
||||||
|
python - <<'PY'
|
||||||
|
import os, sys
|
||||||
|
sys.path.insert(0, "build")
|
||||||
|
from make_release import fetch_tessdata, fetch_tesseract_for_platform
|
||||||
|
target = os.environ["DATATOOLS_PLATFORM"]
|
||||||
|
fetch_tessdata()
|
||||||
|
fetch_tesseract_for_platform(target)
|
||||||
|
PY
|
||||||
|
|
||||||
|
- name: Build PyInstaller bundle
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
# The spec reads this to find the per-platform staging dir;
|
||||||
|
# see build/datatools.spec for the contract.
|
||||||
|
DATATOOLS_TESS_STAGING: build/_tesseract/${{ matrix.platform }}
|
||||||
|
run: pyinstaller build/datatools.spec --clean --noconfirm
|
||||||
|
|
||||||
|
# ---- Per-platform installer packaging ------------------------
|
||||||
|
|
||||||
|
- name: Package macOS DMG (installer)
|
||||||
|
if: matrix.os == 'macos-latest'
|
||||||
|
run: bash build/macos/build_dmg.sh "${{ steps.version.outputs.version }}"
|
||||||
|
|
||||||
|
- name: Package macOS portable .zip
|
||||||
|
if: matrix.os == 'macos-latest'
|
||||||
|
run: bash build/macos/build_zip.sh "${{ steps.version.outputs.version }}"
|
||||||
|
|
||||||
|
- name: Install Inno Setup (Windows)
|
||||||
|
if: matrix.os == 'windows-latest'
|
||||||
|
run: choco install innosetup --no-progress -y
|
||||||
|
|
||||||
|
- name: Package Windows installer
|
||||||
|
if: matrix.os == 'windows-latest'
|
||||||
|
shell: cmd
|
||||||
|
run: |
|
||||||
|
iscc /DAppVersion=${{ steps.version.outputs.version }} build\installer.iss
|
||||||
|
|
||||||
|
- name: Package Windows portable .zip
|
||||||
|
if: matrix.os == 'windows-latest'
|
||||||
|
run: python build/build_portable_zip.py win ${{ steps.version.outputs.version }}
|
||||||
|
|
||||||
|
- name: Install AppImage tooling (Linux)
|
||||||
|
if: matrix.os == 'ubuntu-latest'
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y libfuse2 wget
|
||||||
|
wget -q https://github.com/AppImage/AppImageKit/releases/download/continuous/appimagetool-x86_64.AppImage -O /usr/local/bin/appimagetool
|
||||||
|
sudo chmod +x /usr/local/bin/appimagetool
|
||||||
|
|
||||||
|
- name: Package Linux AppImage
|
||||||
|
if: matrix.os == 'ubuntu-latest'
|
||||||
|
run: bash build/appimage/build.sh "${{ steps.version.outputs.version }}"
|
||||||
|
|
||||||
|
# ---- Upload + release ----------------------------------------
|
||||||
|
|
||||||
|
- name: Upload installer artifact
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: DataTools-${{ matrix.platform }}-installer
|
||||||
|
path: ${{ matrix.installer_glob }}
|
||||||
|
if-no-files-found: error
|
||||||
|
|
||||||
|
- name: Upload portable artifact
|
||||||
|
if: matrix.portable_glob != ''
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: DataTools-${{ matrix.platform }}-portable
|
||||||
|
path: ${{ matrix.portable_glob }}
|
||||||
|
if-no-files-found: error
|
||||||
|
|
||||||
|
- name: Attach installer to Release (tag push only)
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
files: ${{ matrix.installer_glob }}
|
||||||
|
fail_on_unmatched_files: true
|
||||||
|
generate_release_notes: true
|
||||||
|
|
||||||
|
- name: Attach portable to Release (tag push only)
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v') && matrix.portable_glob != ''
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
files: ${{ matrix.portable_glob }}
|
||||||
|
fail_on_unmatched_files: true
|
||||||
27
.gitignore
vendored
27
.gitignore
vendored
@@ -5,5 +5,30 @@ __pycache__/
|
|||||||
logs/
|
logs/
|
||||||
*.egg-info/
|
*.egg-info/
|
||||||
dist/
|
dist/
|
||||||
build/
|
# PyInstaller writes intermediate artifacts to build/build/<spec>/ when the
|
||||||
|
# spec lives in build/. The spec, launcher, and hooks themselves are source
|
||||||
|
# and should be committed; only the generated artifacts are ignored.
|
||||||
|
build/build/
|
||||||
|
build/__pycache__/
|
||||||
|
build/dist/
|
||||||
|
# Generated by build/generate_icons.py from src/gui/assets/datatools_icon_256.png.
|
||||||
|
# Build artifacts, not source — regenerated each CI run.
|
||||||
|
build/icon.ico
|
||||||
|
build/icon.icns
|
||||||
|
build/icon.png
|
||||||
|
|
||||||
|
# Tesseract bundling — fetched at build time, not committed. See
|
||||||
|
# build/vendor/README.md for the canonical URLs and rationale.
|
||||||
|
# - build/_tesseract/ : per-platform binary + DLLs/dylibs staging dir
|
||||||
|
# - build/vendor/tessdata/eng.traineddata : ~16 MB language data
|
||||||
|
build/_tesseract/
|
||||||
|
build/vendor/tessdata/*.traineddata
|
||||||
|
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Claude Code agent worktrees + local settings
|
||||||
|
.claude/
|
||||||
|
|
||||||
|
# Landing-page deploy outputs and operator config (real URLs, not committed)
|
||||||
|
landing/dist/
|
||||||
|
landing/deploy.config.json
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
[client]
|
[client]
|
||||||
toolbarMode = "minimal"
|
# ``viewer`` is the most aggressive — hides Streamlit's running
|
||||||
|
# indicator, deploy button, and status icons. Keeps the main content
|
||||||
|
# area's top-right corner clean.
|
||||||
|
toolbarMode = "viewer"
|
||||||
|
|
||||||
[browser]
|
[browser]
|
||||||
gatherUsageStats = false
|
gatherUsageStats = false
|
||||||
@@ -9,3 +12,17 @@ gatherUsageStats = false
|
|||||||
# reads "Limit 1024MB per file" — matches the analyzer + gate's stated
|
# reads "Limit 1024MB per file" — matches the analyzer + gate's stated
|
||||||
# 1 GB efficiency target. See docs/REQUIREMENTS.md §1.1.
|
# 1 GB efficiency target. See docs/REQUIREMENTS.md §1.1.
|
||||||
maxUploadSize = 1024
|
maxUploadSize = 1024
|
||||||
|
|
||||||
|
# Warm, editorial palette inspired by the
|
||||||
|
# ``datatools_layout_redesign.html`` mockup — cream paper background,
|
||||||
|
# stone ink, burnt-orange accent. Streamlit reads these on startup and
|
||||||
|
# threads them through its widget chrome (file uploader, focus rings,
|
||||||
|
# primary buttons, links). Heavier visual restyling rides on the CSS
|
||||||
|
# in ``_legacy.py:_DESIGN_TOKENS_CSS``.
|
||||||
|
[theme]
|
||||||
|
base = "light"
|
||||||
|
primaryColor = "#c2410c"
|
||||||
|
backgroundColor = "#fafaf7"
|
||||||
|
secondaryBackgroundColor = "#f5f4ef"
|
||||||
|
textColor = "#1c1917"
|
||||||
|
font = "sans serif"
|
||||||
|
|||||||
220
LICENSE_TESSERACT.txt
Normal file
220
LICENSE_TESSERACT.txt
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
This license applies to the bundled Tesseract OCR binary distributed
|
||||||
|
inside DataTools installer artifacts (Windows .exe, macOS .dmg, Linux
|
||||||
|
.AppImage) and the corresponding portable .zip downloads.
|
||||||
|
|
||||||
|
Tesseract OCR upstream: https://github.com/tesseract-ocr/tesseract
|
||||||
|
Copyright (C) 2006-2024 Google Inc. and the Tesseract OCR contributors
|
||||||
|
|
||||||
|
The Tesseract OCR binary is distributed under the Apache License,
|
||||||
|
Version 2.0, the full text of which is reproduced verbatim below.
|
||||||
|
|
||||||
|
The bundled `eng.traineddata` data file is the "best" English model
|
||||||
|
from https://github.com/tesseract-ocr/tessdata_best and is licensed
|
||||||
|
under the Apache License, Version 2.0 as well.
|
||||||
|
|
||||||
|
DataTools itself is proprietary and is NOT covered by this license;
|
||||||
|
see LICENSE.txt at the repository root for DataTools' own license.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for describing the origin of the Work and
|
||||||
|
reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may accept and charge a
|
||||||
|
fee for, acceptance of support, warranty, indemnity, or other
|
||||||
|
liability obligations and/or rights consistent with this License.
|
||||||
|
However, in accepting such obligations, You may act only on Your
|
||||||
|
own behalf and on Your sole responsibility, not on behalf of any
|
||||||
|
other Contributor, and only if You agree to indemnify, defend,
|
||||||
|
and hold each Contributor harmless for any liability incurred by,
|
||||||
|
or claims asserted against, such Contributor by reason of your
|
||||||
|
accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied. See the License for the specific language governing
|
||||||
|
permissions and limitations under the License.
|
||||||
103
README.es.md
Normal file
103
README.es.md
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
> 🌐 **Idioma:** Español · [English](README.md)
|
||||||
|
|
||||||
|
# DataTools
|
||||||
|
|
||||||
|
Limpieza local de CSV / Excel. CLI + GUI en el navegador, sin nube, sin ceremonias de instalación. La GUI incluye paquetes de idioma en inglés y español.
|
||||||
|
|
||||||
|
## Herramientas
|
||||||
|
|
||||||
|
| # | Herramienta | Estado |
|
||||||
|
|---|------|--------|
|
||||||
|
| 01 | **Buscar duplicados** — coincidencia exacta + difusa, 5 normalizadores, reglas de superviviente, auditoría | Listo |
|
||||||
|
| 02 | **Limpiar texto** — espacios, caracteres tipográficos, BOM, finales de línea, mayúsculas/minúsculas | Listo |
|
||||||
|
| 03 | **Estandarizar formatos** — fechas, teléfonos, correos, direcciones, nombres, monedas, booleanos | Listo |
|
||||||
|
| 04 | **Corregir valores faltantes** — detección de nulos disfrazados, perfil, media/mediana/moda/ffill/bfill/interpolación, estrategias de descarte | Listo |
|
||||||
|
| 05 | **Mapear columnas** — autodetección difusa de renombrados, esquema objetivo con coerción de tipos, campos requeridos con valores por defecto, descartar/reordenar | Listo |
|
||||||
|
| 06 | Detectar valores atípicos | Próximamente |
|
||||||
|
| 07 | Combinar archivos | Próximamente |
|
||||||
|
| 08 | Verificación de calidad | Próximamente |
|
||||||
|
| 09 | **Flujos automatizados** — encadena herramientas en un orden recomendado (no forzado), guarda/carga JSON, automatiza limpiezas semanales | Listo |
|
||||||
|
|
||||||
|
Cada página de herramienta incluye una ventana emergente de **Help** (a la derecha del título) con una guía compacta de Cuándo usarla / Pasos / Ejemplos / Consejo. El texto vive en los paquetes de idioma (`tools.<id>.help_md`).
|
||||||
|
|
||||||
|
## Descarga (usuarios no técnicos)
|
||||||
|
|
||||||
|
Paquetes precompilados — sin instalar Python, sin permisos de administrador, sin internet en ejecución. Cada versión ofrece dos formatos por sistema operativo: un **instalador** que crea accesos directos en el escritorio + menú Inicio / Launchpad, y un **.zip portable** que descomprimes y haces doble clic. Elige el que te permita tu política de TI.
|
||||||
|
|
||||||
|
| Plataforma | Instalador (recomendado) | Portable (sin instalar) |
|
||||||
|
|---|---|---|
|
||||||
|
| **macOS** | `DataTools-X.Y.Z-mac.dmg` — ábrelo, arrastra DataTools.app a /Applications, ejecútalo desde Launchpad. | `DataTools-X.Y.Z-mac-portable.zip` — descomprime donde quieras, doble clic en `DataTools.app`. |
|
||||||
|
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — ejecuta el instalador (por usuario, sin admin). Crea acceso directo en el escritorio + entrada en el menú Inicio. | `DataTools-X.Y.Z-win-portable.zip` — descomprime donde quieras, doble clic en `DataTools.exe`. |
|
||||||
|
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x` y doble clic. | El AppImage ya es portable. |
|
||||||
|
|
||||||
|
Última versión: consulta [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (o el listado de Gumroad). Cada paquete ocupa ~300 MB descomprimido; al primer arranque la app levanta un servidor local en http://127.0.0.1:8501 y abre tu navegador predeterminado. Nada sale de tu equipo — instalador y portable son idénticos por dentro.
|
||||||
|
|
||||||
|
**Tesseract OCR viene incluido.** El soporte para PDFs escaneados del Extractor de PDF funciona sin configuración adicional en las tres plataformas — no hace falta instalar Tesseract por separado. Atribución de licencia: ver [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
||||||
|
|
||||||
|
**Avisos del primer arranque (una sola vez):**
|
||||||
|
- **macOS** sin firma: clic derecho → **Abrir** → confirma. (Las compilaciones firmadas se lo saltan.)
|
||||||
|
- **Windows** SmartScreen: pulsa **Más información** → **Ejecutar de todas formas**.
|
||||||
|
|
||||||
|
Guía detallada de instalación y resolución de problemas: [Guía del usuario §1](docs/USER-GUIDE.es.md#1-instalaci%C3%B3n).
|
||||||
|
|
||||||
|
## Instalar desde el código (desarrolladores)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Requiere Python 3.10+.
|
||||||
|
|
||||||
|
## Ejecutar
|
||||||
|
|
||||||
|
**GUI** (recomendado):
|
||||||
|
```bash
|
||||||
|
streamlit run src/gui/app.py
|
||||||
|
```
|
||||||
|
|
||||||
|
**CLI** — siete puntos de entrada:
|
||||||
|
```bash
|
||||||
|
python -m src.cli customers.csv [--apply] # deduplicación
|
||||||
|
python -m src.cli_text_clean messy.csv [--apply] # limpieza de texto
|
||||||
|
python -m src.cli_format intl.csv [--apply] # estandarización de formatos (auto-stream si >100 MB)
|
||||||
|
python -m src.cli_missing holes.csv [--apply] # valores faltantes
|
||||||
|
python -m src.cli_column_map vendor.csv [--apply] # mapeador de columnas
|
||||||
|
python -m src.cli_pipeline any_file.csv [--apply] # encadena herramientas de extremo a extremo
|
||||||
|
python -m src.cli_analyze any_file.csv [--json] # solo escanea
|
||||||
|
```
|
||||||
|
|
||||||
|
Cada CLI ejecuta solo previsualización por defecto; añade `--apply` para escribir la salida.
|
||||||
|
|
||||||
|
## Idioma
|
||||||
|
|
||||||
|
La barra lateral de la GUI tiene un selector de idioma. Se incluyen paquetes para **English** y **Español** (`src/i18n/packs/`); la elección persiste durante la sesión. Para añadir un idioma: coloca un `<código>.json` junto a `en.json` reproduciendo el árbol de claves, y luego añádelo a `LANGUAGES`. Ver [Guía del desarrollador §i18n](docs/DEVELOPER.md#i18n--language-packs) (solo en inglés).
|
||||||
|
|
||||||
|
## Verificación de Revisar y Normalizar
|
||||||
|
|
||||||
|
Cada archivo subido pasa por una verificación de normalización CSV antes de que cualquier herramienta lo toque. El analizador detecta ~15 tipos de problemas (espacios, caracteres NBSP / de ancho cero, BOM, codificación, puntuación tipográfica, encabezados sucios, centinelas nulos, mojibake, …) etiquetados por **confianza** (alta / media / baja) y **acción de corrección**. La GUI muestra cada hallazgo con Corregir auto / Saltar / Personalizar, una previsualización antes/después en vivo, y un selector para anular la codificación. Las páginas de herramientas se niegan a cargar hasta que se pase la verificación.
|
||||||
|
|
||||||
|
## Salida
|
||||||
|
|
||||||
|
Cada ejecución escribe:
|
||||||
|
|
||||||
|
- `{input}_<tool>.csv` — los datos limpios
|
||||||
|
- `{input}_changes.csv` (limpiador de texto) o `{input}_match_groups.csv` (duplicados) — pista de auditoría
|
||||||
|
- `logs/<tool>_YYYYMMDD_HHMMSS.log` — registro de depuración de la ejecución
|
||||||
|
|
||||||
|
El archivo de entrada original nunca se modifica.
|
||||||
|
|
||||||
|
## Documentación
|
||||||
|
|
||||||
|
- [Guía del usuario](docs/USER-GUIDE.es.md) — instalación, flujo de la GUI, verificación
|
||||||
|
- [Referencia de la CLI](docs/CLI-REFERENCE.es.md) — cada bandera con recetas
|
||||||
|
- [Requisitos](docs/REQUIREMENTS.md) — tamaños de archivo, codificaciones, detectores, objetivos de rendimiento (solo en inglés)
|
||||||
|
- [Técnico](docs/TECHNICAL.md) — arquitectura, internos de la verificación, registro de correcciones (solo en inglés)
|
||||||
|
- [Guía del desarrollador](docs/DEVELOPER.md) — añadir correcciones / detectores / estandarizadores (solo en inglés)
|
||||||
|
|
||||||
|
## Dependencias
|
||||||
|
|
||||||
|
`pandas`, `openpyxl`, `rapidfuzz`, `phonenumbers`, `typer`, `loguru`, `charset-normalizer`, `streamlit`. Opcional: `ftfy` para reparación de mojibake.
|
||||||
|
|
||||||
|
## Licencia
|
||||||
|
|
||||||
|
Propietaria.
|
||||||
204
README.md
204
README.md
@@ -1,175 +1,103 @@
|
|||||||
|
> 🌐 **Language:** English · [Español](README.es.md)
|
||||||
|
|
||||||
# DataTools
|
# DataTools
|
||||||
|
|
||||||
A bundle of Python data-cleaning tools for CSV and Excel files. Two scripts ship today; more are in build.
|
Local CSV / Excel cleaning. CLI + browser GUI, no cloud, no install ceremony. GUI ships with English and Spanish language packs.
|
||||||
|
|
||||||
| # | Tool | What it does |
|
## Tools
|
||||||
|
|
||||||
|
| # | Tool | Status |
|
||||||
|
|---|------|--------|
|
||||||
|
| 01 | **Find Duplicates** — exact + fuzzy match, 5 normalizers, survivor rules, audit | Ready |
|
||||||
|
| 02 | **Clean Text** — whitespace, smart chars, BOM, line endings, case ops | Ready |
|
||||||
|
| 03 | **Standardize Formats** — dates, phones, emails, addresses, names, currencies, booleans | Ready |
|
||||||
|
| 04 | **Fix Missing Values** — disguised-null detection, profile, mean/median/mode/ffill/bfill/interpolate, drop strategies | Ready |
|
||||||
|
| 05 | **Map Columns** — fuzzy auto-rename, target schema with type coercion, required fields with defaults, drop/reorder | Ready |
|
||||||
|
| 06 | Find Unusual Values | Coming Soon |
|
||||||
|
| 07 | Combine Files | Coming Soon |
|
||||||
|
| 08 | Quality Check | Coming Soon |
|
||||||
|
| 09 | **Automated Workflows** — chain tools with recommended (not forced) order, save/load JSON, automate weekly cleanups | Ready |
|
||||||
|
|
||||||
|
Every tool page has an in-tool **Help** popover (right of the title) with a compact When-to-use / Steps / Examples / Tip card. Copy lives in the language packs (`tools.<id>.help_md`).
|
||||||
|
|
||||||
|
## Download (non-technical users)
|
||||||
|
|
||||||
|
Pre-built bundles — no Python install, no admin rights, no internet at runtime. Each release ships two flavors per OS: an **installer** that wires up Desktop + Start Menu / Launchpad shortcuts, and a **portable .zip** you unzip and double-click. Pick whichever your IT policy allows.
|
||||||
|
|
||||||
|
| Platform | Installer (recommended) | Portable (no install) |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| 01 | **Deduplicator** | Find and remove duplicate rows with exact + fuzzy matching, smart normalization, and interactive review. |
|
| **macOS** | `DataTools-X.Y.Z-mac.dmg` — open, drag DataTools.app into /Applications, launch from Launchpad. | `DataTools-X.Y.Z-mac-portable.zip` — unzip anywhere, double-click `DataTools.app`. |
|
||||||
| 02 | **Text Cleaner** | Trim whitespace, fold smart quotes, strip invisible / control characters, normalize Unicode, normalize line endings, optional case conversion. |
|
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — run installer (per-user, no admin). Desktop shortcut + Start Menu entry created. | `DataTools-X.Y.Z-win-portable.zip` — unzip anywhere, double-click `DataTools.exe`. |
|
||||||
|
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x`, double-click. | The AppImage is already portable. |
|
||||||
|
|
||||||
## Deduplicator
|
Latest release: see [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (or the Gumroad listing). Each bundle is ~300 MB unpacked; on first launch the app starts a local server at http://127.0.0.1:8501 and opens your default browser. Nothing leaves your machine — installers and portables are byte-identical inside.
|
||||||
|
|
||||||
## Features
|
**Tesseract OCR is bundled.** Scanned-PDF support in the PDF Extractor works out of the box on all three platforms — no separate Tesseract install required. License attribution: see [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
||||||
|
|
||||||
- **Zero-config start** — auto-detects encoding, delimiters, headers, and match columns
|
**First-launch warnings (one-time):**
|
||||||
- **Fuzzy matching** — Jaro-Winkler, Levenshtein, and token set ratio algorithms
|
- **macOS** unsigned builds: right-click → **Open** → confirm. (Signed builds skip this.)
|
||||||
- **5 built-in normalizers** — email (Gmail dot/plus), phone (E.164), name (titles/suffixes), address (USPS), string (whitespace/case)
|
- **Windows** SmartScreen: click **More info** → **Run anyway**.
|
||||||
- **Merge mode** — fill missing fields in the surviving row from removed duplicates
|
|
||||||
- **4 survivor rules** — keep first, last, most complete, or most recent row per group
|
|
||||||
- **Interactive review** — inspect match groups with inline checkboxes and column dropdowns, cherry-pick values, preview surviving rows live
|
|
||||||
- **Config profiles** — save and reload your settings as JSON for repeatable runs
|
|
||||||
- **Dual interface** — full CLI for automation, Streamlit GUI for visual review
|
|
||||||
- **Dry-run by default** — preview what would change before writing anything
|
|
||||||
- **Audit trail** — every run produces a match groups report and timestamped log
|
|
||||||
|
|
||||||
## Quick Start
|
Detailed install + troubleshooting walkthrough: [User Guide §1](docs/USER-GUIDE.md#1-install).
|
||||||
|
|
||||||
### Install
|
## Install from source (developers)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
### CLI
|
Python 3.10+ required.
|
||||||
|
|
||||||
```bash
|
## Run
|
||||||
# Preview duplicates (dry run — no files written)
|
|
||||||
python -m src.cli customers.csv
|
|
||||||
|
|
||||||
# Remove duplicates and save the result
|
|
||||||
python -m src.cli customers.csv --apply
|
|
||||||
|
|
||||||
# Fuzzy-match names at 80% similarity, merge missing fields
|
|
||||||
python -m src.cli customers.csv --fuzzy name --threshold 80 --merge --apply
|
|
||||||
|
|
||||||
# Interactively review each match group
|
|
||||||
python -m src.cli customers.csv --review --apply
|
|
||||||
```
|
|
||||||
|
|
||||||
### GUI
|
|
||||||
|
|
||||||
|
**GUI** (recommended):
|
||||||
```bash
|
```bash
|
||||||
streamlit run src/gui/app.py
|
streamlit run src/gui/app.py
|
||||||
```
|
```
|
||||||
|
|
||||||
Upload a file, click **Find Duplicates**, review match groups side-by-side, then download the cleaned result.
|
**CLI** — seven entry points:
|
||||||
|
|
||||||
## CLI Usage Summary
|
|
||||||
|
|
||||||
```
|
|
||||||
python -m src.cli INPUT_FILE [OPTIONS]
|
|
||||||
|
|
||||||
Options:
|
|
||||||
--apply Write output files (default: preview only)
|
|
||||||
--output, -o PATH Output file path
|
|
||||||
--subset, -s COLS Columns to match on (comma-separated)
|
|
||||||
--key, -k COLS Strong-key columns for exact matching
|
|
||||||
--fuzzy COLS Columns to fuzzy-match
|
|
||||||
--algorithm, -a ALG levenshtein | jaro_winkler | token_set_ratio
|
|
||||||
--threshold, -t N Similarity threshold 0-100 (default: 85)
|
|
||||||
--normalize COL:TYPE Per-column normalizers (e.g., email:email,phone:phone)
|
|
||||||
--survivor RULE first | last | most-complete | most-recent
|
|
||||||
--merge Fill missing fields from removed duplicates
|
|
||||||
--review Interactively review each match group
|
|
||||||
--config PATH Load settings from a JSON config file
|
|
||||||
--save-config PATH Save current settings to JSON
|
|
||||||
--sheet NAME Excel sheet name or 0-based index
|
|
||||||
--encoding ENC Override auto-detected encoding
|
|
||||||
--header-row N 0-based header row index
|
|
||||||
--help Show full help
|
|
||||||
```
|
|
||||||
|
|
||||||
## Sample Output
|
|
||||||
|
|
||||||
```
|
|
||||||
$ python -m src.cli samples/messy_sales.csv
|
|
||||||
|
|
||||||
Reading messy_sales.csv...
|
|
||||||
50 rows, 8 columns
|
|
||||||
Finding duplicates...
|
|
||||||
|
|
||||||
──────────────────────────────────────────────────
|
|
||||||
File: messy_sales.csv
|
|
||||||
Rows in: 50
|
|
||||||
Rows out: 28
|
|
||||||
Removed: 22
|
|
||||||
Groups: 22
|
|
||||||
──────────────────────────────────────────────────
|
|
||||||
|
|
||||||
Match groups:
|
|
||||||
Group 1: rows [1, 2] → keep row 1 (confidence: 100.0%, matched on: email)
|
|
||||||
Group 2: rows [3, 4] → keep row 3 (confidence: 92.3%, matched on: name, phone)
|
|
||||||
...
|
|
||||||
|
|
||||||
This was a preview. Add --apply to write the output files.
|
|
||||||
```
|
|
||||||
|
|
||||||
## Output Files
|
|
||||||
|
|
||||||
When `--apply` is used, three files are produced:
|
|
||||||
|
|
||||||
| File | Contents |
|
|
||||||
|------|----------|
|
|
||||||
| `{input}_deduplicated.csv` | Cleaned data with duplicates removed |
|
|
||||||
| `{input}_removed.csv` | Rows that were removed |
|
|
||||||
| `{input}_match_groups.csv` | Audit trail: group ID, confidence, matched columns, survivor flag |
|
|
||||||
|
|
||||||
## Text Cleaner
|
|
||||||
|
|
||||||
Character-level hygiene for messy CSV / Excel input. Solves the dirty-data failure modes that silently break VLOOKUPs, dedup runs, and downstream imports:
|
|
||||||
|
|
||||||
- Trailing / leading whitespace and tabs in cells
|
|
||||||
- Non-breaking spaces (`U+00A0`) hiding inside text where regular spaces should be
|
|
||||||
- Smart quotes pasted from Word (`"` `"` `'` `'` → `"` `"` `'` `'`)
|
|
||||||
- Em / en dashes, ellipsis, other typographic Unicode
|
|
||||||
- Zero-width and bidi-mark characters (`U+200B`, `U+200C`, `U+200D`, etc.)
|
|
||||||
- BOMs from Excel "Save As CSV UTF-8"
|
|
||||||
- Mixed line endings (`\r\n`, bare `\r`) inside multi-line cells
|
|
||||||
- Control characters (`U+0000`-`U+001F` minus `\t \n \r`)
|
|
||||||
- Optional Unicode NFC / NFKC normalization
|
|
||||||
- Optional per-column case conversion (UPPER / lower / smart Title / Sentence)
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Preview what would change (dry-run)
|
python -m src.cli customers.csv [--apply] # dedup
|
||||||
python -m src.cli_text_clean samples/messy_text.csv
|
python -m src.cli_text_clean messy.csv [--apply] # text clean
|
||||||
|
python -m src.cli_format intl.csv [--apply] # format standardize (auto-streams >100 MB)
|
||||||
# Apply the safe defaults
|
python -m src.cli_missing holes.csv [--apply] # missing values
|
||||||
python -m src.cli_text_clean samples/messy_text.csv --apply
|
python -m src.cli_column_map vendor.csv [--apply] # column mapper
|
||||||
|
python -m src.cli_pipeline any_file.csv [--apply] # chain tools end-to-end
|
||||||
# Title-case the name column, upper-case the SKU column
|
python -m src.cli_analyze any_file.csv [--json] # scan only
|
||||||
python -m src.cli_text_clean products.csv --case title:name,upper:sku --apply
|
|
||||||
|
|
||||||
# Just trim and collapse — nothing fancy
|
|
||||||
python -m src.cli_text_clean messy.csv --preset minimal --apply
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Three presets: `minimal` (trim + collapse only), `excel-hygiene` (default; everything safe ON), `paranoid` (adds lossy NFKC fold).
|
Every CLI runs preview-only by default; add `--apply` to write output.
|
||||||
|
|
||||||
Outputs `{input}_cleaned.csv` plus a per-cell `{input}_changes.csv` audit (row, column, old, new, ops applied).
|
## Language
|
||||||
|
|
||||||
See [docs/CLI-REFERENCE.md](docs/CLI-REFERENCE.md#text-cleaner-cli) for every flag.
|
The GUI sidebar has a language picker. Packs ship for **English** and **Español** (`src/i18n/packs/`); the choice persists for the session. Adding a language: drop a `<code>.json` next to `en.json` mirroring its key tree, then list it in `LANGUAGES`. See [Developer Guide §i18n](docs/DEVELOPER.md#i18n--language-packs).
|
||||||
|
|
||||||
## Review & Normalize gate
|
## Review & Normalize gate
|
||||||
|
|
||||||
Every uploaded file passes through a CSV-normalization gate before any tool page sees it. The analyzer scans for ~15 issue types — whitespace pollution, NBSP / zero-width chars, mixed line endings, BOM artifacts, encoding misdetections, smart punctuation, dirty headers, null sentinels, mojibake, and more — and tags each finding by **confidence** (high / medium / low) and **fix action** (the algorithm in `src/core/fixes.py` that resolves it).
|
Every uploaded file passes through a CSV-normalization gate before any tool sees it. The analyzer flags ~15 issue types (whitespace, NBSP / zero-width chars, BOM, encoding, smart punct, dirty headers, null sentinels, mojibake, …) tagged by **confidence** (high / medium / low) and **fix action**. The GUI shows each finding with Auto-fix / Skip / Customize, a live before/after preview, and an encoding-override picker. Tool pages refuse to load until the gate passes.
|
||||||
|
|
||||||
In the GUI, the **Review & Normalize** page renders one expandable card per finding with a decision control (Auto-fix / Skip / Customize), a live before-and-after preview, an encoding-override picker for misdetected codepages, and an Advanced output options block (encoding, delimiter, line terminator) for the download. Tool pages refuse to load until the gate passes.
|
## Output
|
||||||
|
|
||||||
See [docs/USER-GUIDE.md §3.3](docs/USER-GUIDE.md) for the user-facing walkthrough and [docs/TECHNICAL.md §10.2.1–10.2.4](docs/TECHNICAL.md) for the developer-facing API.
|
Every run writes:
|
||||||
|
|
||||||
## Documentation
|
- `{input}_<tool>.csv` — the cleaned data
|
||||||
|
- `{input}_changes.csv` (text cleaner) or `{input}_match_groups.csv` (dedup) — audit trail
|
||||||
|
- `logs/<tool>_YYYYMMDD_HHMMSS.log` — debug-level run log
|
||||||
|
|
||||||
- [Requirements](docs/REQUIREMENTS.md) — short-form numbered list: file size, codepages, delimiters, detectors, performance targets
|
Original input file is never modified.
|
||||||
- [User Guide](docs/USER-GUIDE.md) — installation, GUI workflow, the Review & Normalize gate
|
|
||||||
- [CLI Reference](docs/CLI-REFERENCE.md) — every flag with examples and recipe sections
|
|
||||||
- [Technical](docs/TECHNICAL.md) — architecture, gate internals, finding schema, fix registry
|
|
||||||
- [Developer Guide](docs/DEVELOPER.md) — extending the bundle, adding fixes / detectors
|
|
||||||
|
|
||||||
## Requirements
|
## Docs
|
||||||
|
|
||||||
- Python 3.10+
|
- [User Guide](docs/USER-GUIDE.md) — install, GUI workflow, gate
|
||||||
- Dependencies: pandas, openpyxl, rapidfuzz, typer, phonenumbers, loguru, tqdm, charset-normalizer
|
- [CLI Reference](docs/CLI-REFERENCE.md) — every flag with recipes
|
||||||
|
- [Requirements](docs/REQUIREMENTS.md) — file sizes, encodings, detectors, perf targets
|
||||||
|
- [Technical](docs/TECHNICAL.md) — architecture, gate internals, fix registry
|
||||||
|
- [Developer Guide](docs/DEVELOPER.md) — adding fixes / detectors / standardizers
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
`pandas`, `openpyxl`, `rapidfuzz`, `phonenumbers`, `typer`, `loguru`, `charset-normalizer`, `streamlit`. Optional: `ftfy` for mojibake repair.
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
Proprietary. All rights reserved.
|
Proprietary.
|
||||||
|
|||||||
383
build/README.md
Normal file
383
build/README.md
Normal file
@@ -0,0 +1,383 @@
|
|||||||
|
# Build — DataTools desktop installer
|
||||||
|
|
||||||
|
> Cross-platform PyInstaller bundle for Mac / Windows / Linux. The
|
||||||
|
> single deliverable the buyer downloads from Gumroad.
|
||||||
|
> **Owner**: Michael · **Updated**: 2026-05-01
|
||||||
|
|
||||||
|
This directory is the build pipeline. Source of truth for the bundle
|
||||||
|
shape, hidden-import lists, per-platform recipes, and the launcher
|
||||||
|
that boots Streamlit inside the bundle.
|
||||||
|
|
||||||
|
## Files
|
||||||
|
|
||||||
|
```
|
||||||
|
build/
|
||||||
|
├── launcher.py Entry point PyInstaller wraps. Boots a local
|
||||||
|
│ Streamlit server, opens browser, locks server
|
||||||
|
│ to 127.0.0.1 so the privacy claim holds.
|
||||||
|
├── datatools.spec PyInstaller spec — hidden imports, data files,
|
||||||
|
│ Mac .app bundle config. Reads the version
|
||||||
|
│ from src/__init__.py.
|
||||||
|
├── installer.iss Inno Setup script — Windows .exe installer.
|
||||||
|
│ Adds Start Menu + Desktop + App Paths entries.
|
||||||
|
├── generate_icons.py Builds icon.ico / icon.icns / icon.png from
|
||||||
|
│ src/gui/assets/datatools_icon_256.png. Run
|
||||||
|
│ once before pyinstaller (CI does this).
|
||||||
|
├── build_portable_zip.py Cross-platform: zips dist/DataTools/ into a
|
||||||
|
│ no-install portable download. Used by the
|
||||||
|
│ Windows + Linux portable artifacts.
|
||||||
|
├── macos/
|
||||||
|
│ ├── build_dmg.sh Wraps dist/DataTools.app into a .dmg with a
|
||||||
|
│ │ drag-to-/Applications layout (installer).
|
||||||
|
│ └── build_zip.sh Wraps dist/DataTools.app into a portable
|
||||||
|
│ .zip via ditto (preserves bundle metadata).
|
||||||
|
├── appimage/
|
||||||
|
│ ├── AppRun Entry point invoked when the AppImage runs.
|
||||||
|
│ ├── datatools.desktop Linux desktop-entry metadata.
|
||||||
|
│ └── build.sh Wraps dist/DataTools/ into an .AppImage.
|
||||||
|
├── hooks/ PyInstaller hooks for libs the static analyser
|
||||||
|
│ └── hook-streamlit.py misses (Streamlit's dynamic imports).
|
||||||
|
├── icon.{ico,icns,png} Generated by generate_icons.py — gitignored.
|
||||||
|
└── README.md this file
|
||||||
|
```
|
||||||
|
|
||||||
|
## Distribution outputs per platform
|
||||||
|
|
||||||
|
Each CI run produces two downloads per platform — an installer for
|
||||||
|
buyers who want shortcuts wired automatically, and a portable .zip
|
||||||
|
for buyers (or IT-locked-down machines) that can't run installers:
|
||||||
|
|
||||||
|
| Platform | Installer | Portable |
|
||||||
|
|----------|----------------------------------------|------------------------------------------------|
|
||||||
|
| macOS | `DataTools-<ver>-mac.dmg` | `DataTools-<ver>-mac-portable.zip` (ditto .app)|
|
||||||
|
| Windows | `DataTools-<ver>-win-setup.exe` | `DataTools-<ver>-win-portable.zip` |
|
||||||
|
| Linux | `DataTools-<ver>-linux-x86_64.AppImage`| (the AppImage IS the portable) |
|
||||||
|
|
||||||
|
All six outputs are self-contained: every dependency (Python, pandas,
|
||||||
|
streamlit, pdfplumber, **Tesseract OCR + `eng.traineddata`**, the lot)
|
||||||
|
is frozen into the bundle. The buyer does not need to install Python,
|
||||||
|
pip, Tesseract, or anything else first. With Tesseract bundled, each
|
||||||
|
artifact is roughly **250–300 MB** on disk (up from ~120 MB pre-OCR);
|
||||||
|
unpacked installs run ~300–400 MB once scratch space is counted.
|
||||||
|
|
||||||
|
## Easy-launch surface
|
||||||
|
|
||||||
|
| Affordance | Windows | macOS |
|
||||||
|
|------------------|--------------------------------------------------|------------------------------------------------------|
|
||||||
|
| Desktop shortcut | Inno Setup `desktopicon` task (checked default) | The .app bundle in /Applications is the icon |
|
||||||
|
| App menu | Start Menu → DataTools (always installed) | Launchpad + Spotlight (auto from /Applications) |
|
||||||
|
| Taskbar / Dock | User pins manually (OS forbids programmatic pin) | User pins manually after first launch |
|
||||||
|
| Run from terminal| `DataTools` (registered via App Paths) | `open -a DataTools` (auto from .app bundle) |
|
||||||
|
|
||||||
|
CI: `.github/workflows/build.yml` runs the full pipeline on tag push
|
||||||
|
(matrix: macos-latest, windows-latest, ubuntu-latest) and attaches
|
||||||
|
the resulting installers to a GitHub Release. Manual
|
||||||
|
`workflow_dispatch` runs upload them as workflow artifacts only.
|
||||||
|
|
||||||
|
## Releasing
|
||||||
|
|
||||||
|
### Single-command local build (recommended for one-developer workflow)
|
||||||
|
|
||||||
|
PyInstaller can't cross-compile, so a single machine produces one
|
||||||
|
platform's packages. Run this on each target OS:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# One-time setup per machine:
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install pyinstaller pillow
|
||||||
|
# Windows only: install Inno Setup from https://jrsoftware.org/isdl.php
|
||||||
|
# Linux only: drop appimagetool onto PATH (see preflight output)
|
||||||
|
|
||||||
|
# Build everything for the current OS:
|
||||||
|
python build/make_release.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Outputs land in `dist/`:
|
||||||
|
- Windows host → `DataTools-<ver>-win-setup.exe` + `DataTools-<ver>-win-portable.zip`
|
||||||
|
- macOS host → `DataTools-<ver>-mac.dmg` + `DataTools-<ver>-mac-portable.zip`
|
||||||
|
- Linux host → `DataTools-<ver>-linux-x86_64.AppImage`
|
||||||
|
|
||||||
|
Useful flags:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python build/make_release.py --preflight # check tooling, build nothing
|
||||||
|
python build/make_release.py --clean # wipe dist/ first
|
||||||
|
python build/make_release.py --skip-installer # just the portable zip
|
||||||
|
python build/make_release.py --skip-portable # just the installer
|
||||||
|
```
|
||||||
|
|
||||||
|
### CI build (push tag → GitHub Release)
|
||||||
|
|
||||||
|
If you have CI runners for all three OSes:
|
||||||
|
|
||||||
|
1. Bump `__version__` in `src/__init__.py`.
|
||||||
|
2. `git commit -am "release: vX.Y.Z" && git tag vX.Y.Z`.
|
||||||
|
3. `git push && git push --tags`.
|
||||||
|
4. CI builds all three platforms and creates a Release with the
|
||||||
|
installers + portable zips attached.
|
||||||
|
5. Mirror the Release assets to Gumroad (manual until v2).
|
||||||
|
|
||||||
|
## Signing (Phase 2 — needs accounts/credentials)
|
||||||
|
|
||||||
|
Both code-signing steps are intentionally not in CI yet because they
|
||||||
|
require credentials the owner sets up first.
|
||||||
|
|
||||||
|
**macOS** — Apple Developer Program enrollment ($99/yr). Once enrolled,
|
||||||
|
add these GitHub Secrets and uncomment the `codesign` + `notarytool`
|
||||||
|
steps in `build.yml`:
|
||||||
|
|
||||||
|
| Secret | Value |
|
||||||
|
|---|---|
|
||||||
|
| `MACOS_DEVELOPER_ID_CERT_P12_BASE64` | base64-encoded `.p12` cert |
|
||||||
|
| `MACOS_DEVELOPER_ID_CERT_PASSWORD` | password for the .p12 |
|
||||||
|
| `MACOS_NOTARY_APPLE_ID` | Apple ID email |
|
||||||
|
| `MACOS_NOTARY_TEAM_ID` | 10-char team ID |
|
||||||
|
| `MACOS_NOTARY_PASSWORD` | app-specific password |
|
||||||
|
|
||||||
|
**Windows** — Code-signing cert from Sectigo / DigiCert (~$200-400/yr,
|
||||||
|
or ~$300-500 for an EV cert that bypasses SmartScreen). Add:
|
||||||
|
|
||||||
|
| Secret | Value |
|
||||||
|
|---|---|
|
||||||
|
| `WINDOWS_CERT_PFX_BASE64` | base64-encoded `.pfx` cert |
|
||||||
|
| `WINDOWS_CERT_PASSWORD` | password for the .pfx |
|
||||||
|
|
||||||
|
Until those are wired, buyers will see:
|
||||||
|
- macOS: "DataTools is damaged and can't be opened" — fix by removing
|
||||||
|
the quarantine attribute (`xattr -cr /Applications/DataTools.app`).
|
||||||
|
Acceptable for the technical buyer; **blocking** for the
|
||||||
|
non-technical buyer. Don't ship to non-technical without notarization.
|
||||||
|
- Windows: SmartScreen "Windows protected your PC" — buyer clicks
|
||||||
|
"More info → Run anyway". Friction but not blocking.
|
||||||
|
- Linux: AppImage runs without complaint (Linux has no equivalent
|
||||||
|
trust-store).
|
||||||
|
|
||||||
|
## Per-platform recipe
|
||||||
|
|
||||||
|
Each platform builds on its own machine — PyInstaller does **not**
|
||||||
|
cross-compile. Pick the platform that matches the bundle you need.
|
||||||
|
GitHub Actions matrix runners are the simplest way to produce all
|
||||||
|
three from one push (see "CI build" below).
|
||||||
|
|
||||||
|
### Mac (Intel + Apple Silicon, universal2)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# One-time:
|
||||||
|
pyenv install 3.12
|
||||||
|
pyenv local 3.12
|
||||||
|
python -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install pyinstaller
|
||||||
|
|
||||||
|
# Build:
|
||||||
|
pyinstaller build/datatools.spec --clean
|
||||||
|
|
||||||
|
# Output:
|
||||||
|
# dist/DataTools/ — folder mode (faster cold start)
|
||||||
|
# dist/DataTools.app/ — macOS .app bundle (drag-drop into /Applications)
|
||||||
|
|
||||||
|
# Sign + notarize (after Apple Developer Program enrollment per BUSINESS.md §10):
|
||||||
|
codesign --deep --force --options runtime \
|
||||||
|
--sign "Developer ID Application: <YOUR-NAME> (<TEAMID>)" \
|
||||||
|
dist/DataTools.app
|
||||||
|
|
||||||
|
# Notarize:
|
||||||
|
xcrun notarytool submit dist/DataTools.app \
|
||||||
|
--apple-id "<YOUR-APPLE-ID>" \
|
||||||
|
--team-id "<TEAMID>" \
|
||||||
|
--password "<APP-SPECIFIC-PASSWORD>" \
|
||||||
|
--wait
|
||||||
|
|
||||||
|
# Staple the notarization ticket so Gatekeeper sees it offline:
|
||||||
|
xcrun stapler staple dist/DataTools.app
|
||||||
|
|
||||||
|
# Wrap for distribution:
|
||||||
|
hdiutil create -volname "DataTools" -srcfolder dist/DataTools.app \
|
||||||
|
-ov -format UDZO dist/DataTools-1.0.0-mac.dmg
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
# One-time:
|
||||||
|
py -3.12 -m venv .venv
|
||||||
|
.venv\Scripts\activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install pyinstaller
|
||||||
|
|
||||||
|
# Build:
|
||||||
|
pyinstaller build\datatools.spec --clean
|
||||||
|
|
||||||
|
# Output:
|
||||||
|
# dist\DataTools\ — folder mode
|
||||||
|
# dist\DataTools\DataTools.exe
|
||||||
|
|
||||||
|
# Wrap with Inno Setup (free):
|
||||||
|
# 1. Install Inno Setup (https://jrsoftware.org/isdl.php)
|
||||||
|
# 2. Create installer.iss next to this README:
|
||||||
|
# [Setup]
|
||||||
|
# AppName=DataTools
|
||||||
|
# AppVersion=1.0.0
|
||||||
|
# DefaultDirName={autopf}\DataTools
|
||||||
|
# OutputDir=..\..\dist
|
||||||
|
# OutputBaseFilename=DataTools-1.0.0-win-setup
|
||||||
|
# Compression=lzma
|
||||||
|
# SolidCompression=yes
|
||||||
|
# [Files]
|
||||||
|
# Source: "..\..\dist\DataTools\*"; DestDir: "{app}"; Flags: recursesubdirs
|
||||||
|
# [Icons]
|
||||||
|
# Name: "{autoprograms}\DataTools"; Filename: "{app}\DataTools.exe"
|
||||||
|
# 3. Compile: ISCC.exe build\installer.iss
|
||||||
|
|
||||||
|
# Code-sign (optional but reduces SmartScreen warnings):
|
||||||
|
# Use signtool with a code-signing cert (Sectigo / DigiCert).
|
||||||
|
# Without signing, buyer sees "Windows protected your PC" once;
|
||||||
|
# they click "More info → Run anyway." Acceptable for v1.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Linux (AppImage)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3.12 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install pyinstaller
|
||||||
|
|
||||||
|
pyinstaller build/datatools.spec --clean
|
||||||
|
# dist/DataTools/ — folder mode
|
||||||
|
|
||||||
|
# Wrap as AppImage (single-file portable app):
|
||||||
|
# 1. Download appimagetool from https://appimage.org/
|
||||||
|
# 2. Set up the AppDir layout:
|
||||||
|
# DataTools.AppDir/
|
||||||
|
# ├── AppRun -> ./DataTools/DataTools
|
||||||
|
# ├── DataTools.desktop (icon + entry config)
|
||||||
|
# ├── icon.png
|
||||||
|
# └── usr/bin/ -> dist/DataTools/*
|
||||||
|
# 3. ./appimagetool DataTools.AppDir dist/DataTools-1.0.0-linux-x86_64.AppImage
|
||||||
|
```
|
||||||
|
|
||||||
|
## CI build (recommended once the spec is stable)
|
||||||
|
|
||||||
|
`.github/workflows/build.yml` (template):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
name: Build installers
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
push:
|
||||||
|
tags: [ 'v*' ]
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
os: [macos-latest, windows-latest, ubuntu-latest]
|
||||||
|
runs-on: ${{ matrix.os }}
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with: { python-version: '3.12' }
|
||||||
|
- run: pip install -r requirements.txt pyinstaller
|
||||||
|
- run: pyinstaller build/datatools.spec --clean
|
||||||
|
- uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: DataTools-${{ matrix.os }}
|
||||||
|
path: dist/
|
||||||
|
```
|
||||||
|
|
||||||
|
Mac code-signing in CI requires the cert + private key as a GitHub
|
||||||
|
secret (encoded with `base64`). Detailed walkthrough belongs in a
|
||||||
|
later doc — for v1, sign locally and upload to GitHub Releases.
|
||||||
|
|
||||||
|
## Tesseract bundling (PDF Extractor OCR)
|
||||||
|
|
||||||
|
Frozen artifacts ship a per-platform Tesseract binary plus the English
|
||||||
|
`eng.traineddata` model so scanned-PDF support in the PDF Extractor
|
||||||
|
works out of the box — no separate user install. Source / pip
|
||||||
|
developer setups still need system Tesseract on `PATH`.
|
||||||
|
|
||||||
|
**Layout inside the bundle**:
|
||||||
|
|
||||||
|
```
|
||||||
|
DataTools/ (or DataTools.app/Contents/MacOS/)
|
||||||
|
└── tesseract/
|
||||||
|
├── tesseract (Linux/macOS binary; tesseract.exe on Windows)
|
||||||
|
└── tessdata/
|
||||||
|
└── eng.traineddata
|
||||||
|
```
|
||||||
|
|
||||||
|
The runtime resolver (in `src/`, owned by the runtime team) walks:
|
||||||
|
|
||||||
|
1. `DATATOOLS_TESSERACT_BIN` env var override.
|
||||||
|
2. `Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"` — frozen
|
||||||
|
bundles only.
|
||||||
|
3. `tesseract` on `PATH`.
|
||||||
|
4. Windows well-known paths.
|
||||||
|
|
||||||
|
**Where the bytes come from**:
|
||||||
|
|
||||||
|
- **Tessdata** — vendored in-repo at `build/vendor/tessdata/eng.traineddata`
|
||||||
|
(sourced from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best)).
|
||||||
|
`datatools.spec` copies it into `tesseract/tessdata/`.
|
||||||
|
- **Binary** — fetched per-platform at build time by
|
||||||
|
`build/make_release.py` from pinned upstream URLs. Current pin:
|
||||||
|
**Tesseract 5.5.0**.
|
||||||
|
|
||||||
|
**Updating Tesseract**:
|
||||||
|
|
||||||
|
1. Bump the version pin and the per-platform fetch URLs in
|
||||||
|
`build/make_release.py`.
|
||||||
|
2. If the model schema changed upstream, refresh
|
||||||
|
`build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the
|
||||||
|
matching tag.
|
||||||
|
3. Rebuild on each platform (`python build/make_release.py`) and
|
||||||
|
smoke-test a scanned PDF through the PDF Extractor.
|
||||||
|
4. Update `LICENSE_TESSERACT.txt` at the repo root if upstream license
|
||||||
|
terms change (Apache-2.0 today).
|
||||||
|
|
||||||
|
License attribution for the bundled binary lives at
|
||||||
|
`LICENSE_TESSERACT.txt` at the repo root — it must ship alongside any
|
||||||
|
binary that contains Tesseract.
|
||||||
|
|
||||||
|
## Common pitfalls
|
||||||
|
|
||||||
|
| Symptom | Fix |
|
||||||
|
|---|---|
|
||||||
|
| Bundle is 800+ MB | Check the ``excludes`` list in ``datatools.spec``. ``matplotlib`` / ``scipy`` / ``tkinter`` are the usual suspects. |
|
||||||
|
| App launches, browser opens, page is blank | Streamlit's static assets aren't bundled. Re-run with `--log-level=DEBUG` and confirm the static dir was collected by `collect_data_files('streamlit')`. |
|
||||||
|
| App launches but logs ``ImportError: streamlit.runtime.X`` | Add ``X`` to ``hidden_imports`` in the spec or to ``hook-streamlit.py``. |
|
||||||
|
| Mac Gatekeeper says "DataTools is damaged and can't be opened" | The bundle wasn't signed + notarized. Don't ship to buyers without these — see the Mac recipe above. |
|
||||||
|
| Windows SmartScreen blocks first launch | Buyer clicks "More info → Run anyway". Code-signing reduces but doesn't eliminate this; for v1 it's an accepted friction. |
|
||||||
|
| Bundle works on dev machine but crashes on a clean machine | Likely a missing C runtime. On Windows, install [VC++ redistributable](https://aka.ms/vs/17/release/vc_redist.x64.exe) into the installer alongside the bundle. |
|
||||||
|
|
||||||
|
## Testing the bundle
|
||||||
|
|
||||||
|
Smoke-test on a **clean** machine (or VM) — your dev machine has too
|
||||||
|
much state to trust:
|
||||||
|
|
||||||
|
```
|
||||||
|
1. Boot a clean Mac / Win / Linux VM.
|
||||||
|
2. Copy the .dmg / .exe / .AppImage onto it.
|
||||||
|
3. Install / drag-drop into Applications / chmod +x.
|
||||||
|
4. Double-click the app icon.
|
||||||
|
5. Browser should open to http://127.0.0.1:850x within 5 seconds.
|
||||||
|
6. Drop samples/demo/shopify_pet_customers.csv into the
|
||||||
|
Automated Workflows page; click Run; AFTER preview should appear.
|
||||||
|
7. Confirm in the network tab: zero outbound calls except to
|
||||||
|
127.0.0.1 and the Streamlit static asset paths (also local).
|
||||||
|
```
|
||||||
|
|
||||||
|
Step 7 is the privacy-claim integrity check from
|
||||||
|
`docs/POST-LAUNCH.md` §6 — do this once per release, then trust it.
|
||||||
|
|
||||||
|
## Versioning
|
||||||
|
|
||||||
|
Bump the version string in three places per release:
|
||||||
|
|
||||||
|
- `datatools.spec` (CFBundleVersion + CFBundleShortVersionString)
|
||||||
|
- the Inno Setup `AppVersion` line
|
||||||
|
- the AppImage filename
|
||||||
|
|
||||||
|
A single source of truth (e.g. `src/__init__.py`) is a future
|
||||||
|
refactor — for v1 the three-spot update is fine.
|
||||||
8
build/appimage/AppRun
Executable file
8
build/appimage/AppRun
Executable file
@@ -0,0 +1,8 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# AppImage entry point. AppImage mounts the bundle and runs this
|
||||||
|
# script. We chdir into the embedded usr/bin so the PyInstaller
|
||||||
|
# bundle's relative paths resolve, then exec the launcher binary.
|
||||||
|
|
||||||
|
set -e
|
||||||
|
HERE="$(dirname -- "$(readlink -f -- "${0}")")"
|
||||||
|
exec "${HERE}/usr/bin/DataTools/DataTools" "$@"
|
||||||
67
build/appimage/build.sh
Executable file
67
build/appimage/build.sh
Executable file
@@ -0,0 +1,67 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Wrap dist/DataTools/ (PyInstaller folder mode) into a distributable
|
||||||
|
# AppImage.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# bash build/appimage/build.sh <version>
|
||||||
|
#
|
||||||
|
# Requires ``appimagetool`` on PATH (CI installs it; locally grab the
|
||||||
|
# latest release from https://github.com/AppImage/AppImageKit/releases).
|
||||||
|
#
|
||||||
|
# Output: dist/DataTools-<version>-linux-x86_64.AppImage
|
||||||
|
#
|
||||||
|
# Tesseract bundling: no-op here. The PyInstaller bundle in
|
||||||
|
# dist/DataTools/ already contains tesseract/{tesseract, *.so,
|
||||||
|
# tessdata/eng.traineddata} from the spec's datas; ``cp -R``
|
||||||
|
# below carries it along into the AppDir.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
VERSION="${1:-0.0.0-dev}"
|
||||||
|
DIST="dist/DataTools"
|
||||||
|
OUT="dist/DataTools-${VERSION}-linux-x86_64.AppImage"
|
||||||
|
|
||||||
|
if [[ ! -d "$DIST" ]]; then
|
||||||
|
echo "Error: $DIST not found. Run pyinstaller build/datatools.spec first." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v appimagetool >/dev/null 2>&1; then
|
||||||
|
echo "Error: appimagetool not on PATH. See build/appimage/build.sh header." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Lay out the AppDir.
|
||||||
|
APPDIR="$(mktemp -d)/DataTools.AppDir"
|
||||||
|
trap 'rm -rf "$(dirname -- "$APPDIR")"' EXIT
|
||||||
|
mkdir -p "$APPDIR/usr/bin"
|
||||||
|
|
||||||
|
cp -R "$DIST" "$APPDIR/usr/bin/"
|
||||||
|
cp build/appimage/AppRun "$APPDIR/AppRun"
|
||||||
|
chmod +x "$APPDIR/AppRun"
|
||||||
|
cp build/appimage/datatools.desktop "$APPDIR/datatools.desktop"
|
||||||
|
|
||||||
|
# Icon. AppImage requires a top-level <appname>.png next to the
|
||||||
|
# .desktop. Use the build/icon.png if present, otherwise generate a
|
||||||
|
# blank placeholder so the build doesn't fail on a fresh checkout.
|
||||||
|
if [[ -f build/icon.png ]]; then
|
||||||
|
cp build/icon.png "$APPDIR/datatools.png"
|
||||||
|
else
|
||||||
|
# 256x256 single-colour PNG via printf — appimagetool needs *some*
|
||||||
|
# icon present. Replace with a real 1024x1024 PNG before launch.
|
||||||
|
python3 - <<'PY'
|
||||||
|
import struct, zlib, os
|
||||||
|
def chunk(t, d): return struct.pack(">I", len(d)) + t + d + struct.pack(">I", zlib.crc32(t + d) & 0xffffffff)
|
||||||
|
W = H = 256
|
||||||
|
ihdr = struct.pack(">IIBBBBB", W, H, 8, 2, 0, 0, 0) # 8-bit RGB
|
||||||
|
raw = b"".join(b"\x00" + b"\x16\x19\x22" * W for _ in range(H)) # filter byte + dark pixels
|
||||||
|
idat = zlib.compress(raw, 9)
|
||||||
|
png = b"\x89PNG\r\n\x1a\n" + chunk(b"IHDR", ihdr) + chunk(b"IDAT", idat) + chunk(b"IEND", b"")
|
||||||
|
out = os.environ["APPDIR"] + "/datatools.png"
|
||||||
|
open(out, "wb").write(png)
|
||||||
|
PY
|
||||||
|
fi
|
||||||
|
export APPDIR
|
||||||
|
|
||||||
|
ARCH=x86_64 appimagetool "$APPDIR" "$OUT"
|
||||||
|
echo "Built $OUT"
|
||||||
8
build/appimage/datatools.desktop
Normal file
8
build/appimage/datatools.desktop
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
[Desktop Entry]
|
||||||
|
Type=Application
|
||||||
|
Name=DataTools
|
||||||
|
Comment=Local CSV / Excel cleaning suite
|
||||||
|
Exec=DataTools
|
||||||
|
Icon=datatools
|
||||||
|
Categories=Office;Utility;
|
||||||
|
Terminal=false
|
||||||
69
build/build_portable_zip.py
Normal file
69
build/build_portable_zip.py
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
"""Wrap the PyInstaller folder build into a portable .zip.
|
||||||
|
|
||||||
|
Self-contained download: unzip → double-click the launcher → app runs.
|
||||||
|
No installer, no Python install, no admin rights required.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python build/build_portable_zip.py <platform> <version>
|
||||||
|
|
||||||
|
Where ``platform`` is one of ``win`` / ``mac`` / ``linux``. The
|
||||||
|
script just produces a generic ``dist/DataTools/`` zip; on macOS the
|
||||||
|
preferred portable format is the ``ditto``-wrapped .app — see
|
||||||
|
``build/macos/build_zip.sh`` for that flow. This helper exists mainly
|
||||||
|
for Windows + Linux, where there's no .app bundle to wrap.
|
||||||
|
|
||||||
|
Output:
|
||||||
|
dist/DataTools-<version>-<platform>-portable.zip
|
||||||
|
|
||||||
|
The zip root is the ``DataTools/`` folder so an unzip produces a
|
||||||
|
self-contained dir the user can drop anywhere (Desktop, USB stick,
|
||||||
|
network share). On Windows, the launcher is ``DataTools.exe`` inside
|
||||||
|
that folder; on Linux, ``DataTools``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
REPO = Path(__file__).resolve().parent.parent
|
||||||
|
DIST_DIR = REPO / "dist"
|
||||||
|
BUNDLE_DIR = DIST_DIR / "DataTools"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
sys.stderr.write(
|
||||||
|
"usage: python build/build_portable_zip.py <platform> <version>\n"
|
||||||
|
)
|
||||||
|
return 2
|
||||||
|
platform = sys.argv[1]
|
||||||
|
version = sys.argv[2]
|
||||||
|
|
||||||
|
if not BUNDLE_DIR.is_dir():
|
||||||
|
sys.stderr.write(
|
||||||
|
f"Bundle dir not found at {BUNDLE_DIR}.\n"
|
||||||
|
"Run ``pyinstaller build/datatools.spec --clean --noconfirm`` first.\n"
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
out_stem = DIST_DIR / f"DataTools-{version}-{platform}-portable"
|
||||||
|
# ``make_archive`` takes a base name (no extension) and produces
|
||||||
|
# ``<base>.zip``. ``root_dir`` = parent of what we want compressed,
|
||||||
|
# ``base_dir`` = the folder name inside the archive root. This
|
||||||
|
# combo yields a single top-level ``DataTools/`` directory inside
|
||||||
|
# the .zip rather than dumping its contents loose.
|
||||||
|
archive = shutil.make_archive(
|
||||||
|
base_name=str(out_stem),
|
||||||
|
format="zip",
|
||||||
|
root_dir=str(DIST_DIR),
|
||||||
|
base_dir="DataTools",
|
||||||
|
)
|
||||||
|
size_mb = Path(archive).stat().st_size / (1024 * 1024)
|
||||||
|
print(f"wrote {archive} ({size_mb:.1f} MB)")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
258
build/datatools.spec
Normal file
258
build/datatools.spec
Normal file
@@ -0,0 +1,258 @@
|
|||||||
|
# PyInstaller spec for DataTools.
|
||||||
|
#
|
||||||
|
# Build (from the repo root, after ``pip install pyinstaller``):
|
||||||
|
#
|
||||||
|
# pyinstaller build/datatools.spec
|
||||||
|
#
|
||||||
|
# Output: ``dist/DataTools/`` (folder mode) and ``dist/DataTools.exe``
|
||||||
|
# (or platform equivalent) on Windows; ``dist/DataTools.app`` on macOS
|
||||||
|
# when packaged via ``--target-arch universal2``. See ``build/README.md``
|
||||||
|
# for the full per-platform recipe.
|
||||||
|
#
|
||||||
|
# Why folder-mode (one-dir) is the default:
|
||||||
|
# * Streamlit's static assets + Python interpreter + ~300 MB of deps
|
||||||
|
# compress poorly into onefile. Onefile mode unpacks every launch
|
||||||
|
# to a temp dir — adds 5-15 s startup latency that confuses
|
||||||
|
# non-technical buyers ("did it crash?").
|
||||||
|
# * Folder mode lets the installer (Inno Setup on Win, .dmg on Mac)
|
||||||
|
# run a one-time copy. Subsequent launches are instant.
|
||||||
|
#
|
||||||
|
# Cross-platform note: this single spec file is built ON each target
|
||||||
|
# platform. Cross-compilation isn't supported — Mac builds need a
|
||||||
|
# Mac, Windows builds need a Windows machine (or a Windows GitHub
|
||||||
|
# Actions runner). See build/README.md for the matrix recipe.
|
||||||
|
|
||||||
|
# -*- mode: python ; coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from PyInstaller.utils.hooks import (
|
||||||
|
collect_all,
|
||||||
|
collect_data_files,
|
||||||
|
collect_submodules,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Repo root from this spec's location (PyInstaller sets SPECPATH).
|
||||||
|
REPO = Path(SPECPATH).resolve().parent
|
||||||
|
|
||||||
|
# Single source of truth for the version string. Read directly from
|
||||||
|
# src/__init__.py instead of importing src/ — importing pulls in
|
||||||
|
# heavy deps (pandas etc) that PyInstaller's spec parser doesn't need.
|
||||||
|
import re as _re
|
||||||
|
_init_py = (REPO / "src" / "__init__.py").read_text(encoding="utf-8")
|
||||||
|
_m = _re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', _init_py)
|
||||||
|
VERSION = _m.group(1) if _m else "0.0.0"
|
||||||
|
|
||||||
|
# ----- Hidden imports ------------------------------------------------
|
||||||
|
# PyInstaller's static analyser misses everything Streamlit reaches
|
||||||
|
# through ``importlib`` and the per-tool registries our app uses. We
|
||||||
|
# exhaustively pull every submodule of the libraries that bridge
|
||||||
|
# user code to runtime — better a 50 MB-bigger bundle than a runtime
|
||||||
|
# ImportError on the buyer's machine.
|
||||||
|
|
||||||
|
hidden_imports: list[str] = []
|
||||||
|
hidden_imports += collect_submodules("streamlit")
|
||||||
|
hidden_imports += collect_submodules("pandas")
|
||||||
|
hidden_imports += collect_submodules("phonenumbers")
|
||||||
|
hidden_imports += collect_submodules("rapidfuzz")
|
||||||
|
hidden_imports += collect_submodules("charset_normalizer")
|
||||||
|
hidden_imports += collect_submodules("openpyxl")
|
||||||
|
hidden_imports += collect_submodules("loguru")
|
||||||
|
|
||||||
|
# PDF Extractor stack. ``pypdfium2`` has its own PyInstaller hook
|
||||||
|
# under ``build/hooks/`` that pulls in the native PDFium binary —
|
||||||
|
# keep the ``collect_submodules`` calls here for belt-and-braces.
|
||||||
|
hidden_imports += collect_submodules("pdfplumber")
|
||||||
|
hidden_imports += collect_submodules("pdfminer")
|
||||||
|
hidden_imports += collect_submodules("pypdfium2")
|
||||||
|
hidden_imports += collect_submodules("PIL")
|
||||||
|
hidden_imports += collect_submodules("pytesseract")
|
||||||
|
|
||||||
|
# Our own engine + GUI modules. Even though we import them directly
|
||||||
|
# at the top of ``launcher.py`` / ``app.py``, the Streamlit
|
||||||
|
# session-state and per-page page discovery layers re-import via
|
||||||
|
# names that PyInstaller doesn't see.
|
||||||
|
hidden_imports += collect_submodules("src")
|
||||||
|
|
||||||
|
# ----- Data files ---------------------------------------------------
|
||||||
|
# Streamlit's static assets (the JS / CSS / fonts the browser fetches
|
||||||
|
# from the bundled HTTP server) are NOT Python files; PyInstaller
|
||||||
|
# can't auto-find them.
|
||||||
|
|
||||||
|
datas: list[tuple[str, str]] = []
|
||||||
|
|
||||||
|
# Streamlit's runtime assets.
|
||||||
|
datas += collect_data_files("streamlit", include_py_files=False)
|
||||||
|
|
||||||
|
# phonenumbers ships its country/area-code metadata as resources.
|
||||||
|
datas += collect_data_files("phonenumbers", include_py_files=False)
|
||||||
|
|
||||||
|
# PDF Extractor data files. ``pypdfium2`` ships a native PDFium
|
||||||
|
# shared library (``.dll`` / ``.so`` / ``.dylib``) under its package
|
||||||
|
# dir; ``pdfminer`` ships the Adobe CMap tables it uses for
|
||||||
|
# character mapping. The drawable-canvas frontend bundle is gone
|
||||||
|
# now that the visual picker was removed.
|
||||||
|
datas += collect_data_files("pypdfium2", include_py_files=False)
|
||||||
|
datas += collect_data_files("pdfminer", include_py_files=False)
|
||||||
|
|
||||||
|
# Our application files. PyInstaller's bundler treats source as code
|
||||||
|
# (.pyc) by default; we add it again as data so the launcher's
|
||||||
|
# ``Path(sys._MEIPASS) / "src" / "gui" / "app.py"`` resolution works.
|
||||||
|
datas += [
|
||||||
|
(str(REPO / "src"), "src"),
|
||||||
|
(str(REPO / "samples" / "demo"), "samples/demo"),
|
||||||
|
(str(REPO / ".streamlit" / "config.toml"),".streamlit"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# ----- Tesseract OCR bundle ----------------------------------------
|
||||||
|
# ``build/make_release.py`` stages the per-platform Tesseract binary
|
||||||
|
# + its runtime libs (DLLs/dylibs/sos) into
|
||||||
|
# ``build/_tesseract/<target>/`` and the shared eng.traineddata into
|
||||||
|
# ``build/vendor/tessdata/``. We add both to ``datas`` so PyInstaller
|
||||||
|
# drops them at the path the runtime expects:
|
||||||
|
#
|
||||||
|
# <bundle>/tesseract/tesseract[.exe]
|
||||||
|
# <bundle>/tesseract/<all dll/dylib/so deps>
|
||||||
|
# <bundle>/tesseract/tessdata/eng.traineddata
|
||||||
|
#
|
||||||
|
# The runtime discovery code in src/pdf_extract.py reads this layout
|
||||||
|
# from ``Path(sys._MEIPASS) / "tesseract" / ...``. Keep the two ends
|
||||||
|
# in sync — if you rename "tesseract" here, update pdf_extract.py too.
|
||||||
|
#
|
||||||
|
# The orchestrator (make_release.py) sets DATATOOLS_TESS_STAGING to
|
||||||
|
# the right per-platform dir before invoking PyInstaller. For ad-hoc
|
||||||
|
# `pyinstaller build/datatools.spec` runs without the orchestrator,
|
||||||
|
# fall back to the canonical staging path.
|
||||||
|
_tess_staging_env = os.environ.get("DATATOOLS_TESS_STAGING")
|
||||||
|
if _tess_staging_env:
|
||||||
|
_tess_staging = Path(_tess_staging_env)
|
||||||
|
else:
|
||||||
|
# Pick the obvious per-host staging dir as a fallback so spec-only
|
||||||
|
# builds (without the orchestrator) still work in dev.
|
||||||
|
import sys as _sys_for_target
|
||||||
|
_target_guess = (
|
||||||
|
"win" if _sys_for_target.platform.startswith("win")
|
||||||
|
else "mac" if _sys_for_target.platform == "darwin"
|
||||||
|
else "linux"
|
||||||
|
)
|
||||||
|
_tess_staging = REPO / "build" / "_tesseract" / _target_guess
|
||||||
|
|
||||||
|
_tessdata = REPO / "build" / "vendor" / "tessdata"
|
||||||
|
|
||||||
|
if _tess_staging.is_dir() and any(_tess_staging.iterdir()):
|
||||||
|
# Drop every file in the staging dir directly under
|
||||||
|
# ``<bundle>/tesseract/`` (binary + DLL/dylib/so siblings).
|
||||||
|
datas += [(str(_tess_staging), "tesseract")]
|
||||||
|
else:
|
||||||
|
# Don't hard-fail spec parse — useful for first-time devs running
|
||||||
|
# PyInstaller before fetching binaries. Surface a loud warning
|
||||||
|
# though, since the OCR feature will silently fail at runtime.
|
||||||
|
print(
|
||||||
|
f"WARNING: {_tess_staging} is empty or missing — OCR will be "
|
||||||
|
"disabled in the bundle. Run build/make_release.py (which "
|
||||||
|
"calls fetch_tesseract_for_platform) before pyinstaller, or "
|
||||||
|
"pre-stage the binary manually."
|
||||||
|
)
|
||||||
|
|
||||||
|
if (_tessdata / "eng.traineddata").exists():
|
||||||
|
datas += [(str(_tessdata), "tesseract/tessdata")]
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"WARNING: {_tessdata}/eng.traineddata is missing — OCR will "
|
||||||
|
"have no language data at runtime. Run build/make_release.py "
|
||||||
|
"or fetch manually per build/vendor/README.md."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bundle the Apache-2.0 LICENSE text alongside the binary. The docs
|
||||||
|
# agent maintains LICENSE_TESSERACT.txt at the repo root; PyInstaller
|
||||||
|
# drops it at the bundle root next to DataTools[.exe].
|
||||||
|
_tess_license = REPO / "LICENSE_TESSERACT.txt"
|
||||||
|
if _tess_license.exists():
|
||||||
|
datas += [(str(_tess_license), ".")]
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
"WARNING: LICENSE_TESSERACT.txt missing at repo root. Required "
|
||||||
|
"by Apache-2.0 for redistribution; the docs agent should "
|
||||||
|
"create it. Continuing without it for now."
|
||||||
|
)
|
||||||
|
|
||||||
|
# ----- Analysis ------------------------------------------------------
|
||||||
|
|
||||||
|
a = Analysis(
|
||||||
|
[str(REPO / "build" / "launcher.py")],
|
||||||
|
pathex=[str(REPO)],
|
||||||
|
binaries=[],
|
||||||
|
datas=datas,
|
||||||
|
hiddenimports=hidden_imports,
|
||||||
|
hookspath=[str(REPO / "build" / "hooks")],
|
||||||
|
hooksconfig={},
|
||||||
|
runtime_hooks=[],
|
||||||
|
excludes=[
|
||||||
|
# Ship-trim — PyInstaller pulls these in but we never need
|
||||||
|
# them, and they add ~80 MB combined.
|
||||||
|
"tkinter",
|
||||||
|
"matplotlib",
|
||||||
|
"scipy",
|
||||||
|
"IPython",
|
||||||
|
"jupyter",
|
||||||
|
"notebook",
|
||||||
|
"test",
|
||||||
|
"tests",
|
||||||
|
],
|
||||||
|
noarchive=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
pyz = PYZ(a.pure)
|
||||||
|
|
||||||
|
exe = EXE(
|
||||||
|
pyz,
|
||||||
|
a.scripts,
|
||||||
|
[],
|
||||||
|
exclude_binaries=True,
|
||||||
|
name="DataTools",
|
||||||
|
debug=False,
|
||||||
|
bootloader_ignore_signals=False,
|
||||||
|
strip=False,
|
||||||
|
upx=True,
|
||||||
|
console=False, # GUI app — no terminal window on Win/Mac
|
||||||
|
disable_windowed_traceback=False,
|
||||||
|
icon=str(REPO / "build" / "icon.icns") if (REPO / "build" / "icon.icns").exists() else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
coll = COLLECT(
|
||||||
|
exe,
|
||||||
|
a.binaries,
|
||||||
|
a.datas,
|
||||||
|
strip=False,
|
||||||
|
upx=True,
|
||||||
|
upx_exclude=[],
|
||||||
|
name="DataTools",
|
||||||
|
)
|
||||||
|
|
||||||
|
# macOS .app bundle wrapper. PyInstaller produces it only on Mac;
|
||||||
|
# this block is a no-op on Win/Linux.
|
||||||
|
#
|
||||||
|
# Tesseract bundling note: ``BUNDLE(coll, ...)`` carries the entire
|
||||||
|
# COLLECT output (binaries + datas) into the .app's
|
||||||
|
# Contents/Resources tree, so the ``tesseract/`` subdir we built up
|
||||||
|
# in ``datas`` lands at ``DataTools.app/Contents/Resources/tesseract/``
|
||||||
|
# and the runtime ``sys._MEIPASS`` resolves there. No extra plumbing
|
||||||
|
# needed.
|
||||||
|
import sys as _sys
|
||||||
|
if _sys.platform == "darwin":
|
||||||
|
app = BUNDLE(
|
||||||
|
coll,
|
||||||
|
name="DataTools.app",
|
||||||
|
icon=str(REPO / "build" / "icon.icns") if (REPO / "build" / "icon.icns").exists() else None,
|
||||||
|
bundle_identifier="com.datatools.desktop",
|
||||||
|
info_plist={
|
||||||
|
"CFBundleDisplayName": "DataTools",
|
||||||
|
"CFBundleVersion": VERSION,
|
||||||
|
"CFBundleShortVersionString": VERSION,
|
||||||
|
"NSHighResolutionCapable": True,
|
||||||
|
# Buyer's macOS will not show the app's window in the dock
|
||||||
|
# if this is True. We want the dock icon so the buyer can
|
||||||
|
# see the app is running while the browser tab is open.
|
||||||
|
"LSUIElement": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
78
build/generate_icons.py
Normal file
78
build/generate_icons.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
"""Generate platform-specific app icons from the source PNG asset.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
build/icon.ico Windows multi-resolution icon (16..256 px sizes).
|
||||||
|
build/icon.icns macOS icon bundle (16..1024 px scaled tiers).
|
||||||
|
build/icon.png Plain 256x256 PNG used by the Linux AppImage.
|
||||||
|
|
||||||
|
Source: ``src/gui/assets/datatools_icon_256.png`` (the same icon
|
||||||
|
``st.set_page_config`` uses, so the installer / Dock / Taskbar match
|
||||||
|
the in-app tab favicon).
|
||||||
|
|
||||||
|
Run manually:
|
||||||
|
python build/generate_icons.py
|
||||||
|
|
||||||
|
CI runs this automatically before invoking PyInstaller (see
|
||||||
|
``.github/workflows/build.yml``). Both files are .gitignored — they
|
||||||
|
are build artifacts derived from the committed PNG.
|
||||||
|
|
||||||
|
Self-contained: pulls only Pillow (already a transitive dep of
|
||||||
|
``pdfplumber``) so no extra installs are required.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
# Repo layout: this script lives at <REPO>/build/. The source PNG is at
|
||||||
|
# <REPO>/src/gui/assets/datatools_icon_256.png.
|
||||||
|
BUILD_DIR = Path(__file__).resolve().parent
|
||||||
|
REPO = BUILD_DIR.parent
|
||||||
|
SOURCE_PNG = REPO / "src" / "gui" / "assets" / "datatools_icon_256.png"
|
||||||
|
|
||||||
|
# Windows ICO needs every size the OS might render at: taskbar (16/24),
|
||||||
|
# Start Menu (32/48), tile (64/128), shell properties dialog (256).
|
||||||
|
ICO_SIZES = [(16, 16), (24, 24), (32, 32), (48, 48), (64, 64),
|
||||||
|
(128, 128), (256, 256)]
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
if not SOURCE_PNG.exists():
|
||||||
|
sys.stderr.write(
|
||||||
|
f"Source icon not found at {SOURCE_PNG}.\n"
|
||||||
|
"Add a 256x256 (or larger) RGBA PNG there and re-run.\n"
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
src = Image.open(SOURCE_PNG).convert("RGBA")
|
||||||
|
if src.size[0] < 256 or src.size[1] < 256:
|
||||||
|
sys.stderr.write(
|
||||||
|
f"Source icon is {src.size}; recommend 256x256 or larger "
|
||||||
|
"so downscaled tiers look crisp.\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
ico_path = BUILD_DIR / "icon.ico"
|
||||||
|
src.save(ico_path, format="ICO", sizes=ICO_SIZES)
|
||||||
|
print(f"wrote {ico_path} ({ico_path.stat().st_size:,} bytes)")
|
||||||
|
|
||||||
|
icns_path = BUILD_DIR / "icon.icns"
|
||||||
|
# Pillow's ICNS writer derives the per-tier sizes from the source
|
||||||
|
# image; passing a 256x256 source yields ic07..ic12 entries which
|
||||||
|
# cover Finder, Dock, and the Get Info panel.
|
||||||
|
src.save(icns_path, format="ICNS")
|
||||||
|
print(f"wrote {icns_path} ({icns_path.stat().st_size:,} bytes)")
|
||||||
|
|
||||||
|
# AppImage uses a plain PNG for its desktop entry. Copy the source
|
||||||
|
# so the AppImage build script doesn't have to know the asset path.
|
||||||
|
png_path = BUILD_DIR / "icon.png"
|
||||||
|
src.save(png_path, format="PNG")
|
||||||
|
print(f"wrote {png_path} ({png_path.stat().st_size:,} bytes)")
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
31
build/hooks/hook-pypdfium2.py
Normal file
31
build/hooks/hook-pypdfium2.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
"""PyInstaller hook for pypdfium2.
|
||||||
|
|
||||||
|
``pypdfium2`` ships the native PDFium shared library as a data file
|
||||||
|
inside its package directory (``pdfium``-prefixed ``.dll`` on
|
||||||
|
Windows, ``.so`` on Linux, ``.dylib`` on macOS). PyInstaller's
|
||||||
|
default discovery picks up Python ``.py``/``.pyc`` but can miss
|
||||||
|
the binary if the package is wheel-installed and the shared lib
|
||||||
|
isn't on the ``__init__``'s module-level path it scans.
|
||||||
|
|
||||||
|
This hook is belt-and-braces — the main spec already calls
|
||||||
|
``collect_data_files("pypdfium2")`` and ``collect_submodules``,
|
||||||
|
but PyInstaller's hook-discovery-by-name is the documented
|
||||||
|
escape hatch for native-bundled libraries. Without this, the
|
||||||
|
visual picker (which renders PDF pages via
|
||||||
|
``pypdfium2.PdfDocument(...).render(...)``) silently fails on
|
||||||
|
installed builds with a ``FileNotFoundError`` for the PDFium
|
||||||
|
shared library.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from PyInstaller.utils.hooks import (
|
||||||
|
collect_all,
|
||||||
|
collect_data_files,
|
||||||
|
collect_dynamic_libs,
|
||||||
|
)
|
||||||
|
|
||||||
|
datas, binaries, hiddenimports = collect_all("pypdfium2")
|
||||||
|
# Make absolutely sure the bundled PDFium .dll/.so/.dylib is
|
||||||
|
# carried over — PyInstaller treats it as a dynamic lib, not data.
|
||||||
|
binaries += collect_dynamic_libs("pypdfium2")
|
||||||
|
# And its raw data files (the type stubs + metadata file).
|
||||||
|
datas += collect_data_files("pypdfium2", include_py_files=False)
|
||||||
30
build/hooks/hook-streamlit.py
Normal file
30
build/hooks/hook-streamlit.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
"""PyInstaller hook for Streamlit.
|
||||||
|
|
||||||
|
The runtime needs three things PyInstaller's static analyser misses:
|
||||||
|
|
||||||
|
1. Every submodule of ``streamlit`` (the framework reaches into
|
||||||
|
``streamlit.runtime`` / ``streamlit.web`` / ``streamlit.elements``
|
||||||
|
via dynamic import).
|
||||||
|
2. The static front-end assets (JS / CSS / fonts) under
|
||||||
|
``streamlit/static/``.
|
||||||
|
3. The vendored config / proto schemas under
|
||||||
|
``streamlit/runtime/scriptrunner/`` etc.
|
||||||
|
|
||||||
|
The main spec already calls ``collect_all('streamlit')`` so this
|
||||||
|
hook is mostly belt-and-braces — but PyInstaller picks hooks up by
|
||||||
|
name, and a missing hook can produce confusing runtime errors when
|
||||||
|
Streamlit upgrades. Keeping it explicit here documents the
|
||||||
|
dependency.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from PyInstaller.utils.hooks import collect_all, collect_data_files, collect_submodules
|
||||||
|
|
||||||
|
datas, binaries, hiddenimports = collect_all("streamlit")
|
||||||
|
|
||||||
|
# Belt-and-braces: explicitly include the static directory.
|
||||||
|
datas += collect_data_files("streamlit", subdir="static", include_py_files=False)
|
||||||
|
|
||||||
|
# Some Streamlit components are loaded by name from the registry.
|
||||||
|
hiddenimports += collect_submodules("streamlit.elements")
|
||||||
|
hiddenimports += collect_submodules("streamlit.runtime")
|
||||||
|
hiddenimports += collect_submodules("streamlit.web")
|
||||||
93
build/installer.iss
Normal file
93
build/installer.iss
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
; Inno Setup script for DataTools — Windows installer.
|
||||||
|
;
|
||||||
|
; Compile from the repo root:
|
||||||
|
; iscc /DAppVersion=3.0 build\installer.iss
|
||||||
|
;
|
||||||
|
; CI passes the version via /DAppVersion to keep src/__init__.py the
|
||||||
|
; single source of truth. Local manual builds: pass /DAppVersion or
|
||||||
|
; let the default kick in.
|
||||||
|
;
|
||||||
|
; What this installer wires up (covers the "easy launch" surface):
|
||||||
|
; * Start Menu group: Start → DataTools → DataTools / Uninstall
|
||||||
|
; * Desktop shortcut: optional, checked by default during install
|
||||||
|
; * Quick Launch: optional, off by default (legacy Win 7 + power
|
||||||
|
; users who keep the bar enabled). Windows 10/11
|
||||||
|
; users pin to taskbar manually via right-click —
|
||||||
|
; OS security policy forbids programmatic pinning.
|
||||||
|
; * App Paths entry: so ``DataTools`` typed into Win+R / cmd works.
|
||||||
|
;
|
||||||
|
; Self-contained: the installer contains a frozen PyInstaller bundle
|
||||||
|
; (Python + every runtime dep). No pre-install or post-install steps
|
||||||
|
; on the buyer's machine. UAC is NOT required because we install
|
||||||
|
; per-user by default; the prompt only fires if the buyer asks for an
|
||||||
|
; all-users install.
|
||||||
|
|
||||||
|
#ifndef AppVersion
|
||||||
|
#define AppVersion "0.0.0-dev"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
[Setup]
|
||||||
|
AppId={{D4A07001-DA7A-4001-8001-DA7A70013700}}
|
||||||
|
AppName=DataTools
|
||||||
|
AppVersion={#AppVersion}
|
||||||
|
AppVerName=DataTools {#AppVersion}
|
||||||
|
AppPublisher=DataTools
|
||||||
|
AppPublisherURL=https://datatools.app
|
||||||
|
AppSupportURL=https://datatools.app/support
|
||||||
|
AppUpdatesURL=https://datatools.app/releases
|
||||||
|
DefaultDirName={autopf}\DataTools
|
||||||
|
DefaultGroupName=DataTools
|
||||||
|
DisableProgramGroupPage=yes
|
||||||
|
OutputDir=..\dist
|
||||||
|
OutputBaseFilename=DataTools-{#AppVersion}-win-setup
|
||||||
|
SetupIconFile=icon.ico
|
||||||
|
UninstallDisplayIcon={app}\DataTools.exe
|
||||||
|
Compression=lzma2/max
|
||||||
|
SolidCompression=yes
|
||||||
|
WizardStyle=modern
|
||||||
|
ArchitecturesInstallIn64BitMode=x64
|
||||||
|
PrivilegesRequired=lowest
|
||||||
|
PrivilegesRequiredOverridesAllowed=dialog
|
||||||
|
; Allow per-user install (no UAC prompt) when admin isn't available.
|
||||||
|
; Buyers without admin rights can still install without IT involvement.
|
||||||
|
|
||||||
|
ChangesAssociations=no
|
||||||
|
CloseApplications=force
|
||||||
|
RestartApplications=no
|
||||||
|
|
||||||
|
[Languages]
|
||||||
|
Name: "english"; MessagesFile: "compiler:Default.isl"
|
||||||
|
|
||||||
|
[Tasks]
|
||||||
|
Name: "desktopicon"; Description: "Create a &desktop shortcut"; GroupDescription: "Additional shortcuts:"
|
||||||
|
Name: "quicklaunchicon"; Description: "Create a &Quick Launch shortcut"; GroupDescription: "Additional shortcuts:"; Flags: unchecked; OnlyBelowVersion: 6.1
|
||||||
|
|
||||||
|
[Files]
|
||||||
|
; PyInstaller's dist/DataTools/ tree includes:
|
||||||
|
; * DataTools.exe + frozen Python runtime
|
||||||
|
; * tesseract/tesseract.exe + DLLs + tessdata/eng.traineddata
|
||||||
|
; (bundled via build/datatools.spec datas; runtime discovery in
|
||||||
|
; src/pdf_extract.py reads sys._MEIPASS / "tesseract" / ...).
|
||||||
|
; * LICENSE_TESSERACT.txt at the bundle root (Apache-2.0).
|
||||||
|
; The recursesubdirs flag below picks all of those up — no separate
|
||||||
|
; Files: entry needed for tesseract/.
|
||||||
|
Source: "..\dist\DataTools\*"; DestDir: "{app}"; Flags: recursesubdirs ignoreversion
|
||||||
|
|
||||||
|
[Icons]
|
||||||
|
; Start Menu entries — created unconditionally so the app is always
|
||||||
|
; discoverable via Start search.
|
||||||
|
Name: "{group}\DataTools"; Filename: "{app}\DataTools.exe"; IconFilename: "{app}\DataTools.exe"
|
||||||
|
Name: "{group}\Uninstall DataTools"; Filename: "{uninstallexe}"
|
||||||
|
; Desktop shortcut — opt-in via the Tasks page.
|
||||||
|
Name: "{autodesktop}\DataTools"; Filename: "{app}\DataTools.exe"; IconFilename: "{app}\DataTools.exe"; Tasks: desktopicon
|
||||||
|
; Quick Launch (legacy) — only relevant on Win 7 and older.
|
||||||
|
Name: "{userappdata}\Microsoft\Internet Explorer\Quick Launch\DataTools"; Filename: "{app}\DataTools.exe"; IconFilename: "{app}\DataTools.exe"; Tasks: quicklaunchicon
|
||||||
|
|
||||||
|
[Registry]
|
||||||
|
; App Paths — lets the buyer launch from Win+R or cmd with just
|
||||||
|
; "DataTools" instead of a full path. Per-user hive so the per-user
|
||||||
|
; install path doesn't need admin to register.
|
||||||
|
Root: HKCU; Subkey: "Software\Microsoft\Windows\CurrentVersion\App Paths\DataTools.exe"; ValueType: string; ValueName: ""; ValueData: "{app}\DataTools.exe"; Flags: uninsdeletekey
|
||||||
|
|
||||||
|
[Run]
|
||||||
|
Filename: "{app}\DataTools.exe"; Description: "Launch DataTools"; Flags: nowait postinstall skipifsilent
|
||||||
138
build/launcher.py
Normal file
138
build/launcher.py
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
"""DataTools desktop launcher.
|
||||||
|
|
||||||
|
This is the entry point PyInstaller wraps for Mac / Windows / Linux
|
||||||
|
installers. Double-clicking the produced binary boots a local
|
||||||
|
Streamlit server (``127.0.0.1:<random-free-port>``), opens the user's
|
||||||
|
default browser at that URL, and keeps the server alive until the
|
||||||
|
window is closed or the binary is killed.
|
||||||
|
|
||||||
|
Why a launcher instead of pointing PyInstaller at ``src/gui/app.py``:
|
||||||
|
|
||||||
|
* Streamlit's CLI normally bootstraps the server via the
|
||||||
|
``streamlit run`` command. PyInstaller-bundled apps can't shell
|
||||||
|
out to ``streamlit`` because the CLI script lives inside the
|
||||||
|
bundle. We invoke Streamlit's bootstrap directly via
|
||||||
|
:func:`streamlit.web.bootstrap.run`.
|
||||||
|
* A free port has to be picked at runtime — buyers will have other
|
||||||
|
services running on 8501.
|
||||||
|
* The "open browser" step is the buyer's only feedback that
|
||||||
|
something happened; without it they'd see a black terminal flash
|
||||||
|
on Windows and conclude the app didn't start.
|
||||||
|
|
||||||
|
Local-dev equivalent (no installer):
|
||||||
|
|
||||||
|
streamlit run src/gui/app.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import webbrowser
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def _find_free_port(start: int = 8501, span: int = 50) -> int:
|
||||||
|
"""Return a TCP port that's free on the loopback interface.
|
||||||
|
|
||||||
|
Prefer 8501 (Streamlit's traditional default — buyer recognises
|
||||||
|
the URL from any docs they've read) and fall back to the next
|
||||||
|
free port in a small range. We don't fall back to OS-allocated
|
||||||
|
(port=0) because the buyer's URL should look stable across
|
||||||
|
restarts within one session.
|
||||||
|
"""
|
||||||
|
for offset in range(span):
|
||||||
|
port = start + offset
|
||||||
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||||
|
try:
|
||||||
|
s.bind(("127.0.0.1", port))
|
||||||
|
return port
|
||||||
|
except OSError:
|
||||||
|
continue
|
||||||
|
# Last resort: kernel-assigned ephemeral port.
|
||||||
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||||
|
s.bind(("127.0.0.1", 0))
|
||||||
|
return s.getsockname()[1]
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_app_path() -> Path:
|
||||||
|
"""Locate ``src/gui/app.py`` whether running from source or a frozen bundle.
|
||||||
|
|
||||||
|
PyInstaller's ``onefile`` mode unpacks resources into a temp
|
||||||
|
directory pointed at by ``sys._MEIPASS``. Bundled mode uses that
|
||||||
|
directory; source mode walks up from this file.
|
||||||
|
"""
|
||||||
|
if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"):
|
||||||
|
# Frozen: app.py was bundled as a data file (see datatools.spec).
|
||||||
|
return Path(sys._MEIPASS) / "src" / "gui" / "app.py" # type: ignore[attr-defined]
|
||||||
|
return Path(__file__).resolve().parent.parent / "src" / "gui" / "app.py"
|
||||||
|
|
||||||
|
|
||||||
|
def _open_browser_when_ready(url: str, delay: float = 1.5) -> None:
|
||||||
|
"""Open the buyer's default browser to *url* after a short delay.
|
||||||
|
|
||||||
|
The delay gives Streamlit's HTTP server time to bind. Without it,
|
||||||
|
the browser races the server and renders a "couldn't connect"
|
||||||
|
page that confuses non-technical buyers. 1.5 s is conservative
|
||||||
|
on slow Windows machines; faster machines will see a brief
|
||||||
|
blank tab.
|
||||||
|
"""
|
||||||
|
def _open() -> None:
|
||||||
|
time.sleep(delay)
|
||||||
|
webbrowser.open(url, new=2)
|
||||||
|
threading.Thread(target=_open, daemon=True).start()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
"""Boot the local Streamlit server and open the browser."""
|
||||||
|
app_path = _resolve_app_path()
|
||||||
|
if not app_path.exists():
|
||||||
|
sys.stderr.write(
|
||||||
|
f"DataTools could not find its UI script at {app_path}.\n"
|
||||||
|
"This is usually a bundle-build error. Re-install or "
|
||||||
|
"contact support@datatools.app.\n"
|
||||||
|
)
|
||||||
|
return 2
|
||||||
|
|
||||||
|
port = _find_free_port()
|
||||||
|
url = f"http://127.0.0.1:{port}/"
|
||||||
|
|
||||||
|
# Pre-set Streamlit options the bundle ships locked. ``server.address``
|
||||||
|
# = 127.0.0.1 enforces "no network exposure" — Streamlit's default
|
||||||
|
# is 0.0.0.0 which would expose the GUI to the LAN. The privacy
|
||||||
|
# claim on the landing pages depends on this.
|
||||||
|
os.environ.setdefault("STREAMLIT_SERVER_ADDRESS", "127.0.0.1")
|
||||||
|
os.environ.setdefault("STREAMLIT_SERVER_PORT", str(port))
|
||||||
|
os.environ.setdefault("STREAMLIT_SERVER_HEADLESS", "true")
|
||||||
|
os.environ.setdefault("STREAMLIT_BROWSER_GATHER_USAGE_STATS", "false")
|
||||||
|
|
||||||
|
# Print before opening the browser so the terminal log doesn't
|
||||||
|
# scroll behind the new browser tab on macOS.
|
||||||
|
print(f"DataTools is running at {url}")
|
||||||
|
print("Close this window or press Ctrl+C to stop.")
|
||||||
|
|
||||||
|
_open_browser_when_ready(url)
|
||||||
|
|
||||||
|
# Streamlit's bootstrap entry point — equivalent to running
|
||||||
|
# ``streamlit run app.py`` but in-process so PyInstaller's bundled
|
||||||
|
# interpreter handles it without shelling out to a separate script.
|
||||||
|
from streamlit.web import bootstrap
|
||||||
|
bootstrap.run(
|
||||||
|
str(app_path),
|
||||||
|
is_hello=False,
|
||||||
|
args=[],
|
||||||
|
flag_options={
|
||||||
|
"server.address": "127.0.0.1",
|
||||||
|
"server.port": port,
|
||||||
|
"server.headless": True,
|
||||||
|
"browser.gatherUsageStats": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
46
build/macos/build_dmg.sh
Executable file
46
build/macos/build_dmg.sh
Executable file
@@ -0,0 +1,46 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Wrap dist/DataTools.app into a distributable .dmg.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# bash build/macos/build_dmg.sh <version>
|
||||||
|
#
|
||||||
|
# Run after ``pyinstaller build/datatools.spec --clean --noconfirm``
|
||||||
|
# has produced ``dist/DataTools.app``. The output DMG goes to
|
||||||
|
# ``dist/DataTools-<version>-mac.dmg``.
|
||||||
|
#
|
||||||
|
# Code signing + notarization happen separately (see build/README.md
|
||||||
|
# "Signing"). This script only handles the packaging step.
|
||||||
|
#
|
||||||
|
# Tesseract bundling: no-op here. The .app already contains
|
||||||
|
# Contents/Resources/tesseract/{tesseract, *.dylib, tessdata/} thanks
|
||||||
|
# to PyInstaller's BUNDLE() carrying the spec's datas through. This
|
||||||
|
# script just wraps the finished .app — no extra steps for OCR.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
VERSION="${1:-0.0.0-dev}"
|
||||||
|
APP="dist/DataTools.app"
|
||||||
|
DMG="dist/DataTools-${VERSION}-mac.dmg"
|
||||||
|
|
||||||
|
if [[ ! -d "$APP" ]]; then
|
||||||
|
echo "Error: $APP not found. Run pyinstaller build/datatools.spec first." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Drag-target convenience: a /Applications symlink inside the DMG so
|
||||||
|
# the buyer can drag the app icon to it without leaving the DMG.
|
||||||
|
STAGE="$(mktemp -d)"
|
||||||
|
trap 'rm -rf "$STAGE"' EXIT
|
||||||
|
|
||||||
|
cp -R "$APP" "$STAGE/"
|
||||||
|
ln -s /Applications "$STAGE/Applications"
|
||||||
|
|
||||||
|
# UDZO = compressed read-only DMG, the standard distribution format.
|
||||||
|
hdiutil create \
|
||||||
|
-volname "DataTools" \
|
||||||
|
-srcfolder "$STAGE" \
|
||||||
|
-ov \
|
||||||
|
-format UDZO \
|
||||||
|
"$DMG"
|
||||||
|
|
||||||
|
echo "Built $DMG"
|
||||||
43
build/macos/build_zip.sh
Executable file
43
build/macos/build_zip.sh
Executable file
@@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Wrap dist/DataTools.app into a no-install portable .zip.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# bash build/macos/build_zip.sh <version>
|
||||||
|
#
|
||||||
|
# Why a portable .zip in addition to the .dmg:
|
||||||
|
# * Buyers who don't want an installer can unzip and double-click the
|
||||||
|
# .app directly — no drag-to-/Applications step, no installer
|
||||||
|
# chrome. Self-contained: the .app holds Python + every dep.
|
||||||
|
# * IT-locked-down machines often block .dmg auto-mount but allow
|
||||||
|
# .zip download + extraction.
|
||||||
|
#
|
||||||
|
# Run after ``pyinstaller build/datatools.spec --clean --noconfirm``
|
||||||
|
# has produced ``dist/DataTools.app``. Output goes to
|
||||||
|
# ``dist/DataTools-<version>-mac-portable.zip``.
|
||||||
|
#
|
||||||
|
# Tesseract bundling: no-op here. The bundled Tesseract binary +
|
||||||
|
# dylibs + tessdata are already inside DataTools.app/Contents/Resources/tesseract/
|
||||||
|
# (placed by PyInstaller's BUNDLE/datas mechanism). ``ditto -c -k``
|
||||||
|
# preserves the whole .app tree.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
VERSION="${1:-0.0.0-dev}"
|
||||||
|
APP="dist/DataTools.app"
|
||||||
|
ZIP="dist/DataTools-${VERSION}-mac-portable.zip"
|
||||||
|
|
||||||
|
if [[ ! -d "$APP" ]]; then
|
||||||
|
echo "Error: $APP not found. Run pyinstaller build/datatools.spec first." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ``ditto`` preserves the .app bundle's extended attributes and
|
||||||
|
# resource forks (a plain ``zip`` strips them and can break code
|
||||||
|
# signatures + Info.plist resolution on the buyer's machine).
|
||||||
|
#
|
||||||
|
# --sequesterRsrc keeps the AppleDouble metadata inside the archive
|
||||||
|
# rather than as parallel ._ files on disk after extraction.
|
||||||
|
rm -f "$ZIP"
|
||||||
|
ditto -c -k --sequesterRsrc --keepParent "$APP" "$ZIP"
|
||||||
|
|
||||||
|
echo "Built $ZIP ($(du -h "$ZIP" | cut -f1))"
|
||||||
757
build/make_release.py
Normal file
757
build/make_release.py
Normal file
@@ -0,0 +1,757 @@
|
|||||||
|
"""Single-command release builder for DataTools.
|
||||||
|
|
||||||
|
PyInstaller can't cross-compile — to produce a Windows .exe you run
|
||||||
|
this on Windows, for a Mac .dmg you run it on macOS, for a Linux
|
||||||
|
AppImage you run it on Linux. One script, one OS at a time.
|
||||||
|
|
||||||
|
What this script does (in order):
|
||||||
|
1. Preflight — checks PyInstaller, Pillow, and the platform's
|
||||||
|
packager (Inno Setup on Win / hdiutil + ditto on Mac /
|
||||||
|
appimagetool on Linux) are reachable. Bails with install
|
||||||
|
instructions if anything is missing.
|
||||||
|
2. Generates icon.ico / icon.icns / icon.png from the PNG asset.
|
||||||
|
3. Runs PyInstaller against build/datatools.spec.
|
||||||
|
4. Wraps the PyInstaller output into:
|
||||||
|
* Windows: DataTools-<ver>-win-setup.exe (Inno Setup)
|
||||||
|
+ DataTools-<ver>-win-portable.zip
|
||||||
|
* macOS: DataTools-<ver>-mac.dmg
|
||||||
|
+ DataTools-<ver>-mac-portable.zip
|
||||||
|
* Linux: DataTools-<ver>-linux-x86_64.AppImage
|
||||||
|
5. Prints what landed in dist/ and the byte sizes.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python build/make_release.py # build everything for this OS
|
||||||
|
python build/make_release.py --preflight # check tooling, don't build
|
||||||
|
python build/make_release.py --skip-installer # only the portable zip
|
||||||
|
python build/make_release.py --skip-portable # only the installer
|
||||||
|
python build/make_release.py --clean # wipe dist/ first
|
||||||
|
|
||||||
|
Run from the repo root or from build/ — either works.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
REPO = Path(__file__).resolve().parent.parent
|
||||||
|
BUILD = REPO / "build"
|
||||||
|
DIST = REPO / "dist"
|
||||||
|
|
||||||
|
# Tesseract bundling. The runtime discovery code in
|
||||||
|
# ``src/pdf_extract.py`` looks for the binary at
|
||||||
|
# ``Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"`` and tessdata
|
||||||
|
# at ``... / "tesseract" / "tessdata" / "eng.traineddata"``. We stage
|
||||||
|
# everything under ``build/_tesseract/<platform>/`` (gitignored) and
|
||||||
|
# the PyInstaller spec adds that staging dir to ``datas=`` so it lands
|
||||||
|
# at the right place inside the frozen bundle.
|
||||||
|
TESSERACT_VERSION = "5.5.0"
|
||||||
|
TESSDATA_DIR = BUILD / "vendor" / "tessdata"
|
||||||
|
TESSDATA_URL = (
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata"
|
||||||
|
)
|
||||||
|
TESSERACT_STAGING = BUILD / "_tesseract"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Output helpers — colourless so logs stay readable in any terminal/CI tail.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _step(msg: str) -> None:
|
||||||
|
print(f"\n==> {msg}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _ok(msg: str) -> None:
|
||||||
|
print(f" ok: {msg}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _warn(msg: str) -> None:
|
||||||
|
print(f" warn: {msg}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _err(msg: str) -> None:
|
||||||
|
print(f" ERROR: {msg}", file=sys.stderr, flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _run(cmd: list[str], cwd: Path | None = None, env: dict | None = None) -> None:
|
||||||
|
"""Run *cmd*, stream output, exit on failure with a useful banner."""
|
||||||
|
printable = " ".join(map(str, cmd))
|
||||||
|
print(f" $ {printable}", flush=True)
|
||||||
|
try:
|
||||||
|
subprocess.run(cmd, check=True, cwd=cwd or REPO, env=env)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
_err(f"command failed (exit {e.returncode}): {printable}")
|
||||||
|
sys.exit(e.returncode)
|
||||||
|
except FileNotFoundError:
|
||||||
|
_err(f"command not found: {cmd[0]}")
|
||||||
|
sys.exit(127)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Platform detection
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_platform() -> str:
|
||||||
|
"""Return ``win`` / ``mac`` / ``linux`` based on sys.platform."""
|
||||||
|
p = sys.platform
|
||||||
|
if p.startswith("win"):
|
||||||
|
return "win"
|
||||||
|
if p == "darwin":
|
||||||
|
return "mac"
|
||||||
|
if p.startswith("linux"):
|
||||||
|
return "linux"
|
||||||
|
_err(f"unsupported platform {p!r}; this script handles win/mac/linux only.")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Version — single source of truth in src/__init__.py
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _read_version() -> str:
|
||||||
|
init_py = (REPO / "src" / "__init__.py").read_text(encoding="utf-8")
|
||||||
|
m = re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', init_py)
|
||||||
|
if not m:
|
||||||
|
_err("could not parse __version__ from src/__init__.py")
|
||||||
|
sys.exit(1)
|
||||||
|
return m.group(1)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Preflight — check tooling before doing anything destructive
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _have_module(name: str) -> bool:
|
||||||
|
try:
|
||||||
|
__import__(name)
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _have_command(name: str) -> bool:
|
||||||
|
return shutil.which(name) is not None
|
||||||
|
|
||||||
|
|
||||||
|
# Per-platform install hints. The error messages quote these so a buyer
|
||||||
|
# building from source isn't left guessing what to install next.
|
||||||
|
_INSTALL_HINTS = {
|
||||||
|
"pyinstaller": "pip install pyinstaller",
|
||||||
|
"pil": "pip install pillow",
|
||||||
|
"iscc": "Inno Setup (Windows): https://jrsoftware.org/isdl.php — install, then re-open the shell so iscc lands on PATH.",
|
||||||
|
"hdiutil": "ships with macOS — if it's missing your Mac install is broken.",
|
||||||
|
"ditto": "ships with macOS — if it's missing your Mac install is broken.",
|
||||||
|
"appimagetool": "Linux: download appimagetool-x86_64.AppImage from https://github.com/AppImage/AppImageKit/releases, chmod +x, drop on PATH.",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def preflight(target: str) -> None:
|
||||||
|
"""Verify every tool the target build needs is reachable; exit if not."""
|
||||||
|
_step(f"preflight ({target})")
|
||||||
|
|
||||||
|
missing: list[tuple[str, str]] = []
|
||||||
|
|
||||||
|
# Python-side deps — same on every platform. The ``_INSTALL_HINTS``
|
||||||
|
# lookup uses lowercase keys so module name capitalization doesn't
|
||||||
|
# need to match.
|
||||||
|
for mod in ("PyInstaller", "PIL"):
|
||||||
|
if not _have_module(mod):
|
||||||
|
hint = _INSTALL_HINTS.get(mod.lower(), f"pip install {mod}")
|
||||||
|
missing.append((mod.lower(), hint))
|
||||||
|
else:
|
||||||
|
_ok(f"{mod} importable")
|
||||||
|
|
||||||
|
# PyInstaller's CLI must also be reachable as a binary, not just as
|
||||||
|
# an importable module — the spec is invoked via the ``pyinstaller``
|
||||||
|
# command. ``python -m PyInstaller`` is a fine fallback so don't
|
||||||
|
# hard-fail if only the CLI binary is missing.
|
||||||
|
if _have_command("pyinstaller"):
|
||||||
|
_ok("pyinstaller on PATH")
|
||||||
|
else:
|
||||||
|
_warn("pyinstaller binary not on PATH — will fall back to `python -m PyInstaller`")
|
||||||
|
|
||||||
|
# Platform-specific packagers.
|
||||||
|
if target == "win":
|
||||||
|
if _have_command("iscc"):
|
||||||
|
_ok("Inno Setup (iscc) on PATH")
|
||||||
|
else:
|
||||||
|
missing.append(("iscc", _INSTALL_HINTS["iscc"]))
|
||||||
|
elif target == "mac":
|
||||||
|
for tool in ("hdiutil", "ditto"):
|
||||||
|
if _have_command(tool):
|
||||||
|
_ok(f"{tool} on PATH")
|
||||||
|
else:
|
||||||
|
missing.append((tool, _INSTALL_HINTS[tool]))
|
||||||
|
elif target == "linux":
|
||||||
|
if _have_command("appimagetool"):
|
||||||
|
_ok("appimagetool on PATH")
|
||||||
|
else:
|
||||||
|
missing.append(("appimagetool", _INSTALL_HINTS["appimagetool"]))
|
||||||
|
|
||||||
|
if missing:
|
||||||
|
_err("missing prerequisites:")
|
||||||
|
for name, hint in missing:
|
||||||
|
print(f" - {name}: {hint}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
_ok("all prerequisites present")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tesseract bundling — fetch the binary + tessdata at build time.
|
||||||
|
#
|
||||||
|
# We download (not vendor) because:
|
||||||
|
# * Binaries are large (5-40 MB per platform) and license-encumbered
|
||||||
|
# to keep current in git.
|
||||||
|
# * tessdata is Apache-2.0 and ~16 MB — fine to redistribute but
|
||||||
|
# bloats clones for contributors who don't touch OCR.
|
||||||
|
#
|
||||||
|
# Caching layout:
|
||||||
|
# build/_tesseract/win/tesseract.exe + DLLs
|
||||||
|
# build/_tesseract/mac/tesseract + dylibs
|
||||||
|
# build/_tesseract/linux/tesseract + libs
|
||||||
|
# build/vendor/tessdata/eng.traineddata (shared across platforms)
|
||||||
|
#
|
||||||
|
# The PyInstaller spec reads ``build/_tesseract/<platform>/`` and the
|
||||||
|
# tessdata dir, then bundles them under ``<bundle>/tesseract/``.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _download(url: str, dest: Path, *, expected_min_bytes: int = 1024) -> None:
|
||||||
|
"""Download *url* to *dest* atomically. Sanity-check the size."""
|
||||||
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
tmp = dest.with_suffix(dest.suffix + ".part")
|
||||||
|
print(f" GET {url}", flush=True)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(url, timeout=120) as r, open(tmp, "wb") as f:
|
||||||
|
shutil.copyfileobj(r, f)
|
||||||
|
except Exception as e: # noqa: BLE001 — bubble any network error up
|
||||||
|
if tmp.exists():
|
||||||
|
tmp.unlink()
|
||||||
|
_err(f"download failed: {url}\n {e}")
|
||||||
|
raise
|
||||||
|
size = tmp.stat().st_size
|
||||||
|
if size < expected_min_bytes:
|
||||||
|
tmp.unlink()
|
||||||
|
raise RuntimeError(
|
||||||
|
f"downloaded file too small ({size} bytes < {expected_min_bytes}); "
|
||||||
|
f"the URL probably 404'd into an HTML error page."
|
||||||
|
)
|
||||||
|
tmp.replace(dest)
|
||||||
|
_ok(f"downloaded {dest.name} ({size / (1024 * 1024):.1f} MB)")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_tessdata() -> Path:
|
||||||
|
"""Ensure ``build/vendor/tessdata/eng.traineddata`` exists; return its path.
|
||||||
|
|
||||||
|
Shared across platforms. Downloaded once and cached. The
|
||||||
|
runtime expects this file at ``<bundle>/tesseract/tessdata/eng.traineddata``;
|
||||||
|
the PyInstaller spec handles the placement.
|
||||||
|
"""
|
||||||
|
_step("fetch tessdata (eng.traineddata)")
|
||||||
|
TESSDATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
target = TESSDATA_DIR / "eng.traineddata"
|
||||||
|
if target.exists() and target.stat().st_size > 1_000_000:
|
||||||
|
_ok(f"already cached: {target.relative_to(REPO)} "
|
||||||
|
f"({target.stat().st_size / (1024 * 1024):.1f} MB)")
|
||||||
|
return target
|
||||||
|
# ~16 MB on disk for the "best" model. Allow some slack on the
|
||||||
|
# min-bytes check (3 MB) so we still catch HTML 404 pages.
|
||||||
|
_download(TESSDATA_URL, target, expected_min_bytes=3 * 1024 * 1024)
|
||||||
|
return target
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_tesseract_windows(staging: Path) -> None:
|
||||||
|
"""Stage tesseract.exe + DLLs into *staging*.
|
||||||
|
|
||||||
|
Strategy (no easy stand-alone Windows tarball exists — UB-Mannheim
|
||||||
|
ships the canonical Windows builds as Inno Setup installers):
|
||||||
|
|
||||||
|
1. Download the installer .exe from the UB-Mannheim mirror.
|
||||||
|
2. Extract it with 7-Zip (which can read Inno Setup archives via
|
||||||
|
the {app} group). 7-Zip is preinstalled on
|
||||||
|
``windows-latest`` GitHub Actions runners (`C:\\Program Files\\7-Zip\\7z.exe`).
|
||||||
|
3. Copy tesseract.exe + every DLL + the tessdata dir from the
|
||||||
|
extraction into ``staging/``.
|
||||||
|
|
||||||
|
The DLL set tesseract.exe needs at runtime (per UB-Mannheim's
|
||||||
|
Inno Setup script):
|
||||||
|
libtesseract-5.dll, libleptonica-6.dll, libgomp-1.dll,
|
||||||
|
libstdc++-6.dll, libwinpthread-1.dll, libgcc_s_seh-1.dll,
|
||||||
|
liblz4.dll, libjpeg-8.dll, libpng16-16.dll, libtiff-6.dll,
|
||||||
|
libwebp-7.dll, libwebpmux-3.dll, libopenjp2-7.dll, zlib1.dll
|
||||||
|
The whole {app} tree from the installer is ~120 MB; we copy
|
||||||
|
just the .exe + .dll files (~50 MB) since the runtime only
|
||||||
|
needs the binary and its direct deps.
|
||||||
|
"""
|
||||||
|
# UB-Mannheim posts builds under a versioned filename; the exact
|
||||||
|
# build revision changes (5.5.0.20241111 at time of writing).
|
||||||
|
# We pin a specific rev so reproducible builds don't drift.
|
||||||
|
rev = "20241111" # patch rev for tesseract 5.5.0 on the UB-Mannheim mirror
|
||||||
|
fname = f"tesseract-ocr-w64-setup-{TESSERACT_VERSION}.{rev}.exe"
|
||||||
|
url = f"https://digi.bib.uni-mannheim.de/tesseract/{fname}"
|
||||||
|
|
||||||
|
cache = TESSERACT_STAGING / fname
|
||||||
|
if not cache.exists():
|
||||||
|
_download(url, cache, expected_min_bytes=20 * 1024 * 1024)
|
||||||
|
|
||||||
|
# 7-Zip is preinstalled on windows-latest runners; on a dev box
|
||||||
|
# the user installs it (choco install 7zip) or substitutes
|
||||||
|
# innoextract. Locate it.
|
||||||
|
sevenz = (
|
||||||
|
shutil.which("7z")
|
||||||
|
or shutil.which("7z.exe")
|
||||||
|
or r"C:\Program Files\7-Zip\7z.exe"
|
||||||
|
)
|
||||||
|
if not Path(sevenz).exists() and not shutil.which("7z"):
|
||||||
|
_err(
|
||||||
|
"7-Zip not found. On Windows CI runners it's preinstalled; "
|
||||||
|
"on a dev box install via ``choco install 7zip`` or extract "
|
||||||
|
f"{cache} manually into {staging}/ and re-run with "
|
||||||
|
"TESSERACT_SKIP_FETCH=1."
|
||||||
|
)
|
||||||
|
raise FileNotFoundError("7z")
|
||||||
|
|
||||||
|
extract = TESSERACT_STAGING / "win_extract"
|
||||||
|
if extract.exists():
|
||||||
|
shutil.rmtree(extract)
|
||||||
|
extract.mkdir(parents=True)
|
||||||
|
_run([str(sevenz), "x", "-y", f"-o{extract}", str(cache)])
|
||||||
|
|
||||||
|
staging.mkdir(parents=True, exist_ok=True)
|
||||||
|
# The Inno Setup payload lands under ``{app}/`` inside the
|
||||||
|
# extraction. Recursively grab tesseract.exe + DLLs.
|
||||||
|
found_exe = False
|
||||||
|
for root, _dirs, files in os.walk(extract):
|
||||||
|
for f in files:
|
||||||
|
src = Path(root) / f
|
||||||
|
if f.lower() == "tesseract.exe":
|
||||||
|
shutil.copy2(src, staging / "tesseract.exe")
|
||||||
|
found_exe = True
|
||||||
|
elif f.lower().endswith(".dll"):
|
||||||
|
shutil.copy2(src, staging / f)
|
||||||
|
if not found_exe:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"tesseract.exe not found inside extracted installer at {extract}"
|
||||||
|
)
|
||||||
|
_ok(f"staged Windows tesseract into {staging.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_tesseract_macos(staging: Path) -> None:
|
||||||
|
"""Stage tesseract + dylibs into *staging* on macOS.
|
||||||
|
|
||||||
|
Strategy: use Homebrew. ``brew install tesseract`` is the
|
||||||
|
sanctioned macOS path and the binary it installs is the same one
|
||||||
|
every guide on the internet points at. We copy the binary +
|
||||||
|
every dylib it links against into the staging dir, then run
|
||||||
|
``install_name_tool`` to rewrite the load paths so the binary
|
||||||
|
works after relocation into the .app bundle.
|
||||||
|
|
||||||
|
Caveat: ``brew`` must be on PATH (it is on ``macos-latest``
|
||||||
|
runners). If it isn't, we surface a helpful error rather than
|
||||||
|
fail mysteriously.
|
||||||
|
"""
|
||||||
|
if not shutil.which("brew"):
|
||||||
|
_err(
|
||||||
|
"Homebrew not found. On macos-latest GitHub runners it's "
|
||||||
|
"preinstalled; on a dev Mac install from https://brew.sh and "
|
||||||
|
"re-run. Alternatively pre-stage tesseract into "
|
||||||
|
f"{staging}/ and set TESSERACT_SKIP_FETCH=1."
|
||||||
|
)
|
||||||
|
raise FileNotFoundError("brew")
|
||||||
|
|
||||||
|
# ``brew install`` is idempotent — fine to run on every build. We
|
||||||
|
# don't pin the version through brew because brew tracks its own
|
||||||
|
# taps; instead we assert the version matches TESSERACT_VERSION
|
||||||
|
# after install.
|
||||||
|
_run(["brew", "install", "tesseract"])
|
||||||
|
|
||||||
|
# Find the binary brew just installed.
|
||||||
|
tess_path = shutil.which("tesseract")
|
||||||
|
if not tess_path:
|
||||||
|
raise RuntimeError("brew install tesseract succeeded but tesseract not on PATH")
|
||||||
|
|
||||||
|
staging.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(tess_path, staging / "tesseract")
|
||||||
|
|
||||||
|
# Copy every non-system dylib the binary links against. The
|
||||||
|
# ``otool -L`` output lists absolute paths under /opt/homebrew/
|
||||||
|
# (Apple Silicon) or /usr/local/ (Intel). We skip /usr/lib/* and
|
||||||
|
# /System/* (Apple-shipped, present on every Mac).
|
||||||
|
try:
|
||||||
|
otool = subprocess.run(
|
||||||
|
["otool", "-L", str(staging / "tesseract")],
|
||||||
|
check=True, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise RuntimeError(f"otool failed: {e.stderr}") from e
|
||||||
|
|
||||||
|
deps = []
|
||||||
|
for line in otool.stdout.splitlines()[1:]:
|
||||||
|
path = line.strip().split(" ", 1)[0]
|
||||||
|
if path.startswith(("/opt/homebrew/", "/usr/local/")):
|
||||||
|
deps.append(path)
|
||||||
|
|
||||||
|
# Copy each dep and its transitive deps. One level of recursion
|
||||||
|
# is usually enough for the tesseract dep tree (libtesseract →
|
||||||
|
# libleptonica → libpng/libjpeg/libtiff/libwebp).
|
||||||
|
copied: set[str] = set()
|
||||||
|
|
||||||
|
def _copy_with_deps(libpath: str) -> None:
|
||||||
|
if libpath in copied or not Path(libpath).exists():
|
||||||
|
return
|
||||||
|
copied.add(libpath)
|
||||||
|
dest = staging / Path(libpath).name
|
||||||
|
shutil.copy2(libpath, dest)
|
||||||
|
# Rewrite the dest's own load path to @loader_path so the
|
||||||
|
# bundle is relocatable.
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["install_name_tool", "-id", f"@loader_path/{Path(libpath).name}", str(dest)],
|
||||||
|
check=True, capture_output=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
# Not fatal — install_name_tool refuses on already-relative
|
||||||
|
# IDs. The dyld loader will still find them via
|
||||||
|
# @loader_path rewrites on the consumer side.
|
||||||
|
pass
|
||||||
|
# Walk this lib's own deps.
|
||||||
|
try:
|
||||||
|
sub = subprocess.run(
|
||||||
|
["otool", "-L", libpath], check=True, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
for sub_line in sub.stdout.splitlines()[1:]:
|
||||||
|
sub_path = sub_line.strip().split(" ", 1)[0]
|
||||||
|
if sub_path.startswith(("/opt/homebrew/", "/usr/local/")):
|
||||||
|
_copy_with_deps(sub_path)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
for dep in deps:
|
||||||
|
_copy_with_deps(dep)
|
||||||
|
|
||||||
|
# Rewrite the tesseract binary's references to point at
|
||||||
|
# @loader_path/<dyname> so it can find its deps inside the bundle.
|
||||||
|
bin_path = staging / "tesseract"
|
||||||
|
for dep in deps:
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["install_name_tool", "-change", dep,
|
||||||
|
f"@loader_path/{Path(dep).name}", str(bin_path)],
|
||||||
|
check=True, capture_output=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
_ok(f"staged macOS tesseract + {len(copied)} dylibs into {staging.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_tesseract_linux(staging: Path) -> None:
|
||||||
|
"""Stage tesseract + .so files into *staging* on Linux.
|
||||||
|
|
||||||
|
Strategy: ``apt-get install tesseract-ocr libtesseract5``
|
||||||
|
(preinstalled on most ubuntu-latest images; we run install
|
||||||
|
anyway because the package is idempotent). Then copy the
|
||||||
|
binary + every .so it links against into staging. ``patchelf``
|
||||||
|
rewrites RPATH so the bundle is relocatable.
|
||||||
|
"""
|
||||||
|
if not shutil.which("apt-get") and not shutil.which("tesseract"):
|
||||||
|
_err(
|
||||||
|
"Neither apt-get nor a pre-installed tesseract found. On "
|
||||||
|
"ubuntu-latest runners both are present. On other distros "
|
||||||
|
"install tesseract-ocr via your package manager and re-run "
|
||||||
|
"with TESSERACT_SKIP_FETCH=1 after pre-staging the binary."
|
||||||
|
)
|
||||||
|
raise FileNotFoundError("tesseract")
|
||||||
|
|
||||||
|
if shutil.which("apt-get") and not shutil.which("tesseract"):
|
||||||
|
_run(["sudo", "apt-get", "update"])
|
||||||
|
_run(["sudo", "apt-get", "install", "-y", "tesseract-ocr", "libtesseract5"])
|
||||||
|
|
||||||
|
tess_path = shutil.which("tesseract")
|
||||||
|
if not tess_path:
|
||||||
|
raise RuntimeError("apt-get install succeeded but tesseract not on PATH")
|
||||||
|
|
||||||
|
staging.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(tess_path, staging / "tesseract")
|
||||||
|
|
||||||
|
# Collect .so dependencies via ldd. Skip the dynamic linker and
|
||||||
|
# libc/libpthread/libdl/libm/libstdc++/libgcc_s — those are
|
||||||
|
# guaranteed to exist on every Linux target and shipping them can
|
||||||
|
# cause GLIBC mismatch errors on older distros. The interesting
|
||||||
|
# tesseract-specific deps are libtesseract, libleptonica, and the
|
||||||
|
# image format libs (libpng, libjpeg, libtiff, libwebp, libgif).
|
||||||
|
SKIP_PREFIXES = (
|
||||||
|
"linux-vdso", "/lib64/ld-linux", "/lib/ld-linux",
|
||||||
|
"libc.so", "libdl.so", "libpthread.so", "libm.so",
|
||||||
|
"librt.so", "libnsl.so", "libutil.so",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
ldd = subprocess.run(
|
||||||
|
["ldd", str(staging / "tesseract")],
|
||||||
|
check=True, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise RuntimeError(f"ldd failed: {e.stderr}") from e
|
||||||
|
|
||||||
|
copied = 0
|
||||||
|
for line in ldd.stdout.splitlines():
|
||||||
|
# Format: " libfoo.so.N => /path/to/libfoo.so.N (0x...)"
|
||||||
|
parts = line.split("=>")
|
||||||
|
if len(parts) != 2:
|
||||||
|
continue
|
||||||
|
soname = parts[0].strip()
|
||||||
|
if soname.startswith(SKIP_PREFIXES):
|
||||||
|
continue
|
||||||
|
path_part = parts[1].strip().split(" ", 1)[0]
|
||||||
|
if not path_part or not Path(path_part).exists():
|
||||||
|
continue
|
||||||
|
shutil.copy2(path_part, staging / Path(path_part).name)
|
||||||
|
copied += 1
|
||||||
|
|
||||||
|
# patchelf is optional — if present, rewrite RPATH to $ORIGIN so
|
||||||
|
# the binary finds its bundled .so files. If absent, the
|
||||||
|
# PyInstaller LD_LIBRARY_PATH that the launcher sets will cover
|
||||||
|
# it (we already chdir into _MEIPASS for the runtime).
|
||||||
|
if shutil.which("patchelf"):
|
||||||
|
try:
|
||||||
|
_run(["patchelf", "--set-rpath", "$ORIGIN", str(staging / "tesseract")])
|
||||||
|
except SystemExit:
|
||||||
|
_warn("patchelf rpath rewrite failed — relying on LD_LIBRARY_PATH at runtime")
|
||||||
|
|
||||||
|
_ok(f"staged Linux tesseract + {copied} .so files into {staging.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_tesseract_for_platform(target: str) -> Path:
|
||||||
|
"""Stage the per-platform Tesseract binary + libs into ``build/_tesseract/<target>/``.
|
||||||
|
|
||||||
|
Returns the staging dir path. The PyInstaller spec adds this dir
|
||||||
|
(plus tessdata) to its ``datas=`` so the bundle ends up with
|
||||||
|
everything under ``<bundle>/tesseract/`` where the runtime
|
||||||
|
discovery code expects it.
|
||||||
|
|
||||||
|
Honours ``TESSERACT_SKIP_FETCH=1`` — set this when you've
|
||||||
|
pre-staged the binary by hand (offline build, behind a proxy,
|
||||||
|
custom build of tesseract, etc.). The script still verifies the
|
||||||
|
binary is present and surfaces a helpful error if not.
|
||||||
|
"""
|
||||||
|
_step(f"fetch tesseract binary ({target})")
|
||||||
|
staging = TESSERACT_STAGING / target
|
||||||
|
exe_name = "tesseract.exe" if target == "win" else "tesseract"
|
||||||
|
exe_path = staging / exe_name
|
||||||
|
|
||||||
|
if os.environ.get("TESSERACT_SKIP_FETCH") == "1":
|
||||||
|
if not exe_path.exists():
|
||||||
|
_err(
|
||||||
|
f"TESSERACT_SKIP_FETCH=1 but {exe_path} is missing. "
|
||||||
|
"Pre-stage the binary + its libs into that dir, then re-run."
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
_ok(f"skipping fetch (TESSERACT_SKIP_FETCH=1); using {exe_path.relative_to(REPO)}")
|
||||||
|
return staging
|
||||||
|
|
||||||
|
if exe_path.exists():
|
||||||
|
_ok(f"already staged: {exe_path.relative_to(REPO)}")
|
||||||
|
return staging
|
||||||
|
|
||||||
|
if target == "win":
|
||||||
|
_fetch_tesseract_windows(staging)
|
||||||
|
elif target == "mac":
|
||||||
|
_fetch_tesseract_macos(staging)
|
||||||
|
elif target == "linux":
|
||||||
|
_fetch_tesseract_linux(staging)
|
||||||
|
else:
|
||||||
|
_err(f"unknown target {target!r} for tesseract fetch")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
if not exe_path.exists():
|
||||||
|
_err(
|
||||||
|
f"fetch step finished but {exe_path.relative_to(REPO)} is missing. "
|
||||||
|
"Inspect the logs above; you may need to pre-stage the binary manually."
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
return staging
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Build steps
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def step_generate_icons() -> None:
|
||||||
|
_step("generate icons")
|
||||||
|
_run([sys.executable, str(BUILD / "generate_icons.py")])
|
||||||
|
|
||||||
|
|
||||||
|
def step_pyinstaller(clean: bool, *, target: str | None = None) -> None:
|
||||||
|
_step("pyinstaller bundle")
|
||||||
|
# Use ``python -m PyInstaller`` so we don't depend on the binary
|
||||||
|
# being on PATH (Windows users frequently see this — pip's
|
||||||
|
# Scripts/ dir isn't auto-added).
|
||||||
|
cmd = [sys.executable, "-m", "PyInstaller",
|
||||||
|
str(BUILD / "datatools.spec"),
|
||||||
|
"--noconfirm"]
|
||||||
|
if clean:
|
||||||
|
cmd.append("--clean")
|
||||||
|
# The spec reads ``DATATOOLS_TESS_STAGING`` to find the per-platform
|
||||||
|
# tesseract staging dir. Passing it via env keeps the spec file
|
||||||
|
# platform-agnostic — the spec doesn't need to detect win/mac/linux
|
||||||
|
# itself; the orchestrator already did.
|
||||||
|
env = os.environ.copy()
|
||||||
|
if target:
|
||||||
|
env["DATATOOLS_TESS_STAGING"] = str(TESSERACT_STAGING / target)
|
||||||
|
_run(cmd, env=env)
|
||||||
|
|
||||||
|
|
||||||
|
def step_package_win(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
||||||
|
out: list[Path] = []
|
||||||
|
if do_installer:
|
||||||
|
_step("Windows installer (Inno Setup)")
|
||||||
|
_run(["iscc", f"/DAppVersion={version}", str(BUILD / "installer.iss")])
|
||||||
|
out.append(DIST / f"DataTools-{version}-win-setup.exe")
|
||||||
|
if do_portable:
|
||||||
|
_step("Windows portable .zip")
|
||||||
|
_run([sys.executable, str(BUILD / "build_portable_zip.py"), "win", version])
|
||||||
|
out.append(DIST / f"DataTools-{version}-win-portable.zip")
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def step_package_mac(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
||||||
|
out: list[Path] = []
|
||||||
|
if do_installer:
|
||||||
|
_step("macOS DMG (installer)")
|
||||||
|
_run(["bash", str(BUILD / "macos" / "build_dmg.sh"), version])
|
||||||
|
out.append(DIST / f"DataTools-{version}-mac.dmg")
|
||||||
|
if do_portable:
|
||||||
|
_step("macOS portable .zip")
|
||||||
|
_run(["bash", str(BUILD / "macos" / "build_zip.sh"), version])
|
||||||
|
out.append(DIST / f"DataTools-{version}-mac-portable.zip")
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def step_package_linux(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
||||||
|
# On Linux the AppImage IS the portable. We ignore the two flags
|
||||||
|
# and always produce the single file — splitting wouldn't add
|
||||||
|
# value.
|
||||||
|
if not (do_installer or do_portable):
|
||||||
|
return []
|
||||||
|
_step("Linux AppImage")
|
||||||
|
_run(["bash", str(BUILD / "appimage" / "build.sh"), version])
|
||||||
|
return [DIST / f"DataTools-{version}-linux-x86_64.AppImage"]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Orchestration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _summarise(outputs: list[Path]) -> None:
|
||||||
|
_step("done — outputs")
|
||||||
|
if not outputs:
|
||||||
|
_warn("no files produced (everything skipped via flags)")
|
||||||
|
return
|
||||||
|
for p in outputs:
|
||||||
|
if p.exists():
|
||||||
|
size_mb = p.stat().st_size / (1024 * 1024)
|
||||||
|
print(f" {p.relative_to(REPO)} ({size_mb:.1f} MB)")
|
||||||
|
else:
|
||||||
|
_warn(f"expected output missing: {p.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="make_release.py",
|
||||||
|
description=(
|
||||||
|
"Build the installer + portable zip for the current OS. "
|
||||||
|
"Cross-compilation isn't supported by PyInstaller — run "
|
||||||
|
"this once per platform you want to target."
|
||||||
|
),
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--platform", choices=("auto", "win", "mac", "linux"), default="auto",
|
||||||
|
help="Override OS detection (mostly for testing). Default: auto.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--preflight", action="store_true",
|
||||||
|
help="Check tooling and exit without building.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clean", action="store_true",
|
||||||
|
help="Wipe dist/ before building.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-installer", action="store_true",
|
||||||
|
help="Don't build the OS installer (.exe / .dmg).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-portable", action="store_true",
|
||||||
|
help="Don't build the portable .zip.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
target = _detect_platform() if args.platform == "auto" else args.platform
|
||||||
|
version = _read_version()
|
||||||
|
do_installer = not args.skip_installer
|
||||||
|
do_portable = not args.skip_portable
|
||||||
|
|
||||||
|
print(f"DataTools release builder")
|
||||||
|
print(f" target: {target} (host: {platform.platform()})")
|
||||||
|
print(f" version: {version}")
|
||||||
|
print(f" installer: {'yes' if do_installer else 'no'}")
|
||||||
|
print(f" portable: {'yes' if do_portable else 'no'}")
|
||||||
|
print(f" dist dir: {DIST}")
|
||||||
|
|
||||||
|
if target != _detect_platform():
|
||||||
|
_warn(
|
||||||
|
f"--platform {target} but host is {_detect_platform()}. "
|
||||||
|
"PyInstaller can't cross-compile — the bundle will be for "
|
||||||
|
"the HOST, only the packaging step will follow your override. "
|
||||||
|
"Useful only for testing the packager paths."
|
||||||
|
)
|
||||||
|
|
||||||
|
preflight(target)
|
||||||
|
if args.preflight:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.clean and DIST.exists():
|
||||||
|
_step(f"cleaning {DIST}")
|
||||||
|
shutil.rmtree(DIST)
|
||||||
|
|
||||||
|
step_generate_icons()
|
||||||
|
|
||||||
|
# Stage Tesseract OCR before PyInstaller runs. The spec reads
|
||||||
|
# ``build/_tesseract/<target>/`` + ``build/vendor/tessdata/`` and
|
||||||
|
# bundles them under ``<bundle>/tesseract/`` so the runtime
|
||||||
|
# discovery in src/pdf_extract.py finds them at:
|
||||||
|
# Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"
|
||||||
|
# Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"
|
||||||
|
fetch_tessdata()
|
||||||
|
fetch_tesseract_for_platform(target)
|
||||||
|
|
||||||
|
step_pyinstaller(clean=args.clean, target=target)
|
||||||
|
|
||||||
|
if target == "win":
|
||||||
|
outputs = step_package_win(version, do_installer, do_portable)
|
||||||
|
elif target == "mac":
|
||||||
|
outputs = step_package_mac(version, do_installer, do_portable)
|
||||||
|
else:
|
||||||
|
outputs = step_package_linux(version, do_installer, do_portable)
|
||||||
|
|
||||||
|
_summarise(outputs)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
62
build/vendor/README.md
vendored
Normal file
62
build/vendor/README.md
vendored
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# build/vendor/ — third-party bundle inputs (fetched at build time)
|
||||||
|
|
||||||
|
This tree holds the third-party assets that get bundled into the
|
||||||
|
PyInstaller artifacts but that we deliberately do **not** keep in git
|
||||||
|
(too large / license-encumbered / re-fetchable on demand).
|
||||||
|
|
||||||
|
The build pipeline (`build/make_release.py`) populates everything in
|
||||||
|
here before the PyInstaller step. The contents are git-ignored except
|
||||||
|
for this README.
|
||||||
|
|
||||||
|
## tessdata/
|
||||||
|
|
||||||
|
Holds the Tesseract language data file(s) used by the PDF Extractor
|
||||||
|
OCR fallback. Only English is bundled today.
|
||||||
|
|
||||||
|
### Canonical source
|
||||||
|
|
||||||
|
We use the **"best" model** from `tesseract-ocr/tessdata_best` (LSTM,
|
||||||
|
slower but higher accuracy than the legacy `tessdata` set, and only
|
||||||
|
~12 MB compressed → ~16 MB uncompressed):
|
||||||
|
|
||||||
|
```
|
||||||
|
https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
|
||||||
|
```
|
||||||
|
|
||||||
|
There is also `tessdata_fast/` (~4 MB, lower accuracy) if you ever
|
||||||
|
want to optimise for bundle size over recognition quality. For bank
|
||||||
|
statements (the only OCR use case so far), the extra accuracy of the
|
||||||
|
`_best` model is worth the 10 MB.
|
||||||
|
|
||||||
|
### Why we don't vendor it in git
|
||||||
|
|
||||||
|
* ~16 MB binary file — bloats clone times for everyone, including
|
||||||
|
contributors who never touch the OCR code path.
|
||||||
|
* Apache-2.0-licensed and stable; the file rarely changes upstream
|
||||||
|
(last touched 2021), so a build-time fetch is safe.
|
||||||
|
* The Tesseract project explicitly distributes these via GitHub
|
||||||
|
raw URLs — they're meant to be downloaded, not redistributed
|
||||||
|
through other repos.
|
||||||
|
|
||||||
|
### How it gets populated
|
||||||
|
|
||||||
|
`build/make_release.py::fetch_tessdata()` checks for
|
||||||
|
`build/vendor/tessdata/eng.traineddata` on every run. If it's
|
||||||
|
missing, the script downloads it from the canonical URL above and
|
||||||
|
caches it here. Subsequent builds reuse the cached file.
|
||||||
|
|
||||||
|
On CI, the directory is restored from the GitHub Actions cache so we
|
||||||
|
don't pay the download cost on every run (`.github/workflows/build.yml`
|
||||||
|
caches `build/vendor/tessdata/` keyed on the URL above).
|
||||||
|
|
||||||
|
## Manual one-time fetch (if you're offline or behind a proxy)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p build/vendor/tessdata
|
||||||
|
curl -L -o build/vendor/tessdata/eng.traineddata \
|
||||||
|
https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify the file is non-empty and starts with the magic bytes
|
||||||
|
`b"\x00\x00\x00\x00"` followed by a header that `pytesseract` can
|
||||||
|
read; the script does a basic sanity check after download.
|
||||||
0
build/vendor/tessdata/.gitkeep
vendored
Normal file
0
build/vendor/tessdata/.gitkeep
vendored
Normal file
481
docs/ADMIN.md
Normal file
481
docs/ADMIN.md
Normal file
@@ -0,0 +1,481 @@
|
|||||||
|
# ADMIN — Internal license operations
|
||||||
|
|
||||||
|
Creator/operator-only reference. End users should read `USER-GUIDE.md` instead.
|
||||||
|
|
||||||
|
This doc covers everything the creator does that buyers never see: minting
|
||||||
|
through the live server, where state lives on the box, how to rotate secrets,
|
||||||
|
generating the signing keypair, the dev vs. production key story, and how to
|
||||||
|
recover from key loss.
|
||||||
|
|
||||||
|
For the end-to-end system + tech stack diagrams, see `ARCHITECTURE.md`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Live deployment (PR 1)
|
||||||
|
|
||||||
|
The license server is running at:
|
||||||
|
|
||||||
|
| URL | What it serves |
|
||||||
|
|---|---|
|
||||||
|
| `https://datatools.unalogix.com/` | Marketing site (placeholder — "DataTools — coming soon") |
|
||||||
|
| `https://licenses.datatools.unalogix.com/health` | Liveness + DB reachability probe |
|
||||||
|
| `https://licenses.datatools.unalogix.com/internal/*` | nginx-blocked on the public side — accessible only via SSH tunnel |
|
||||||
|
| Postgres @ `127.0.0.1:5433` (localhost) | DB containing the authoritative `licenses` table |
|
||||||
|
|
||||||
|
**Host**: `46.225.166.142` (Ubuntu 24.04), nginx 1.24, Postgres 16-alpine + FastAPI in Docker.
|
||||||
|
|
||||||
|
**Cert**: Let's Encrypt, covers both subdomains, expires 2026-08-12, auto-renews via `certbot.timer`.
|
||||||
|
|
||||||
|
### On-box state
|
||||||
|
|
||||||
|
| Path | Contents |
|
||||||
|
|---|---|
|
||||||
|
| `/srv/datatools-license/` | Deploy root, mode 750, owned by `datatools-api` |
|
||||||
|
| `/srv/datatools-license/compose.yml` | Production docker-compose definition |
|
||||||
|
| `/srv/datatools-license/app/` | Git clone of this repo (re-clone or `git pull` to update) |
|
||||||
|
| `/srv/datatools-license/secrets/` | Mode 750 dir holding `pg_password`, `admin_token`. Files are mode 400, owned UID 10001 (container app user) |
|
||||||
|
| `/srv/datatools-license/backups/` | Postgres dumps land here (cron not yet wired — see §"Backups" below) |
|
||||||
|
| `/etc/nginx/sites-available/unalogix` | nginx config for both subdomains |
|
||||||
|
| `/etc/letsencrypt/live/datatools.unalogix.com/` | TLS cert + key |
|
||||||
|
|
||||||
|
Container names: `datatools-api`, `datatools-postgres`. Both use
|
||||||
|
`restart: unless-stopped`.
|
||||||
|
|
||||||
|
### Get the admin token
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh michael@46.225.166.142 'sudo cat /srv/datatools-license/secrets/admin_token'
|
||||||
|
```
|
||||||
|
|
||||||
|
The token is **never** in git, in environment-variable dumps, or in
|
||||||
|
`docker inspect`. It lives on disk under mode 400 / UID 10001 (so only
|
||||||
|
root and the container app user can read it).
|
||||||
|
|
||||||
|
### Rotate the admin token
|
||||||
|
|
||||||
|
Any time it's been shown somewhere it shouldn't, or as routine hygiene:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/datatools-license
|
||||||
|
openssl rand -hex 32 > secrets/admin_token
|
||||||
|
chown 10001:10001 secrets/admin_token
|
||||||
|
chmod 400 secrets/admin_token
|
||||||
|
docker compose restart api # ~3 seconds; old token stops working immediately
|
||||||
|
```
|
||||||
|
|
||||||
|
### Mint a license from your laptop
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Open the SSH tunnel (leave running in a background terminal)
|
||||||
|
ssh -L 8090:127.0.0.1:8090 michael@46.225.166.142 -N &
|
||||||
|
|
||||||
|
# 2. Set the auth env
|
||||||
|
export DATATOOLS_ADMIN_TOKEN="$(ssh michael@46.225.166.142 'sudo cat /srv/datatools-license/secrets/admin_token')"
|
||||||
|
export DATATOOLS_ADMIN_URL=http://127.0.0.1:8090
|
||||||
|
|
||||||
|
# 3. Mint
|
||||||
|
python3 -m src.admin_cli mint \
|
||||||
|
--name "Buyer Name" \
|
||||||
|
--email buyer@example.com \
|
||||||
|
--tier core
|
||||||
|
|
||||||
|
# 4. (optional) List or revoke
|
||||||
|
python3 -m src.admin_cli list --email buyer@example.com
|
||||||
|
python3 -m src.admin_cli revoke DT1-CORE-xxxx-yyyy --reason "refund"
|
||||||
|
```
|
||||||
|
|
||||||
|
The blob lands in the response (and in the `licenses` table). Deliver it
|
||||||
|
to the buyer however suits — copy-paste into email, attach as `.dtlic`.
|
||||||
|
|
||||||
|
### Inspect / debug
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Container status + recent logs
|
||||||
|
ssh michael@46.225.166.142 'cd /srv/datatools-license && docker compose ps && docker compose logs api --tail 30'
|
||||||
|
|
||||||
|
# Query the licenses table directly
|
||||||
|
ssh michael@46.225.166.142 'cd /srv/datatools-license && docker compose exec -T postgres \
|
||||||
|
psql -U datatools_api -d datatools_licenses -c "SELECT license_key, email, tier, source, expires_at FROM licenses ORDER BY created_at DESC LIMIT 20;"'
|
||||||
|
|
||||||
|
# Public-side health
|
||||||
|
curl https://licenses.datatools.unalogix.com/health
|
||||||
|
```
|
||||||
|
|
||||||
|
### Bring it down / back up / rebuild
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/datatools-license
|
||||||
|
|
||||||
|
# Restart just the API (e.g. after rotating a secret)
|
||||||
|
docker compose restart api
|
||||||
|
|
||||||
|
# Restart everything
|
||||||
|
docker compose restart
|
||||||
|
|
||||||
|
# Bring down (DB volume PRESERVED)
|
||||||
|
docker compose down
|
||||||
|
|
||||||
|
# Bring up
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
# Rebuild the image after a git pull
|
||||||
|
cd app && git pull
|
||||||
|
cd ..
|
||||||
|
docker compose build && docker compose up -d
|
||||||
|
docker compose exec api alembic upgrade head # if new migrations
|
||||||
|
```
|
||||||
|
|
||||||
|
### Backups (not yet automated)
|
||||||
|
|
||||||
|
Postgres state is the system of record for the customer list — once PR 2
|
||||||
|
auto-mints from Gumroad webhooks, losing the DB would mean losing every
|
||||||
|
buyer record. Schedule a daily dump:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# /etc/cron.daily/datatools-license-backup — see SETUP-LICENSE-SERVER.md §9
|
||||||
|
```
|
||||||
|
|
||||||
|
Until that's in place, dump manually before any risky operation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose exec -T postgres \
|
||||||
|
pg_dump -U datatools_api datatools_licenses \
|
||||||
|
| gzip > backups/db-$(date -u +%Y%m%dT%H%M%SZ).sql.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
### Production signing key (not yet rotated)
|
||||||
|
|
||||||
|
The server currently signs with the in-tree dev keypair (no
|
||||||
|
`DATATOOLS_LICENSE_PRIVKEY_FILE` configured → falls back to
|
||||||
|
`src/license/_dev_keypair.py`). That matches what the desktop currently
|
||||||
|
verifies against, so existing buyers continue to work.
|
||||||
|
|
||||||
|
**Before shipping v1.0 to paying buyers**, rotate to a production keypair:
|
||||||
|
|
||||||
|
1. `python scripts/generate_keypair.py` (on a trusted machine).
|
||||||
|
2. Save the private hex to `/srv/datatools-license/secrets/license_privkey`,
|
||||||
|
chmod 400, chown 10001:10001.
|
||||||
|
3. Bake the public hex into the PyInstaller build's
|
||||||
|
`DATATOOLS_LICENSE_PUBKEY` env.
|
||||||
|
4. Wire `DATATOOLS_LICENSE_PRIVKEY_FILE` + `DATATOOLS_LICENSE_PUBKEY`
|
||||||
|
into compose.yml's `api.environment` and add `license_privkey` to
|
||||||
|
the secrets block.
|
||||||
|
5. `docker compose restart api`.
|
||||||
|
|
||||||
|
### What's deployed (PR 1) vs queued (PR 2 / 3)
|
||||||
|
|
||||||
|
| Capability | Status |
|
||||||
|
|---|---|
|
||||||
|
| Mint API + Postgres + auth | **Live** |
|
||||||
|
| `datatools-admin` CLI (manual mints) | **Live** |
|
||||||
|
| `licenses.datatools.unalogix.com/health` public | **Live** |
|
||||||
|
| Gumroad webhook receiver | **PR 2 — code merged, deploy pending** |
|
||||||
|
| Postmark transactional email | **PR 2 — code merged, deploy pending** |
|
||||||
|
| Buyer renewal / re-delivery portal | **PR 3** |
|
||||||
|
| Cloudflare in front (DDoS / WAF) | Deferred (DNS at supercp/cPanel) |
|
||||||
|
| Production signing keypair | Deferred (still using dev key) |
|
||||||
|
| Automated DB backups | **Pending** — see §"Backups" |
|
||||||
|
|
||||||
|
### Running a Gumroad webhook (PR 2)
|
||||||
|
|
||||||
|
Once PR 2 is deployed, sales fire `POST` to
|
||||||
|
`https://licenses.datatools.unalogix.com/webhooks/gumroad?secret=<gumroad_secret>`.
|
||||||
|
Auth is the URL secret (Gumroad's recommended pattern). The handler
|
||||||
|
audit-logs the raw payload, mints idempotently keyed on `sale_id`,
|
||||||
|
sends the buyer their blob via Postmark, and returns 200 (always —
|
||||||
|
non-2xx would trigger 3-day retry storms).
|
||||||
|
|
||||||
|
**Adding a new SKU:**
|
||||||
|
|
||||||
|
1. Create the product in Gumroad and copy its `product_id`.
|
||||||
|
2. Edit `/srv/datatools-license/app/server/config/products.yaml`,
|
||||||
|
add a row under `gumroad:` with that ID + the tier you sold.
|
||||||
|
3. `cd /srv/datatools-license && docker compose restart api` — the
|
||||||
|
config is read at startup and cached.
|
||||||
|
|
||||||
|
**Inspecting webhook activity:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Recent webhook deliveries (all storefronts share this table)
|
||||||
|
ssh michael@46.225.166.142 'cd /srv/datatools-license && docker compose exec -T postgres \
|
||||||
|
psql -U datatools_api -d datatools_licenses -c \
|
||||||
|
"SELECT received_at, order_id, processed, error FROM gumroad_events ORDER BY received_at DESC LIMIT 20;"'
|
||||||
|
|
||||||
|
# Failures only (replay candidates)
|
||||||
|
ssh michael@46.225.166.142 'cd /srv/datatools-license && docker compose exec -T postgres \
|
||||||
|
psql -U datatools_api -d datatools_licenses -c \
|
||||||
|
"SELECT id, received_at, order_id, error FROM gumroad_events WHERE processed=false ORDER BY received_at DESC;"'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Replaying a failed webhook** (after fixing the products.yaml mapping
|
||||||
|
or whatever surfaced the error): the safest path is to ask the buyer
|
||||||
|
to re-trigger via Gumroad's "Send Test Ping" button in their order
|
||||||
|
record, *or* mint manually via `datatools-admin mint --source manual`
|
||||||
|
and add a note linking to the original `gumroad_events.id`.
|
||||||
|
|
||||||
|
**Testing without buyers:** Gumroad's seller dashboard has a "Send
|
||||||
|
Test Ping" button. It sets `test=true` in the payload; the adapter
|
||||||
|
tags the resulting license with `notes='gumroad test ping'` so it's
|
||||||
|
trivially filterable later.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## TL;DR — I just need a license for my dev machine
|
||||||
|
|
||||||
|
You're running from source, so the repo's embedded dev keypair signs and
|
||||||
|
verifies. No env vars needed.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/generate_license.py \
|
||||||
|
--name "Michael Dombaugh" \
|
||||||
|
--email michael.dombaugh@gmail.com \
|
||||||
|
--tier core
|
||||||
|
```
|
||||||
|
|
||||||
|
Copy the `DTLIC2:…` blob from stdout, then activate:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m src.license_cli activate "DTLIC2:..." \
|
||||||
|
--name "Michael Dombaugh" \
|
||||||
|
--email michael.dombaugh@gmail.com
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m src.license_cli status
|
||||||
|
```
|
||||||
|
|
||||||
|
License lands at `~/.datatools/license.json`, valid 1 year.
|
||||||
|
|
||||||
|
> The `--name` / `--email` you pass to `activate` **must** match the values
|
||||||
|
> the blob was minted with — they're part of the signed payload.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Key model (Ed25519, asymmetric)
|
||||||
|
|
||||||
|
| Key | Lives where | Used for |
|
||||||
|
|-----|------------|---------|
|
||||||
|
| **Private** (32 bytes hex) | Creator's password manager / KMS only | Signing license blobs |
|
||||||
|
| **Public** (32 bytes hex) | Baked into the shipped binary | Verifying blobs at activation |
|
||||||
|
|
||||||
|
The split is the whole point: an attacker with a copy of the binary still
|
||||||
|
can't mint blobs — they'd need the private key, which never ships.
|
||||||
|
|
||||||
|
There's also an in-tree **dev keypair** (`src/license/_dev_keypair.py`)
|
||||||
|
derived deterministically from a seed. It's used when no env vars are set,
|
||||||
|
so devs/tests can sign and verify locally without juggling secrets. Frozen
|
||||||
|
builds that still use it are rejected at startup by
|
||||||
|
`assert_production_safe` — see `src/license/crypto.py:84`.
|
||||||
|
|
||||||
|
Blob format prefix: `DTLIC2:` (v1 was HMAC; v2 is Ed25519).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## One-time setup — generating the production keypair
|
||||||
|
|
||||||
|
Run once, before the first paid release.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/generate_keypair.py --output keypair.env
|
||||||
|
```
|
||||||
|
|
||||||
|
You'll get:
|
||||||
|
|
||||||
|
```
|
||||||
|
DATATOOLS_LICENSE_PRIVKEY=<64 hex chars> # KEEP SECRET
|
||||||
|
DATATOOLS_LICENSE_PUBKEY=<64 hex chars> # BAKE INTO BUILD
|
||||||
|
```
|
||||||
|
|
||||||
|
Then:
|
||||||
|
|
||||||
|
1. **Stash the private key** in a password manager / KMS / hardware token.
|
||||||
|
Losing it means no more renewals — see "Recovery" below.
|
||||||
|
2. **Delete `keypair.env`** from disk once stored.
|
||||||
|
3. **Set the public key** as `DATATOOLS_LICENSE_PUBKEY` in the PyInstaller
|
||||||
|
build environment. The shipped binary embeds it via the env at freeze time.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Minting a buyer license (production)
|
||||||
|
|
||||||
|
With the production private key loaded:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export DATATOOLS_LICENSE_PRIVKEY=<your-private-hex>
|
||||||
|
|
||||||
|
python scripts/generate_license.py \
|
||||||
|
--name "Buyer Name" \
|
||||||
|
--email buyer@example.com \
|
||||||
|
--tier core \
|
||||||
|
--years 1 \
|
||||||
|
--output buyer.dtlic
|
||||||
|
```
|
||||||
|
|
||||||
|
Flags:
|
||||||
|
|
||||||
|
| Flag | Default | Notes |
|
||||||
|
|------|---------|-------|
|
||||||
|
| `--name` | required | Buyer's full name. Goes into signed payload. |
|
||||||
|
| `--email` | required | Buyer's email. Goes into signed payload. |
|
||||||
|
| `--tier` | `core` | One of: `lite`, `core`, `pro` |
|
||||||
|
| `--years` | `1` | Lifetime in years |
|
||||||
|
| `--key` | random | Override the auto-generated license key |
|
||||||
|
| `--output` / `-o` | stdout | Write blob to file instead of printing |
|
||||||
|
|
||||||
|
Deliver the blob to the buyer either inline in the purchase email or as
|
||||||
|
the attached `.dtlic` file.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tiers
|
||||||
|
|
||||||
|
| Tier | Features |
|
||||||
|
|------|---------|
|
||||||
|
| **lite** | Find Duplicates, Clean Text, Standardize Formats |
|
||||||
|
| **core** | All 9 tools |
|
||||||
|
| **pro** | All 9 tools + future Pro-only features |
|
||||||
|
|
||||||
|
Source of truth: `src/license/features.py::all_features_for_tier`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Useful one-liners
|
||||||
|
|
||||||
|
Mint a free internal/team license (dev key, no env needed):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/generate_license.py --name "QA Bot" --email qa@datatools.app --tier core --years 5
|
||||||
|
```
|
||||||
|
|
||||||
|
Mint with a stable, human-readable key:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/generate_license.py --name "Acme Corp" --email ops@acme.com \
|
||||||
|
--tier pro --key "DT1-PRO-ACME-2026"
|
||||||
|
```
|
||||||
|
|
||||||
|
Renew an existing buyer (just re-mint with the same email; they paste the
|
||||||
|
new blob):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m src.license_cli renew "DTLIC2:..."
|
||||||
|
```
|
||||||
|
|
||||||
|
Check what's active locally:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m src.license_cli status
|
||||||
|
```
|
||||||
|
|
||||||
|
Wipe a local license (move to a new machine, debug a buyer issue):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m src.license_cli deactivate
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Customer record-keeping — the issuance log
|
||||||
|
|
||||||
|
Every successful `scripts/generate_license.py` run appends one JSON
|
||||||
|
line to a local **issuance log**. This is the creator-side system of
|
||||||
|
record for "who has a license" until the server-side flow in
|
||||||
|
`docs/LICENSE-SERVER.md` lands.
|
||||||
|
|
||||||
|
**Path:** `~/.datatools-creator/issued.jsonl` (override with
|
||||||
|
`$DATATOOLS_ISSUANCE_LOG`). Mode 600. Outside the buyer-facing
|
||||||
|
`~/.datatools/` dir so it never gets bundled into a shipped install.
|
||||||
|
|
||||||
|
**Format** — one record per line:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"license_key": "DT1-CORE-5dd8e1db-d90c4656",
|
||||||
|
"name": "Michael Dombaugh",
|
||||||
|
"email": "michael.dombaugh@gmail.com",
|
||||||
|
"tier": "core",
|
||||||
|
"issued_at": "2026-05-13T22:10:27Z",
|
||||||
|
"expires_at": "2031-05-13T22:10:27Z",
|
||||||
|
"blob": "DTLIC2:..."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The full blob is stored so you can re-deliver to a buyer who lost
|
||||||
|
their email without re-minting (the re-minted blob would have a
|
||||||
|
different signature and would invalidate any device they'd already
|
||||||
|
activated against the old one).
|
||||||
|
|
||||||
|
**Useful operations:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Full list of issued licenses
|
||||||
|
cat ~/.datatools-creator/issued.jsonl | jq
|
||||||
|
|
||||||
|
# Find by buyer email
|
||||||
|
jq -r 'select(.email == "buyer@example.com")' ~/.datatools-creator/issued.jsonl
|
||||||
|
|
||||||
|
# Count by tier
|
||||||
|
jq -r .tier ~/.datatools-creator/issued.jsonl | sort | uniq -c
|
||||||
|
|
||||||
|
# Licenses expiring in the next 30 days
|
||||||
|
jq -r 'select(.expires_at < "'"$(date -u -d '+30 days' +%Y-%m-%dT%H:%M:%SZ)"'") | .email' \
|
||||||
|
~/.datatools-creator/issued.jsonl
|
||||||
|
|
||||||
|
# Re-deliver a buyer's blob
|
||||||
|
jq -r 'select(.email == "buyer@example.com") | .blob' \
|
||||||
|
~/.datatools-creator/issued.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
**Skipping the log** for test mints: pass `--no-log`. Never use this
|
||||||
|
for real buyer fulfillment — an unlogged mint is invisible to every
|
||||||
|
future query and to the eventual server-side migration.
|
||||||
|
|
||||||
|
**Backup:** treat this file like a small business ledger. Copy it
|
||||||
|
into your password manager / encrypted cloud sync alongside the
|
||||||
|
private key. Losing it doesn't break anything cryptographically (you
|
||||||
|
can still mint new licenses) but it does lose the customer list.
|
||||||
|
|
||||||
|
**Migrating to the server:** the JSONL schema is intentionally close
|
||||||
|
to the planned `licenses` table in `docs/LICENSE-SERVER.md`. Once the
|
||||||
|
server is up, a one-shot import script will read the JSONL and
|
||||||
|
insert each row.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recovery — what if the private key is lost?
|
||||||
|
|
||||||
|
Existing licenses keep working until they expire (the public key in the
|
||||||
|
shipped binary still verifies them). What breaks:
|
||||||
|
|
||||||
|
- **Renewals** — you can't mint a new blob for an existing buyer.
|
||||||
|
- **New sales** — you can't mint anything.
|
||||||
|
|
||||||
|
Path forward:
|
||||||
|
|
||||||
|
1. Generate a new keypair (`scripts/generate_keypair.py`).
|
||||||
|
2. Ship a new build with the new public key.
|
||||||
|
3. Re-issue every active buyer a new blob signed by the new private key.
|
||||||
|
4. Communicate the upgrade path to buyers.
|
||||||
|
|
||||||
|
Treat the private key like a code-signing cert — back it up to two
|
||||||
|
independent secure locations.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Files & code pointers
|
||||||
|
|
||||||
|
| Path | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `scripts/generate_keypair.py` | One-time keypair generation |
|
||||||
|
| `scripts/generate_license.py` | Mint a signed blob |
|
||||||
|
| `src/license/crypto.py` | Sign / verify / dev-key detection |
|
||||||
|
| `src/license/_dev_keypair.py` | In-tree dev keypair (never ships in prod) |
|
||||||
|
| `src/license/manager.py` | `assert_production_safe` startup check |
|
||||||
|
| `src/license/features.py` | Tier → features mapping |
|
||||||
|
| `src/license_cli.py` | End-user `activate` / `status` / `renew` / `deactivate` |
|
||||||
|
| `~/.datatools/license.json` | Where activated licenses are stored on each machine |
|
||||||
|
| `~/.datatools-creator/issued.jsonl` | Creator-side issuance log (one JSON line per mint) |
|
||||||
|
| `docs/LICENSE-SERVER.md` | Design for the future online issuance + record-keeping system |
|
||||||
|
| `docs/SETUP-LICENSE-SERVER.md` | Self-hosted server install runbook (DNS, Docker, nginx, TLS, backups) |
|
||||||
241
docs/ARCHITECTURE.md
Normal file
241
docs/ARCHITECTURE.md
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
# ARCHITECTURE — end-to-end view
|
||||||
|
|
||||||
|
Stitches the desktop app (`TECHNICAL.md`) and the license server
|
||||||
|
(`LICENSE-SERVER.md`) into a single picture. Read this first for "how
|
||||||
|
does it all fit together"; drill into the per-component docs for
|
||||||
|
detail.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. System diagram
|
||||||
|
|
||||||
|
```
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ OPERATOR / DEVELOPER LAPTOP │
|
||||||
|
│ │
|
||||||
|
│ git clone / push ←─── code lives in git.invixiom.com │
|
||||||
|
│ datatools-admin CLI ─── manual mints, list, revoke ─────┐ │
|
||||||
|
│ ssh -L 8090:127.0.0.1:8090 ───── tunnel for /internal/* ─────┤ │
|
||||||
|
└────────────────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
┌─────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
│ internal Bearer-auth API (over SSH tunnel only)
|
||||||
|
▼
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ LICENSE SERVER — 46.225.166.142 │
|
||||||
|
│ ───────────────────────────────────────────────────────────────── │
|
||||||
|
│ │
|
||||||
|
│ ┌──────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ nginx 1.24 (TLS termination, public reverse proxy) │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ datatools.unalogix.com → static placeholder │ │
|
||||||
|
│ │ licenses.datatools.unalogix.com → 127.0.0.1:8090 (FastAPI) │ │
|
||||||
|
│ │ /internal/* on public surface → blocked (404) │ │
|
||||||
|
│ └────────────────────────────┬─────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ┌────────────────────────────▼─────────────────────────────────────┐ │
|
||||||
|
│ │ FastAPI app — datatools-api (Docker container, UID 10001) │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ ┌──────────────────┐ ┌──────────────────┐ ┌───────────────┐ │ │
|
||||||
|
│ │ │ /webhooks/* │ │ /internal/* │ │ /health │ │ │
|
||||||
|
│ │ │ (storefronts) │ │ (Bearer-auth) │ │ (public) │ │ │
|
||||||
|
│ │ └────────┬─────────┘ └────────┬─────────┘ └───────────────┘ │ │
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ ▼ ▼ │ │
|
||||||
|
│ │ ┌────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ SourceAdapter (Protocol) — normalized │ │ │
|
||||||
|
│ │ │ • ManualAdapter • GumroadAdapter │ │ │
|
||||||
|
│ │ │ • (LemonSqueezy, Stripe — future) │ │ │
|
||||||
|
│ │ └────────────────┬───────────────────────┘ │ │
|
||||||
|
│ │ │ SaleEvent / RefundEvent │ │
|
||||||
|
│ │ ▼ │ │
|
||||||
|
│ │ ┌────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ mint_from_sale() │ │ │
|
||||||
|
│ │ │ • Ed25519 sign via PyCA cryptography │ │ │
|
||||||
|
│ │ │ • idempotent on (source, order_id) │ │ │
|
||||||
|
│ │ └────────────────┬───────────────────────┘ │ │
|
||||||
|
│ └────────────────────┼─────────────────────────────────────────────┘ │
|
||||||
|
│ │ SQL │
|
||||||
|
│ ┌────────────────────▼─────────────────────────────────────────────┐ │
|
||||||
|
│ │ Postgres 16 — datatools-postgres (container, vol pg_data) │ │
|
||||||
|
│ │ • licenses — authoritative customer record │ │
|
||||||
|
│ │ • gumroad_events — webhook audit log (idempotency, replay) │ │
|
||||||
|
│ └──────────────────────────────────────────────────────────────────┘ │
|
||||||
|
└───────────────────────┬────────────────────────────────┬───────────────┘
|
||||||
|
│ │
|
||||||
|
┌───────────┘ └──────────┐
|
||||||
|
│ POST /email (httpx) Gumroad Ping│
|
||||||
|
▼ POST │
|
||||||
|
┌───────────────────┐ ┌─────────────▼──┐
|
||||||
|
│ Postmark │ │ Gumroad │
|
||||||
|
│ (transactional │ │ (storefront, │
|
||||||
|
│ email) │ │ payments) │
|
||||||
|
└───────┬───────────┘ └────────────────┘
|
||||||
|
│ DKIM-signed email with license blob ▲
|
||||||
|
▼ │
|
||||||
|
┌────────────────────────────────────────────────────────────────┴───────┐
|
||||||
|
│ BUYER'S MACHINE │
|
||||||
|
│ │
|
||||||
|
│ Receives email ──► copies DTLIC2: blob ──► pastes into desktop app │
|
||||||
|
│ │
|
||||||
|
│ ┌──────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ DataTools desktop (Python 3.12 + Streamlit + Typer CLIs) │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ ┌────────────────────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ Activate screen — verifies blob signature │ │ │
|
||||||
|
│ │ │ against EMBEDDED Ed25519 public key │ │ │
|
||||||
|
│ │ │ (NO network call to the license server, ever) │ │ │
|
||||||
|
│ │ └─────────────────────────┬──────────────────────────────────┘ │ │
|
||||||
|
│ │ ▼ │ │
|
||||||
|
│ │ ~/.datatools/license.json (signed blob, mode 644, on disk) │ │
|
||||||
|
│ └──────────────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ Pays via web browser ─────► Gumroad ────► (kicks off the Ping) │
|
||||||
|
└────────────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**Three primary flows**, distinguishable by where the green arrows
|
||||||
|
start in the diagram:
|
||||||
|
|
||||||
|
1. **Sale → fulfillment** (the automated path)
|
||||||
|
Buyer pays at Gumroad → Gumroad fires Ping to
|
||||||
|
`licenses.datatools.unalogix.com/webhooks/gumroad?secret=…` → nginx
|
||||||
|
→ FastAPI → audit-log row → adapter normalizes payload → `mint_from_sale`
|
||||||
|
writes the `licenses` row + Ed25519-signs the blob → Postmark emails
|
||||||
|
the buyer their blob. End-to-end latency: a few hundred milliseconds.
|
||||||
|
|
||||||
|
2. **Manual mint** (operator path — comps, support replacements)
|
||||||
|
Operator opens SSH tunnel → `datatools-admin mint` → `/internal/mint`
|
||||||
|
(Bearer-authed, never publicly reachable) → same `mint_from_sale`
|
||||||
|
path → blob returned in HTTP response. Operator delivers to buyer
|
||||||
|
out-of-band.
|
||||||
|
|
||||||
|
3. **Activation** (buyer path — fully offline)
|
||||||
|
Buyer pastes blob into desktop's Activate screen → desktop verifies
|
||||||
|
the Ed25519 signature against the public key **embedded in the
|
||||||
|
shipped binary** → license written to `~/.datatools/license.json`.
|
||||||
|
The desktop app makes **no network calls** to the license server at
|
||||||
|
any point. This preserves the "your data never leaves your computer"
|
||||||
|
promise (`DECISIONS.md §9b`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Tech stack
|
||||||
|
|
||||||
|
Layered view of what technology lives where. "External SaaS" entries
|
||||||
|
are services we depend on but don't operate.
|
||||||
|
|
||||||
|
```
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ DESKTOP APP (shipped binary, runs on buyer's box) │
|
||||||
|
├──────────────────┬─────────────────────────────────────────────────────┤
|
||||||
|
│ GUI │ Streamlit 1.35 — local web server, browser opens │
|
||||||
|
│ CLI │ Typer 0.12 — per-tool entry points │
|
||||||
|
│ Core logic │ pandas 2.x, numpy, rapidfuzz, charset-normalizer │
|
||||||
|
│ Crypto (verify) │ PyCA cryptography — Ed25519 public-key verify only │
|
||||||
|
│ Storage │ ~/.datatools/license.json (file, mode 644) │
|
||||||
|
│ Internationalization │ i18n via JSON catalogs in src/i18n/ │
|
||||||
|
│ Build │ PyInstaller — one-file binary, per OS │
|
||||||
|
│ Runtimes │ Python 3.12 (bundled into installer) │
|
||||||
|
│ Platforms │ Windows · macOS · Linux │
|
||||||
|
└──────────────────┴─────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ LICENSE SERVER (this box; non-buyer-facing) │
|
||||||
|
├──────────────────┬─────────────────────────────────────────────────────┤
|
||||||
|
│ Edge │ nginx 1.24 + Let's Encrypt (auto-renew via timer) │
|
||||||
|
│ HTTP framework │ FastAPI 0.119 + Starlette + Pydantic v2 │
|
||||||
|
│ ASGI server │ uvicorn 0.39 (+uvloop, +httptools, +watchfiles) │
|
||||||
|
│ Form parsing │ python-multipart (for Gumroad form-encoded Pings) │
|
||||||
|
│ ORM │ SQLAlchemy 2.0 │
|
||||||
|
│ Migrations │ Alembic 1.18 (one initial migration so far) │
|
||||||
|
│ Database │ Postgres 16-alpine (containerized, single node) │
|
||||||
|
│ Database driver │ psycopg 3.3 (with binary wheel) │
|
||||||
|
│ Crypto (sign) │ PyCA cryptography — Ed25519 private-key sign │
|
||||||
|
│ HTTP client │ httpx 0.28 (Postmark calls, test mocking) │
|
||||||
|
│ Config │ Pydantic Settings + YAML (products.yaml) │
|
||||||
|
│ Container │ Docker + Docker Compose v2 plugin │
|
||||||
|
│ Image base │ python:3.12-slim │
|
||||||
|
│ Process user │ UID 10001 (non-root `app` user defined in image) │
|
||||||
|
│ Logging │ stdlib `logging` to container stdout → docker logs │
|
||||||
|
│ Host OS │ Ubuntu 24.04 LTS │
|
||||||
|
└──────────────────┴─────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ OPERATOR / DEVELOPER MACHINE │
|
||||||
|
├──────────────────┬─────────────────────────────────────────────────────┤
|
||||||
|
│ Source control │ git → self-hosted Gitea (git.invixiom.com) │
|
||||||
|
│ Admin CLI │ Typer (src/admin_cli.py) │
|
||||||
|
│ Server access │ SSH tunnel for /internal/* (no public exposure) │
|
||||||
|
│ Break-glass │ scripts/generate_license.py (offline-only mints, │
|
||||||
|
│ │ used when the license server is unreachable) │
|
||||||
|
│ Test runner │ pytest 8.3 + SQLite in-memory (no docker required) │
|
||||||
|
│ Smoke test │ bash + docker compose (server/scripts/smoke.sh) │
|
||||||
|
└──────────────────┴─────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ EXTERNAL SaaS / dependencies │
|
||||||
|
├──────────────────┬─────────────────────────────────────────────────────┤
|
||||||
|
│ Storefront │ Gumroad — Ping webhook to /webhooks/gumroad │
|
||||||
|
│ Transactional │ Postmark — HTTP API for license-delivery emails │
|
||||||
|
│ email │ (LoggingEmailService fallback when token unset) │
|
||||||
|
│ TLS CA │ Let's Encrypt — ACME HTTP-01 challenge via certbot │
|
||||||
|
│ Authoritative │ supercp / cPanel (your DNS host for unalogix.com) │
|
||||||
|
│ DNS │ — Cloudflare front-door deferred │
|
||||||
|
│ Source hosting │ Self-hosted Gitea (git.invixiom.com) — not on the │
|
||||||
|
│ │ datatools box; shares the same physical host │
|
||||||
|
└──────────────────┴─────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Trust + isolation boundaries
|
||||||
|
|
||||||
|
Worth tracing explicitly because the threat model differs at each
|
||||||
|
boundary:
|
||||||
|
|
||||||
|
| Boundary | What crosses it | Trust model |
|
||||||
|
|---|---|---|
|
||||||
|
| Buyer ↔ Gumroad | Payment, buyer details | Out of scope — Gumroad's problem |
|
||||||
|
| Gumroad → license server (webhook) | Signed-by-shared-secret POST | URL secret check; non-matching = 404 (no info leak); audit-log everything regardless |
|
||||||
|
| License server → Postmark | DKIM-signed transactional mail | Postmark verified-sender domain; HTTP API auth via server token |
|
||||||
|
| License server → Postgres | SQL over local docker bridge | Same compose project; password from on-disk secret file |
|
||||||
|
| Operator → license server (`/internal/*`) | Bearer token over SSH tunnel | Token only on disk + in the operator's env; nginx blocks `/internal/*` publicly as defense-in-depth |
|
||||||
|
| License server → buyer (email) | Plaintext blob in inbox | Buyer's email account hygiene; we deliberately don't encrypt — blob is self-protecting (signature) |
|
||||||
|
| Buyer → desktop app (activation) | Signed blob pasted in | Verified against pubkey **embedded in the shipped binary**; no network call |
|
||||||
|
|
||||||
|
The single most important property to preserve: **the desktop app
|
||||||
|
never talks to the license server.** All trust in the desktop comes
|
||||||
|
from the embedded public key + the signed blob. This is what makes
|
||||||
|
the offline activation guarantee real, and what keeps a license-server
|
||||||
|
outage from breaking buyers who've already activated.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Where things are stored
|
||||||
|
|
||||||
|
| Lives on… | Path / location | Contents |
|
||||||
|
|---|---|---|
|
||||||
|
| Buyer's machine | `~/.datatools/license.json` | Activated license blob |
|
||||||
|
| Buyer's machine | Postmark email | Delivery copy of the blob |
|
||||||
|
| License server | `licenses` table (Postgres) | Authoritative customer record — name, email, tier, blob, source, order ID, promotion, amount paid |
|
||||||
|
| License server | `gumroad_events` table | Append-only webhook delivery audit log |
|
||||||
|
| License server | `/srv/datatools-license/secrets/` | Postgres password, admin Bearer token, (PR 2) Postmark token + Gumroad secret |
|
||||||
|
| License server | `/etc/letsencrypt/live/datatools.unalogix.com/` | TLS cert + key |
|
||||||
|
| Operator's laptop | `~/.datatools-creator/issued.jsonl` | Creator-side issuance log (pre-server era, kept as a break-glass backup) |
|
||||||
|
| Operator's laptop | Git clone of this repo | Source code, including `server/config/products.yaml` |
|
||||||
|
| Gitea | This repo's commits | Everything except secrets |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Related docs
|
||||||
|
|
||||||
|
| Doc | Scope |
|
||||||
|
|---|---|
|
||||||
|
| `TECHNICAL.md` | Desktop app internals (core libs, GUI, CLIs) |
|
||||||
|
| `LICENSE-SERVER.md` | Server architecture rationale + DB schema |
|
||||||
|
| `SETUP-LICENSE-SERVER.md` | Server install runbook (DNS, packages, nginx, TLS, Postgres) |
|
||||||
|
| `ADMIN.md` | Day-2 operations (minting, rotation, inspection) |
|
||||||
|
| `DECISIONS.md` | Architecture decision records — `§9b` = no online activation check |
|
||||||
|
| `USER-GUIDE.md` | Buyer-facing documentation |
|
||||||
347
docs/BUSINESS.md
347
docs/BUSINESS.md
@@ -1,278 +1,225 @@
|
|||||||
# BUSINESS.md - Business Case & Marketing Strategy
|
# Business
|
||||||
|
|
||||||
> **Creator-only document. Do not ship to buyers.**
|
> Creator-only. Do not ship to buyers.
|
||||||
|
> **Version**: 1.6 · **Updated**: 2026-05-01 · **Owner**: Michael
|
||||||
|
|
||||||
**Version**: 1.6
|
## 1. Executive summary
|
||||||
**Last updated**: April 28, 2026
|
|
||||||
**Owner**: Michael
|
|
||||||
|
|
||||||
---
|
Sell niche Python automation tools as one-time downloadable digital products. Target non-technical users who hate Excel/CSV grunt work but can't code. Distribute via Gumroad / Lemon Squeezy with automated delivery. Cross-platform from launch. Each bundle ships GUI (primary, browser-local) + CLI.
|
||||||
|
|
||||||
## 1. Executive Summary
|
- **Pricing**: $49-79 per bundle · $149 full suite (when 3+ exist).
|
||||||
|
- **Goal**: lifestyle cashflow. No saleable-asset exit required.
|
||||||
|
|
||||||
Sell niche-specific Python automation tools as one-time downloadable digital products. Target non-technical users who hate repetitive Excel/CSV work but cannot code. Distribute via Gumroad / Lemon Squeezy / Stripe with automated instant delivery. Cross-platform from launch (Windows, macOS, Linux). Each bundle ships with both a GUI (primary surface for non-technical buyers, runs in the buyer's browser locally) and a CLI (for power users and automation).
|
## 2. Market opportunity
|
||||||
|
|
||||||
**Pricing model**: One-time purchase. Individual bundles $49-$79. Full suite $149.
|
- Persistent, evergreen pain: data cleaning is universal.
|
||||||
|
- Low competition in vertical niches (Shopify pet-supplies feeds vs. generic CSV cleaners).
|
||||||
|
- ~100% gross margin after creation.
|
||||||
|
- Hosted browser demo as try-before-buy conversion lever (added v1.3).
|
||||||
|
|
||||||
**Goal**: Lifestyle cashflow. No saleable-asset exit required.
|
**Timing reality**: marketplaces + community posts → days/weeks to first sale. Own-domain SEO is a 6-18 month compounding asset, not an early channel.
|
||||||
|
|
||||||
---
|
## 3. Target customers
|
||||||
|
|
||||||
## 2. Market Opportunity
|
**Primary**:
|
||||||
|
- Shopify owners (Pet Supplies = priority niche).
|
||||||
- Persistent, evergreen pain: manual data cleaning is universal across small business and freelance work.
|
- Small business owners needing reporting + finance.
|
||||||
- Low competition in highly vertical niches (e.g., Shopify pet supplies feeds vs. generic CSV cleaners).
|
- Freelancers / consultants handling client data.
|
||||||
- High margin: near-100% gross margin after initial creation.
|
|
||||||
- Distribution leverage: marketplace search + community presence + programmatic SEO + **hosted browser demo as a try-before-buy conversion surface** (added v1.3, see Section 7).
|
|
||||||
|
|
||||||
**Realistic distribution timeline note**: Marketplace listings (Gumroad, Lemon Squeezy directory) and niche community posts can produce paying customers within days to weeks. New-domain SEO will not produce traction inside 90 days. Plan early-stage distribution around marketplaces, communities, and a hosted demo; treat owned-domain SEO as a 6-18 month compounding asset.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 3. Target Customers
|
|
||||||
|
|
||||||
Primary:
|
|
||||||
- Shopify store owners (Pet Supplies niche identified as priority).
|
|
||||||
- Small business owners needing reporting and finance automation.
|
|
||||||
- Freelancers and consultants who handle client data.
|
|
||||||
- Local marketing agencies.
|
- Local marketing agencies.
|
||||||
|
|
||||||
Anti-personas (do not waste effort here):
|
**Anti-personas**:
|
||||||
- Enterprise data teams (will build it themselves).
|
- Enterprise data teams (build their own).
|
||||||
- Pure technical buyers (will pip install something free).
|
- Pure technical buyers (`pip install` something free).
|
||||||
|
|
||||||
---
|
## 4. Product strategy
|
||||||
|
|
||||||
## 4. Product Strategy
|
**Lead**: Excel & CSV Data Cleaning Mastery Bundle (highest pain, broadest demand).
|
||||||
|
|
||||||
**Lead product**: Excel & CSV Data Cleaning Mastery Bundle (highest-pain, broadest demand).
|
**Roadmap**:
|
||||||
|
1. Data Cleaning Mastery (in progress)
|
||||||
|
2. Automated Business Reporting
|
||||||
|
3. Ecommerce Data Pipeline
|
||||||
|
4. Small Business Finance
|
||||||
|
5. Marketing Public Data Aggregation
|
||||||
|
6. AI Ecommerce Aggregation — Shopify Pet Supplies
|
||||||
|
|
||||||
**Bundle roadmap**:
|
**Sequence rule**: don't start bundle 2 until bundle 1 has paying customers + one external review. Five parallel skeletons is a known failure mode.
|
||||||
1. Data Cleaning Mastery (lead, in progress).
|
|
||||||
2. Automated Business Reporting.
|
|
||||||
3. Ecommerce Data Pipeline.
|
|
||||||
4. Small Business Finance.
|
|
||||||
5. Marketing Public Data Aggregation.
|
|
||||||
6. AI Ecommerce Aggregation - Shopify Pet Supplies (vertical niche play).
|
|
||||||
|
|
||||||
**Sequence rule**: Do not start bundle 2 until bundle 1 has paying customers and at least one external review. Building five skeleton bundles in parallel is a known failure mode.
|
**Surface**: desktop install per OS (PyInstaller) with Streamlit GUI + CLI. Constrained demo on Streamlit Community Cloud.
|
||||||
|
|
||||||
**Product surface (locked v1.3)**: Each bundle ships as a desktop install (Windows / macOS / Linux) with both a Streamlit-based GUI and a CLI. A constrained version of the GUI is also deployed publicly to Streamlit Community Cloud as a free browser demo. See TECHNICAL.md Sections 1-3 and DECISIONS.md Section 4c for the full architecture.
|
## 4a. Lead bundle — Find Duplicates
|
||||||
|
|
||||||
---
|
Highest pain density across all 4 personas. Feeds landing copy, demo design, feature priority. Tech spec: TECHNICAL.md §11.1.
|
||||||
|
|
||||||
## 4a. Lead Bundle Deep Dive: Deduplicator Use Cases & Competitive Position (added v1.6)
|
### Use cases by persona
|
||||||
|
|
||||||
The deduplicator is the lead because it has the highest pain density across all four target personas. This section captures the use-case map, competitive landscape, and market gap statement. It feeds landing page copy, demo dataset design, and feature prioritization. Companion technical spec is in TECHNICAL.md Section 10.1.
|
**Shopify**:
|
||||||
|
1. Customer list cleanup (`john@gmail.com` vs `John@Gmail.com` vs `j.ohn@gmail.com`).
|
||||||
|
2. Product catalog dedup (SKU whitespace, near-identical names).
|
||||||
|
3. Abandoned-cart cleanup before re-engagement.
|
||||||
|
4. Order export consolidation across channels.
|
||||||
|
5. Subscriber-list hygiene before Klaviyo / Mailchimp import (per-contact pricing).
|
||||||
|
|
||||||
### Use cases by buyer persona
|
**Bookkeeper**:
|
||||||
|
6. Bank export reconciliation across overlapping date ranges.
|
||||||
|
7. Vendor list consolidation across QB + spreadsheets + email.
|
||||||
|
8. Customer master cleanup pre-invoicing migration.
|
||||||
|
9. Expense report dedup (same receipt twice).
|
||||||
|
|
||||||
**Shopify store owner (priority niche)**
|
**Freelancer**:
|
||||||
1. Customer list cleanup: same person with `john@gmail.com` and `John@Gmail.com` and `j.ohn@gmail.com` (Gmail ignores dots), or with two phone formats.
|
10. Pre-analysis cleanup of client dumps.
|
||||||
2. Product catalog dedup: same SKU listed with trailing whitespace, case differences, or near-identical names ("Dog Collar - Red - Large" vs "Dog Collar Red L").
|
11. Survey response dedup (same respondent, multiple devices).
|
||||||
3. Abandoned cart cleanup before re-engagement campaign (don't email the same person 3 times).
|
|
||||||
4. Order export consolidation when pulling from Shopify + Amazon + manual entry.
|
|
||||||
5. Subscriber list hygiene before importing to Klaviyo / Mailchimp (every duplicate costs money on per-contact pricing).
|
|
||||||
|
|
||||||
**Small business / bookkeeper**
|
|
||||||
6. Bank export reconciliation: same transaction appearing in two exports across overlapping date ranges.
|
|
||||||
7. Vendor list consolidation across QuickBooks, spreadsheets, and email.
|
|
||||||
8. Customer master record cleanup before invoicing migration.
|
|
||||||
9. Expense report dedup where employees submit the same receipt twice.
|
|
||||||
|
|
||||||
**Freelancer / consultant**
|
|
||||||
10. Pre-analysis cleanup of client-supplied data dumps (almost always have dupes).
|
|
||||||
11. Survey response dedup (same respondent submitting twice from different devices).
|
|
||||||
12. Lead list cleanup before client handoff.
|
12. Lead list cleanup before client handoff.
|
||||||
|
|
||||||
**Marketing agency**
|
**Marketing agency**:
|
||||||
13. Email list deduplication across multiple lead sources before campaign send.
|
13. Email-list dedup across lead sources.
|
||||||
14. Audience reconciliation when running multi-platform campaigns (Facebook + Google + organic forms).
|
14. Multi-platform audience reconciliation.
|
||||||
15. Suppression-list management (combine unsubscribes across lists).
|
15. Suppression-list management.
|
||||||
|
|
||||||
**Highest-pain, highest-frequency**: 1, 5, 6, 13. Build the feature set, sample dataset, and demo around these first. Landing page copy should lead with these scenarios. The hosted demo's pre-loaded dataset should make at least two of these cases obvious within ten seconds.
|
**Highest pain × frequency**: 1, 5, 6, 13. Build feature set + demo dataset + landing copy around these.
|
||||||
|
|
||||||
### Competitive landscape
|
### Competitive landscape
|
||||||
|
|
||||||
| Tool | Price | Strength | Weakness vs. this product |
|
| Tool | Price | Strength | Weakness |
|
||||||
|---|---|---|---|
|
|------|-------|----------|----------|
|
||||||
| Excel "Remove Duplicates" | Free | Universal, zero install | Exact match only. No fuzzy. No audit log. |
|
| Excel Remove Duplicates | Free | Universal, zero install | Exact only. No fuzzy. No audit. |
|
||||||
| Pandas `drop_duplicates` | Free | Powerful | Requires Python skill. Buyer doesn't have it. |
|
| Pandas `drop_duplicates` | Free | Powerful | Requires Python. |
|
||||||
| OpenRefine | Free | Powerful clustering, fuzzy | Steep learning curve, dated GUI, intimidating for non-technical users. |
|
| OpenRefine | Free | Powerful clustering | Steep curve, dated GUI. |
|
||||||
| Dedupe.io | ~$30+/mo SaaS | ML-based fuzzy | Recurring cost, cloud upload (privacy concern for client data), overkill for small jobs. |
|
| Dedupe.io | $30+/mo | ML fuzzy | Recurring + cloud upload. |
|
||||||
| WinPure / Data Ladder | $300-2000+ | Enterprise-grade | Wrong price tier and complexity for solo operators. |
|
| WinPure / Data Ladder | $300-2000+ | Enterprise | Wrong tier. |
|
||||||
| Power Query (Excel) | Free | Integrated | Exact match only, no fuzzy without M-code skill. |
|
| Power Query | Free | Integrated | Exact only without M-code. |
|
||||||
|
|
||||||
### The market gap this product fills
|
### Market gap
|
||||||
|
|
||||||
The market has a hole between "Excel (too basic)" and "OpenRefine / Dedupe.io (too complex or expensive or cloud-bound)." That hole is:
|
> Fuzzy match quality of OpenRefine, with the zero-learning UX of Excel, sold once for under $100, runs locally.
|
||||||
|
|
||||||
> Fuzzy match quality of OpenRefine, with the zero-learning-curve UX of Excel, sold once for under $100, runs locally on the buyer's machine.
|
Defensible **only if** fuzzy matching works without docs. Mediocre fuzzy → loses to free Excel. Learning required → loses to free OpenRefine. Tier 1 spec (TECHNICAL.md §11.1) is the minimum viable feature set to occupy this gap.
|
||||||
|
|
||||||
This is a defensible position **only if** the product delivers fuzzy match quality the buyer can trust without reading documentation. If fuzzy is mediocre, the product loses to free Excel. If UX requires learning, it loses to free OpenRefine. The Tier 1 functional spec in TECHNICAL.md Section 10.1 is the minimum viable feature set to occupy this gap.
|
|
||||||
|
|
||||||
### Pricing sanity check (lead bundle specifically)
|
|
||||||
|
|
||||||
$49-$79 is correct for this position. Above $99 the buyer expects SaaS support (which conflicts with the no-touch constraint). Below $30 it competes with free and signals "toy." See Section 5 for full pricing rationale.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 5. Pricing
|
## 5. Pricing
|
||||||
|
|
||||||
| Tier | Price | Notes |
|
| Tier | Price | Notes |
|
||||||
|---|---|---|
|
|------|-------|-------|
|
||||||
| Single bundle | $49 - $79 | Sweet spot for individual buyer impulse purchase |
|
| Single bundle | $49-79 | Impulse-purchase sweet spot for solo operators |
|
||||||
| Full suite (when 3+ bundles exist) | $149 | Anchor price, drives bundle attach |
|
| Full suite (3+ bundles) | $149 | Anchor; drives bundle attach |
|
||||||
|
|
||||||
Rationale: $49-$79 is below the threshold that triggers procurement / approval friction for solo operators. Above $99 the buyer expects a SaaS or human support.
|
**Why**: < $99 avoids procurement friction. > $99 triggers SaaS-support expectations that conflict with no-touch. < $30 competes with free, signals "toy".
|
||||||
|
|
||||||
---
|
## 6. Revenue targets
|
||||||
|
|
||||||
## 6. Revenue Targets (realistic, tiered)
|
|
||||||
|
|
||||||
Replacing the original "$50k/mo ceiling" target with evidence-based tiers for solo digital product sellers in this category:
|
|
||||||
|
|
||||||
| Horizon | Target | Notes |
|
| Horizon | Target | Notes |
|
||||||
|---|---|---|
|
|---------|--------|-------|
|
||||||
| First 90 days | First paying customer | Validates the funnel, not the business |
|
| 90 days | First paying customer | Validates funnel, not business |
|
||||||
| 6 months | $1,000 - $3,000 / mo | Achievable with working lead bundle + marketplace presence + hosted demo |
|
| 6 months | $1k-3k/mo | Lead bundle + marketplace + demo |
|
||||||
| 12 months | $5,000 / mo | Realistic 12-month goal. Triggers re-evaluation of the "fully async" constraint (see Section 8) |
|
| 12 months | $5k/mo | Triggers "fully async" revisit |
|
||||||
| 24 months | $10,000 / mo | Stretch target. Requires either a hit product or 3+ bundles compounding |
|
| 24 months | $10k/mo | Stretch. Needs hit product or 3+ bundles compounding |
|
||||||
|
|
||||||
$20k+/mo is achievable but requires a channel asset (audience, brand, content) that the current operator constraints exclude. Not a target.
|
$20k+/mo achievable but requires audience/brand asset that operator constraints exclude.
|
||||||
|
|
||||||
---
|
## 7. Marketing
|
||||||
|
|
||||||
## 7. Marketing Strategy
|
### Channels (priority order, early stage)
|
||||||
|
|
||||||
**Channels (in priority order, early stage)**:
|
1. **Hosted browser demo** — free Streamlit Community Cloud, linked from every listing. Direct conversion lever for digital downloads where buyers can't evaluate quality otherwise.
|
||||||
1. **Hosted browser demo** (added v1.3). Free, public Streamlit Community Cloud deployment of a constrained version of each bundle. Linked prominently from every Gumroad / Lemon Squeezy listing and the landing page as "Try it free in your browser." Direct conversion lever: prospects can validate quality before purchase, which is otherwise impossible for digital downloads at this price.
|
2. Marketplace listings — Gumroad search, Lemon Squeezy directory, GitHub.
|
||||||
2. Marketplace listings (Gumroad search, Lemon Squeezy directory, GitHub).
|
3. Niche communities — value-first posts in subreddits, Indie Hackers, niche Slack/Discord. Demo doubles as the shareable asset.
|
||||||
3. Niche community presence (subreddits, Indie Hackers, niche Slack/Discord) - value-first posts, not promotion. The hosted demo doubles as the asset shared in these posts.
|
4. Programmatic SEO — long-tail landing pages (compounds over months).
|
||||||
4. Programmatic SEO landing pages targeting long-tail keywords (compounds over months).
|
|
||||||
5. Strong GitHub README as discovery surface.
|
5. Strong GitHub README as discovery surface.
|
||||||
|
|
||||||
**Hosted demo design**:
|
### Demo design
|
||||||
- Same core engine as the paid product, GUI front-end only.
|
|
||||||
- Constrained: row limit (e.g., 100 rows on output), watermark on output files, sample dataset preloaded plus optional small-file upload (capped size).
|
|
||||||
- Persistent CTA on every page: "Like what you see? Get the full version for $49 ->" linking to Gumroad.
|
|
||||||
- No login or signup required to use the demo. Friction kills conversion.
|
|
||||||
- Hosted on Streamlit Community Cloud (free) at launch. Migrate to $5/mo VPS if rate limits or branding constraints become an issue.
|
|
||||||
|
|
||||||
**Target keywords (long-tail, low competition)**:
|
- Same core engine as paid product, GUI-only.
|
||||||
- python csv cleaner bundle
|
- Constraints: row limit (100), output watermark, sample dataset preloaded + small upload (capped).
|
||||||
- excel data cleaning scripts
|
- Persistent CTA: *"Like what you see? Get the full version for $49 →"*.
|
||||||
- automated data deduplicator python
|
- No login. Friction kills conversion.
|
||||||
- csv duplicate removal tool
|
- Streamlit Community Cloud (free) at launch. $5/mo VPS if rate-limited.
|
||||||
- shopify product feed cleaner
|
|
||||||
|
|
||||||
**Funnel**:
|
### Target keywords
|
||||||
- Discovery (marketplace search / community post / SEO) -> Hosted demo (try-before-buy) -> Landing page -> Gumroad checkout -> Stripe payment -> automated email delivery -> upsell sequence to next bundle.
|
|
||||||
|
|
||||||
**Support model**: Self-serve documentation in every download. Email support only, no live chat, no calls.
|
`python csv cleaner bundle` · `excel data cleaning scripts` · `automated data deduplicator python` · `csv duplicate removal tool` · `shopify product feed cleaner`.
|
||||||
|
|
||||||
---
|
### Funnel
|
||||||
|
|
||||||
## 8. The "Fully Async, No-Touch" Constraint
|
Discovery → Demo (try-before-buy) → Landing page → Gumroad → Stripe → automated email delivery → upsell sequence to next bundle.
|
||||||
|
|
||||||
The locked criteria require fully automated, no-touch marketing and sales. This is preserved as the long-term steady state. However:
|
### Support
|
||||||
|
|
||||||
**Revisit trigger**: When monthly recurring revenue reaches $5,000/mo.
|
Self-serve docs in every download. Email only. No live chat, no calls.
|
||||||
|
|
||||||
**Why revisit**: At early stage, the no-touch constraint rules out the channels most likely to produce first traction (direct outreach to 50 Shopify pet operators, founder-led community participation, customer interviews). These are time-bounded activities, not permanent commitments. Strict adherence to "no-touch" before product-market fit may cost more revenue than it saves time.
|
## 8. The "fully async, no-touch" constraint
|
||||||
|
|
||||||
**Action at trigger**: Re-evaluate whether selective non-async activity (e.g., 2 hours/week of community participation, or a small founder audience build) would compound revenue faster than additional bundle development. Decision is yours; this document only flags the trigger.
|
Locked criteria require automated, no-touch marketing + sales. Long-term steady state.
|
||||||
|
|
||||||
Until $5k/mo, operate under the locked async-only rule.
|
**Revisit trigger**: $5k/mo MRR.
|
||||||
|
|
||||||
---
|
**Why**: pre-PMF, the no-touch rule excludes the channels most likely to produce first traction (founder outreach to 50 Shopify pet operators, community participation, customer interviews). Strict adherence may cost more revenue than it saves time.
|
||||||
|
|
||||||
## 9. Cost Structure
|
**Action at trigger**: re-evaluate selective non-async (e.g., 2 hr/wk community participation) vs. additional bundle dev. Decision lives with the operator; this just flags the trigger.
|
||||||
|
|
||||||
**Recurring (monthly budget cap: $1,200)**:
|
## 9. Cost structure
|
||||||
|
|
||||||
| Item | Cost | Notes |
|
Recurring monthly cap: **$1,200**.
|
||||||
|---|---|---|
|
|
||||||
| Gumroad / Lemon Squeezy fees | ~10% of revenue | Net of merchant fees, no flat cost |
|
|
||||||
| Domain | ~$15/yr | One-time annual |
|
|
||||||
| Hosting (landing pages) | $0 - $20/mo | Static hosting via Cloudflare Pages, Netlify, or GitHub Pages is free |
|
|
||||||
| Hosting (browser demos) | $0 at launch | Streamlit Community Cloud free tier. Plan for $5-10/mo VPS migration if scale or branding requires |
|
|
||||||
| Email service (transactional + sequences) | $0 - $30/mo | Free tier covers early volume |
|
|
||||||
| Apple Developer Program | $99/yr (~$8/mo) | Required for macOS code signing - see Section 10 |
|
|
||||||
| Inno Setup (Windows installer) | Free | One-time download |
|
|
||||||
| PyInstaller, Streamlit, Python tooling | Free | All open source |
|
|
||||||
| **Total fixed monthly** | **~$30-70/mo** | Well under $1,200 cap |
|
|
||||||
|
|
||||||
Headroom in the budget allows for optional ad spend ($100-200/mo) once a bundle has proven conversion data.
|
| Item | Cost |
|
||||||
|
|------|------|
|
||||||
|
| Gumroad / Lemon Squeezy fees | ~10% of revenue |
|
||||||
|
| Domain | ~$15/yr |
|
||||||
|
| Landing-page hosting | $0-20/mo (static via Cloudflare/Netlify/GH Pages) |
|
||||||
|
| Demo hosting | $0 at launch (Streamlit Community Cloud); plan $5-10/mo VPS migration |
|
||||||
|
| Email service | $0-30/mo |
|
||||||
|
| Apple Developer Program | $99/yr (~$8/mo) |
|
||||||
|
| Inno Setup, PyInstaller, Python | Free |
|
||||||
|
| **Total fixed monthly** | **~$30-70/mo** |
|
||||||
|
|
||||||
---
|
Headroom enables optional ad spend ($100-200/mo) once a bundle has proven conversion data.
|
||||||
|
|
||||||
## 10. macOS Code Signing (Apple Developer Program)
|
## 10. macOS code signing
|
||||||
|
|
||||||
**Required cost**: $99/year, paid to Apple.
|
**Cost**: $99/yr to Apple Developer Program. **Decision: pay it.**
|
||||||
|
|
||||||
**Why it's required**:
|
**Why required**: macOS Gatekeeper hard-blocks unsigned apps with *"This app cannot be opened because the developer cannot be verified"* — the only obvious button is "Move to Trash." The bypass (right-click → Open) exists but the target buyer won't perform it.
|
||||||
macOS includes a security layer (Gatekeeper) that blocks unsigned applications by default. When a non-technical buyer downloads an unsigned `.app` or `.dmg`, macOS shows a hard-block dialog: *"This app cannot be opened because the developer cannot be verified."* The only obvious button is "Move to Trash."
|
|
||||||
|
|
||||||
The bypass exists (right-click > Open, then confirm in a second dialog), but the target buyer persona will not perform it. The likely outcomes for unsigned Mac builds: refund requests, support tickets, or silent abandonment.
|
**What $99 buys**: code-signing certificate (removes hard block) + notarization service (removes "downloaded from internet" warning). Result: clean double-click experience.
|
||||||
|
|
||||||
**What the $99/yr buys**:
|
**Setup**: Apple ID + government ID (individuals) or D-U-N-S number (orgs). First approval takes 1-2 weeks. Once approved, sign + notarize is automated in CI.
|
||||||
- A code signing certificate. Removes the hard block.
|
|
||||||
- Notarization service (included). Apple scans the binary and stamps it; this removes the secondary "downloaded from internet" warning too. Result: clean double-click-to-run experience.
|
|
||||||
|
|
||||||
**Setup notes**:
|
## 11. Risks & mitigation
|
||||||
- Requires Apple ID + government ID (for individuals) or D-U-N-S number (for organizations).
|
|
||||||
- First-time approval takes 1-2 weeks. Plan accordingly.
|
|
||||||
- Once approved, signing and notarization is automated in the build pipeline (see TECHNICAL.md).
|
|
||||||
|
|
||||||
**Decision**: Pay for it. The cost is trivial relative to the conversion-rate impact for the non-technical buyer persona.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 11. Risks & Mitigation
|
|
||||||
|
|
||||||
| Risk | Mitigation |
|
| Risk | Mitigation |
|
||||||
|---|---|
|
|------|------------|
|
||||||
| Commoditization (free scripts on GitHub) | Niche verticals + polished GUI + cross-platform installers + hosted demo |
|
| Free GitHub scripts commoditize | Niche verticals + polished GUI + cross-platform installers + hosted demo |
|
||||||
| Slow early traction | Lead with hosted demo + marketplaces + communities, not own-domain SEO |
|
| Slow early traction | Lead with demo + marketplaces + communities, not own-domain SEO |
|
||||||
| Refund chargebacks | Clear scope on landing page, hosted demo lets buyers validate before purchase, working samples included |
|
| Refund chargebacks | Clear scope on landing, demo lets buyers validate, working samples included |
|
||||||
| macOS install friction | Apple Developer Program ($99/yr), code signing + notarization |
|
| macOS install friction | Apple Dev Program ($99/yr), code sign + notarize |
|
||||||
| Browser-launch UX confusion (GUI opens in browser locally) | Single sentence in installer welcome and email; persistent in-app "runs locally, no internet used" message; pywebview native-window wrap as v1.1 enhancement if needed |
|
| Browser-launch UX confusion | One sentence in installer + email; persistent in-app "runs locally" message; pywebview wrap as v1.1 if needed |
|
||||||
| Customer support burden | Robust installers, idiot-proof docs, sample data included, hosted demo lets prospects self-evaluate |
|
| Support burden | Robust installers, idiot-proof docs, sample data included |
|
||||||
| IP theft / resale | License file. Accept this is partial protection; focus on staying ahead via updates |
|
| IP theft / resale | License file. Accept partial protection; focus on staying ahead via updates |
|
||||||
| Platform risk (Gumroad / Lemon Squeezy policy change) | Multi-marketplace from day one; own domain as fallback |
|
| Marketplace policy change | Multi-marketplace day 1; own domain as fallback |
|
||||||
| Streamlit project direction change breaks desktop packaging | Low probability; flagged as criteria-relock trigger in DECISIONS.md Section 8 |
|
| Streamlit direction change | Low probability; flagged as criteria-relock trigger in DECISIONS §8 |
|
||||||
|
|
||||||
---
|
## 12. Success metrics (monthly)
|
||||||
|
|
||||||
## 12. Success Metrics
|
|
||||||
|
|
||||||
Tracked monthly:
|
|
||||||
- Units sold per bundle.
|
- Units sold per bundle.
|
||||||
- Conversion rate (landing page -> purchase).
|
- Conversion rate (landing → purchase).
|
||||||
- **Demo-to-purchase conversion rate** (added v1.3): hosted demo visits -> Gumroad clicks -> purchases.
|
- **Demo-to-purchase rate** (added v1.3): demo visits → Gumroad clicks → purchases.
|
||||||
- Refund rate (target < 5%).
|
- Refund rate (target < 5%).
|
||||||
- Support tickets per 100 sales (target < 10).
|
- Support tickets / 100 sales (target < 10).
|
||||||
- Organic traffic to product pages.
|
- Organic traffic to product pages.
|
||||||
- Per-platform install success rate (Windows, macOS, Linux).
|
- Per-platform install success.
|
||||||
|
|
||||||
---
|
## 13. Honest status (2026-05-01)
|
||||||
|
|
||||||
## 13. Honest Status (April 28, 2026)
|
- 3 of 9 tools shipped (Find Duplicates, Clean Text, Standardize Formats).
|
||||||
|
- Cross-platform build pipeline designed, not yet built.
|
||||||
- 1 of 9 scripts is real and tested (`01_deduplicator.py`). The other 8 are skeletons. **Expected at project start.**
|
- macOS code signing not yet set up.
|
||||||
- Cross-platform build pipeline (PyInstaller-based) designed but not yet built.
|
- Streamlit GUI shipped for the 3 ready tools.
|
||||||
- macOS code signing not yet set up (Apple Developer Program enrollment pending).
|
|
||||||
- Streamlit GUI not yet built (locked as the framework as of v1.3).
|
|
||||||
- Hosted demo not yet deployed.
|
- Hosted demo not yet deployed.
|
||||||
- No paying customers yet.
|
- No paying customers.
|
||||||
- No live landing page yet.
|
- No live landing page.
|
||||||
|
|
||||||
**Next concrete steps before any marketing spend**:
|
**Next concrete steps before marketing spend**:
|
||||||
1. Build the Streamlit GUI for the lead script (`01_deduplicator.py`). Apply UX standards from DECISIONS.md Section 4b.
|
1. Stand up the PyInstaller pipeline with Streamlit launcher (1-3 days first time).
|
||||||
2. Stand up the PyInstaller cross-platform build pipeline with Streamlit launcher (see TECHNICAL.md Sections 3.3 and 3.4). Budget 1-3 days for first-time Streamlit-PyInstaller integration.
|
2. Deploy constrained demo to Streamlit Community Cloud.
|
||||||
3. Deploy the constrained demo version to Streamlit Community Cloud.
|
3. Enroll in Apple Developer Program (start in parallel — 1-2 wk lead time).
|
||||||
4. Enroll in Apple Developer Program (1-2 week lead time - start in parallel with the above).
|
4. Single landing page for the lead bundle, demo prominently linked.
|
||||||
5. Stand up a single landing page for the lead bundle, with the hosted demo prominently linked.
|
5. Finish 2 more tools to Ready state (CLI + GUI).
|
||||||
6. Finish at least 2 more of the 9 scripts to working state with both CLI and GUI.
|
6. List on Gumroad with sample output proof, per-platform installers, demo link.
|
||||||
7. List on Gumroad with sample output proof, per-platform installer downloads, and hosted demo link.
|
|
||||||
|
|||||||
239
docs/CLI-REFERENCE.es.md
Normal file
239
docs/CLI-REFERENCE.es.md
Normal file
@@ -0,0 +1,239 @@
|
|||||||
|
> 🌐 **Idioma:** Español · [English](CLI-REFERENCE.md)
|
||||||
|
|
||||||
|
# Referencia de la CLI
|
||||||
|
|
||||||
|
> ⚠️ Los comandos, banderas y valores de las opciones son **idénticos en ambos idiomas**. La CLI emite todos sus mensajes en inglés; este documento traduce las explicaciones, no los comandos.
|
||||||
|
|
||||||
|
Tres módulos de CLI, uno por cada herramienta Lista:
|
||||||
|
|
||||||
|
| Módulo | Comando | Propósito |
|
||||||
|
|--------|---------|---------|
|
||||||
|
| `src.cli` | `python -m src.cli FILE` | Buscar duplicados |
|
||||||
|
| `src.cli_text_clean` | `python -m src.cli_text_clean FILE` | Limpiar texto |
|
||||||
|
| `src.cli_analyze` | `python -m src.cli_analyze FILE` | Analizador (escaneo de solo lectura) |
|
||||||
|
|
||||||
|
Cada comando es **previsualización por defecto** — añade `--apply` para escribir la salida.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# Buscar duplicados
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m src.cli ARCHIVO_ENTRADA [OPCIONES]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Opciones
|
||||||
|
|
||||||
|
### Generales
|
||||||
|
- `--apply` — escribe los archivos de salida (por defecto: previsualización).
|
||||||
|
- `-o, --output RUTA` — ruta de salida (por defecto `{input}_deduplicated.csv`).
|
||||||
|
|
||||||
|
### Selección de columnas
|
||||||
|
- `-s, --subset COLS` — columnas separadas por comas en las que hacer la coincidencia (por defecto: detección automática).
|
||||||
|
- `-k, --key COLS` — columnas de clave fuerte; cada una se convierte en una estrategia independiente de coincidencia exacta (`fb_id`, `ein`, `sku`).
|
||||||
|
|
||||||
|
### Coincidencia difusa
|
||||||
|
- `--fuzzy COLS` — columnas separadas por comas para coincidencia difusa.
|
||||||
|
- `-a, --algorithm ALG` — `levenshtein` / `jaro_winkler` (por defecto) / `token_set_ratio`.
|
||||||
|
- `-t, --threshold N` — similitud 0-100 (por defecto 85).
|
||||||
|
|
||||||
|
### Normalización
|
||||||
|
- `--normalize COL:TIPO` — pares `col:tipo` separados por comas. Tipos: `email`, `phone`, `name`, `address`, `string`.
|
||||||
|
|
||||||
|
| Tipo | Efecto | Ejemplo |
|
||||||
|
|------|--------|---------|
|
||||||
|
| `email` | minúsculas, elimina puntos de Gmail, elimina `+etiqueta` | `John.Doe+x@gmail.com` → `johndoe@gmail.com` |
|
||||||
|
| `phone` | E.164 (extensión preservada) | `(555) 123-4567 ext 100` → `+15551234567;ext=100` |
|
||||||
|
| `name` | elimina títulos + sufijos + partículas, baja a minúsculas | `Dr. Charles de Gaulle Jr.` → `charles gaulle` |
|
||||||
|
| `address` | abreviaturas USPS + nombre de estado → 2 letras, minúsculas | `123 Main Street, California` → `123 main st ca` |
|
||||||
|
| `string` | recorta + colapsa + minúsculas | ` HELLO WORLD ` → `hello world` |
|
||||||
|
|
||||||
|
### Selección del superviviente
|
||||||
|
- `--survivor REGLA` — `first` (por defecto) / `last` / `most-complete` / `most-recent`.
|
||||||
|
- `--date-column COL` — obligatoria para `most-recent`.
|
||||||
|
- `--merge` — rellena los huecos del superviviente desde las filas eliminadas.
|
||||||
|
|
||||||
|
### Revisión interactiva
|
||||||
|
- `--review` — pregunta s/n/saltar por cada grupo de coincidencias con diff lado a lado.
|
||||||
|
|
||||||
|
### Configuración
|
||||||
|
- `--config RUTA` — carga toda la configuración desde un JSON.
|
||||||
|
- `--save-config RUTA` — guarda la configuración actual en un JSON.
|
||||||
|
|
||||||
|
### Manejo de archivos
|
||||||
|
- `--sheet NOMBRE|N` — nombre de hoja de Excel o índice base 0.
|
||||||
|
- `--encoding ENC` — anula la codificación autodetectada.
|
||||||
|
- `--header-row N` — fila de encabezado en base 0.
|
||||||
|
|
||||||
|
## Recetas
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Deduplicación básica con autodetección
|
||||||
|
python -m src.cli customers.csv [--apply]
|
||||||
|
|
||||||
|
# Coincidencia difusa de nombres al 80%
|
||||||
|
python -m src.cli customers.csv --fuzzy name --threshold 80 --apply
|
||||||
|
|
||||||
|
# Múltiples claves fuertes (lógica OR)
|
||||||
|
python -m src.cli donors.csv --key fb_id,ein --apply
|
||||||
|
|
||||||
|
# Fila más completa + fusionar campos faltantes
|
||||||
|
python -m src.cli contacts.csv --survivor most-complete --merge --apply
|
||||||
|
|
||||||
|
# Más reciente + fusión
|
||||||
|
python -m src.cli contacts.csv --survivor most-recent --date-column updated_at --merge --apply
|
||||||
|
|
||||||
|
# Revisión interactiva
|
||||||
|
python -m src.cli customers.csv --review --apply
|
||||||
|
|
||||||
|
# Guardar / cargar perfil
|
||||||
|
python -m src.cli customers.csv --fuzzy name --threshold 80 --save-config dedup.json
|
||||||
|
python -m src.cli new.csv --config dedup.json --apply
|
||||||
|
|
||||||
|
# Excel
|
||||||
|
python -m src.cli data.xlsx --sheet "Sales" --apply
|
||||||
|
```
|
||||||
|
|
||||||
|
## Algoritmos
|
||||||
|
|
||||||
|
- **`jaro_winkler`** (por defecto) — el mejor para cadenas cortas (nombres); pondera los primeros caracteres.
|
||||||
|
- **`levenshtein`** — ratio de distancia de edición; errores tipográficos y transposiciones.
|
||||||
|
- **`token_set_ratio`** — el mejor para direcciones; ignora el orden de las palabras.
|
||||||
|
|
||||||
|
## Detección automática
|
||||||
|
|
||||||
|
Cuando no se pasan banderas `--subset` / `--fuzzy`, las columnas se detectan por su nombre:
|
||||||
|
|
||||||
|
| Patrón | Algoritmo | Umbral | Normalizador | Clave |
|
||||||
|
|---------|-----------|-----------|------------|-----|
|
||||||
|
| Email | exacto | 100% | email | fuerte |
|
||||||
|
| Teléfono | exacto | 100% | phone | fuerte |
|
||||||
|
| Nombre | jaro_winkler | 85% | name | débil |
|
||||||
|
| Dirección | token_set_ratio | 80% | address | débil |
|
||||||
|
|
||||||
|
**Reglas de estrategia**: claves fuertes → OR independiente; claves débiles → AND emparejadas con cada clave fuerte; sin claves fuertes → las débiles se promueven a independientes; sin patrones → coincidencia exacta en todas las columnas.
|
||||||
|
|
||||||
|
## Archivos de salida (con `--apply`)
|
||||||
|
|
||||||
|
| Archivo | Contenido |
|
||||||
|
|------|----------|
|
||||||
|
| `{stem}_deduplicated.csv` | Datos limpios |
|
||||||
|
| `{stem}_removed.csv` | Filas eliminadas |
|
||||||
|
| `{stem}_match_groups.csv` | `_group_id`, `_is_survivor`, `_confidence`, `_matched_on`, `_original_row` + columnas originales |
|
||||||
|
|
||||||
|
Registro: `logs/dedup_YYYYMMDD_HHMMSS.log`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# Limpiar texto
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m src.cli_text_clean ARCHIVO_ENTRADA [OPCIONES]
|
||||||
|
```
|
||||||
|
|
||||||
|
Higiene a nivel de carácter. Ver [TECHNICAL.md §10.2](TECHNICAL.md) (solo en inglés) para la especificación.
|
||||||
|
|
||||||
|
## Opciones
|
||||||
|
|
||||||
|
### Generales
|
||||||
|
- `--apply` — escribe la salida (por defecto: previsualización).
|
||||||
|
- `-o, --output RUTA` — ruta de salida (por defecto `{input}_cleaned.csv`).
|
||||||
|
- `--preset NOMBRE` — `minimal` / `excel-hygiene` (por defecto) / `paranoid`.
|
||||||
|
|
||||||
|
### Alcance
|
||||||
|
- `--columns COLS` — columnas separadas por comas a limpiar (por defecto: todas las columnas de texto).
|
||||||
|
- `--skip COLS` — excluye estas columnas.
|
||||||
|
|
||||||
|
### Anulaciones por operación (anulan el preset activo)
|
||||||
|
- `--no-trim`, `--no-collapse`, `--no-nfc`, `--nfkc`, `--no-smart-chars`, `--no-zero-width`, `--no-bom`, `--no-control`, `--no-line-endings`.
|
||||||
|
|
||||||
|
### Mayúsculas / minúsculas
|
||||||
|
- `--case MODO` — `upper` / `lower` / `title` / `sentence`. O por columna: `--case title:name,upper:sku`.
|
||||||
|
- El modo título preserva los tokens en mayúsculas (`USA`) y deja en minúsculas las partículas internas (`of`, `and`).
|
||||||
|
|
||||||
|
### Auditoría + configuración
|
||||||
|
- `--full-changelog` — escribe todos los cambios (por defecto se limita a los primeros 1000).
|
||||||
|
- `--config RUTA` / `--save-config RUTA`.
|
||||||
|
|
||||||
|
### Archivo
|
||||||
|
- `--sheet`, `--encoding`, `--header-row` — iguales que en Buscar duplicados.
|
||||||
|
|
||||||
|
## Presets
|
||||||
|
|
||||||
|
| Preset | Qué hace |
|
||||||
|
|--------|--------------|
|
||||||
|
| `minimal` | Solo recorte y colapso. |
|
||||||
|
| `excel-hygiene` (por defecto) | Recorte, colapso, NFC, plegado de caracteres tipográficos, eliminación de caracteres invisibles, eliminación de BOM, eliminación de caracteres de control, normalización de finales de línea. |
|
||||||
|
| `paranoid` | `excel-hygiene` + plegado de compatibilidad NFKC (con pérdida). |
|
||||||
|
|
||||||
|
## Recetas
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Valores por defecto seguros (previsualiza, luego aplica)
|
||||||
|
python -m src.cli_text_clean messy.csv [--apply]
|
||||||
|
|
||||||
|
# Solo recorte y colapso, sin tocar Unicode
|
||||||
|
python -m src.cli_text_clean messy.csv --preset minimal --apply
|
||||||
|
|
||||||
|
# Nombres en formato título, SKUs en mayúsculas
|
||||||
|
python -m src.cli_text_clean people.csv --case title:name,upper:sku --apply
|
||||||
|
|
||||||
|
# Limpiar solo columnas concretas
|
||||||
|
python -m src.cli_text_clean orders.csv --columns vendor,product --apply
|
||||||
|
|
||||||
|
# Excluir una columna de notas en texto libre
|
||||||
|
python -m src.cli_text_clean tickets.csv --skip notes --apply
|
||||||
|
```
|
||||||
|
|
||||||
|
## Archivos de salida (con `--apply`)
|
||||||
|
|
||||||
|
| Archivo | Contenido |
|
||||||
|
|------|----------|
|
||||||
|
| `{stem}_cleaned.csv` | Datos limpios |
|
||||||
|
| `{stem}_changes.csv` | `row`, `column`, `old`, `new`, `ops_applied` (limitado a 1000; `--full-changelog` quita el límite) |
|
||||||
|
|
||||||
|
Registro: `logs/text_clean_YYYYMMDD_HHMMSS.log`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# Analizador
|
||||||
|
|
||||||
|
```
|
||||||
|
python -m src.cli_analyze ARCHIVO_ENTRADA [OPCIONES]
|
||||||
|
```
|
||||||
|
|
||||||
|
Escaneo de solo lectura; muestra todos los hallazgos del detector sin modificar el archivo.
|
||||||
|
|
||||||
|
## Opciones
|
||||||
|
- `--sample-rows N` — límite de filas escaneadas (por defecto 1000).
|
||||||
|
- `--json` — imprime los hallazgos como un array JSON en stdout.
|
||||||
|
- `--strict` — sale con código no cero ante cualquier hallazgo `warn`/`error`.
|
||||||
|
|
||||||
|
## Esquema JSON (un objeto por hallazgo)
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"id": "smart_punctuation_in_data",
|
||||||
|
"severity": "warn",
|
||||||
|
"confidence": "high",
|
||||||
|
"fix_action": "fold_smart_punctuation",
|
||||||
|
"pre_applied": false,
|
||||||
|
"tool": "02_text_cleaner",
|
||||||
|
"count": 17,
|
||||||
|
"description": "17 cell(s) contain curly quotes…",
|
||||||
|
"column": null,
|
||||||
|
"samples": [{"row": 3, "column": "name", "value": "“Alice”"}]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Significado de los campos
|
||||||
|
- `severity` — `info` / `warn` / `error`. Solo `error` bloquea la verificación de la GUI.
|
||||||
|
- `confidence` — `high` (un clic), `medium` (previsualiza), `low` (opt-in).
|
||||||
|
- `fix_action` — id del algoritmo en `src/core/fixes.py`. Vacío si es solo informativo.
|
||||||
|
- `pre_applied` — `true` para correcciones ya aplicadas durante la lectura a nivel de bytes.
|
||||||
|
|
||||||
|
## Detectores
|
||||||
|
|
||||||
|
Puntuación tipográfica, espacios NBSP / Unicode, caracteres de ancho cero, encabezados sucios, relleno con espacios, centinelas tipo null, huellas de mojibake, columnas de email con mayúsculas/minúsculas mezcladas, formatos de fecha inconsistentes, filas casi duplicadas, identificadores con ceros a la izquierda, finales de línea mezclados, fallo de decodificación de codificación, presencia de U+FFFD.
|
||||||
|
|
||||||
|
Agregar un detector: añade la entrada en `analyze.py` y la corrección correspondiente en `fixes.py`. Ningún otro punto de llamada cambia.
|
||||||
@@ -1,431 +1,213 @@
|
|||||||
|
> 🌐 **Language:** English · [Español](CLI-REFERENCE.es.md)
|
||||||
|
|
||||||
# CLI Reference
|
# CLI Reference
|
||||||
|
|
||||||
Complete command-line reference for the DataTools bundle.
|
Three CLI modules, one per Ready tool:
|
||||||
|
|
||||||
DataTools ships two CLI modules so each script can be invoked independently:
|
|
||||||
|
|
||||||
| Module | Command | Purpose |
|
| Module | Command | Purpose |
|
||||||
|---|---|---|
|
|--------|---------|---------|
|
||||||
| `src.cli` | `python -m src.cli INPUT_FILE [OPTIONS]` | Deduplicator (script 01) |
|
| `src.cli` | `python -m src.cli FILE` | Find Duplicates |
|
||||||
| `src.cli_text_clean` | `python -m src.cli_text_clean INPUT_FILE [OPTIONS]` | Text cleaner (script 02) |
|
| `src.cli_text_clean` | `python -m src.cli_text_clean FILE` | Clean Text |
|
||||||
|
| `src.cli_analyze` | `python -m src.cli_analyze FILE` | Analyzer (read-only scan) |
|
||||||
|
|
||||||
The deduplicator section is below; the text cleaner reference is in [Section: Text Cleaner CLI](#text-cleaner-cli).
|
Every command is **preview-only by default** — add `--apply` to write output.
|
||||||
|
|
||||||
## Deduplicator
|
---
|
||||||
|
|
||||||
|
# Find Duplicates
|
||||||
|
|
||||||
```
|
```
|
||||||
python -m src.cli INPUT_FILE [OPTIONS]
|
python -m src.cli INPUT_FILE [OPTIONS]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Arguments
|
|
||||||
|
|
||||||
| Argument | Required | Description |
|
|
||||||
|----------|----------|-------------|
|
|
||||||
| `INPUT_FILE` | Yes | Path to the CSV, delimited text, or Excel file to deduplicate |
|
|
||||||
|
|
||||||
## Options
|
## Options
|
||||||
|
|
||||||
### Core
|
### Core
|
||||||
|
- `--apply` — write output files (default: preview).
|
||||||
|
- `-o, --output PATH` — output path (default `{input}_deduplicated.csv`).
|
||||||
|
|
||||||
| Flag | Short | Default | Description |
|
### Column selection
|
||||||
|------|-------|---------|-------------|
|
- `-s, --subset COLS` — comma-separated columns to match on (default: auto-detect).
|
||||||
| `--apply` | | `false` | Write output files. Without this flag, only a preview is shown. |
|
- `-k, --key COLS` — strong-key columns; each becomes an independent exact-match strategy (`fb_id`, `ein`, `sku`).
|
||||||
| `--output` | `-o` | `{input}_deduplicated.csv` | Output file path. |
|
|
||||||
|
|
||||||
### Column Selection
|
### Fuzzy matching
|
||||||
|
- `--fuzzy COLS` — comma-separated columns to fuzzy-match.
|
||||||
| Flag | Short | Default | Description |
|
- `-a, --algorithm ALG` — `levenshtein` / `jaro_winkler` (default) / `token_set_ratio`.
|
||||||
|------|-------|---------|-------------|
|
- `-t, --threshold N` — similarity 0-100 (default 85).
|
||||||
| `--subset` | `-s` | auto-detect | Comma-separated columns to match on. When omitted, columns are auto-detected by name pattern (email, phone, name, address). |
|
|
||||||
| `--key` | `-k` | none | Comma-separated strong-key columns. Each becomes an independent exact-match strategy. Use for identifiers like `fb_id`, `ein`, `sku`. |
|
|
||||||
|
|
||||||
### Fuzzy Matching
|
|
||||||
|
|
||||||
| Flag | Short | Default | Description |
|
|
||||||
|------|-------|---------|-------------|
|
|
||||||
| `--fuzzy` | | none | Comma-separated columns to fuzzy-match. Other columns in the strategy use exact matching. |
|
|
||||||
| `--algorithm` | `-a` | `jaro_winkler` | Fuzzy algorithm: `levenshtein`, `jaro_winkler`, or `token_set_ratio`. |
|
|
||||||
| `--threshold` | `-t` | `85` | Similarity threshold 0-100. Lower values find more matches but increase false positives. |
|
|
||||||
|
|
||||||
### Normalization
|
### Normalization
|
||||||
|
- `--normalize COL:TYPE` — comma-separated `col:type` pairs. Types: `email`, `phone`, `name`, `address`, `string`.
|
||||||
|
|
||||||
| Flag | Short | Default | Description |
|
| Type | Effect | Example |
|
||||||
|------|-------|---------|-------------|
|
|------|--------|---------|
|
||||||
| `--normalize` | | auto-detect | Column normalizers as `col:type` pairs, comma-separated. Types: `email`, `phone`, `name`, `address`, `string`. |
|
| `email` | lowercase, strip Gmail dots, strip `+tag` | `John.Doe+x@gmail.com` → `johndoe@gmail.com` |
|
||||||
|
| `phone` | E.164 (+ ext preserved) | `(555) 123-4567 ext 100` → `+15551234567;ext=100` |
|
||||||
|
| `name` | strip titles + suffixes + particles, case-fold | `Dr. Charles de Gaulle Jr.` → `charles gaulle` |
|
||||||
|
| `address` | USPS abbrevs + state name → 2-letter, case-fold | `123 Main Street, California` → `123 main st ca` |
|
||||||
|
| `string` | trim + collapse + case-fold | ` HELLO WORLD ` → `hello world` |
|
||||||
|
|
||||||
**Normalizer details:**
|
### Survivor selection
|
||||||
|
- `--survivor RULE` — `first` (default) / `last` / `most-complete` / `most-recent`.
|
||||||
|
- `--date-column COL` — required for `most-recent`.
|
||||||
|
- `--merge` — fill blanks in survivor from removed rows.
|
||||||
|
|
||||||
| Type | What it does | Example |
|
### Interactive review
|
||||||
|------|-------------|---------|
|
- `--review` — prompt y/n/s per match group with side-by-side diff.
|
||||||
| `email` | Lowercase, strip Gmail dots, strip `+tag` suffixes | `John.Doe+tag@gmail.com` → `johndoe@gmail.com` |
|
|
||||||
| `phone` | Parse to E.164 format; fallback: digits only | `(555) 123-4567` → `+15551234567` |
|
|
||||||
| `name` | Strip titles (Dr., Mr.) and suffixes (Jr., PhD), case-fold | `Dr. John Smith Jr.` → `john smith` |
|
|
||||||
| `address` | USPS abbreviations (Street→St, Avenue→Ave), case-fold | `123 Main Street, Suite 4` → `123 main st ste 4` |
|
|
||||||
| `string` | Trim, collapse whitespace, case-fold | ` HELLO WORLD ` → `hello world` |
|
|
||||||
|
|
||||||
### Survivor Selection
|
|
||||||
|
|
||||||
| Flag | Short | Default | Description |
|
|
||||||
|------|-------|---------|-------------|
|
|
||||||
| `--survivor` | | `first` | Which row to keep per duplicate group. |
|
|
||||||
| `--date-column` | | none | Date column for the `most-recent` rule. |
|
|
||||||
| `--merge` | | `false` | Fill missing fields in the surviving row from removed duplicates. |
|
|
||||||
|
|
||||||
**Survivor rules:**
|
|
||||||
|
|
||||||
| Rule | Behavior |
|
|
||||||
|------|----------|
|
|
||||||
| `first` | Keep the first row encountered (lowest row number) |
|
|
||||||
| `last` | Keep the last row encountered (highest row number) |
|
|
||||||
| `most-complete` | Keep the row with the fewest blank/empty cells |
|
|
||||||
| `most-recent` | Keep the row with the latest date (requires `--date-column`) |
|
|
||||||
|
|
||||||
### Interactive Review
|
|
||||||
|
|
||||||
| Flag | Short | Default | Description |
|
|
||||||
|------|-------|---------|-------------|
|
|
||||||
| `--review` | | `false` | Interactively review each match group. For each group, choose: merge (y), keep both (n), or skip remaining (s). |
|
|
||||||
|
|
||||||
### Configuration
|
### Configuration
|
||||||
|
- `--config PATH` — load all settings from JSON.
|
||||||
|
- `--save-config PATH` — save current settings to JSON.
|
||||||
|
|
||||||
| Flag | Short | Default | Description |
|
### File handling
|
||||||
|------|-------|---------|-------------|
|
- `--sheet NAME|N` — Excel sheet name or 0-based index.
|
||||||
| `--config` | | none | Load all settings from a saved JSON config file. |
|
- `--encoding ENC` — override auto-detected encoding.
|
||||||
| `--save-config` | | none | Save current settings to a JSON config file for reuse. |
|
- `--header-row N` — 0-based header row.
|
||||||
|
|
||||||
### File Handling
|
|
||||||
|
|
||||||
| Flag | Short | Default | Description |
|
|
||||||
|------|-------|---------|-------------|
|
|
||||||
| `--sheet` | | first sheet | Excel sheet name or 0-based index. Ignored for CSV files. |
|
|
||||||
| `--encoding` | | auto-detect | Override auto-detected file encoding (e.g., `utf-8`, `windows-1252`). |
|
|
||||||
| `--header-row` | | auto-detect | 0-based row index for the header row. |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Recipes
|
## Recipes
|
||||||
|
|
||||||
### 1. Basic Dedup (Auto-Detect)
|
|
||||||
|
|
||||||
Let the engine detect email, phone, name, and address columns automatically.
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Preview
|
# Basic auto-detect dedup
|
||||||
python -m src.cli customers.csv
|
python -m src.cli customers.csv [--apply]
|
||||||
|
|
||||||
# Apply
|
# Fuzzy name match at 80%
|
||||||
python -m src.cli customers.csv --apply
|
|
||||||
```
|
|
||||||
|
|
||||||
The engine scans column names for patterns like `email`, `phone`, `name`, `address` and builds strategies automatically. Strong keys (email, phone) become standalone strategies; weak keys (name, address) are paired with strong keys.
|
|
||||||
|
|
||||||
### 2. Fuzzy Name Matching
|
|
||||||
|
|
||||||
Match rows where names are similar but not identical — catches typos, nickname variations, and formatting differences.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Fuzzy-match on the "name" column at 80% similarity
|
|
||||||
python -m src.cli customers.csv --fuzzy name --threshold 80 --apply
|
python -m src.cli customers.csv --fuzzy name --threshold 80 --apply
|
||||||
|
|
||||||
# Fuzzy-match on multiple columns
|
# Multiple strong keys (OR logic)
|
||||||
python -m src.cli customers.csv --fuzzy name,address --threshold 85 --apply
|
|
||||||
|
|
||||||
# Use Levenshtein distance instead of Jaro-Winkler
|
|
||||||
python -m src.cli customers.csv --fuzzy name --algorithm levenshtein --threshold 80 --apply
|
|
||||||
```
|
|
||||||
|
|
||||||
**Algorithm comparison:**
|
|
||||||
- `jaro_winkler` (default) — best for short strings like names; weights early characters more heavily
|
|
||||||
- `levenshtein` — edit-distance ratio; works well for typos and transpositions
|
|
||||||
- `token_set_ratio` — best for addresses and long strings; ignores word order
|
|
||||||
|
|
||||||
### 3. Custom Strong Keys
|
|
||||||
|
|
||||||
Use specific identifier columns to find exact duplicates.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Deduplicate by Facebook ID
|
|
||||||
python -m src.cli donors.csv --key fb_id --apply
|
|
||||||
|
|
||||||
# Multiple strong keys (each is independent — matched with OR)
|
|
||||||
python -m src.cli donors.csv --key fb_id,ein --apply
|
python -m src.cli donors.csv --key fb_id,ein --apply
|
||||||
```
|
|
||||||
|
|
||||||
Strong keys are OR'd: a match on `fb_id` alone OR `ein` alone marks rows as duplicates.
|
# Most-complete row + merge missing fields
|
||||||
|
|
||||||
### 4. Merge Mode
|
|
||||||
|
|
||||||
Keep the most complete row and fill any remaining blanks from the duplicates.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Most complete row + merge missing fields
|
|
||||||
python -m src.cli contacts.csv --survivor most-complete --merge --apply
|
python -m src.cli contacts.csv --survivor most-complete --merge --apply
|
||||||
|
|
||||||
# Keep most recent row and merge
|
# Most-recent + merge
|
||||||
python -m src.cli contacts.csv --survivor most-recent --date-column updated_at --merge --apply
|
python -m src.cli contacts.csv --survivor most-recent --date-column updated_at --merge --apply
|
||||||
```
|
|
||||||
|
|
||||||
**How merge works:** The survivor row keeps all its non-empty fields. For any blank/null fields, the engine fills from the removed rows (in row order). The result is a single row with maximum data retention.
|
# Interactive review
|
||||||
|
|
||||||
### 5. Multi-Column Subset
|
|
||||||
|
|
||||||
Match on a specific combination of columns rather than auto-detecting.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Exact match on email + phone only
|
|
||||||
python -m src.cli customers.csv --subset email,phone --apply
|
|
||||||
|
|
||||||
# Mix exact and fuzzy within a subset
|
|
||||||
python -m src.cli customers.csv --subset email,name --fuzzy name --threshold 85 --apply
|
|
||||||
```
|
|
||||||
|
|
||||||
When using `--subset`, all listed columns must match (AND logic) for a pair to be considered duplicates.
|
|
||||||
|
|
||||||
### 6. Save and Load Config Profiles
|
|
||||||
|
|
||||||
Save your settings for repeatable runs on similar files.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Save settings to a file
|
|
||||||
python -m src.cli customers.csv --fuzzy name --threshold 80 --merge \
|
|
||||||
--survivor most-complete --save-config customer_dedup.json
|
|
||||||
|
|
||||||
# Load saved settings
|
|
||||||
python -m src.cli new_customers.csv --config customer_dedup.json --apply
|
|
||||||
```
|
|
||||||
|
|
||||||
Config files are JSON. Example:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"strategies": [],
|
|
||||||
"survivor_rule": "most_complete",
|
|
||||||
"merge": true,
|
|
||||||
"default_algorithm": "jaro_winkler",
|
|
||||||
"default_threshold": 80.0,
|
|
||||||
"fuzzy_columns": ["name"]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 7. Interactive Review
|
|
||||||
|
|
||||||
Step through each match group and decide whether to merge.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m src.cli customers.csv --review --apply
|
python -m src.cli customers.csv --review --apply
|
||||||
|
|
||||||
|
# Save / load profile
|
||||||
|
python -m src.cli customers.csv --fuzzy name --threshold 80 --save-config dedup.json
|
||||||
|
python -m src.cli new.csv --config dedup.json --apply
|
||||||
|
|
||||||
|
# Excel
|
||||||
|
python -m src.cli data.xlsx --sheet "Sales" --apply
|
||||||
```
|
```
|
||||||
|
|
||||||
For each group, the CLI displays both rows side-by-side and prompts:
|
## Algorithms
|
||||||
|
|
||||||
```
|
- **`jaro_winkler`** (default) — best for short strings (names); weights early chars.
|
||||||
============================================================
|
- **`levenshtein`** — edit-distance ratio; typos and transpositions.
|
||||||
Match Group 1 — Confidence: 92.3%
|
- **`token_set_ratio`** — best for addresses; ignores word order.
|
||||||
Matched on: name, phone
|
|
||||||
============================================================
|
|
||||||
|
|
||||||
Row 1:
|
## Auto-detection
|
||||||
name: John Smith
|
|
||||||
email: john@example.com
|
|
||||||
phone: (555) 123-4567
|
|
||||||
|
|
||||||
Row 2:
|
When no `--subset` / `--fuzzy` flags, columns are detected by name:
|
||||||
name: Jon Smith
|
|
||||||
email:
|
|
||||||
phone: 555-123-4567
|
|
||||||
|
|
||||||
[y] Merge [n] Keep both [s] Skip remaining:
|
| Pattern | Algorithm | Threshold | Normalizer | Key |
|
||||||
```
|
|---------|-----------|-----------|------------|-----|
|
||||||
|
| Email | exact | 100% | email | strong |
|
||||||
|
| Phone | exact | 100% | phone | strong |
|
||||||
|
| Name | jaro_winkler | 85% | name | weak |
|
||||||
|
| Address | token_set_ratio | 80% | address | weak |
|
||||||
|
|
||||||
- **y** — accept the match; merge/remove duplicate
|
**Strategy rules**: strong keys → standalone OR; weak keys → AND-paired with each strong key; no strong keys → weak promoted to standalone; no patterns → exact match on all columns.
|
||||||
- **n** — reject the match; keep both rows
|
|
||||||
- **s** — skip all remaining groups (keep both for all)
|
|
||||||
|
|
||||||
### 8. Excel Files and Multi-Sheet
|
## Output files (with `--apply`)
|
||||||
|
|
||||||
Work with Excel files directly — no CSV conversion needed.
|
| File | Contents |
|
||||||
|
|------|----------|
|
||||||
|
| `{stem}_deduplicated.csv` | Cleaned data |
|
||||||
|
| `{stem}_removed.csv` | Removed rows |
|
||||||
|
| `{stem}_match_groups.csv` | `_group_id`, `_is_survivor`, `_confidence`, `_matched_on`, `_original_row` + originals |
|
||||||
|
|
||||||
```bash
|
Log: `logs/dedup_YYYYMMDD_HHMMSS.log`.
|
||||||
# Deduplicate first sheet (default)
|
|
||||||
python -m src.cli data.xlsx --apply
|
|
||||||
|
|
||||||
# Specify sheet by name
|
|
||||||
python -m src.cli data.xlsx --sheet "Sales Data" --apply
|
|
||||||
|
|
||||||
# Specify sheet by index (0-based)
|
|
||||||
python -m src.cli data.xlsx --sheet 1 --apply
|
|
||||||
```
|
|
||||||
|
|
||||||
Output is always CSV by default. To write Excel output, use `-o`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m src.cli data.xlsx -o cleaned.xlsx --apply
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Auto-Detection Details
|
# Clean Text
|
||||||
|
|
||||||
When no `--subset` or `--fuzzy` flags are provided, the engine scans column names and builds strategies:
|
|
||||||
|
|
||||||
| Column pattern | Detection regex | Algorithm | Threshold | Normalizer | Key type |
|
|
||||||
|---------------|----------------|-----------|-----------|------------|----------|
|
|
||||||
| Email | `e[-_]?mail` | exact | 100% | email | strong |
|
|
||||||
| Phone | `phone\|telephone\|mobile\|cell` | exact | 100% | phone | strong |
|
|
||||||
| Name | `^(name\|full_name\|customer_name\|...)$` | jaro_winkler | 85% | name | weak |
|
|
||||||
| Address | `address\|street\|addr` | token_set_ratio | 80% | address | weak |
|
|
||||||
|
|
||||||
**Strategy building rules:**
|
|
||||||
- Strong keys → standalone OR strategies (email match alone is enough)
|
|
||||||
- Weak keys → paired with each strong key via AND (name match requires email or phone match too)
|
|
||||||
- No strong keys found → weak keys promoted to standalone
|
|
||||||
- No patterns matched → exact match on all columns (equivalent to `drop_duplicates`)
|
|
||||||
|
|
||||||
## Output Files
|
|
||||||
|
|
||||||
When `--apply` is set, three files are written:
|
|
||||||
|
|
||||||
| File | Description |
|
|
||||||
|------|-------------|
|
|
||||||
| `{stem}_deduplicated.csv` | Cleaned DataFrame with duplicates removed |
|
|
||||||
| `{stem}_removed.csv` | Rows that were removed |
|
|
||||||
| `{stem}_match_groups.csv` | Audit trail with `_group_id`, `_is_survivor`, `_confidence`, `_matched_on`, `_original_row`, plus all original columns |
|
|
||||||
|
|
||||||
## Logging
|
|
||||||
|
|
||||||
Every run writes a timestamped log to `logs/dedup_YYYYMMDD_HHMMSS.log` with full debug-level details: strategies used, pair comparisons, survivor decisions, and merge actions.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# Text Cleaner CLI
|
|
||||||
|
|
||||||
Character-level hygiene for CSV / Excel files: whitespace trim and collapse, smart-character folding, Unicode normalization, BOM strip, control-char strip, line-ending normalization, optional case conversion. See TECHNICAL.md Section 10.2 for the full functional spec.
|
|
||||||
|
|
||||||
```
|
```
|
||||||
python -m src.cli_text_clean INPUT_FILE [OPTIONS]
|
python -m src.cli_text_clean INPUT_FILE [OPTIONS]
|
||||||
```
|
```
|
||||||
|
|
||||||
## Arguments
|
Character-level hygiene. See [TECHNICAL.md §10.2](TECHNICAL.md) for the spec.
|
||||||
|
|
||||||
| Argument | Required | Description |
|
|
||||||
|----------|----------|-------------|
|
|
||||||
| `INPUT_FILE` | Yes | Path to the CSV, TSV, or Excel file to clean |
|
|
||||||
|
|
||||||
## Options
|
## Options
|
||||||
|
|
||||||
### Core
|
### Core
|
||||||
|
- `--apply` — write output (default: preview).
|
||||||
| Flag | Short | Default | Description |
|
- `-o, --output PATH` — output path (default `{input}_cleaned.csv`).
|
||||||
|------|-------|---------|-------------|
|
- `--preset NAME` — `minimal` / `excel-hygiene` (default) / `paranoid`.
|
||||||
| `--apply` | | `false` | Write output files. Without this flag, only a preview is shown. |
|
|
||||||
| `--output` | `-o` | `{input}_cleaned.csv` | Output file path. |
|
|
||||||
| `--preset` | | `excel-hygiene` | Preset bundle of safe defaults. See [Presets](#presets). |
|
|
||||||
|
|
||||||
### Scope
|
### Scope
|
||||||
|
- `--columns COLS` — comma-separated columns to clean (default: all string columns).
|
||||||
|
- `--skip COLS` — exclude these columns.
|
||||||
|
|
||||||
| Flag | Default | Description |
|
### Per-op overrides (override the active preset)
|
||||||
|------|---------|-------------|
|
- `--no-trim`, `--no-collapse`, `--no-nfc`, `--nfkc`, `--no-smart-chars`, `--no-zero-width`, `--no-bom`, `--no-control`, `--no-line-endings`.
|
||||||
| `--columns` | all string columns | Comma-separated columns to clean. |
|
|
||||||
| `--skip` | none | Comma-separated columns to skip even if they look like text. Useful for free-text notes columns you don't want touched. |
|
|
||||||
|
|
||||||
### Per-operation toggles
|
### Case
|
||||||
|
- `--case MODE` — `upper` / `lower` / `title` / `sentence`. Or per-column: `--case title:name,upper:sku`.
|
||||||
|
- Title case preserves all-caps tokens (`USA`) and lowercases mid-string particles (`of`, `and`).
|
||||||
|
|
||||||
These override the active preset.
|
### Audit + config
|
||||||
|
- `--full-changelog` — write every change (default caps to first 1000).
|
||||||
|
- `--config PATH` / `--save-config PATH`.
|
||||||
|
|
||||||
| Flag | Effect |
|
### File
|
||||||
|------|--------|
|
- `--sheet`, `--encoding`, `--header-row` — same as Find Duplicates.
|
||||||
| `--no-trim` | Disable leading/trailing whitespace strip |
|
|
||||||
| `--no-collapse` | Disable internal whitespace collapse |
|
|
||||||
| `--no-nfc` | Disable Unicode NFC normalization |
|
|
||||||
| `--nfkc` | Enable NFKC compatibility fold (lossy: `①` → `1`, `fi` → `fi`) |
|
|
||||||
| `--no-smart-chars` | Disable smart-character folding (curly quotes, em/en-dash, NBSP, ellipsis) |
|
|
||||||
| `--no-zero-width` | Disable zero-width / invisible character strip |
|
|
||||||
| `--no-bom` | Disable leading BOM strip |
|
|
||||||
| `--no-control` | Disable control-character strip |
|
|
||||||
| `--no-line-endings` | Disable line-ending normalization |
|
|
||||||
|
|
||||||
### Case conversion
|
|
||||||
|
|
||||||
| Flag | Forms | Description |
|
|
||||||
|------|-------|-------------|
|
|
||||||
| `--case` | `upper`, `lower`, `title`, `sentence` | Apply this case to every selected column |
|
|
||||||
| `--case` | `mode:col[,mode:col]` | Per-column case (e.g., `--case title:name,upper:code`) |
|
|
||||||
|
|
||||||
Title case preserves all-caps tokens (`USA` stays `USA`) and lowercases mid-string particles (`of`, `and`, `the`, etc.).
|
|
||||||
|
|
||||||
### Audit and config
|
|
||||||
|
|
||||||
| Flag | Default | Description |
|
|
||||||
|------|---------|-------------|
|
|
||||||
| `--full-changelog` | `false` | Write every cell change to the audit CSV (default caps to first 1000). |
|
|
||||||
| `--config` | none | Load options from a saved JSON config file. |
|
|
||||||
| `--save-config` | none | Save the current options to a JSON config file. |
|
|
||||||
|
|
||||||
### File format / encoding
|
|
||||||
|
|
||||||
| Flag | Default | Description |
|
|
||||||
|------|---------|-------------|
|
|
||||||
| `--sheet` | `0` | Excel sheet name or 0-based index. |
|
|
||||||
| `--encoding` | auto-detect | Override auto-detected file encoding. |
|
|
||||||
| `--header-row` | auto-detect | 0-based row index for the header. |
|
|
||||||
|
|
||||||
## Presets
|
## Presets
|
||||||
|
|
||||||
| Preset | What it does |
|
| Preset | What it does |
|
||||||
|---|---|
|
|--------|--------------|
|
||||||
| `minimal` | Trim + collapse whitespace only. Nothing else. |
|
| `minimal` | Trim + collapse only. |
|
||||||
| `excel-hygiene` (default) | Trim, collapse, NFC, smart-character fold, zero-width strip, BOM strip, control strip, line-ending normalize. NFKC off. |
|
| `excel-hygiene` (default) | Trim, collapse, NFC, smart-char fold, zero-width strip, BOM strip, control strip, line-ending normalize. |
|
||||||
| `paranoid` | All of `excel-hygiene` plus NFKC compatibility fold (lossy). |
|
| `paranoid` | `excel-hygiene` + NFKC compatibility fold (lossy). |
|
||||||
|
|
||||||
## Output Files
|
|
||||||
|
|
||||||
When `--apply` is set:
|
|
||||||
|
|
||||||
| File | Description |
|
|
||||||
|------|-------------|
|
|
||||||
| `{stem}_cleaned.csv` | Cleaned DataFrame |
|
|
||||||
| `{stem}_changes.csv` | Per-cell audit: `row`, `column`, `old`, `new`, `ops_applied` (capped to 1000 rows by default; use `--full-changelog` for all) |
|
|
||||||
|
|
||||||
A timestamped log is always written to `logs/text_clean_YYYYMMDD_HHMMSS.log`.
|
|
||||||
|
|
||||||
## Recipes
|
## Recipes
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Preview what would change with the safe defaults
|
# Safe defaults (preview, then apply)
|
||||||
python -m src.cli_text_clean messy.csv
|
python -m src.cli_text_clean messy.csv [--apply]
|
||||||
|
|
||||||
# Apply the safe defaults
|
# Just trim + collapse, leave Unicode alone
|
||||||
python -m src.cli_text_clean messy.csv --apply
|
|
||||||
|
|
||||||
# Just the basics — only trim and collapse, leave Unicode/quotes alone
|
|
||||||
python -m src.cli_text_clean messy.csv --preset minimal --apply
|
python -m src.cli_text_clean messy.csv --preset minimal --apply
|
||||||
|
|
||||||
# Title-case the name column, upper-case the SKU column, leave others alone for case
|
# Title-case names, upper-case SKUs
|
||||||
python -m src.cli_text_clean people.csv --case title:name,upper:sku --apply
|
python -m src.cli_text_clean people.csv --case title:name,upper:sku --apply
|
||||||
|
|
||||||
# Clean only specific columns
|
# Clean only specific columns
|
||||||
python -m src.cli_text_clean orders.csv --columns vendor,product --apply
|
python -m src.cli_text_clean orders.csv --columns vendor,product --apply
|
||||||
|
|
||||||
# Skip a free-text notes column from cleaning
|
# Skip a free-text notes column
|
||||||
python -m src.cli_text_clean tickets.csv --skip notes --apply
|
python -m src.cli_text_clean tickets.csv --skip notes --apply
|
||||||
|
|
||||||
# Save the current settings as a profile and reload it later
|
|
||||||
python -m src.cli_text_clean messy.csv --preset minimal --case upper --save-config my.json
|
|
||||||
python -m src.cli_text_clean other.csv --config my.json --apply
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Output files (with `--apply`)
|
||||||
|
|
||||||
|
| File | Contents |
|
||||||
|
|------|----------|
|
||||||
|
| `{stem}_cleaned.csv` | Cleaned data |
|
||||||
|
| `{stem}_changes.csv` | `row`, `column`, `old`, `new`, `ops_applied` (capped to 1000; `--full-changelog` removes cap) |
|
||||||
|
|
||||||
|
Log: `logs/text_clean_YYYYMMDD_HHMMSS.log`.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Analyzer (upload-time scan)
|
# Analyzer
|
||||||
|
|
||||||
```
|
```
|
||||||
python -m src.cli_analyze INPUT_FILE [OPTIONS]
|
python -m src.cli_analyze INPUT_FILE [OPTIONS]
|
||||||
|
|
||||||
--sample-rows N Cap on rows scanned (default 1000)
|
|
||||||
--json Print findings as a JSON array on stdout
|
|
||||||
--strict Exit non-zero on any warn/error finding
|
|
||||||
```
|
```
|
||||||
|
|
||||||
JSON output schema (one object per finding):
|
Read-only scan; surfaces every detector finding without modifying the file.
|
||||||
|
|
||||||
|
## Options
|
||||||
|
- `--sample-rows N` — cap on rows scanned (default 1000).
|
||||||
|
- `--json` — print findings as a JSON array on stdout.
|
||||||
|
- `--strict` — exit non-zero on any warn/error finding.
|
||||||
|
|
||||||
|
## JSON schema (one object per finding)
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@@ -442,10 +224,14 @@ JSON output schema (one object per finding):
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
- `severity` — `info` / `warn` / `error`. Only `error` blocks the GUI normalization gate.
|
## Field meanings
|
||||||
- `confidence` — `high` (round-trip-safe, eligible for one-click auto-fix), `medium` (preview before applying), `low` (heuristic, opt-in only).
|
- `severity` — `info` / `warn` / `error`. Only `error` blocks the GUI gate.
|
||||||
- `fix_action` — stable id naming the algorithm in `src/core/fixes.py` that resolves the finding. Empty string for informational-only findings.
|
- `confidence` — `high` (one-click), `medium` (preview), `low` (opt-in).
|
||||||
- `pre_applied` — `true` for fixes already applied during the byte-level read pass (BOM strip, NUL strip, line-ending normalize, byte-level smart-quote fold, transcode-to-UTF-8 from UTF-16/32). The GUI gate treats these as already-resolved; the CLI emits them so callers can audit what changed during read.
|
- `fix_action` — id of the algorithm in `src/core/fixes.py`. Empty for informational-only.
|
||||||
|
- `pre_applied` — `true` for fixes already applied during the byte-level read pass.
|
||||||
|
|
||||||
The detector set covers smart punctuation, NBSP / Unicode whitespace, zero-width characters, dirty headers, whitespace padding, null-like sentinels, mojibake fingerprints (UTF-8-as-cp1252), mixed-case email columns, near-duplicate rows (case-and-padding stripped), leading-zero IDs (Excel hazard), mixed line endings, encoding decode failure (`encoding_decode_failed`), and U+FFFD presence in the loaded text (`encoding_uncertain`). New detectors plug in by appending one entry to `analyze.py` and one matching fix in `fixes.py`.
|
## Detectors
|
||||||
|
|
||||||
|
Smart punctuation, NBSP / Unicode whitespace, zero-width chars, dirty headers, whitespace padding, null-like sentinels, mojibake fingerprints, mixed-case email columns, inconsistent date formats, near-duplicate rows, leading-zero IDs, mixed line endings, encoding decode failure, U+FFFD presence.
|
||||||
|
|
||||||
|
Add a detector: append entry in `analyze.py` + matching fix in `fixes.py`. No other call sites change.
|
||||||
|
|||||||
@@ -1,269 +1,239 @@
|
|||||||
# DECISIONS.md - Locked Criteria, Scoring Rubric, Decision Log
|
# Decisions
|
||||||
|
|
||||||
> **Creator-only document. Do not ship to buyers.**
|
> Creator-only. Locked criteria, scoring rubric, decision log.
|
||||||
|
> **Version**: 1.6 · **Updated**: 2026-05-01
|
||||||
|
|
||||||
**Version**: 1.6
|
## 1. Locked operating criteria
|
||||||
**Last updated**: April 28, 2026
|
|
||||||
|
|
||||||
This document captures the original locked operating criteria, the scoring framework used to select the product category, the platform-model evaluation, and key decisions with rationale. It exists so future-you (or a recovery rebuild) can reconstruct *why* the project is what it is, not just *what* it is.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 1. Locked Operating Criteria
|
|
||||||
|
|
||||||
These are the constraints, targets, and goals the product strategy must satisfy. Established at project start. Any change to these requires an explicit re-lock.
|
|
||||||
|
|
||||||
### Constraints
|
### Constraints
|
||||||
|
1. Cash budget ≤ $1,200/mo recurring. No external funding.
|
||||||
| # | Criterion | Notes |
|
2. Time ≤ 10 hr/wk. Build-once assets preferred.
|
||||||
|---|---|---|
|
3. Skill set: database design, data pipelines, programming. Every opportunity must leverage these.
|
||||||
| 1 | Cash budget ≤ $1,200/month | Recurring monthly only; no large one-time capital, no external funding |
|
4. Network: none. Zero reliance on personal connections.
|
||||||
| 2 | Time available ≤ 10 hours/week | Strong preference for build-once assets generating revenue for years with minimal maintenance |
|
|
||||||
| 3 | Skill set: Database Design, Data Pipelines, Data Aggregation, Programming | Every opportunity must directly leverage these |
|
|
||||||
| 4 | Existing network: none | Zero reliance on personal connections for acquisition, sales, or operations |
|
|
||||||
|
|
||||||
### Targets
|
### Targets
|
||||||
|
5. First revenue: 15 days preferred, 90 days hard stop.
|
||||||
| # | Target | Notes |
|
6. Revenue ceiling: tiered (BUSINESS §6). Realistic 12-mo: $5k/mo.
|
||||||
|---|---|---|
|
7. Lifestyle cashflow goal. No saleable-asset exit required.
|
||||||
| 5 | Time to first revenue: 15 days preferred, 90 days hard stop | |
|
8. Distribution: fully async, no-touch. Revisit at $5k/mo.
|
||||||
| 6 | Revenue ceiling: tiered (see BUSINESS.md Section 6) | Revised from original $50k/mo. Realistic 12-month target: $5k/mo |
|
9. Work pattern: deep work + recovery. No real-time on-call.
|
||||||
| 7 | Lifestyle cashflow goal | Sustainable for several years, no saleable-asset exit required |
|
|
||||||
| 8 | Distribution: fully async, no-touch, automated | Revisit at $5k/mo (see BUSINESS.md Section 8) |
|
|
||||||
| 9 | Day-to-day work pattern: deep work + recovery periods | No real-time on-call or customer-facing constraints |
|
|
||||||
|
|
||||||
### Goals
|
### Goals
|
||||||
|
10. Escape 9-5 W2 employment without stability concerns. (Primary)
|
||||||
|
11. Free up time for retirement lifestyle, optional enjoyable work. (Secondary)
|
||||||
|
|
||||||
| # | Goal | Priority |
|
### Internal contradictions
|
||||||
|---|---|---|
|
|
||||||
| 10 | Escape 9-5 W2 employment without stability concerns | Primary |
|
|
||||||
| 11 | Free up time for retirement lifestyle, optional enjoyable work | Secondary |
|
|
||||||
|
|
||||||
### No Internal Contradictions
|
"Fully async + 15-day-to-revenue + no network" is tight but workable. Caveat in BUSINESS §8: revisit async at $5k/mo.
|
||||||
|
|
||||||
The original criteria were checked for tension. The "fully async + 15-day to first revenue + no network" combination is tight but workable, with the caveat documented in BUSINESS.md Section 8 (revisit async constraint at $5k/mo).
|
## 2. Scoring rubric
|
||||||
|
|
||||||
---
|
Each candidate scored 1-5 on 6 dimensions. Total /30 → verdict.
|
||||||
|
|
||||||
## 2. Scoring Rubric
|
|
||||||
|
|
||||||
Every business candidate was scored 1-5 on six dimensions. Total /30, then mapped to verdict.
|
|
||||||
|
|
||||||
| Dimension | What it measures |
|
| Dimension | What it measures |
|
||||||
|---|---|
|
|-----------|------------------|
|
||||||
| Fit to locked criteria | Direct match to constraints 1-4 and targets 5-9. **Any 1 is a hard kill.** |
|
| Fit to locked criteria | Direct match to constraints 1-4 + targets 5-9. **Any 1 = hard kill.** |
|
||||||
| Demand durability | Structural shift vs. trend peak. Will this still pay in 3 years? |
|
| Demand durability | Structural shift vs. trend peak. Pays in 3 yr? |
|
||||||
| Defensibility | What stops the next entrant from copying it. |
|
| Defensibility | What stops the next entrant. |
|
||||||
| Unit economics realism | CAC, payback period, gross margin, working capital. |
|
| Unit economics realism | CAC, payback, gross margin, working capital. |
|
||||||
| Operator fit | Skills, capital, time, stomach for the work. |
|
| Operator fit | Skills, capital, time, stomach. |
|
||||||
| Exit / cash-flow optionality | Multiple paths to revenue, optionality on later changes. |
|
| Exit / cash-flow optionality | Multiple revenue paths. |
|
||||||
|
|
||||||
**Verdict mapping**: PURSUE / INVESTIGATE / PASS / KILL based on total score and any hard-kill dimension.
|
**Verdict**: PURSUE / INVESTIGATE / PASS / KILL.
|
||||||
|
|
||||||
**Calibration note added in v1.1**: The original scoring inflated unit economics for the lead candidate by treating near-100% gross margin as 5/5 without accounting for CAC under the "no network" constraint. A more honest score for the Python Bundles category is 7.0-7.5/10, not 8.7/10. The strategy is still sound; the optimism just needed deflating.
|
**v1.1 calibration**: original scoring inflated unit economics by treating ~100% gross margin as 5/5 without accounting for CAC under "no network." Honest score: 7.0-7.5/10 (was 8.7). Strategy still sound; optimism deflated.
|
||||||
|
|
||||||
---
|
## 3. Candidate evaluation
|
||||||
|
|
||||||
## 3. Candidate Evaluation Summary
|
|
||||||
|
|
||||||
Five candidates were evaluated against the locked criteria. Top three:
|
|
||||||
|
|
||||||
| Rank | Candidate | Score | Verdict |
|
| Rank | Candidate | Score | Verdict |
|
||||||
|---|---|---|---|
|
|------|-----------|-------|---------|
|
||||||
| 1 | Niche Python Automation Script Bundles | 8.7/10 (original) / ~7.5/10 (calibrated) | **PURSUE** |
|
| 1 | Niche Python Automation Script Bundles | 8.7/10 → 7.5/10 (calibrated) | **PURSUE** |
|
||||||
| 2 | Curated Datasets | 8.7/10 | PURSUE (deferred) |
|
| 2 | Curated Datasets | 8.7/10 | PURSUE (deferred) |
|
||||||
| 3 | Hosted Data Pipeline Micro-Tool | 8.3/10 | INVESTIGATE |
|
| 3 | Hosted Data Pipeline Micro-Tool | 8.3/10 | INVESTIGATE |
|
||||||
|
|
||||||
**Why #1 was selected over #2**:
|
**Why #1 over #2**: faster path to first revenue (digital download vs. ongoing curation pipeline). Lower ongoing maintenance. Direct programming leverage. Better fit for "build once, sell many."
|
||||||
- Faster path to first revenue (digital download vs. ongoing data curation pipeline).
|
|
||||||
- Lower ongoing maintenance after launch.
|
|
||||||
- Direct leverage of programming skills, not just data acquisition.
|
|
||||||
- Better fit for the "build once, sell many times" preference in criterion 2.
|
|
||||||
|
|
||||||
**Why others were ranked lower**:
|
**Rejected**: Notion Templates (weak skill leverage), Query Optimizer SaaS (recurring infra conflicts with lifestyle/maintenance constraint).
|
||||||
- Notion Templates: weaker leverage of programming skills.
|
|
||||||
- Query Optimizer (SaaS): introduces hosting, support, and recurring infrastructure costs that conflict with the lifestyle / minimal maintenance constraint.
|
|
||||||
|
|
||||||
---
|
## 4. Platform model
|
||||||
|
|
||||||
## 4. Platform Model Decision (How to Sell)
|
| Model | Verdict |
|
||||||
|
|-------|---------|
|
||||||
|
| **Standalone tools, dual CLI + GUI (chosen)** | **CHOSEN** (revised v1.2). Build once, no hosting, no SaaS support. GUI captures non-tech buyer; CLI captures power users. |
|
||||||
|
| SaaS web app | Rejected. Recurring hosting + support conflicts with minimal-maintenance constraint. |
|
||||||
|
| CLI-only | Rejected (revised v1.2). Wrong fit for non-tech buyer; produces refunds. |
|
||||||
|
| Browser extension | Rejected. Sandbox limits, wrong tool for files. |
|
||||||
|
| Notion / Airtable templates | Rejected. Doesn't leverage programming. |
|
||||||
|
|
||||||
Models considered for the lead bundle:
|
**v1.2 rationale**:
|
||||||
|
- Buyer persona ("hates Excel work but can't code") won't learn a CLI. Refunds at this price.
|
||||||
|
- Find Duplicates needs interactive review — not viable in pure CLI.
|
||||||
|
- Dual interface keeps CLI for automation without sacrificing primary buyer surface.
|
||||||
|
|
||||||
| Model | Pros | Cons | Verdict |
|
## 4a. Functional scope principle (v1.2)
|
||||||
|---|---|---|---|
|
|
||||||
| **Standalone tools, dual CLI + GUI interface (chosen)** | Build once, sell forever. No hosting. No SaaS support burden. Direct skill match. GUI captures non-technical buyer; CLI captures power users and automation use cases. | Requires installer for non-technical buyers. Some platform friction (signing, etc.). GUI adds build cost vs. CLI-only. | **CHOSEN (revised v1.2)** |
|
|
||||||
| SaaS web app | Recurring revenue. Easy install. | Ongoing hosting cost, support burden, SaaS scrutiny. Conflicts with "minimal maintenance" criterion. | Rejected |
|
|
||||||
| CLI-only | Lowest build cost | Wrong fit for non-technical buyer persona. Will produce refunds. | Rejected (revised v1.2) |
|
|
||||||
| Browser extension | Easy install | Limited by browser sandbox. Wrong tool for data file processing. | Rejected |
|
|
||||||
| Notion / Airtable templates | Fast to ship | Doesn't leverage programming skills. Low defensibility. | Rejected |
|
|
||||||
|
|
||||||
**Decision (revised v1.2)**: Ship as standalone tools with **both** a CLI and a GUI front-end sharing the same core logic. Packaged with cross-platform installers (PyInstaller-based) so the buyer experience approximates a native app. GUI is no longer "deferred"; it is required at v1 launch.
|
**Decision**: each script ships **complete coverage of the workflow it names**, including features Excel does free.
|
||||||
|
|
||||||
**Rationale for the v1.2 revision**:
|
**Why**: one-stop shopping is the value. Forcing buyers to bounce between this product and Excel/OpenRefine for parts of one task defeats the value prop.
|
||||||
- The buyer persona ("hate repetitive Excel work but cannot code") will not learn a CLI. CLI-only at this price point produces refunds.
|
|
||||||
- The deduplicator specifically requires interactive review of fuzzy-match candidates. That UX is not viable in pure CLI.
|
|
||||||
- A dual-interface design keeps the CLI for power users and future automation/scheduling use cases without sacrificing the primary buyer experience.
|
|
||||||
|
|
||||||
---
|
**Anti-rule**: not license to scope-creep. Boundary = the named workflow. Dedup includes normalization + survivor + audit. NOT format conversion or charting (those belong to other scripts).
|
||||||
|
|
||||||
## 4a. Functional Scope Principle (added v1.2)
|
## 4b. UX standards for GUI (v1.2 — load-bearing)
|
||||||
|
|
||||||
**Decision**: Each script ships with **complete functional coverage of the problem it names**, including features available for free elsewhere (e.g., Excel's built-in exact-match dedup).
|
| Standard | What it means |
|
||||||
|
|----------|---------------|
|
||||||
|
| Works out of the box | Drop file → useful result, zero config. |
|
||||||
|
| Sensible defaults visible | Every option has a default that works for the common case. |
|
||||||
|
| Progressive disclosure | Default view = file uploader + go button + results. Advanced in expander panes. |
|
||||||
|
| Plain-English labels | "Find duplicates" not "Apply Levenshtein at 0.85". Tooltips carry technical detail. |
|
||||||
|
| Visible safety | Dry-run / preview by default. Original input never modified. |
|
||||||
|
| No multi-step setup | Single window for the basic task. |
|
||||||
|
| Errors name problem + fix | "Column 'email' not found. Available: name, phone. Did you mean 'phone'?" not `KeyError`. |
|
||||||
|
| Identical core to CLI | No drift. Anything CLI does, GUI does (minus interactive review = GUI-natural). |
|
||||||
|
|
||||||
**Rationale**: The product is "one-stop shopping" for the buyer's data-cleaning workflow. Forcing a buyer to bounce between this product and Excel/OpenRefine/etc. for parts of a single task defeats the value proposition. A buyer cleaning a customer list expects exact dedup, fuzzy dedup, normalization, and survivor-merge in one tool. Splitting that across products is what they paid to avoid.
|
**"Intuitive enough" test**: a non-technical user who's never seen the tool can complete the lead use case on first launch with no docs read.
|
||||||
|
|
||||||
**Consequence for design**: Do not omit a feature on the grounds that "Excel does this for free." If it belongs to the workflow, it belongs in the script.
|
## 4c. GUI framework: Streamlit (v1.3)
|
||||||
|
|
||||||
**Anti-rule**: This is not license to scope-creep. The boundary is "the workflow this script names." A deduplicator includes everything dedup-adjacent (normalization, survivor selection, audit). It does not include format conversion, charting, or anything outside the dedup workflow. Those belong to other scripts in the bundle.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 4b. UX Standards for GUI Front-End (added v1.2)
|
|
||||||
|
|
||||||
The GUI is the primary buyer surface. These standards are load-bearing.
|
|
||||||
|
|
||||||
| Standard | What it means in practice |
|
|
||||||
|---|---|
|
|
||||||
| **Works out of the box** | Dropping any reasonable CSV / XLSX onto the GUI must produce a useful result with zero configuration. The buyer should never see a config screen on first run. |
|
|
||||||
| **Sensible defaults everywhere** | Every option has a default that works for the most common case. Defaults are visible (so the user understands what is being applied) but not blocking. |
|
|
||||||
| **Progressive disclosure** | Advanced options exist but are tucked behind an "Advanced" or "Settings" pane. The default view shows the minimum needed for a first run. |
|
|
||||||
| **Plain-English labels** | No technical jargon in primary UI. "Find duplicates" not "Apply Levenshtein matching with token_set_ratio threshold". Tooltips can carry the technical detail for users who want it. |
|
|
||||||
| **Visible safety** | Dry-run / preview by default. The user sees what *would* change before any file is written. Original input is never modified. |
|
|
||||||
| **No multi-step setup** | If the GUI requires more than a single window (file picker + go button + results view) to complete a basic task, it has failed this standard. |
|
|
||||||
| **Errors that name the problem and the fix** | "Column 'email' not found in this file. Available columns: name, phone, address. Did you mean 'phone'?" not "KeyError: 'email'". |
|
|
||||||
| **Identical core to CLI** | The GUI and CLI are two front-ends over the same library code. Anything the CLI can do, the GUI can do. Anything the GUI can do, the CLI can do (possibly minus interactive review). No drift. |
|
|
||||||
|
|
||||||
**Test for "intuitive enough"**: A non-technical person who has never seen the tool can complete the lead use case (dedup a customer list with one or more confidence levels) on first launch with no documentation read. If that test fails on real users, the GUI is not yet shippable.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 4c. GUI Framework Decision: Streamlit (added v1.3)
|
|
||||||
|
|
||||||
**Chosen**: Streamlit.
|
|
||||||
|
|
||||||
### Frameworks evaluated
|
|
||||||
|
|
||||||
| Framework | Verdict |
|
| Framework | Verdict |
|
||||||
|---|---|
|
|-----------|---------|
|
||||||
| **Streamlit** | **CHOSEN** |
|
| **Streamlit** | **CHOSEN** |
|
||||||
| Tkinter + CustomTkinter | Rejected (CustomTkinter maintenance status confirmed inactive: last release Jan 2024, ~28 months old as of decision date; Snyk classifies as Inactive project) |
|
| Tkinter + CustomTkinter | Rejected — maintainer absent (last release Jan 2024, ~28 mo). Snyk: Inactive. |
|
||||||
| Plain Tkinter | Rejected (UX quality below what a $49-79 product justifies in 2026 without significant hand-styling work) |
|
| Plain Tkinter | Rejected — UX gap unacceptable at $49-79 in 2026 without heavy hand-styling. |
|
||||||
| Flet | Rejected (ecosystem too young for a build-once-maintain-for-years product) |
|
| Flet | Rejected — ecosystem too young for build-once-maintain-for-years. |
|
||||||
| PySide6 / Qt | Rejected (overkill for this product tier; steepest learning curve, largest bundles) |
|
| PySide6 / Qt | Rejected — overkill, steepest learning curve, biggest bundles. |
|
||||||
| NiceGUI | Rejected (similar pattern to Streamlit but smaller community and less mature data-tool ergonomics) |
|
| NiceGUI | Rejected — same browser tradeoff as Streamlit, smaller community + ecosystem. |
|
||||||
|
|
||||||
### Full evaluation matrix (added v1.6)
|
### Scored matrix (1-5, 5 = best for this product)
|
||||||
|
|
||||||
Promoted from chat-history-only into docs in v1.6 to lock the rejection reasoning against re-litigation. Scored 1-5 where 5 is best for *this specific product*.
|
| Dimension | Tk | Tk+CTk | Streamlit | Flet | PySide6 | NiceGUI |
|
||||||
|
|-----------|----|----|-----------|------|---------|---------|
|
||||||
| Dimension | Tkinter | Tk+CTk | Streamlit | Flet | PySide6 | NiceGUI |
|
| Non-tech UX | 1 | 3 | 4 | 4 | 5 | 4 |
|
||||||
|---|---|---|---|---|---|---|
|
| Native window (no browser) | 5 | 5 | 1 | 5 | 5 | 1 |
|
||||||
| Non-tech UX quality (look + feel) | 1 | 3 | 4 | 4 | 5 | 4 |
|
| Build speed v1 | 3 | 3 | 5 | 4 | 2 | 4 |
|
||||||
| "Native window opens" (no browser) | 5 | 5 | 1 | 5 | 5 | 1 |
|
| Build speed per feature | 3 | 3 | 5 | 4 | 2 | 4 |
|
||||||
| Build speed for v1 | 3 | 3 | 5 | 4 | 2 | 4 |
|
| PyInstaller compat | 5 | 4 | 2 | 3 | 3 | 2 |
|
||||||
| Build speed per added feature | 3 | 3 | 5 | 4 | 2 | 4 |
|
| Bundle size (smaller better) | 5 | 4 | 1 | 3 | 2 | 1 |
|
||||||
| PyInstaller compatibility (low friction) | 5 | 4 | 2 | 3 | 3 | 2 |
|
| Maintenance burden | 4 | 3 | 4 | 3 | 4 | 3 |
|
||||||
| Bundle size (smaller = better) | 5 | 4 | 1 | 3 | 2 | 1 |
|
| Ecosystem maturity | 5 | 3 | 4 | 2 | 5 | 3 |
|
||||||
| Maintenance burden over time | 4 | 3 | 4 | 3 | 4 | 3 |
|
| Solo-dev learning curve | 4 | 4 | 5 | 4 | 2 | 4 |
|
||||||
| Ecosystem maturity / longevity bet | 5 | 3 | 4 | 2 | 5 | 3 |
|
| Drop-file-see-result fit | 3 | 3 | 5 | 4 | 4 | 5 |
|
||||||
| Solo dev learning curve | 4 | 4 | 5 | 4 | 2 | 4 |
|
|
||||||
| Suits "drop file, see result" pattern | 3 | 3 | 5 | 4 | 4 | 5 |
|
|
||||||
| **Total /50** | **38** | **37** | **38** | **36** | **34** | **35** |
|
| **Total /50** | **38** | **37** | **38** | **36** | **34** | **35** |
|
||||||
|
|
||||||
**The total is misleading on purpose.** Equal totals hide that these options fail differently. Tkinter ties Streamlit on the sum but loses on look-and-feel and data-app fit (the dimensions that matter most for this product). The verdict is in the per-dimension story, not the sum.
|
**Sums lie.** Tk ties Streamlit but loses on look-and-feel + data-app fit (the dimensions that matter). Verdict is per-dimension, not total.
|
||||||
|
|
||||||
**Per-option summary** (the substance behind the verdicts):
|
|
||||||
|
|
||||||
- **Plain Tkinter**: Smallest bundle (~30-50 MB added), most predictable PyInstaller behavior, will work in 10 years. Default widgets look like 1998. A non-technical buyer paying $49-79 and seeing a default Tk UI will feel cheated. Don't ship.
|
|
||||||
- **Tkinter + CustomTkinter**: Native window, ~50-80 MB added, modern look, mature PyInstaller story. Maintainer absent (last release Jan 2024). Multi-year product cannot bet UI layer on a library classified Inactive. The probable failure mode is a future macOS or Python update breaking the Tk layer with no upstream fix.
|
|
||||||
- **Streamlit**: Fastest to build for data tools. Tables, file uploads, dataframes are first-class. Mature ecosystem. Browser-launch UX is the real liability, mitigated by in-app messaging and the optional pywebview wrap (v1.1). Bundle size 300-500 MB. PyInstaller packaging fiddly first time, reusable after.
|
|
||||||
- **Flet**: Modern Flutter-based UI, native windows, looks great. Ecosystem too young for a build-once-maintain-for-years product. Breaking API changes between minor versions still happening. PyInstaller story less battle-tested.
|
|
||||||
- **PySide6 / Qt**: Industrial-grade, best widget set, native everything. Steepest learning curve, largest bundles, licensing care needed. Overkill for $49-79 product tier and burns the 10 hr/wk time budget on UI scaffolding instead of script features.
|
|
||||||
- **NiceGUI**: Similar pattern to Streamlit (Python-to-web). Smaller community, less mature data-tool ergonomics. Same browser-launch tradeoff without Streamlit's velocity advantage.
|
|
||||||
|
|
||||||
### Why Streamlit won
|
### Why Streamlit won
|
||||||
|
|
||||||
1. **Fastest build velocity for v1 and every subsequent bundle.** "Drop a CSV, see results" is the native Streamlit interaction pattern. Tables, filters, dataframes display well with minimal code. This compounds across the 9-script lead bundle and the future 5 bundles in the roadmap.
|
1. **Fastest build velocity** — "drop CSV, see results" is native. Tables, file uploads, dataframes are first-class. Compounds across 9-script lead + 5 future bundles.
|
||||||
2. **Lowest maintenance burden per added feature.** Active framework, large community, mature ecosystem. Bug fixes happen upstream, not on this project's time.
|
2. **Lowest maintenance burden** — active, large community, mature ecosystem. Bugs fixed upstream.
|
||||||
3. **Hosted browser demo as a marketing asset from day one.** A Streamlit app deploys to Streamlit Community Cloud (free) or a $5/mo VPS. The Gumroad landing page can offer "Try it free in your browser" with a sample dataset. For a $49-79 product where buyers cannot evaluate quality before purchase, a working demo can move conversion meaningfully. Tkinter family options cannot provide this.
|
3. **Hosted demo as marketing asset** — Streamlit Community Cloud (free) lets the landing page offer "Try free in browser" with sample data. Tk-family options can't.
|
||||||
4. **Future SaaS optionality** (expanded v1.6). Not a driver of this decision; the locked criteria reject SaaS. But if criteria ever evolve, Streamlit code converts to a hosted multi-user app in hours rather than weeks. Streamlit's session-state model, component patterns, and HTTP-server architecture are SaaS-native by default; the same code that runs the desktop bundle's local browser GUI runs unchanged on a hosted server (modulo authentication and per-user file isolation). Tkinter code, by contrast, would require a complete rewrite to become a hosted product. This is low-cost optionality: zero implementation effort now, meaningful flexibility later if the lifestyle-cashflow constraint ever lifts in favor of recurring revenue.
|
4. **Future SaaS optionality** — same code runs unchanged on a hosted server (modulo auth + per-user isolation). Tk would require rewrite. Zero implementation now, meaningful flexibility later.
|
||||||
|
|
||||||
### Tradeoffs accepted
|
### Tradeoffs accepted
|
||||||
|
|
||||||
1. **Browser-launch UX on the desktop install.** When a buyer double-clicks the desktop shortcut, their default browser opens to a localhost URL. This may briefly confuse non-technical buyers. **Mitigation**: a single sentence in the welcome dialog and install email explains that the data tool runs in the browser locally and uses no internet. If support tickets show this is a meaningful confusion driver, evaluate wrapping with pywebview (native window around the local Streamlit server) in v1.1.
|
1. **Browser-launch UX** — buyer double-click → default browser opens to localhost. Mitigated: install email + welcome dialog + persistent in-app message. Pywebview wrap is the v1.1 fallback if confusing.
|
||||||
2. **Larger bundle size**, ~300-500 MB vs. ~50 MB for Tkinter. Acceptable for marketplace download in 2026 with typical broadband.
|
2. **Bundle size** — ~300-500 MB vs. ~50 MB for Tk. Acceptable in 2026.
|
||||||
3. **PyInstaller packaging is fiddly** the first time. Budget 1-3 days for the one-time setup, then it's reusable across all subsequent bundles via a shared template.
|
3. **PyInstaller fiddly first time** — budget 1-3 days. Reusable across all bundles after.
|
||||||
4. **Streamlit's session re-run model is unusual.** Manageable for single-user data tools; would matter more if the SaaS optionality were exercised at scale.
|
4. **Streamlit's session re-run model** is unusual but manageable.
|
||||||
|
|
||||||
### Why CustomTkinter was rejected (the previously-favored option)
|
## 5. Distribution
|
||||||
|
|
||||||
A web check during this decision found that CustomTkinter's last PyPI release was 5.2.2 in January 2024. As of April 2026, that's roughly 28 months without a release, and Snyk classifies the project as Inactive. The library still works and remains popular (~115k weekly downloads, 13k+ GitHub stars), but the maintainer is effectively absent. For a product intended to ship to non-technical buyers and remain functional for years with minimal touch from the operator, betting the UI layer on an unmaintained library is an unacceptable risk: any future Python or macOS update that breaks the Tk underpinnings becomes the operator's problem to fix or fork.
|
**Primary**: Marketplaces (Gumroad, Lemon Squeezy). Built-in traffic, async payments/delivery/refunds, listing in days.
|
||||||
|
|
||||||
This is the kind of dependency risk that matters most in a "build once, sell forever" product, where every hour spent firefighting a dependency break is an hour stolen from the next bundle.
|
Own-domain SEO: long-term compounding asset (6-18 mo), not early-stage channel.
|
||||||
|
|
||||||
---
|
**v1.3 addition**: hosted browser demo as secondary distribution + primary conversion lever.
|
||||||
|
|
||||||
## 5. Distribution Channel Decision
|
## 6. Pricing
|
||||||
|
|
||||||
**Chosen primary**: Marketplace listings (Gumroad, Lemon Squeezy).
|
$49-79/bundle · $149 full suite (when 3+ exist).
|
||||||
|
|
||||||
**Rationale**: Under the "no network + fully async + 90-day hard stop" constraints, marketplaces are the only channel that:
|
- < $99 → no procurement friction for solo operators.
|
||||||
- Has built-in buyer traffic (no audience-building required).
|
- > $99 → triggers SaaS-support expectations conflicting with no-touch.
|
||||||
- Handles payments, delivery, refunds asynchronously.
|
- $49-79 → right unit economics + impulse-purchase territory.
|
||||||
- Allows listing in days, not months.
|
|
||||||
|
|
||||||
Own-domain SEO is treated as a long-term compounding asset (6-18 months to traction), not an early-stage channel.
|
## 7. Decision log
|
||||||
|
|
||||||
**Added v1.3**: A **hosted browser demo** of each bundle (deployed via Streamlit Community Cloud) becomes a secondary distribution surface and a primary conversion-rate lever on the landing page. Marketing details in BUSINESS.md Section 7.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 6. Pricing Decision
|
|
||||||
|
|
||||||
**Chosen**: $49-$79 per bundle, $149 for full suite (when 3+ bundles exist).
|
|
||||||
|
|
||||||
**Rationale**:
|
|
||||||
- Below $99 threshold avoids procurement / approval friction for solo operator buyers.
|
|
||||||
- Above $99 raises buyer expectations (SaaS, human support) that conflict with the no-touch constraint.
|
|
||||||
- $49-$79 produces the right unit economics for marketplace fees + Stripe fees while remaining impulse-purchase territory.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 7. Decision Log (Chronological)
|
|
||||||
|
|
||||||
| Date | Decision | Rationale |
|
| Date | Decision | Rationale |
|
||||||
|
|------|----------|-----------|
|
||||||
|
| Apr 2026 | Lock operating criteria | Project kickoff |
|
||||||
|
| Apr 2026 | Python Bundles selected | Highest score |
|
||||||
|
| Apr 2026 | Excel/CSV Cleaning as lead bundle | Highest pain, broadest demand |
|
||||||
|
| Apr 2026 (v1.1) | PyInstaller cross-platform pipeline | Eliminates "install Python" friction |
|
||||||
|
| Apr 2026 (v1.1) | Apple Developer Program ($99/yr) | Required for clean macOS install |
|
||||||
|
| Apr 2026 (v1.1) | Tiered revenue targets ($5k @ 12mo, $10k @ 24mo) | Original $50k unsupported by evidence |
|
||||||
|
| Apr 2026 (v1.1) | Tag "no-touch" for revisit at $5k/mo | Strict adherence pre-PMF may cost more revenue than it saves |
|
||||||
|
| Apr 28 (v1.2) | Functional scope: include workflow features even if free elsewhere | One-stop shopping is the value prop. See §4a. |
|
||||||
|
| Apr 28 (v1.2) | Promote GUI to required at v1; ship dual CLI + GUI | Buyer persona won't use CLI. See §4. |
|
||||||
|
| Apr 28 (v1.2) | Lock UX standards (works OOTB, sensible defaults, progressive disclosure, dry-run) | Load-bearing for non-tech buyer. See §4b. |
|
||||||
|
| Apr 28 (v1.3) | Lock GUI framework as Streamlit | Fastest velocity, lowest maintenance, hosted demo, SaaS optionality. See §4c. |
|
||||||
|
| Apr 28 (v1.3) | Add hosted browser demo as conversion lever | Direct consequence of Streamlit choice. See §5. |
|
||||||
|
| Apr 28 (v1.4) | Re-apply 04/06 boundary work (silent-drift recovery) | Stream B v1.2 content overwritten in parallel v1.3 work. Restored per no-silent-drift rule. |
|
||||||
|
| Apr 28 (v1.5) | Add `02_text_cleaner.py`; renumber 02-08 → 03-09 | Character-level hygiene had no clear owner. See TECHNICAL §10. |
|
||||||
|
| Apr 29 (v1.7) | Adopt Clean Text Tier 1/2/3 spec; lock `excel-hygiene` default | Promotes from stub to buildable v1 target. Full spec in TECHNICAL §11.2. |
|
||||||
|
| Apr 28 (v1.6) | Fold conversation-history content into docs (deduplicator spec, lead bundle use cases, full GUI matrix, 04/06 examples, Streamlit-to-SaaS reasoning) | No new decisions; promote at-risk analysis from chat history per no-silent-drift rule. |
|
||||||
|
| May 1 (v1.6) | Mark Standardize Formats **Ready** | 199-row buyer corpus passing; Tier 1 + most Tier 2 built. |
|
||||||
|
| May 1 (v1.6) | Add `src/core/errors.py` structured hierarchy | Uniform helpful messages across CLI + GUI. See TECHNICAL §7. |
|
||||||
|
| May 13 (v1.6) | Ship in-house JSON i18n + EN/ES packs | Expand addressable market (Spanish-first buyers, LatAm bookkeepers) without a `gettext` build step. JSON packs editable by non-devs; parity test prevents drift. See TECHNICAL §10b. |
|
||||||
|
| May 13 (v1.6) | Ship licensing: 1-year HMAC-signed blobs, name+email registration, offline verification, tier-scaffolded for future SKUs | Unlock the lifetime-update business model without recurring infra. Honor-system DRM (HMAC + 30-day refund) — sufficient at $49. See §9b below. |
|
||||||
|
| May 13 (v1.6) | Add Lite SKU (Find Duplicates + Clean Text + Standardize Formats) | Lower-priced entry point for buyers who only need the three universal tools. Per-tool feature gating + lock badges on the home grid surface the upgrade path. See §9b. |
|
||||||
|
| May 13 (v1.6) | Remove user-facing free trial | A 1-year all-features trial undercut the paid Lite SKU. Paid-only keeps tier economics clean. Internal ``_mint`` API still exists for tests and the seller's key generator. See §9b. |
|
||||||
|
| May 13 (v1.6) | Upgrade license crypto: HMAC → Ed25519 (asymmetric) | HMAC's symmetric secret was extractable from the shipped binary — anyone with the binary could mint blobs. Ed25519 splits sign (seller) from verify (binary), so binary compromise doesn't let an attacker forge licenses. Blob prefix bumped DTLIC1 → DTLIC2. See §9b. |
|
||||||
|
| May 13 (v1.6) | Add ``assert_production_safe`` tripwire | A shipped build with ``DATATOOLS_DEV_MODE=1`` or the in-source dev pubkey would silently defeat licensing. The tripwire refuses to boot such a build. No-op in source / pytest runs. See §9b. |
|
||||||
|
|
||||||
|
## 9b. Licensing model
|
||||||
|
|
||||||
|
**Decision (v1.6)**: offline HMAC-signed license blobs, 1-year lifetime, name + email registration required. Tier-scaffolded so future SKUs (PRO, ENTERPRISE) can carve per-tool feature sets without code changes.
|
||||||
|
|
||||||
|
| Option | Verdict |
|
||||||
|
|---|---|
|
||||||
|
| **Offline HMAC blob (chosen)** | **CHOSEN.** No server, no internet, fits the no-touch constraint. Honor-system at this price point. |
|
||||||
|
| Online activation check | Rejected. Conflicts with the "your data never leaves your computer" promise; introduces support load (server downtime, network issues). |
|
||||||
|
| No license at all | Rejected. The lifetime-update value prop requires *some* gating to make renewal meaningful. |
|
||||||
|
| Time-bombed binary (PyInstaller --no-license) | Rejected. Can't deliver renewals without re-shipping the installer. |
|
||||||
|
| Hardware-locked license | Rejected. Friction on legitimate device-swaps; doesn't match the buyer persona's tolerance. |
|
||||||
|
|
||||||
|
**Threat model** (v1.6 — Ed25519): the binary ships only the public key. A motivated reverse engineer who pulls everything out of the binary has the verification key but not the signing key — they can't mint new licenses. The earlier HMAC scheme had this hole; the asymmetric upgrade closes it. The remaining attack surface is:
|
||||||
|
|
||||||
|
- Re-signing with a forked binary that ships an attacker-controlled pubkey + auto-grants licenses. Costs more effort than the price of a legitimate copy and the result is per-fork, not shareable.
|
||||||
|
- Hooking the verification call to always return True. Defeats DRM entirely but only on the attacker's own machine — they could just write down "I unlocked DataTools" and skip the work.
|
||||||
|
- Setting ``DATATOOLS_DEV_MODE=1`` to bypass checks. **Refused in shipped builds** by ``assert_production_safe``; works in source/test runs only.
|
||||||
|
|
||||||
|
The 30-day refund window covers casual blob sharing from a different angle (anyone who shares their blob is implicitly authorizing the buyer to issue them a refund-on-demand).
|
||||||
|
|
||||||
|
**What's enforced**:
|
||||||
|
- License blob signature must match (HMAC-SHA256 with the build secret).
|
||||||
|
- Buyer-entered name + email must match the values embedded in the blob.
|
||||||
|
- Expiry date must be in the future.
|
||||||
|
- Tier must include the requested feature.
|
||||||
|
|
||||||
|
**What's NOT enforced**:
|
||||||
|
- Number of devices the same blob is used on (no concurrent-use detection).
|
||||||
|
- Reverse-engineered re-signing of expired blobs (would require RSA / online check).
|
||||||
|
|
||||||
|
**Future SKUs**: the ``FEATURES_BY_TIER`` table in ``src/license/features.py`` is the single source of truth for "which tools each tier unlocks". Adding a PRO SKU that excludes Automated Workflows is a 1-line edit there + a 1-line edit at the gate site. No consumer-code churn.
|
||||||
|
|
||||||
|
**v1.6 SKU lineup**:
|
||||||
|
|
||||||
|
| Tier | Tools unlocked | Notes |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| April 2026 | Lock operating criteria | Project kickoff |
|
| LITE | Find Duplicates, Clean Text, Standardize Formats | Entry SKU. Three universal tools that handle the most common bookkeeping / RevOps / Klaviyo prep workflows. |
|
||||||
| April 2026 | Select Python Automation Script Bundles as the product category | Highest score against locked criteria |
|
| CORE | All 9 tools | Full v1 suite. |
|
||||||
| April 2026 | Choose CLI standalone over SaaS / GUI | Best fit for minimal maintenance + skill leverage |
|
| PRO | All 9 tools (scaffolded) | Reserved for future per-feature carve-outs (e.g., scheduled pipelines, API access). |
|
||||||
| April 2026 | Pick Excel & CSV Data Cleaning Mastery as lead bundle | Highest pain, broadest demand, easiest demonstration |
|
| ENTERPRISE | All 9 tools (scaffolded) | Reserved for future bulk / multi-seat SKUs. |
|
||||||
| April 2026 | Initial install path: Inno Setup (Windows-only) | First-pass design |
|
| TRIAL | Same as LITE | Deprecated — no longer issuable. Mapping kept for any legacy on-disk trial licenses to load without error. |
|
||||||
| April 2026 (revised v1.1) | **Switch to PyInstaller-based cross-platform pipeline** | Eliminates "install Python first" friction; expands TAM to Mac and Linux users |
|
|
||||||
| April 2026 (revised v1.1) | **Enroll in Apple Developer Program ($99/yr)** | Required for clean macOS install experience for non-technical buyers |
|
|
||||||
| April 2026 (revised v1.1) | **Replace $50k/mo target with tiered realistic targets** | Original target was unsupported by evidence base; tiered targets hit $5k at 12mo, $10k at 24mo |
|
|
||||||
| April 2026 (revised v1.1) | **Tag "fully async no-touch" for revisit at $5k/mo** | Strict adherence pre-PMF may cost more revenue than it saves time |
|
|
||||||
| April 28, 2026 (v1.2) | **Functional scope: include all workflow-relevant features even if available free elsewhere** | One-stop shopping is the value proposition. Forcing buyers to bounce between products defeats the purpose. See Section 4a. |
|
|
||||||
| April 28, 2026 (v1.2) | **Promote GUI from "deferred" to required at v1 launch; ship dual CLI + GUI interface** | Buyer persona will not use CLI. Deduplicator specifically requires interactive review UX that CLI cannot deliver well. See Section 4. |
|
|
||||||
| April 28, 2026 (v1.2) | **Lock UX standards for GUI: works out of the box, sensible defaults, progressive disclosure, plain-English labels, dry-run by default** | These are load-bearing for the non-technical buyer. Without them the GUI may exist but won't justify the price. See Section 4b. |
|
|
||||||
| April 28, 2026 (v1.3) | **Lock GUI framework as Streamlit; reject CustomTkinter (maintenance inactive), plain Tkinter (UX gap), Flet/PySide6/NiceGUI (each fails on a dimension that matters)** | Fastest build velocity, lowest maintenance burden, hosted browser demo as marketing asset, future SaaS optionality. Browser-launch UX accepted as a tradeoff with documented mitigation. See Section 4c. |
|
|
||||||
| April 28, 2026 (v1.3) | **Add hosted browser demo as secondary distribution surface and conversion lever** | Direct consequence of Streamlit choice. See Section 5 and BUSINESS.md Section 7. |
|
|
||||||
| April 28, 2026 (v1.4) | **Re-apply 03/05 script boundary work dropped during v1.3 merge (silent drift recovery)** | Stream B v1.2 content (sharpened 03/05 descriptions in USER-GUIDE, run-order rule, TECHNICAL.md Section 9 boundary spec, RECOVERY.md pointer) was overwritten when Stream A's parallel v1.3 Streamlit work was saved to project. Restoring per the doc's own no-silent-drift rule. 03 owns "what's not there" (missing values, sentinel codes, imputation), 05 owns "what shouldn't be there" (statistical outliers, domain rules, winsorization). 03 runs before 05 because outlier statistics on data containing NaN or sentinel codes are mathematically poisoned. See TECHNICAL.md Section 9. |
|
|
||||||
| April 28, 2026 (v1.5) | **Add `02_text_cleaner.py` as new script; renumber 02-08 → 03-09** | Audit revealed character-level hygiene (whitespace trimming, multi-space collapse, Unicode normalization, BOM handling, line-ending normalization, special-character handling) had no clear owner. Was implicitly scattered: `01_deduplicator` normalizes internally for matching only (doesn't write back), `02_format_standardizer` (now 03) implies it but its named scope is dates/currencies/names/phones/addresses, `03_missing_value_handler` (now 04) only handles whitespace-only as disguised null. A buyer with trailing-space pollution had no obvious script to run. Per Section 4a (functional scope principle: one-stop shopping for the workflow), this was a real gap. Added as 02 because text cleaning is a pre-processing step that should run before format standardization, missing-value handling, and outlier detection. Kept 01 (deduplicator) at position 1 as the lead/working/marketing-flagship script; numbering does not strictly equal pipeline order, the orchestrator manages execution order. Renumber consequence: TECHNICAL.md Section 9 boundary references updated 03→04, 05→06; orchestrator references updated 08→09. New contested case documented in Section 9.3: whitespace-only cells (02 trims first, leaving empty string; 04 then detects empty strings as disguised null). Master orchestrator now 09. |
|
|
||||||
| April 29, 2026 (v1.7) | **Adopt `02_text_cleaner.py` Tier 1/2/3 functional spec; lock `excel-hygiene` as default preset** | Promotes character-level hygiene from a stub to a buildable v1 target. Strategic framing: Excel/Power Query/OpenRefine fail this category for non-technical buyers; the gap is "one-click correctness for dirty-CSV failure modes that cause silent VLOOKUP misses." Spec covers 10 toggleable ops (trim, collapse, NFC, smart-char fold, zero-width strip, BOM strip, control strip, line-ending normalize, NFKC opt-in, per-column case), per-column scope control, dry-run-by-default, per-cell change audit, idempotency, three presets (`minimal`/`excel-hygiene`/`paranoid`), and JSON config save/load. Output shape mirrors deduplicator: `{input}_cleaned.csv`, `{input}_changes.csv`, `logs/text_clean_{ts}.log`. Boundary with adjacent scripts re-asserted: 02 trims whitespace-only cells to empty (04 then detects empty as null per Section 9.3); 02 is *write-time* and stays distinct from `01_deduplicator`'s match-time `normalize_string` helper. Smart-character fold defaults ON in `excel-hygiene` because demo value is highest there and dry-run preview makes the change visible before commit. NFKC stays opt-in (lossy). `ftfy` mojibake repair deferred to Tier 2 to avoid the 5MB dep without buyer demand. CLI ships as separate `src/cli_text_clean.py` module per the one-CLI-per-script pattern in TECHNICAL Section 3.2. Full spec in TECHNICAL.md Section 10.2. |
|
|
||||||
| April 28, 2026 (v1.6) | **Fold conversation-history content into docs: deduplicator functional spec, lead bundle use cases, competitive landscape, full GUI framework comparison matrix, concrete 04/06 boundary examples, expanded Streamlit-to-SaaS reasoning** | None of this represents new decisions; all of it represents prior analysis that lived only in chat history and was at risk of evaporating. Per the doc's own no-silent-drift rule (Section 8) and the v1.4 recovery story, valuable analysis must be promoted to docs to survive. Specifically: TECHNICAL.md gains Section 10 (per-script functional specs, starting with the deduplicator's 36-item tiered spec) which is the buildable target for the v1 launch GUI port; this also makes the gap between "currently working" (exact + basic fuzzy) and "v1 launch best-of-class" (Tier 1) explicit so the docs don't quietly overstate where the code is. Section 9.3 gains three concrete distinguishing examples (bank-export blank fees / $1M outlier / "999=refused") that prove 04 and 06 are distinct concerns. BUSINESS.md gains Section 4a (Lead Bundle Deep Dive: 15 use cases by persona, 6-row competitive landscape table, market gap statement) which feeds landing page copy and demo design. Section 4c gains a 10-dimension scored framework matrix and per-option summaries (locks the rejection reasoning against re-litigation), plus expanded point 4 on Streamlit-to-SaaS migration cost. RECOVERY.md updated to reference Section 10 in rebuild and priority steps. No structural decisions changed; this is pure capture work. |
|
|
||||||
|
|
||||||
---
|
**Trial removed (v1.6)**: a 1-year free trial that unlocked every tool would undercut the paid Lite SKU (why pay for Lite when trial gives more for longer?). Paid-only keeps the funnel clean. The internal ``LicenseManager._mint`` API still exists for tests and for the seller's ``scripts/generate_license.py`` key generator; there's no user-facing way to self-issue a license.
|
||||||
|
|
||||||
## 8. What Would Trigger Re-Locking the Criteria
|
## 8. Re-lock triggers
|
||||||
|
|
||||||
These criteria are load-bearing and not casually changed. Triggers for explicit re-evaluation:
|
These criteria are load-bearing. Triggers for explicit re-evaluation:
|
||||||
|
|
||||||
- Hitting the $5k/mo revenue tier (revisit async constraint).
|
- $5k/mo MRR (revisit async constraint).
|
||||||
- Hitting the $10k/mo revenue tier (revisit time-budget allocation).
|
- $10k/mo MRR (revisit time-budget allocation).
|
||||||
- A platform shutting down (Gumroad / Lemon Squeezy policy change forcing channel migration).
|
- Marketplace shutdown (Gumroad / Lemon Squeezy policy).
|
||||||
- A new skill acquired that opens a higher-leverage product category.
|
- New skill that opens a higher-leverage product category.
|
||||||
- A burnout signal indicating the time / recovery balance is broken.
|
- Burnout signal — time/recovery balance broken.
|
||||||
- Streamlit project taking a hard direction change that breaks the desktop-packaging path (low probability, but worth flagging).
|
- Streamlit hard direction change breaking desktop packaging (low probability).
|
||||||
|
|
||||||
Any re-lock requires writing the new criteria here with a date and rationale. No silent drift.
|
Any re-lock writes new criteria here with date + rationale. **No silent drift.**
|
||||||
|
|||||||
332
docs/DEMO-PLAN.md
Normal file
332
docs/DEMO-PLAN.md
Normal file
@@ -0,0 +1,332 @@
|
|||||||
|
# Demo Plan — DataTools
|
||||||
|
|
||||||
|
> Creator-only. Implements PLAN.md §2.2 (the demo IS the product) and
|
||||||
|
> §2.3 (niche down — three landing pages, one engine).
|
||||||
|
> **Version**: 1.0 · **Adopted**: 2026-05-01 · **Owner**: Michael
|
||||||
|
|
||||||
|
The hosted demo is the single highest-leverage marketing asset in the
|
||||||
|
plan. This document defines exactly what loads, in what order, with
|
||||||
|
what data, for which buyer — so the operator builds it once and never
|
||||||
|
rebuilds it from a stale headline.
|
||||||
|
|
||||||
|
## 1. Goals
|
||||||
|
|
||||||
|
- Convert a cold visitor to a paid buyer in **under three minutes** of
|
||||||
|
active interaction.
|
||||||
|
- Demonstrate the *full pipeline* (not one tool) on a dataset that
|
||||||
|
*looks like the visitor's own work* — not a toy CSV.
|
||||||
|
- Survive zero attention to maintenance — once running, the demo
|
||||||
|
should keep working as the engine evolves (the pre-saved pipeline
|
||||||
|
JSONs use the same code path the paid product uses).
|
||||||
|
- Provide a shareable artifact for niche-community posts (a public URL
|
||||||
|
the operator can drop into a subreddit reply with one sentence).
|
||||||
|
|
||||||
|
## 2. Constraints (non-negotiable)
|
||||||
|
|
||||||
|
| Constraint | Source | Implication |
|
||||||
|
|---|---|---|
|
||||||
|
| Free hosting at launch | BUSINESS.md §9 | Streamlit Community Cloud (1 GB RAM, sleeps after 7 days idle) |
|
||||||
|
| No login | BUSINESS.md §7 | No email gate, no signup wall, no "create account to continue" |
|
||||||
|
| Async / no-touch | DECISIONS.md §1 #8 | Cannot offer "schedule a demo with us" CTA |
|
||||||
|
| Runs locally on paid product | BUSINESS.md §11 | Demo can't expose the same engine to abuse — needs row caps |
|
||||||
|
| Friction kills conversion | BUSINESS.md §7 | Demo dataset preloaded; no "select a file" first-step |
|
||||||
|
| < $1,200/mo recurring | BUSINESS.md §9 | Migration plan to $5/mo VPS only after rate-limit signal |
|
||||||
|
|
||||||
|
## 3. The three personas (per PLAN.md §2.3)
|
||||||
|
|
||||||
|
| Tag | Persona | Top-of-funnel keyword | Demo dataset | Pre-saved pipeline |
|
||||||
|
|---|---|---|---|---|
|
||||||
|
| `shopify-pet` | Shopify operator (priority: pet supplies) | "shopify customer cleanup" | `samples/demo/shopify_pet_customers.csv` | `shopify_pet_pipeline.json` |
|
||||||
|
| `bookkeeper` | Bookkeeper / freelance accountant | "reconcile bank export csv" | `samples/demo/bookkeeper_bank_reconcile.csv` | `bookkeeper_bank_pipeline.json` |
|
||||||
|
| `revops` | Marketing / RevOps agency | "dedupe lead list across vendors" | `samples/demo/agency_combined_leads.csv` | `agency_leads_pipeline.json` |
|
||||||
|
|
||||||
|
Each persona gets its **own landing page URL**, its **own demo dataset
|
||||||
|
loaded by default**, and its **own H1 + below-the-fold copy.** The
|
||||||
|
engine is identical; only positioning differs.
|
||||||
|
|
||||||
|
## 4. Demo dataset specifications
|
||||||
|
|
||||||
|
Each dataset is intentionally small (~15–25 rows) so the full pipeline
|
||||||
|
runs in well under one second on Streamlit Community Cloud's free
|
||||||
|
hardware. Each row is a *plausible-looking* export from that
|
||||||
|
persona's tooling. Each contains every kind of pollution the bundle's
|
||||||
|
five tools fix, so a single demo run shows every tool earning its
|
||||||
|
keep.
|
||||||
|
|
||||||
|
### 4.0 Pain-point coverage map
|
||||||
|
|
||||||
|
Each demo dataset is engineered so the buyer sees their **own top
|
||||||
|
pain** demonstrated in the AFTER preview. The mapping below pairs
|
||||||
|
each pain from PLAN.md §2.3a with the rows / columns that exercise
|
||||||
|
it. Refresh the dataset only when this coverage drops.
|
||||||
|
|
||||||
|
| Persona | Pain (from PLAN §2.3a) | Demo coverage |
|
||||||
|
|---|---|---|
|
||||||
|
| Shopify pet | S1 — Klaviyo per-contact dupes | 5 dup pairs across rows 1–15 (case + format + address-twin variants) |
|
||||||
|
| Shopify pet | S2 — feed-rejection chars | smart-quote / NBSP / BOM in rows 1–6, 9, 11 |
|
||||||
|
| Shopify pet | S3 — multi-channel | partner-style customer IDs (`SHOP-`); demonstration of column-level mapping covered in RevOps demo |
|
||||||
|
| Shopify pet | S4 — subscription identity | rows 1+2, 7+8, 9+10 — same person, different format |
|
||||||
|
| Shopify pet | S5 — VAT-MOSS country drift | rows 16–18 (`United Kingdom` / `U.K.` / `UK`) + rows 19–20 (`Germany`/`Italia`) |
|
||||||
|
| Bookkeeper | B1 — month-overlap re-import | 7 dup pairs spanning Jan↔Feb and Mar boundaries |
|
||||||
|
| Bookkeeper | B2 — 1099 vendor consolidation | Amazon × 3 spellings, Verizon × 2, Acme Realty × 2, Adobe × 2, Costco × 2, Zoom × 2, Stripe × 4 |
|
||||||
|
| Bookkeeper | B3 — audit trail | every cell change in the run logged with old/new/rule — surface in the demo's audit tab |
|
||||||
|
| Bookkeeper | B4 — per-license economics | demonstrated by pricing copy, not data |
|
||||||
|
| Bookkeeper | B5 — multi-currency | rows 26 (EUR), 27 (GBP), 28 (BRL with comma decimal), 29 (parens-negative) |
|
||||||
|
| RevOps | R1 — per-contact tier | 6 cross-source dup pairs (HubSpot × LinkedIn × Manual Scrape) |
|
||||||
|
| RevOps | R2 — deliverability | rows 26–27 (`uma at uniform dot com`, `victor@@victorco.com` invalid emails) |
|
||||||
|
| RevOps | R3 — GDPR / privacy | demonstrated by the network-tab moat panel + zero-upload claim |
|
||||||
|
| RevOps | R4 — vendor unification | 3 source values (HubSpot / LinkedIn / Manual Scrape), 13 country codes, mixed-shape headers |
|
||||||
|
| RevOps | R5 — suppression list | rows 29–30 (`Suppressed`, `Opted Out` tags) |
|
||||||
|
|
||||||
|
### 4.1 `shopify_pet_customers.csv` (20 rows)
|
||||||
|
|
||||||
|
**Looks like**: a Shopify customer export filtered for "Pet Supplies"
|
||||||
|
sales channel, 12 months activity.
|
||||||
|
|
||||||
|
**Pollution included**:
|
||||||
|
- Whitespace padding (" Alice ", "Sydney Opera House Drive ")
|
||||||
|
- Mixed phone formats: `(415) 555-1234`, `415.555.1234`, `5559876543`,
|
||||||
|
`+1 555-111-1111`
|
||||||
|
- International phones: GB, ES, DE, AU, JP (15 demo rows span 6
|
||||||
|
countries)
|
||||||
|
- Currency variants: `$1,240.50`, `£890.25`, `€2.410,75` (EU comma
|
||||||
|
decimal), `A$ 1,299.00`, `¥75000`
|
||||||
|
- Date formats: `2025-12-04`, `12/15/2025`, `?`, `(blank)`, `(none)`,
|
||||||
|
`#N/A`
|
||||||
|
- Disguised nulls: `N/A`, blank, `(blank)`, `?`, `#N/A`, `(none)`,
|
||||||
|
`unknown`
|
||||||
|
- Name casing: `EVE MARTINEZ`, `henry`, `O'NEIL`, `noah`, mixed Title /
|
||||||
|
ALL CAPS / lower
|
||||||
|
- Email case variants that *should* dedup: `Bob@PetShop.com` vs
|
||||||
|
`alice@petshop.com`
|
||||||
|
- 4 fuzzy duplicates (Alice/Bob same address, Grace/Henry same phone,
|
||||||
|
Carlos/Olivia same address, Ivy/Jack same address)
|
||||||
|
|
||||||
|
**After running the pipeline**: 20 rows → 15, ~29 cells canonicalized,
|
||||||
|
~45 sentinels standardised, 5 cross-row duplicates merged. The
|
||||||
|
customer table is now Klaviyo-import-ready and the country column
|
||||||
|
(previously `UK` / `U.K.` / `United Kingdom` / `Germany` / `Italia`)
|
||||||
|
is GB / DE / IT — VAT MOSS report won't break.
|
||||||
|
|
||||||
|
### 4.2 `bookkeeper_bank_reconcile.csv` (30 rows)
|
||||||
|
|
||||||
|
**Looks like**: two months of business checking + credit-card activity
|
||||||
|
exported from a bank portal, with the Feb export accidentally
|
||||||
|
overlapping the Jan export at the month boundary.
|
||||||
|
|
||||||
|
**Pollution included**:
|
||||||
|
- Mixed date formats: `01/15/2025`, `2025-01-15`, `Jan 18 2025`,
|
||||||
|
`1/27/25`, `Feb 5 2025`
|
||||||
|
- Currency formats: `-$129.99`, `($89.50)` parens-negative,
|
||||||
|
`+$3,450.00`, `- $599.88` space, bare `-129.99`, `(50.00)`
|
||||||
|
- Header trailing whitespace: `"Date "`
|
||||||
|
- Smart quotes around descriptions: `"autopay"`
|
||||||
|
- Em-dash sentinels in Vendor: `—`
|
||||||
|
- Smart-em-dash inside descriptions: `STAPLES #4422 — paper, toner`
|
||||||
|
- Vendor casing inconsistency: `Amazon` / `amazon.com` / `AMAZON.COM`,
|
||||||
|
`Verizon` / `verizon`
|
||||||
|
- 6 duplicate transactions (same date+amount+vendor recorded twice
|
||||||
|
with different formats)
|
||||||
|
|
||||||
|
**After running the pipeline**: 30 rows → 23, ~84 cells normalized, 7
|
||||||
|
duplicates removed (month-overlap + VAT-MOSS dups). All dates
|
||||||
|
ISO-formatted, all amounts numeric (including EUR/GBP/BRL with comma
|
||||||
|
decimal), vendor casing canonical, parens-negative resolved.
|
||||||
|
|
||||||
|
### 4.3 `agency_combined_leads.csv` (30 rows)
|
||||||
|
|
||||||
|
**Looks like**: a marketing-ops worksheet combining lead exports from
|
||||||
|
HubSpot + LinkedIn Sales Navigator + manual scraping, ready for
|
||||||
|
campaign targeting.
|
||||||
|
|
||||||
|
**Pollution included**:
|
||||||
|
- Phone formats per region: US, UK, Spain, Germany, China, India,
|
||||||
|
Australia, Mexico, Israel, Singapore, Hong Kong, Italy, South
|
||||||
|
Korea — 13 country codes
|
||||||
|
- Country column inconsistent: `USA` / `US` / `United States`
|
||||||
|
- Disguised nulls: `N/A`, `unknown`, `(unknown)`, `(blank)`, `(none)`,
|
||||||
|
`?`, `—`, `#N/A`, `TBD`
|
||||||
|
- Source column tags origin (`HubSpot` / `LinkedIn` / `Manual Scrape`)
|
||||||
|
- Email duplicates across sources with case variants: `alice@acme.com`
|
||||||
|
+ `Alice.Johnson@acme.com`, `bob@beta.com` + `Bob@Beta.com`,
|
||||||
|
`diana@delta.com` from two sources, `carlos@gamma.io` from two
|
||||||
|
sources, `Frank@Foxtrot.de` + `frank@foxtrot.de`
|
||||||
|
- Name casing: `DIANA LEE`, `henry`, `IVY CHEN`, mixed
|
||||||
|
- 6 fuzzy / cross-source duplicates designed to survive the dedup
|
||||||
|
- Score column with sentinel pollution that needs coercion to integer
|
||||||
|
|
||||||
|
**After running the pipeline**: 30 rows → 24, ~43 cells canonicalized,
|
||||||
|
14 sentinels resolved, 6 cross-source duplicates merged with `merge=true`
|
||||||
|
so each survivor inherits the most-complete picture. Invalid-email
|
||||||
|
rows (deliverability stress) and `Suppressed`/`Opted Out` tags
|
||||||
|
(suppression-list use case) survive as flagged rows the operator
|
||||||
|
manually reviews.
|
||||||
|
|
||||||
|
## 5. UX flow (per persona)
|
||||||
|
|
||||||
|
The demo is a single Streamlit page (likely
|
||||||
|
`src/gui/pages/0_Review.py` repurposed for demo mode, or a
|
||||||
|
dedicated `app_demo.py` for the cloud build).
|
||||||
|
|
||||||
|
```
|
||||||
|
┌──────────────────────────────────────────────────────────┐
|
||||||
|
│ DataTools — for {Persona} │
|
||||||
|
│ "{Persona-specific H1}" │
|
||||||
|
├──────────────────────────────────────────────────────────┤
|
||||||
|
│ │
|
||||||
|
│ Sample dataset preloaded: shopify_pet_customers.csv │
|
||||||
|
│ [Replace with your own file (capped 100 rows)] │
|
||||||
|
│ │
|
||||||
|
│ ┌─ BEFORE preview (15 rows) ─────────────────────────┐ │
|
||||||
|
│ │ Alice | (415) 555-1234 | $1,240.50 | … │ │
|
||||||
|
│ │ Bob | 415.555.1234 | $1,240.50 | … │ │
|
||||||
|
│ │ ... │ │
|
||||||
|
│ └──────────────────────────────────────────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ Pipeline (saved): │
|
||||||
|
│ 1. Text Clean → 2. Format Standardize → │
|
||||||
|
│ 3. Missing → 4. Deduplicate │
|
||||||
|
│ │
|
||||||
|
│ [▶ Run pipeline] │
|
||||||
|
│ │
|
||||||
|
│ ┌─ AFTER preview ───────────────────────────────────┐ │
|
||||||
|
│ │ 15 rows → 11 (4 duplicates merged) │ │
|
||||||
|
│ │ 27 cells canonicalized · 33 sentinels resolved │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ Alice Johnson | +14155551234 | 1240.50 | … │ │
|
||||||
|
│ │ ... │ │
|
||||||
|
│ └──────────────────────────────────────────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ [Download cleaned CSV (sample, watermarked)] │
|
||||||
|
│ │
|
||||||
|
│ ┌──────────────────────────────────────────────────┐ │
|
||||||
|
│ │ Like what you see? │ │
|
||||||
|
│ │ Run this on YOUR 50,000-row export — locally. │ │
|
||||||
|
│ │ No upload. Your data never leaves your machine. │ │
|
||||||
|
│ │ [Get DataTools — $49 →] │ │
|
||||||
|
│ └──────────────────────────────────────────────────┘ │
|
||||||
|
└──────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**Critical UX points**:
|
||||||
|
- Sample dataset is *already loaded* on page paint. Visitor never
|
||||||
|
sees an empty state.
|
||||||
|
- BEFORE table is shown side-by-side with AFTER once the run
|
||||||
|
completes. Hidden-character toggle on by default so the visitor
|
||||||
|
*sees* what was hidden in their data.
|
||||||
|
- "Replace with your own file" is a secondary action below the BEFORE
|
||||||
|
table — not the headline.
|
||||||
|
- Per-step metrics are shown in the AFTER block: "27 cells
|
||||||
|
canonicalized, 33 sentinels resolved, 4 duplicates merged." Numbers
|
||||||
|
sell more than narrative.
|
||||||
|
- Buy button is **inside** the AFTER block and **above the fold** when
|
||||||
|
the run completes. Friction kills.
|
||||||
|
|
||||||
|
## 6. Free vs paid boundary
|
||||||
|
|
||||||
|
The demo runs the **same code** as the paid product. Caps are surface,
|
||||||
|
not engine.
|
||||||
|
|
||||||
|
| Limit | Free demo | Paid (downloaded) |
|
||||||
|
|---|---|---|
|
||||||
|
| Input rows | 100 | unlimited (1 GB+ via streaming) |
|
||||||
|
| File size | 5 MB | unlimited |
|
||||||
|
| Output | watermarked CSV ("DataTools demo — buy at <url>" appended as last row) | clean CSV |
|
||||||
|
| Pipeline editor | locked to the persona-saved pipeline | full edit / save / load JSON |
|
||||||
|
| Save pipeline JSON | disabled | enabled |
|
||||||
|
| International | enabled | enabled |
|
||||||
|
| Audit log download | disabled | enabled |
|
||||||
|
| Tool 06–09 | as they ship | as they ship |
|
||||||
|
|
||||||
|
The watermark is a **single trailing row**, not an in-cell tag — so
|
||||||
|
the demo's AFTER preview *visibly* reads as production-quality data,
|
||||||
|
not "demo crippled" data.
|
||||||
|
|
||||||
|
## 7. CTA copy (per persona)
|
||||||
|
|
||||||
|
### 7.1 Shopify pet operator
|
||||||
|
|
||||||
|
- **H1**: *Clean your customer / vendor / subscriber exports — locally.*
|
||||||
|
- **Sub**: *Klaviyo-import-ready in 30 seconds. Catches duplicates Excel
|
||||||
|
misses. Your data never leaves your computer.*
|
||||||
|
- **CTA**: *Get DataTools for Shopify — $49 →*
|
||||||
|
|
||||||
|
### 7.2 Bookkeeper / freelance accountant
|
||||||
|
|
||||||
|
- **H1**: *Reconcile messy bank exports. Hand your client an audit
|
||||||
|
trail.*
|
||||||
|
- **Sub**: *Catches the duplicate transaction Quickbooks imported twice.
|
||||||
|
Standardizes dates, amounts, vendor casing. Every change auditable.*
|
||||||
|
- **CTA**: *Get DataTools for Bookkeepers — $49 →*
|
||||||
|
|
||||||
|
### 7.3 Marketing / RevOps agency
|
||||||
|
|
||||||
|
- **H1**: *Dedupe leads across HubSpot, LinkedIn, and manual scrapes.*
|
||||||
|
- **Sub**: *International phones, country normalization, fuzzy dedup
|
||||||
|
with merge — one tool, one schema, no upload.*
|
||||||
|
- **CTA**: *Get DataTools for RevOps — $49 →*
|
||||||
|
|
||||||
|
## 8. Telemetry / conversion tracking
|
||||||
|
|
||||||
|
Async + no-touch + free hosting limits what we can instrument. Use
|
||||||
|
event-only counters, no PII:
|
||||||
|
|
||||||
|
| Event | Source | Aggregate-only field |
|
||||||
|
|---|---|---|
|
||||||
|
| `demo.page_view` | landing page | persona tag |
|
||||||
|
| `demo.run_clicked` | demo page | persona tag |
|
||||||
|
| `demo.run_completed` | demo page | persona tag, rows_processed |
|
||||||
|
| `demo.cta_clicked` | demo page | persona tag |
|
||||||
|
| `gumroad.purchase` | Gumroad webhook | landing-page-source query param (`?from=shopify-pet`) |
|
||||||
|
|
||||||
|
Conversion = `cta_clicked / run_completed`. Demo-quality issue surfaces
|
||||||
|
when `run_completed / page_view` < 30 % (visitors not engaging).
|
||||||
|
|
||||||
|
Self-host counters on Cloudflare Pages (free, GDPR-friendly). No
|
||||||
|
Google Analytics — adds privacy banner, conflicts with the "your data
|
||||||
|
never leaves your computer" message.
|
||||||
|
|
||||||
|
## 9. Maintenance plan
|
||||||
|
|
||||||
|
**Recurring**: zero. The demo runs on the same engine the paid
|
||||||
|
product ships, so any improvement to the engine improves the demo
|
||||||
|
automatically. The pre-saved pipeline JSONs reference column names
|
||||||
|
and tool names, both stable APIs.
|
||||||
|
|
||||||
|
**Triggers for revisit**:
|
||||||
|
|
||||||
|
| Trigger | Action |
|
||||||
|
|---|---|
|
||||||
|
| Streamlit Community Cloud rate-limits / sleeps too aggressively | Migrate to a $5–10/mo VPS (BUSINESS.md §9 contingency) |
|
||||||
|
| Demo dataset becomes stale (e.g. all phones standardize to no-op) | Refresh with a new pollution batch — *don't change the persona* |
|
||||||
|
| `run_completed / page_view < 30 %` for 4 consecutive weeks | Audit the demo: is the BEFORE preview showing the mess clearly? Is the AFTER too small to notice? |
|
||||||
|
| `cta_clicked / run_completed < 5 %` for 4 consecutive weeks | The demo is impressive but the CTA isn't earning trust — revise copy + add a screenshot of the network tab showing zero outbound calls (PLAN.md §2.4) |
|
||||||
|
| New tool ships (06–09) | Decide *per persona* whether to add it to that persona's saved pipeline. Not all tools belong on all personas |
|
||||||
|
|
||||||
|
## 10. Build sequence (drops into PLAN.md week 2)
|
||||||
|
|
||||||
|
| Day | Action |
|
||||||
|
|---|---|
|
||||||
|
| 1 | Demo build of Streamlit app: 3 personas, switch via query param `?p=shopify-pet` |
|
||||||
|
| 2 | Pipeline JSONs wired in; row cap + watermark applied; download button |
|
||||||
|
| 3 | Deploy to Streamlit Community Cloud · 3 sub-paths or 3 separate apps |
|
||||||
|
| 4 | Persona landing pages: 3 static HTML pages on Cloudflare Pages, each with iframe embed of its persona demo + CTA |
|
||||||
|
| 5 | Telemetry counters wired (Cloudflare event API) · Gumroad webhook captures `?from=` |
|
||||||
|
|
||||||
|
End of day 5: three URLs the operator can drop into three different
|
||||||
|
niche-community threads, each performing its own conversion math.
|
||||||
|
|
||||||
|
## 11. Anti-temptations (things the demo deliberately refuses)
|
||||||
|
|
||||||
|
- **No "try it on your data first" gate that requires email.** The
|
||||||
|
whole point is friction-free.
|
||||||
|
- **No "schedule a demo" CTA.** Locked by no-touch.
|
||||||
|
- **No live chat widget.** Same.
|
||||||
|
- **No A/B-test framework yet.** Single-arm copy, ship it, iterate
|
||||||
|
monthly. A/B requires statistical traffic the funnel doesn't have
|
||||||
|
pre-PMF.
|
||||||
|
- **No watermark inside cells.** The AFTER preview must look
|
||||||
|
production-quality. Watermark goes on a single trailing row that's
|
||||||
|
obviously the demo signature.
|
||||||
|
- **No animation / loader theatrics.** Pipeline runs in <1 s; a
|
||||||
|
fake-progress bar lies about speed.
|
||||||
236
docs/DEPLOYMENT.md
Normal file
236
docs/DEPLOYMENT.md
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
# Deployment — demo + landing pages
|
||||||
|
|
||||||
|
> One page. Two services. ~30 minutes from "code complete" to
|
||||||
|
> "URL the user can hit." Every step here is from-scratch reproducible
|
||||||
|
> on a clean laptop.
|
||||||
|
> **Version**: 1.0 · **Adopted**: 2026-05-01
|
||||||
|
|
||||||
|
This doc covers the **two distribution surfaces** that ship to public
|
||||||
|
URLs: the Streamlit demo (the iframe target) and the Cloudflare Pages
|
||||||
|
landing pages (the marketing surface that embeds it).
|
||||||
|
|
||||||
|
The *paid* product — PyInstaller installers, code-signing, Gumroad
|
||||||
|
listing — is covered in `docs/NEXT-STEPS.md`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Part 1 · Deploy the demo (Streamlit Community Cloud — free)
|
||||||
|
|
||||||
|
### A. Pre-flight (one-time, ~2 min)
|
||||||
|
|
||||||
|
You need a free [Streamlit Community Cloud](https://streamlit.io/cloud)
|
||||||
|
account. Sign in with the GitHub account that hosts this repo.
|
||||||
|
|
||||||
|
### B. Deploy (~5 min, mostly waiting for the Cloud build)
|
||||||
|
|
||||||
|
1. **Push the repo to GitHub** (private or public — both work). The
|
||||||
|
important files are at the **repo root**:
|
||||||
|
|
||||||
|
- `streamlit_app.py` — Cloud auto-detects this; nothing to configure
|
||||||
|
- `requirements.txt` — Cloud installs from this
|
||||||
|
- `.streamlit/config.toml` — Cloud honours this
|
||||||
|
- `samples/demo/*.csv` + `*_pipeline.json` — the demo's data
|
||||||
|
- `src/` — the engine
|
||||||
|
|
||||||
|
2. In Streamlit Community Cloud → **New app**:
|
||||||
|
- Repository: your fork
|
||||||
|
- Branch: `main`
|
||||||
|
- Main file path: `streamlit_app.py` (the default — leave it)
|
||||||
|
- App URL: `datatools-demo` (or any free subdomain)
|
||||||
|
- **Deploy**
|
||||||
|
|
||||||
|
3. First build is 2–3 min while Cloud installs `pandas`, `phonenumbers`,
|
||||||
|
`rapidfuzz`, etc. Subsequent deploys are < 30 s.
|
||||||
|
|
||||||
|
### C. Verify
|
||||||
|
|
||||||
|
Open the deployed URL. Append `?p=shopify-pet` to the URL bar —
|
||||||
|
the persona-specific demo loads. Try `?p=bookkeeper` and
|
||||||
|
`?p=revops` to confirm all three personas route correctly. Click
|
||||||
|
**Run pipeline**; the AFTER preview should appear within ~1 second.
|
||||||
|
|
||||||
|
### D. The output URL
|
||||||
|
|
||||||
|
The deployed URL is what feeds into `landing/deploy.config.json` →
|
||||||
|
`demo_base_url`. Without trailing slash. For example:
|
||||||
|
|
||||||
|
https://datatools-demo.streamlit.app
|
||||||
|
|
||||||
|
### E. Migration trigger
|
||||||
|
|
||||||
|
Per `BUSINESS.md` §9 / `DEMO-PLAN.md` §9, migrate to a $5–10/mo VPS
|
||||||
|
when:
|
||||||
|
|
||||||
|
- Streamlit Community Cloud rate-limits / sleeps too aggressively, OR
|
||||||
|
- the demo crosses ~5 k page-views/month (free-tier capacity)
|
||||||
|
|
||||||
|
The migration is one command if you containerise:
|
||||||
|
`docker run -p 8501:8501 -v $(pwd):/app python:3.12-slim …`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Part 2 · Deploy the landing pages (Cloudflare Pages — free)
|
||||||
|
|
||||||
|
### A. Pre-flight (one-time, ~5 min)
|
||||||
|
|
||||||
|
You need:
|
||||||
|
|
||||||
|
- A Cloudflare account (free) and a domain (any registrar) with
|
||||||
|
nameservers pointed at Cloudflare. **OR** skip the custom domain
|
||||||
|
step and use the auto-generated `*.pages.dev` URL.
|
||||||
|
- A Gumroad listing URL (placeholder until your account is set up —
|
||||||
|
use `https://gumroad.com/l/datatools` and update it later).
|
||||||
|
|
||||||
|
### B. Build the deploy-ready bundle (~30 sec)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# One-time: copy the template
|
||||||
|
cp landing/deploy.config.example.json landing/deploy.config.json
|
||||||
|
# Edit it with your real URLs
|
||||||
|
edit landing/deploy.config.json
|
||||||
|
# Build
|
||||||
|
python3 landing/deploy.py
|
||||||
|
# → produces landing/dist/
|
||||||
|
```
|
||||||
|
|
||||||
|
`landing/deploy.config.json` is **gitignored**; your real URLs never
|
||||||
|
hit the repo.
|
||||||
|
|
||||||
|
### C. Deploy (~3 min)
|
||||||
|
|
||||||
|
Two paths — pick one:
|
||||||
|
|
||||||
|
**Drag-and-drop (zero CLI):**
|
||||||
|
|
||||||
|
1. Cloudflare Pages dashboard → **Create project** → **Direct Upload**
|
||||||
|
2. Drag `landing/dist/` into the upload zone
|
||||||
|
3. Project name: `datatools` (becomes `datatools.pages.dev`)
|
||||||
|
4. Click **Deploy**
|
||||||
|
|
||||||
|
**Wrangler CLI (one command, scriptable):**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
npm install -g wrangler # one-time
|
||||||
|
wrangler login # one-time
|
||||||
|
wrangler pages deploy landing/dist
|
||||||
|
```
|
||||||
|
|
||||||
|
### D. Custom domain (~5 min, optional)
|
||||||
|
|
||||||
|
Pages dashboard → your project → **Custom domains** → add
|
||||||
|
`datatools.app` (or whichever apex domain you registered). Cloudflare
|
||||||
|
auto-issues TLS. Once propagated:
|
||||||
|
|
||||||
|
- `https://datatools.app/` → apex chooser
|
||||||
|
- `https://datatools.app/shopify-pet/` → Shopify landing
|
||||||
|
- `https://datatools.app/bookkeeper/` → Bookkeeper landing
|
||||||
|
- `https://datatools.app/revops/` → RevOps landing
|
||||||
|
|
||||||
|
### E. Verify
|
||||||
|
|
||||||
|
For each persona:
|
||||||
|
|
||||||
|
1. Open the persona URL.
|
||||||
|
2. Confirm the demo iframe loads (the URL inside it points at the
|
||||||
|
Streamlit demo from Part 1).
|
||||||
|
3. Click "Run pipeline" inside the iframe → AFTER preview appears.
|
||||||
|
4. Click the "Get DataTools" button → opens Gumroad with the
|
||||||
|
correct `?from=<persona>` query (verify in the URL bar).
|
||||||
|
|
||||||
|
If the iframe shows "Refused to connect", check Cloudflare Pages →
|
||||||
|
**Settings** → **Functions** for any CSP that disallows Streamlit's
|
||||||
|
domain. (Default Pages config does not set CSP, so this is rarely an
|
||||||
|
issue.)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Part 3 · Updates
|
||||||
|
|
||||||
|
The cycle is:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1) Edit code or copy
|
||||||
|
edit landing/<persona>/index.html
|
||||||
|
edit src/gui/app_demo.py
|
||||||
|
|
||||||
|
# 2) Rebuild landing
|
||||||
|
python3 landing/deploy.py
|
||||||
|
|
||||||
|
# 3) Re-deploy landing
|
||||||
|
wrangler pages deploy landing/dist
|
||||||
|
|
||||||
|
# 4) Re-deploy demo
|
||||||
|
git push origin main
|
||||||
|
# (Streamlit Cloud auto-deploys on push)
|
||||||
|
```
|
||||||
|
|
||||||
|
Both surfaces deploy in under 5 minutes end-to-end.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Part 4 · Sanity checks (post-deploy, ~3 min)
|
||||||
|
|
||||||
|
Run these once, then trust the build (per `POST-LAUNCH.md` §6):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Landing pages serve and reference the right demo URL
|
||||||
|
curl -s https://datatools.app/ | grep -c persona-card
|
||||||
|
# → 3 (one per persona card)
|
||||||
|
|
||||||
|
curl -s https://datatools.app/shopify-pet/ | grep -c "datatools-demo"
|
||||||
|
# → ≥1 (iframe src points at your demo)
|
||||||
|
|
||||||
|
# Demo responds and routes the persona param
|
||||||
|
curl -s https://datatools-demo.streamlit.app/?p=shopify-pet | grep -c "Shopify"
|
||||||
|
# → ≥1
|
||||||
|
|
||||||
|
# Sitemap is valid XML and lists all 4 pages
|
||||||
|
curl -s https://datatools.app/sitemap.xml | grep -c "<url>"
|
||||||
|
# → 4
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Part 5 · Cost ceiling check
|
||||||
|
|
||||||
|
| Service | Tier | Cost | Cap |
|
||||||
|
|---|---|---|---|
|
||||||
|
| Cloudflare Pages | Free | $0 | 500 builds/month, unlimited bandwidth |
|
||||||
|
| Streamlit Community Cloud | Free | $0 | 1 GB RAM, sleeps after 7 days idle |
|
||||||
|
| Custom domain | Cloudflare or registrar | ~$15/year | n/a |
|
||||||
|
| GitHub | Free for private repos with limited collaborators | $0 | n/a |
|
||||||
|
| **Total ongoing** | | **~$1.25/mo** (domain only) | |
|
||||||
|
|
||||||
|
Well inside the `BUSINESS.md` §9 cap of $1,200/mo recurring. The
|
||||||
|
$5–10/mo VPS migration is a contingency only — don't pre-build it.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
**Streamlit Cloud build fails with "ModuleNotFoundError: src.core"**
|
||||||
|
|
||||||
|
`streamlit_app.py` puts the repo root on `sys.path` before invoking
|
||||||
|
the demo module — but only if the file is at the repo root. Confirm
|
||||||
|
`streamlit_app.py` lives at `/streamlit_app.py`, not nested in a
|
||||||
|
folder.
|
||||||
|
|
||||||
|
**Cloudflare Pages deploy succeeds but persona pages 404**
|
||||||
|
|
||||||
|
The directory layout is preserved by `deploy.py`. Confirm your
|
||||||
|
`landing/dist/` has `shopify-pet/index.html`, etc. — not just three
|
||||||
|
flat files. If you used drag-and-drop, drag the **directory**, not
|
||||||
|
its contents.
|
||||||
|
|
||||||
|
**The iframe shows "X-Frame-Options denied"**
|
||||||
|
|
||||||
|
Streamlit Community Cloud allows iframe embedding by default. If
|
||||||
|
you've migrated to a self-hosted demo with a reverse proxy, set
|
||||||
|
`X-Frame-Options: ALLOWALL` (or remove the header entirely) for the
|
||||||
|
demo's domain.
|
||||||
|
|
||||||
|
**Gumroad URL has no `?from=` parameter when clicked**
|
||||||
|
|
||||||
|
The `&from=` query param is added by the landing-page CTA, not by
|
||||||
|
Gumroad. If it's missing, the landing-page HTML wasn't substituted —
|
||||||
|
re-run `python3 landing/deploy.py` and re-deploy.
|
||||||
@@ -1,285 +1,404 @@
|
|||||||
# Developer Guide
|
# Developer Guide
|
||||||
|
|
||||||
Architecture, data flow, and extension guide for the DataTools Deduplicator.
|
Architecture, data flow, extension points.
|
||||||
|
|
||||||
## Architecture
|
## Architecture
|
||||||
|
|
||||||
```
|
```
|
||||||
CLI (src/cli.py) GUI (src/gui/app.py)
|
CLI (src/cli*.py) GUI (src/gui/app.py + pages/)
|
||||||
│ │
|
│ │
|
||||||
│ flags → strategies │ widgets → strategies
|
└──────────┐ ┌──────────┘
|
||||||
│ _interactive_review() │ match_group_card()
|
|
||||||
│ tqdm progress bar │ st.progress()
|
|
||||||
│ │
|
|
||||||
└──────────┐ ┌────────────────┘
|
|
||||||
│ │
|
|
||||||
▼ ▼
|
▼ ▼
|
||||||
┌─────────────────┐
|
┌────────────────┐
|
||||||
│ core.dedup │
|
│ src/core/ │
|
||||||
│ deduplicate() │
|
└────────────────┘
|
||||||
└────────┬────────┘
|
|
||||||
│
|
|
||||||
┌────────────┼────────────┐
|
|
||||||
▼ ▼ ▼
|
|
||||||
core.io core.normalizers core.config
|
|
||||||
read/write normalize_*() save/load JSON
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Key principle:** All business logic lives in `src/core/`. The CLI and GUI are thin wrappers that translate user input into `deduplicate()` arguments and display the `DeduplicationResult`.
|
**Core/UI rule**: business logic in `core/` only. CLI + GUI translate user input → core call → display result.
|
||||||
|
|
||||||
## File-by-File Reference
|
## Module map
|
||||||
|
|
||||||
### src/core/dedup.py — Deduplication Engine
|
| Module | Public surface |
|
||||||
|
|--------|----------------|
|
||||||
|
| `i18n` | `t(key, lang=None, **fmt)`, `current_language()`, `set_language()`, `render_language_selector()`, `LANGUAGES` |
|
||||||
|
| `core.dedup` | `deduplicate()`, `MatchStrategy`, `ColumnMatchStrategy`, `Algorithm`, `SurvivorRule`, `DeduplicationResult`, `MatchResult`, `build_default_strategies()` |
|
||||||
|
| `core.normalizers` | `normalize_email/phone/name/address/string`, `NormalizerType`, `get_normalizer()` |
|
||||||
|
| `core.io` | `read_file()`, `write_file()`, `list_sheets()`, `detect_encoding/delimiter/header_row`, `repair_bytes()` |
|
||||||
|
| `core.config` | `DeduplicationConfig.from_file/to_file/to_strategies/to_survivor_rule` |
|
||||||
|
| `core.analyze` | `analyze()`, `Finding`, `findings_by_tool()`, `_NULL_LIKE` |
|
||||||
|
| `core.fixes` | `@register("fix_id")` decorator, `get_fix()`, `available_actions()` |
|
||||||
|
| `core.normalize` | `auto_fix()`, `apply_decisions()`, `NormalizationResult`, `is_normalized()` |
|
||||||
|
| `core.text_clean` | `clean_dataframe()`, `CleanOptions`, `CleanResult`, `smart_title_case()` |
|
||||||
|
| `core.format_standardize` | `standardize_dataframe()`, `StandardizeOptions`, `StandardizeResult`, `FieldType`, per-cell `standardize_*()` |
|
||||||
|
| `core.errors` | `DataToolsError` hierarchy, `ensure_dataframe()`, `ensure_choice()`, `wrap_file_read/write()`, `format_for_user()` |
|
||||||
|
| `core._constants` | `US_STATE_NAMES`, `US_STATE_CODES`, `USPS_EXPANSIONS`, `USPS_COMPRESSIONS` |
|
||||||
|
|
||||||
The central module. Contains:
|
## Data flow — Find Duplicates
|
||||||
|
|
||||||
- **Enums:** `Algorithm` (4 fuzzy algorithms), `SurvivorRule` (4 selection rules)
|
|
||||||
- **Data classes:** `ColumnMatchStrategy`, `MatchStrategy`, `MatchResult`, `DeduplicationResult`
|
|
||||||
- **`deduplicate()`** — main entry point. Takes a DataFrame + optional strategies/rules, returns a `DeduplicationResult` with deduplicated DataFrame, removed rows, match groups, and log entries.
|
|
||||||
- **`build_default_strategies()`** — scans column names with regex patterns to auto-detect email, phone, name, and address columns. Builds strong/weak key strategies with appropriate algorithms and normalizers.
|
|
||||||
- **`_UnionFind`** — disjoint-set data structure for transitive closure. If A matches B and B matches C, all three end up in one group.
|
|
||||||
- **`_find_match_groups()`** — O(n^2) pairwise comparison. For each pair, tries all strategies (OR semantics). Feeds matches into union-find. Returns match groups with confidence scores.
|
|
||||||
- **`_select_survivor()`** — picks the row to keep based on the survivor rule.
|
|
||||||
- **`_merge_group()`** — fills blank fields in the survivor from loser rows.
|
|
||||||
|
|
||||||
### src/core/normalizers.py — Text Normalization
|
|
||||||
|
|
||||||
Five normalizer functions, each `str → str`, idempotent, None-safe:
|
|
||||||
|
|
||||||
- **`normalize_email()`** — lowercase, strip Gmail dots, strip `+tag` suffixes
|
|
||||||
- **`normalize_phone()`** — parse with `phonenumbers` to E.164; fallback to digits-only
|
|
||||||
- **`normalize_name()`** — strip title prefixes (Dr., Mr.) and suffixes (Jr., PhD), case-fold
|
|
||||||
- **`normalize_address()`** — USPS abbreviations (Street→St, Avenue→Ave), case-fold
|
|
||||||
- **`normalize_string()`** — trim, collapse whitespace, case-fold
|
|
||||||
|
|
||||||
The `get_normalizer()` registry function maps `NormalizerType` enum values to functions.
|
|
||||||
|
|
||||||
### src/core/io.py — File I/O
|
|
||||||
|
|
||||||
Auto-detection stack:
|
|
||||||
|
|
||||||
1. **`detect_encoding()`** — checks BOM, then uses `charset-normalizer` heuristics
|
|
||||||
2. **`detect_delimiter()`** — uses `csv.Sniffer` on first 20 lines
|
|
||||||
3. **`detect_header_row()`** — finds first row where all cells look like column names
|
|
||||||
|
|
||||||
Main functions:
|
|
||||||
- **`read_file()`** — reads CSV/TSV/Excel with full auto-detection. Returns a DataFrame.
|
|
||||||
- **`write_file()`** — writes DataFrame to CSV or Excel. Uses `utf-8-sig` by default for Windows Excel compatibility.
|
|
||||||
- **`list_sheets()`** — returns sheet names from an Excel workbook.
|
|
||||||
|
|
||||||
### src/core/config.py — Configuration Profiles
|
|
||||||
|
|
||||||
Save/load deduplication settings as JSON:
|
|
||||||
|
|
||||||
- **`DeduplicationConfig`** — flat dataclass with all settings: strategies, survivor rule, merge flag, algorithm, threshold, normalizer map.
|
|
||||||
- **`.to_file()` / `.from_file()`** — JSON serialization
|
|
||||||
- **`.to_strategies()`** — converts config back to `MatchStrategy` objects for the engine
|
|
||||||
- **`.to_survivor_rule()`** — converts string to `SurvivorRule` enum
|
|
||||||
|
|
||||||
### src/cli.py — Command-Line Interface
|
|
||||||
|
|
||||||
Typer-based CLI with 17 options. Key responsibilities:
|
|
||||||
|
|
||||||
- Parse flags into strategies, survivor rule, and other config
|
|
||||||
- Set up logging (timestamped log files in `logs/`)
|
|
||||||
- Column name validation with fuzzy suggestions on typos
|
|
||||||
- `_interactive_review()` — side-by-side row display with y/n/s prompts
|
|
||||||
- Progress bar via `tqdm` for files > 10,000 rows
|
|
||||||
- Output formatting and file writing
|
|
||||||
|
|
||||||
### src/gui/app.py — Streamlit GUI
|
|
||||||
|
|
||||||
Single-page layout:
|
|
||||||
- File upload with instant preview and configurable delimiter (comma, tab, semicolon, pipe, or custom)
|
|
||||||
- Advanced options expander (column selection, fuzzy, normalizers, survivor rule, merge, config profiles)
|
|
||||||
- Find Duplicates button → runs `deduplicate()` with `progress_callback`
|
|
||||||
- Interactive review via `st.data_editor` with inline checkboxes and column dropdowns
|
|
||||||
- Batch actions: Accept All, Reject All, Clear Decisions
|
|
||||||
- Apply review decisions and download cleaned results
|
|
||||||
- Download buttons for deduplicated CSV, removed rows, and match groups report
|
|
||||||
|
|
||||||
### src/gui/components.py — Reusable GUI Widgets
|
|
||||||
|
|
||||||
- **`match_group_card()`** — expandable card with `st.data_editor`: inline Keep checkboxes per row, `SelectboxColumn` dropdowns for differing columns, and a live surviving rows preview
|
|
||||||
- **`config_panel()`** — the advanced options expander, returns settings dict with strategies, survivor rule, merge flag
|
|
||||||
- **`results_summary()`** — summary metrics and download buttons
|
|
||||||
- **`apply_review_decisions()`** — builds final DataFrames from user review decisions (merge, split, or keep-all per group) with column override support
|
|
||||||
|
|
||||||
## Data Flow
|
|
||||||
|
|
||||||
```
|
```
|
||||||
Input File
|
read_file() # auto-detect encoding, delimiter, header
|
||||||
│
|
▼ DataFrame
|
||||||
▼
|
build_default_strategies() # if no explicit strategies
|
||||||
read_file() ← auto-detect encoding, delimiter, header
|
▼ # strong keys (email, phone) → standalone OR
|
||||||
│
|
# weak keys (name, address) → AND with strong
|
||||||
▼
|
_apply_normalizations() # add _norm_* shadow columns
|
||||||
DataFrame
|
▼
|
||||||
│
|
_find_match_groups() # O(n²) pair compare, OR strategies, union-find
|
||||||
▼
|
▼
|
||||||
build_default_strategies() ← (if no explicit strategies)
|
[review_callback()] # optional interactive review
|
||||||
│ scan column names → regex patterns
|
▼
|
||||||
│ strong keys: email, phone (standalone OR)
|
_select_survivor() # per group: first/last/most-complete/most-recent
|
||||||
│ weak keys: name, address (AND with strong)
|
▼
|
||||||
▼
|
[_merge_group()] # optional: fill blanks from losers
|
||||||
_apply_normalizations() ← add _norm_* shadow columns
|
▼
|
||||||
│ normalize_email(), normalize_phone(), etc.
|
DeduplicationResult # deduplicated_df, removed_df, match_groups, log
|
||||||
▼
|
|
||||||
_find_match_groups() ← O(n²) pairwise comparison
|
|
||||||
│ for each pair: try all strategies (OR)
|
|
||||||
│ _compute_similarity() per column
|
|
||||||
│ union-find for transitive closure
|
|
||||||
▼
|
|
||||||
[review_callback()] ← optional: interactive review per group
|
|
||||||
│ True=accept, False=reject, None=skip
|
|
||||||
▼
|
|
||||||
_select_survivor() ← per group: first/last/most-complete/most-recent
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
[_merge_group()] ← optional: fill blanks from losers
|
|
||||||
│
|
|
||||||
▼
|
|
||||||
DeduplicationResult
|
|
||||||
├── deduplicated_df ← cleaned DataFrame (shadow cols dropped)
|
|
||||||
├── removed_df ← rows that were removed
|
|
||||||
├── match_groups ← list of MatchResult with confidence, columns
|
|
||||||
└── log_entries ← human-readable audit log
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## How to Add a Normalizer
|
## Extension recipes
|
||||||
|
|
||||||
1. **Add the function** in `src/core/normalizers.py`:
|
### Add a normalizer
|
||||||
|
|
||||||
|
1. Add function to `core/normalizers.py`:
|
||||||
|
```python
|
||||||
|
def normalize_company(value: Optional[str]) -> str:
|
||||||
|
if not value or not isinstance(value, str): return ""
|
||||||
|
name = value.strip().casefold()
|
||||||
|
for sfx in ("inc", "llc", "corp", "ltd", "co"):
|
||||||
|
name = re.sub(rf"\b{sfx}\.?\s*$", "", name).strip()
|
||||||
|
return name
|
||||||
|
```
|
||||||
|
2. Register: add `COMPANY = "company"` to `NormalizerType` + entry in `_NORMALIZER_MAP`.
|
||||||
|
3. Auto-detect (optional): add a `_COLUMN_TYPE_PATTERNS` row in `core/dedup.py`.
|
||||||
|
|
||||||
|
### Add a fuzzy algorithm
|
||||||
|
|
||||||
|
1. Add value to `Algorithm` enum in `core/dedup.py`.
|
||||||
|
2. Add case in `_compute_similarity()`.
|
||||||
|
3. Document the value in CLI help text.
|
||||||
|
|
||||||
|
### Add a survivor rule
|
||||||
|
|
||||||
|
1. Add value to `SurvivorRule` enum.
|
||||||
|
2. Add branch in `_select_survivor()`.
|
||||||
|
3. Add CLI mapping.
|
||||||
|
|
||||||
|
### Add a fix + detector (analyzer/gate)
|
||||||
|
|
||||||
|
1. **Detector** in `core/analyze.py`: add `_detect_<thing>(df) -> list[Finding]`, hook into the main `analyze()` pipeline. Emit Finding with a unique `fix_action` id.
|
||||||
|
2. **Fix** in `core/fixes.py`:
|
||||||
|
```python
|
||||||
|
@register("fix_id")
|
||||||
|
def my_fix(df, payload=None) -> tuple[pd.DataFrame, int]:
|
||||||
|
# ...
|
||||||
|
return out_df, cells_changed
|
||||||
|
```
|
||||||
|
3. **Constant** in `core/analyze.py`: add `FIX_<NAME> = "fix_id"` so the detector and fix can reference it.
|
||||||
|
|
||||||
|
No other call sites change. Gate auto-discovers it via the registry.
|
||||||
|
|
||||||
|
### Tool page header — `render_tool_header(tool_id)`
|
||||||
|
|
||||||
|
Every tool page renders its title block via `render_tool_header(tool_id)` in `src/gui/components/_legacy.py` — do not call `st.title()` + `st.caption()` directly. The helper renders:
|
||||||
|
|
||||||
|
- `tools.<id>.page_title` as the page title (left column).
|
||||||
|
- A **Help** popover button right of the title (icon `:material/help_outline:`, label from `help.button_label`). Clicking opens an `st.popover` containing the markdown body.
|
||||||
|
- `tools.<id>.page_caption` as the caption below.
|
||||||
|
|
||||||
|
All copy is i18n-driven; editors can tweak help text without touching Python. If a tool is missing its `help_md` key, the popover falls back to `help.missing_body`.
|
||||||
|
|
||||||
|
**`help_md` structure** (markdown, stored as a single string with `\n` line breaks in JSON):
|
||||||
|
|
||||||
|
```
|
||||||
|
**When to use**
|
||||||
|
- bullet 1
|
||||||
|
- bullet 2
|
||||||
|
|
||||||
|
**Steps**
|
||||||
|
1. numbered step
|
||||||
|
2. numbered step
|
||||||
|
|
||||||
|
**Examples**
|
||||||
|
- example 1
|
||||||
|
- example 2
|
||||||
|
|
||||||
|
**Tip** one-sentence pro tip.
|
||||||
|
```
|
||||||
|
|
||||||
|
Keep it short — the popover is intentionally compact. Mirror the structure across every tool so the muscle memory transfers.
|
||||||
|
|
||||||
|
### i18n — language packs
|
||||||
|
|
||||||
|
The GUI's user-facing strings live in `src/i18n/packs/<code>.json`, keyed by ISO-639-1 code. English (`en.json`) is canonical; missing keys in other packs fall back to English, and missing keys in English fall back to the literal dotted key so a typo is visible rather than silent.
|
||||||
|
|
||||||
|
**Look up a string in code:**
|
||||||
|
```python
|
||||||
|
from src.i18n import t
|
||||||
|
st.button(t("upload.run_button"))
|
||||||
|
st.warning(t("gate.warning", name=filename)) # {name} interpolated via str.format
|
||||||
|
```
|
||||||
|
|
||||||
|
`t()` reads the active language from `st.session_state["ui_lang"]`. Outside a Streamlit run (tests, scripts) it falls back to English.
|
||||||
|
|
||||||
|
**Add a new language:**
|
||||||
|
1. Copy `src/i18n/packs/en.json` to `src/i18n/packs/<code>.json` and translate values in place. Keep the key tree identical.
|
||||||
|
2. Add a one-line entry to `LANGUAGES` in `src/i18n/__init__.py`: `{"code": "fr", "label": "Français"}`. The sidebar picker auto-renders.
|
||||||
|
3. Run `pytest tests/test_lang_packs.py` — the parity test fails until every key from `en.json` exists in the new pack (and orphan keys not in English are also flagged).
|
||||||
|
|
||||||
|
**Add a new key:**
|
||||||
|
1. Add it to `en.json` first (canonical pack).
|
||||||
|
2. Add it to every other registered pack in the same commit. The parity test enforces this.
|
||||||
|
3. Use the dotted key at the call site: `t("section.subsection.key")` or `t("section.key", name=value)` for placeholder interpolation.
|
||||||
|
|
||||||
|
**Authoring rules:**
|
||||||
|
- Keys live under semantic sections (`home.*`, `upload.*`, `findings.*`, `help.*`, `tools.<id>.name`). Don't nest by language or by tool unless the string is genuinely tool-specific.
|
||||||
|
- Per-tool header copy lives under `tools.<id>.{page_title, page_caption, help_md}`. `page_caption` is the one-line subtitle under the title; `help_md` is the popover body (see *Tool page header* above). Top-level `help.button_label` / `help.missing_body` are shared across every tool.
|
||||||
|
- Use `{named}` placeholders (not positional `{0}`) so translators see what's being interpolated.
|
||||||
|
- Strings can contain Streamlit markdown (`**bold**`) — pass through `st.markdown` / `st.caption` as usual.
|
||||||
|
- Do **not** put strings inside the farewell-overlay JS payload without going through `_js_html_safe()` in `src/gui/components/_legacy.py`; the helper escapes both the JS string terminator and HTML special chars. The test `TestFarewellEscape` pins that contract.
|
||||||
|
- The sidebar picker is mounted by `hide_streamlit_chrome()`, so every page that calls that helper automatically gets the picker. Pages that don't call it (rare) can call `render_language_selector()` directly.
|
||||||
|
|
||||||
|
### Licensing
|
||||||
|
|
||||||
|
The license layer lives at ``src/license/``. The public API:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def normalize_company(value: Optional[str]) -> str:
|
from src.license import (
|
||||||
"""Strip legal suffixes (Inc, LLC, Corp), case-fold."""
|
get_manager, require_feature, current_state,
|
||||||
if not value or not isinstance(value, str):
|
FeatureFlag, Tier, License,
|
||||||
return ""
|
)
|
||||||
name = value.strip().casefold()
|
|
||||||
# Strip common suffixes
|
mgr = get_manager()
|
||||||
for suffix in ("inc", "llc", "corp", "ltd", "co"):
|
if not mgr.is_valid():
|
||||||
name = re.sub(rf"\b{suffix}\.?\s*$", "", name).strip()
|
raise RuntimeError("Not licensed")
|
||||||
return name
|
require_feature(FeatureFlag.DEDUPLICATOR)
|
||||||
```
|
```
|
||||||
|
|
||||||
2. **Register it** in the same file:
|
**Storage**: ``~/.datatools/license.json`` (override via
|
||||||
|
``DATATOOLS_LICENSE_PATH``). Signed with Ed25519 (asymmetric) — the
|
||||||
|
seller's private key signs; the buyer's binary verifies with the
|
||||||
|
embedded public key.
|
||||||
|
|
||||||
|
**Key material**:
|
||||||
|
|
||||||
|
| Variable | Who has it | Where it's used |
|
||||||
|
|---|---|---|
|
||||||
|
| ``DATATOOLS_LICENSE_PRIVKEY`` | Seller only | ``scripts/generate_license.py`` (mint a buyer's blob), ``scripts/generate_keypair.py`` writes a fresh one |
|
||||||
|
| ``DATATOOLS_LICENSE_PUBKEY`` | Every shipped binary | Verification at activation time; set at build time via PyInstaller env |
|
||||||
|
|
||||||
|
If neither env var is set, ``src.license.crypto`` falls back to the
|
||||||
|
deterministic dev keypair in ``src/license/_dev_keypair.py``. The
|
||||||
|
dev key is in source on purpose (so tests work without secrets),
|
||||||
|
but a frozen build that's still using it is a build-config bug —
|
||||||
|
:func:`assert_production_safe` refuses to start such a binary.
|
||||||
|
|
||||||
|
**First-time setup for shipped builds**:
|
||||||
|
|
||||||
|
1. ``python scripts/generate_keypair.py --output prod-keys.env`` —
|
||||||
|
creates a fresh keypair.
|
||||||
|
2. Stash ``DATATOOLS_LICENSE_PRIVKEY`` somewhere safe (password
|
||||||
|
manager / KMS). Lose it and you can't issue renewals without
|
||||||
|
reshipping a new build with a new public key.
|
||||||
|
3. Configure the PyInstaller build env with
|
||||||
|
``DATATOOLS_LICENSE_PUBKEY=<hex>`` so the shipped binary
|
||||||
|
verifies against the production key.
|
||||||
|
4. Mint buyer licenses with
|
||||||
|
``DATATOOLS_LICENSE_PRIVKEY=<hex> python scripts/generate_license.py ...``.
|
||||||
|
|
||||||
|
**Dev bypass**: ``DATATOOLS_DEV_MODE=1`` short-circuits every check.
|
||||||
|
The test suite's autouse fixture sets this so existing tests don't
|
||||||
|
need their own license fixtures. Tests that need the real check
|
||||||
|
explicitly use ``isolated_license_path`` /
|
||||||
|
``activated_license_manager`` / ``unactivated_license_manager``.
|
||||||
|
|
||||||
|
**Adding a feature flag**:
|
||||||
|
|
||||||
|
1. Add the enum value to ``FeatureFlag`` in ``src/license/schema.py``.
|
||||||
|
2. Add it to the relevant tier's set in
|
||||||
|
``FEATURES_BY_TIER`` in ``src/license/features.py``.
|
||||||
|
3. Gate at the call site: ``require_feature(FeatureFlag.YOUR_FLAG)``.
|
||||||
|
|
||||||
|
**Adding a new tier**:
|
||||||
|
|
||||||
|
1. Add the enum value to ``Tier``.
|
||||||
|
2. Add a row to ``FEATURES_BY_TIER`` listing the unlocked flags.
|
||||||
|
3. Add ``license.tier_<name>`` translation keys to every i18n pack.
|
||||||
|
4. The activation flow, sidebar status badge, feature gate, and home
|
||||||
|
grid lock badge all pick up the new tier automatically.
|
||||||
|
|
||||||
|
**Worked example — the Lite tier**:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class NormalizerType(str, Enum):
|
# src/license/schema.py
|
||||||
# ... existing types ...
|
class Tier(str, Enum):
|
||||||
COMPANY = "company" # ← add enum value
|
LITE = "lite" # new
|
||||||
|
CORE = "core"
|
||||||
|
...
|
||||||
|
|
||||||
_NORMALIZER_MAP: dict[NormalizerType, Callable[[str], str]] = {
|
# src/license/features.py
|
||||||
# ... existing entries ...
|
FEATURES_BY_TIER = {
|
||||||
NormalizerType.COMPANY: normalize_company, # ← add mapping
|
...
|
||||||
|
Tier.LITE: frozenset({
|
||||||
|
FeatureFlag.DEDUPLICATOR,
|
||||||
|
FeatureFlag.TEXT_CLEANER,
|
||||||
|
FeatureFlag.FORMAT_STANDARDIZER,
|
||||||
|
}),
|
||||||
|
Tier.CORE: _all(),
|
||||||
|
...
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
3. **Add auto-detection pattern** in `src/core/dedup.py` (optional):
|
Then in en.json/es.json add ``license.tier_lite``. That's it — the
|
||||||
|
existing ``require_feature_or_render_upgrade`` (GUI) and
|
||||||
|
``guard(feature=...)`` (CLI) calls in every tool page/CLI route a
|
||||||
|
Lite user into the upgrade prompt for any tool the tier doesn't
|
||||||
|
unlock. The home grid's lock badge fires off the same feature
|
||||||
|
lookup.
|
||||||
|
|
||||||
```python
|
**Minting a license** (creator-only):
|
||||||
_COLUMN_TYPE_PATTERNS = [
|
|
||||||
# ... existing patterns ...
|
|
||||||
(re.compile(r"company|organization|org_name", re.I),
|
|
||||||
NormalizerType.COMPANY, Algorithm.TOKEN_SET_RATIO, 85.0, False),
|
|
||||||
]
|
|
||||||
```
|
|
||||||
|
|
||||||
## How to Add a Matching Algorithm
|
|
||||||
|
|
||||||
1. **Add the enum value** in `src/core/dedup.py`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
class Algorithm(str, Enum):
|
|
||||||
# ... existing values ...
|
|
||||||
SOUNDEX = "soundex"
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Add the computation** in `_compute_similarity()`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
def _compute_similarity(val_a: str, val_b: str, algorithm: Algorithm) -> float:
|
|
||||||
# ... existing cases ...
|
|
||||||
if algorithm == Algorithm.SOUNDEX:
|
|
||||||
return 100.0 if _soundex(val_a) == _soundex(val_b) else 0.0
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Add the CLI flag value** in `src/cli.py` help text for `--algorithm`.
|
|
||||||
|
|
||||||
## How to Add a Survivor Strategy
|
|
||||||
|
|
||||||
1. **Add the enum value** in `src/core/dedup.py`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
class SurvivorRule(str, Enum):
|
|
||||||
# ... existing values ...
|
|
||||||
KEEP_LONGEST = "longest"
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Add the logic** in `_select_survivor()`:
|
|
||||||
|
|
||||||
```python
|
|
||||||
if rule == SurvivorRule.KEEP_LONGEST:
|
|
||||||
return max(indices, key=lambda i: len(str(df.iloc[i].to_dict())))
|
|
||||||
```
|
|
||||||
|
|
||||||
3. **Add to the CLI** survivor map in `src/cli.py`.
|
|
||||||
|
|
||||||
## Testing
|
|
||||||
|
|
||||||
### Run Tests
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# All tests
|
DATATOOLS_LICENSE_SECRET=<shipping-secret> \
|
||||||
pytest tests/ -q
|
python scripts/generate_license.py \
|
||||||
|
--name "Jane Doe" --email jane@example.com \
|
||||||
# Specific module
|
--tier core --years 1
|
||||||
pytest tests/test_dedup.py -q
|
|
||||||
pytest tests/test_normalizers.py -q
|
|
||||||
pytest tests/test_io.py -q
|
|
||||||
pytest tests/test_config.py -q
|
|
||||||
pytest tests/test_cli.py -q
|
|
||||||
|
|
||||||
# Verbose with output
|
|
||||||
pytest tests/ -v
|
|
||||||
|
|
||||||
# Stop on first failure
|
|
||||||
pytest tests/ -x
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Test Structure
|
The script prints a ``DTLIC1:`` blob to stdout — deliver this in the
|
||||||
|
Gumroad / purchase email. The buyer pastes it into the activation
|
||||||
|
page or runs ``python -m src.license_cli activate <blob> --name ...``.
|
||||||
|
|
||||||
|
### Add a format-standardizer field type
|
||||||
|
|
||||||
|
1. Add value to `FieldType` enum in `core/format_standardize.py`.
|
||||||
|
2. Add per-cell `standardize_<x>(value, *, …)` returning `(new_value, changed)`.
|
||||||
|
3. Add option fields to `StandardizeOptions` (with defaults that preserve existing behavior).
|
||||||
|
4. Wire into `_apply_field_type()` dispatcher (the `else` branch raises `AssertionError` — every enum value needs a branch).
|
||||||
|
5. Add validation entry in `StandardizeOptions.from_dict()` for any new enum-shaped option.
|
||||||
|
|
||||||
|
## Errors
|
||||||
|
|
||||||
|
Use `core/errors.py` instead of raw `ValueError` / `OSError`:
|
||||||
|
|
||||||
|
| Pattern | Use |
|
||||||
|
|---------|-----|
|
||||||
|
| Bad arg, wrong type, missing column | `InputValidationError` |
|
||||||
|
| Bad config / options file | `ConfigError` |
|
||||||
|
| File parses but isn't what we expected | `FileFormatError` |
|
||||||
|
| File I/O failure (perms, missing, disk full) | `FileAccessError` |
|
||||||
|
| Internal invariant broken (unreachable branch) | `AssertionError` |
|
||||||
|
|
||||||
|
Helpers:
|
||||||
|
- `ensure_dataframe(value, function="my_func")` at every public entry that takes a df.
|
||||||
|
- `ensure_choice(value, name="mode", choices=[...])` at every entry that takes a literal.
|
||||||
|
- `wrap_file_read(path, "operation", exc)` / `wrap_file_write(...)` when wrapping `OSError`.
|
||||||
|
|
||||||
|
GUI / CLI handlers: use `format_for_user(exc, context="...")` to render.
|
||||||
|
|
||||||
|
All `DataToolsError` subclasses extend stdlib `ValueError` or `OSError` so existing handlers still catch them.
|
||||||
|
|
||||||
|
## PDF Extractor — bundled Tesseract
|
||||||
|
|
||||||
|
Frozen builds (installer / portable .zip / AppImage) ship Tesseract OCR inside the bundle so scanned PDFs work without a separate system install. Source / `pip` developer environments still resolve Tesseract from `PATH`.
|
||||||
|
|
||||||
|
**Runtime layout (frozen bundles)**:
|
||||||
|
|
||||||
|
| Resource | Path |
|
||||||
|
|---|---|
|
||||||
|
| Tesseract binary | `Path(sys._MEIPASS) / "tesseract" / "tesseract"` (Linux/macOS), `…/tesseract/tesseract.exe` (Windows) |
|
||||||
|
| Tessdata directory | `Path(sys._MEIPASS) / "tesseract" / "tessdata"` |
|
||||||
|
| English model | `Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"` |
|
||||||
|
|
||||||
|
**Discovery order** (PDF Extractor runtime):
|
||||||
|
|
||||||
|
1. `DATATOOLS_TESSERACT_BIN` env var (override — explicit path to a `tesseract` binary).
|
||||||
|
2. Bundled path under `sys._MEIPASS` (frozen bundles only — falls through to step 3 otherwise).
|
||||||
|
3. `tesseract` on `PATH` (developer setups, source checkouts).
|
||||||
|
4. Windows well-known locations (`C:\Program Files\Tesseract-OCR\tesseract.exe`, etc.).
|
||||||
|
|
||||||
|
**Where the bytes come from**:
|
||||||
|
|
||||||
|
- **Tessdata** is vendored at `build/vendor/tessdata/eng.traineddata` — the "best" English model from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best). PyInstaller's spec copies it into `tesseract/tessdata/` inside the bundle.
|
||||||
|
- **Tesseract binary** is fetched at build time by `build/make_release.py` — per-platform download URLs are pinned in that script. The current pin is **Tesseract 5.5.0**.
|
||||||
|
|
||||||
|
**To update Tesseract**:
|
||||||
|
|
||||||
|
1. Bump the version pin + the per-platform fetch URLs in `build/make_release.py`.
|
||||||
|
2. If upstream changed the `eng.traineddata` schema, refresh `build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the matching tag.
|
||||||
|
3. Rebuild on each platform (`python build/make_release.py`) and smoke-test a scanned-PDF run through the PDF Extractor before tagging the release.
|
||||||
|
4. Update `LICENSE_TESSERACT.txt` at the repo root if the upstream license terms change (Tesseract is Apache-2.0 today).
|
||||||
|
|
||||||
|
## Tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# All (core + CLI + GUI)
|
||||||
|
pytest -q
|
||||||
|
# Quick loop — skip the GUI layer
|
||||||
|
pytest -q -m 'not gui'
|
||||||
|
# Only the GUI tests
|
||||||
|
pytest -q -m gui
|
||||||
|
# By module
|
||||||
|
pytest tests/test_dedup.py
|
||||||
|
# Include slow / integration
|
||||||
|
pytest -m slow
|
||||||
|
# Single test
|
||||||
|
pytest tests/test_dedup.py::TestExactMatch::test_basic
|
||||||
|
```
|
||||||
|
|
||||||
|
Test layout:
|
||||||
```
|
```
|
||||||
tests/
|
tests/
|
||||||
├── conftest.py # Shared fixtures
|
├── conftest.py # core/CLI fixtures
|
||||||
│ ├── sample_csv_path # Path to samples/messy_sales.csv
|
├── test_dedup.py · test_normalizers.py · test_io.py · test_config.py
|
||||||
│ ├── sample_df # Loaded sample CSV as DataFrame
|
├── test_analyze.py · test_normalize.py · test_text_clean.py
|
||||||
│ ├── simple_df # Small 5-row DataFrame with obvious duplicates
|
├── test_format_standardize.py
|
||||||
│ ├── merge_df # DataFrame with partial records
|
├── test_format_standardize_corpus.py # 199-row buyer corpus
|
||||||
│ └── tmp_csv # Temporary CSV from simple_df
|
├── test_audit_fixes.py · test_errors.py · test_fixes_unit.py
|
||||||
├── test_dedup.py # Engine tests: similarity, union-find, pairs, integration
|
├── test_corpus.py · test_encodings_corpus.py · test_fixtures_sweep.py
|
||||||
├── test_normalizers.py # Normalizer tests: all 5 types with edge cases
|
├── test_cli.py · test_cli_*.py · test_e2e.py · test_install.py
|
||||||
├── test_io.py # I/O tests: encoding, delimiter, header, read/write
|
├── test_perf_regressions.py # shape pins for the perf wins
|
||||||
├── test_config.py # Config tests: serialization round-trip
|
└── gui/ # Streamlit AppTest-driven tests
|
||||||
└── test_cli.py # CLI tests: argument parsing, file handling
|
├── conftest.py # AppTest fixtures + helpers
|
||||||
|
├── _findings_panel_harness.py # isolated component test page
|
||||||
|
├── test_smoke.py # every page renders in EN + ES
|
||||||
|
├── test_chrome.py # language selector, hide_chrome
|
||||||
|
├── test_gate.py # require_normalization_gate
|
||||||
|
├── test_workflows.py # happy path per Ready tool
|
||||||
|
├── test_dedup_review.py # match-group card interactions
|
||||||
|
├── test_advanced_panels.py # config_panel widgets
|
||||||
|
├── test_errors.py # malformed-upload error paths
|
||||||
|
└── test_findings_panel.py # analyzer findings rendering
|
||||||
```
|
```
|
||||||
|
|
||||||
### Writing Tests
|
### GUI test layer
|
||||||
|
|
||||||
Follow existing patterns. Tests use pytest fixtures from `conftest.py`:
|
GUI tests drive pages with `streamlit.testing.v1.AppTest` —
|
||||||
|
in-process, no browser, no display. They pre-populate
|
||||||
|
`st.session_state` with stashed-upload bytes (via the
|
||||||
|
`stash_upload()` helper in `tests/gui/conftest.py`) and either click
|
||||||
|
buttons via `app.button[i].click().run()` or assert on the
|
||||||
|
`session_state` after the run.
|
||||||
|
|
||||||
```python
|
Marker registered in `pytest.ini`. Default `pytest` runs everything;
|
||||||
def test_my_feature(simple_df):
|
`pytest -m 'not gui'` skips them for a faster core-only loop.
|
||||||
"""Test description."""
|
Coming-Soon stubs are pinned by the smoke tests so a regression
|
||||||
result = deduplicate(simple_df, ...)
|
("import error", "missing widget") shows up immediately.
|
||||||
assert len(result.match_groups) == expected
|
|
||||||
assert result.deduplicated_df.shape[0] == expected_rows
|
|
||||||
```
|
|
||||||
|
|
||||||
## Known Limitations
|
Fixture corpora: `test-cases/text-cleaner-corpus/` (21 files) · `test-cases/encodings-corpus/` (31 files) · `test-cases/format-cleaner-corpus/` (7 files + spec).
|
||||||
|
|
||||||
- **O(n^2) pairwise comparison** — no blocking or indexing. Works well up to ~50,000 rows. Beyond that, performance degrades quadratically. Future optimization: add blocking (partition by first letter, zip code prefix, etc.) to reduce comparison space.
|
## Known limitations
|
||||||
- **No multi-sheet dedup** — each Excel sheet is processed independently. Cross-sheet deduplication is not supported.
|
|
||||||
- **Phone normalization requires valid-length numbers** — the `phonenumbers` library rejects numbers that are too short or too long for the detected region. Fallback is digits-only, which may produce false negatives for international numbers without country codes.
|
- **Dedup pair-compare is O(n²)** for fuzzy strategies. Exact-only
|
||||||
- **Single-threaded** — no parallel comparison. Could benefit from `multiprocessing` for large files.
|
strategies (every column uses `Algorithm.EXACT` at threshold 100)
|
||||||
- **Memory-bound** — entire file is loaded into a pandas DataFrame. Files larger than available RAM will fail. Chunked reading exists but is not integrated with the dedup engine.
|
now route through an O(n) groupby fast path automatically — no API
|
||||||
|
change. Fuzzy strategies can opt into prefix blocking via
|
||||||
|
`deduplicate(..., blocking_columns=[...], blocking_prefix_len=1)`
|
||||||
|
to partition pairs by a cheap key (trades recall for speed).
|
||||||
|
- **Threading is opt-in for format_standardize** —
|
||||||
|
`StandardizeOptions.parallel_columns > 1` uses a thread pool.
|
||||||
|
On CPython 3.12 the GIL caps the win at roughly neutral; the
|
||||||
|
scaffolding is in place for free-threaded Python 3.13+.
|
||||||
|
- **Memory-bound** — entire file loaded into pandas. Streaming reads
|
||||||
|
exist but not integrated with the dedup engine.
|
||||||
|
- **No multi-sheet dedup** — each Excel sheet processed independently.
|
||||||
|
- **Phonenumbers minimum-length** — international numbers without
|
||||||
|
country codes fall back to digits-only.
|
||||||
|
|||||||
244
docs/FUTURE-TOOLS.md
Normal file
244
docs/FUTURE-TOOLS.md
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
# Future tools — design notes
|
||||||
|
|
||||||
|
> Creator-only. Specs for tools the strategic plan refuses to build right now
|
||||||
|
> but that surface repeatedly enough to be worth documenting once instead of
|
||||||
|
> re-thinking from scratch every time a customer asks.
|
||||||
|
> **Status of these tools**: post-launch, post-revenue. See `PLAN.md` §2.1 —
|
||||||
|
> new-tool development is frozen until DataTools has a paying customer and a
|
||||||
|
> repeated demand signal for the same idea. This file is the resting place
|
||||||
|
> for those ideas in the meantime; nothing here ships unless a future
|
||||||
|
> decision says it does.
|
||||||
|
|
||||||
|
Each entry follows the same shape: **What it does**, **Why someone would
|
||||||
|
want it**, **Can we ship it now?**, **Approach**, **GUI sketch**, **Effort**,
|
||||||
|
**Risks/unknowns**, **Ship criteria** (the signal that overrides the freeze).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. PDF → CSV extractor (bank statements + similar)
|
||||||
|
|
||||||
|
### What it does
|
||||||
|
|
||||||
|
Takes a PDF (typically a bank statement, expense report, paystub, invoice,
|
||||||
|
or any document where humans-but-not-computers can read a table) and turns
|
||||||
|
the tabular content into a CSV that the rest of DataTools can consume.
|
||||||
|
|
||||||
|
The user shows the tool **where** the data lives by drawing rectangles on
|
||||||
|
a rendered preview of the first page; the tool then applies those region
|
||||||
|
templates to every page of the document (and remembers the template so the
|
||||||
|
same template can be re-applied to next month's statement without
|
||||||
|
re-clicking).
|
||||||
|
|
||||||
|
### Why someone would want it
|
||||||
|
|
||||||
|
Bookkeepers, accountants, and any small-business operator who:
|
||||||
|
|
||||||
|
- Gets bank/credit-card statements only as PDFs (most US banks; many
|
||||||
|
European ones).
|
||||||
|
- Wants to import transactions into QuickBooks / Xero / a spreadsheet
|
||||||
|
without paying $10–$30/month for a SaaS converter (Docparser,
|
||||||
|
Rossum, Hubdoc) or relying on a Python script they can't maintain.
|
||||||
|
- Has 12 months × N accounts of statements to back-fill into a
|
||||||
|
ledger.
|
||||||
|
|
||||||
|
This is the most-requested DataTools adjacency in the casual feedback we
|
||||||
|
have so far. It maps tightly onto the **bookkeeper niche** identified in
|
||||||
|
`PLAN.md` §2.3 — that persona is exactly who needs PDF extraction and is
|
||||||
|
exactly the kind of operator who'd pay for a one-time desktop tool over a
|
||||||
|
recurring SaaS subscription.
|
||||||
|
|
||||||
|
### Can we ship it now?
|
||||||
|
|
||||||
|
**No.** Current state, verified 2026-05-17:
|
||||||
|
|
||||||
|
- No PDF dependency in `requirements.txt` or `requirements-dev.txt`.
|
||||||
|
- No PDF-touching code anywhere under `src/`. The single
|
||||||
|
string-mention of "PDF" in the codebase is in the **output** copy for
|
||||||
|
the Quality Check tool ("generate PDF/Excel quality reports"),
|
||||||
|
unrelated to extraction.
|
||||||
|
- No region-selection / canvas component in the Streamlit GUI today.
|
||||||
|
|
||||||
|
Building this requires net-new infrastructure on three axes (libraries,
|
||||||
|
extraction core, region-picker UI). Estimates below.
|
||||||
|
|
||||||
|
### Approach (technical)
|
||||||
|
|
||||||
|
PDFs split cleanly into two populations and the strategy differs:
|
||||||
|
|
||||||
|
1. **Native / text-layer PDFs** — text is stored as text, just laid out
|
||||||
|
visually. Most modern bank statements are this. Solvable with
|
||||||
|
coordinate-aware text extraction:
|
||||||
|
|
||||||
|
- **`pdfplumber`** (BSD-3, on top of `pdfminer.six`) — gives `(x0, y0,
|
||||||
|
x1, y1, text)` per character/word/line for each page. Mature, well
|
||||||
|
tested, single dependency, no native compiler. **First-choice.**
|
||||||
|
- **`pypdf`** (BSD-3) — text-only, no positions. Too coarse for
|
||||||
|
statement parsing; useful only for "the whole document as one big
|
||||||
|
string."
|
||||||
|
- **`camelot-py`** (MIT) — purpose-built for table extraction.
|
||||||
|
Heavier (needs `ghostscript` and `tk`/`opencv` for some modes),
|
||||||
|
and assumes the table grid is already visible. Worth evaluating
|
||||||
|
as a fallback for documents with explicit ruled tables.
|
||||||
|
|
||||||
|
2. **Scanned / image-only PDFs** — pixels of a scanner; no text layer.
|
||||||
|
Less common from major banks today but still happens with old PDFs
|
||||||
|
and receipts. Needs OCR:
|
||||||
|
|
||||||
|
- **`pytesseract`** wrapping the **Tesseract** binary (Apache-2). The
|
||||||
|
OCR is good for English on clean scans, mediocre on receipts.
|
||||||
|
Detect with `pdfplumber`: a page where every character is in a
|
||||||
|
glyph "image" object means the page is image-only → OCR fallback.
|
||||||
|
|
||||||
|
The extraction core would be a state machine:
|
||||||
|
|
||||||
|
1. Render page to an image (`pdfplumber.Page.to_image()` returns a PIL
|
||||||
|
image at a chosen DPI).
|
||||||
|
2. User draws a header region and per-row regions (or marks a single
|
||||||
|
table bounding box + column dividers) on the preview.
|
||||||
|
3. For each PDF page, crop the corresponding pixel region (or pdf
|
||||||
|
coordinate region), pull the text in that crop, and apply per-region
|
||||||
|
parsing (date, amount, description).
|
||||||
|
4. Emit one CSV row per detected statement row.
|
||||||
|
|
||||||
|
Bank-statement-specific niceties — implementable as templates on top of
|
||||||
|
the generic engine:
|
||||||
|
|
||||||
|
- Recurring-template store: save "Chase visa October layout" once, the
|
||||||
|
next month's PDF lands on the same template automatically. JSON file
|
||||||
|
in `~/.datatools/templates/` keyed by a layout fingerprint (page
|
||||||
|
size + header text hash).
|
||||||
|
- Multi-page row stitching: a row that wraps across pages gets merged
|
||||||
|
back together based on date-column continuity.
|
||||||
|
- Currency / sign inference: a column that mixes `$1,234.56` and
|
||||||
|
`($45.00)` — already handled by the (now-existing) Standardize
|
||||||
|
Formats analyzer rules.
|
||||||
|
|
||||||
|
### GUI sketch
|
||||||
|
|
||||||
|
The hardest part of the whole project. Streamlit doesn't ship a native
|
||||||
|
"draw rectangles on an image" widget. Options:
|
||||||
|
|
||||||
|
- **`streamlit-drawable-canvas`** — community component (MIT-licensed).
|
||||||
|
Lets the user draw freehand rectangles on top of a background image.
|
||||||
|
Returns the rectangle coordinates as JSON. Active maintenance.
|
||||||
|
**First-choice for the region picker.**
|
||||||
|
- **`streamlit-cropper`** — single-rectangle crop tool. Good if we only
|
||||||
|
needed the table bbox; too limited for "header region + column
|
||||||
|
dividers + repeating-row template."
|
||||||
|
- **Custom React component** — fully tailored UX but adds a build
|
||||||
|
toolchain DataTools doesn't have today. Last resort.
|
||||||
|
|
||||||
|
Sketch of the proposed page (under "Transformations" in the sidebar
|
||||||
|
section):
|
||||||
|
|
||||||
|
```
|
||||||
|
🧾 PDF → CSV (Beta)
|
||||||
|
─────────────────────────────────────────────────────────────────────
|
||||||
|
Upload a PDF [ Browse… ]
|
||||||
|
(statement / invoice / form — text-based PDFs work best)
|
||||||
|
|
||||||
|
[ ▸ Preview: October-statement.pdf · 3 pages ]
|
||||||
|
┌────────────────────────────────────────────────┐
|
||||||
|
│ CHASE BANK │
|
||||||
|
│ Statement period Oct 1–31, 2025 │
|
||||||
|
│ ┌─[1: header strip — drawn in red]──────────┐ │
|
||||||
|
│ │ Date Description Amount │ │
|
||||||
|
│ └────────────────────────────────────────────┘ │
|
||||||
|
│ ┌─[2: row template — drawn in green]────────┐ │
|
||||||
|
│ │ 10/03 AMAZON.COM #42… -45.67 │ │
|
||||||
|
│ └────────────────────────────────────────────┘ │
|
||||||
|
│ ⋮ (more transactions) │
|
||||||
|
└────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
Columns: [Date] [Description] [Amount] [+ Add column]
|
||||||
|
|
||||||
|
Apply template to: ( ) Only this page
|
||||||
|
(•) All pages with this layout
|
||||||
|
( ) All pages (force)
|
||||||
|
|
||||||
|
[ Save template as… Chase Visa Oct 2025 ]
|
||||||
|
|
||||||
|
[ Run extraction → CSV ]
|
||||||
|
```
|
||||||
|
|
||||||
|
After "Run extraction": the standard tool-page result layout (preview
|
||||||
|
table, "Saved to ~/Downloads/<name>_extracted.csv", "Open Downloads
|
||||||
|
folder" — matching the other Ready tools).
|
||||||
|
|
||||||
|
The **template save/recall** is what makes this a one-time setup
|
||||||
|
instead of a per-document chore — bookkeepers don't want to re-draw
|
||||||
|
rectangles every month.
|
||||||
|
|
||||||
|
### Effort estimate
|
||||||
|
|
||||||
|
| Phase | Scope | Estimate | Risk |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **A. Backend, native PDFs only** | pdfplumber-based extraction, hard-coded region passed via a JSON config (no GUI) | **1–2 weeks** | Low — straightforward use of pdfplumber. |
|
||||||
|
| **B. Region-picker GUI** | streamlit-drawable-canvas, multi-region drawing, per-region role assignment (date / amount / description) | **2–3 weeks** | Medium — the canvas component has quirks; persisting region state across reruns is non-trivial. |
|
||||||
|
| **C. Multi-page application + template persistence** | Apply one page's template to N pages, save/load templates, layout fingerprint | **1–2 weeks** | Medium — "is the next page the same layout?" is a real perception problem; we'll need a heuristic. |
|
||||||
|
| **D. Scanned-PDF OCR fallback** | Detect image-only pages, run Tesseract, merge OCR text into the extraction path | **2–3 weeks** | High — OCR accuracy is variable; we'd want a quality threshold + a "fail this page noisily" path. Bundling Tesseract with the PyInstaller build is its own packaging headache. |
|
||||||
|
| **E. Bank-statement specifics** | Cross-page row stitching, currency-sign inference, multi-account splits | **1–2 weeks** | Medium — every bank's idea of a "statement" differs. Templates absorb most of the variance. |
|
||||||
|
|
||||||
|
**Realistic total for a polished v1**: 6–10 calendar weeks of focused work
|
||||||
|
(text-PDFs + GUI + templates + statement-specific niceties). Add another
|
||||||
|
2–3 weeks if scanned PDFs are required at launch.
|
||||||
|
|
||||||
|
**Minimum viable extract** (just text PDFs, single-region drawing, no
|
||||||
|
template recall, no OCR): **3–4 weeks**. Worth scoping a beta at that
|
||||||
|
level before committing to the full surface.
|
||||||
|
|
||||||
|
### Difficulty rating
|
||||||
|
|
||||||
|
**Medium-hard.** Not because any single piece is novel — pdfplumber +
|
||||||
|
streamlit-drawable-canvas are well-trodden libraries — but because the
|
||||||
|
*combination* (point-and-click region selection that persists across
|
||||||
|
multiple PDF pages and across documents with similar layouts) is where
|
||||||
|
most of the engineering goes. The "every bank does it slightly
|
||||||
|
differently" reality makes templates a hard requirement rather than a
|
||||||
|
nice-to-have, and templates raise the design effort.
|
||||||
|
|
||||||
|
### Risks / unknowns
|
||||||
|
|
||||||
|
- **Scanned-PDF coverage**: if a meaningful slice of the addressable
|
||||||
|
market sends image-only PDFs (older statements, scanned receipts),
|
||||||
|
shipping text-only extraction limits the audience. Decide via the
|
||||||
|
first 10–20 user requests.
|
||||||
|
- **PyInstaller packaging of Tesseract**: bundling the OCR binary into
|
||||||
|
the desktop build is non-trivial. May force a "Tesseract not found —
|
||||||
|
install it separately" path on first launch, which hurts the "one-
|
||||||
|
click install" story.
|
||||||
|
- **Bank layout drift**: a template captured today can stop working
|
||||||
|
next month if the bank redesigns its statement. Layout-fingerprint
|
||||||
|
detection has to fail loudly rather than silently produce garbage.
|
||||||
|
- **PII surface**: bank statements are some of the most sensitive
|
||||||
|
documents the user might touch. The "runs locally — your data never
|
||||||
|
leaves this computer" guarantee is even more load-bearing here than
|
||||||
|
for CSVs. No telemetry, no cloud OCR services, hard line.
|
||||||
|
|
||||||
|
### Ship criteria
|
||||||
|
|
||||||
|
Before this tool re-enters active development, all of these need to be
|
||||||
|
true:
|
||||||
|
|
||||||
|
- DataTools has shipped to **≥1 paying customer** (the `PLAN.md` §2.1
|
||||||
|
freeze condition).
|
||||||
|
- **At least 3 paying customers OR 5 demo-traffic emails** have
|
||||||
|
explicitly asked for PDF extraction. Below that signal, build
|
||||||
|
something else.
|
||||||
|
- The bookkeeper niche (per `PLAN.md` §2.3) has at least one converted
|
||||||
|
customer — that's the persona who actually needs this tool, and
|
||||||
|
confirming they pay before building a tool aimed squarely at them
|
||||||
|
is the discipline the freeze exists to enforce.
|
||||||
|
|
||||||
|
If those three trip, the **Phase A minimum-viable beta (3–4 weeks)**
|
||||||
|
goes first — text PDFs + single-region drawing — so we can see real
|
||||||
|
user behaviour before committing to the full template surface.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## (placeholder for additional future-tool entries)
|
||||||
|
|
||||||
|
Add new entries above this line. Keep the same shape:
|
||||||
|
What / Why / Can we ship now / Approach / GUI / Effort / Risks /
|
||||||
|
Ship criteria. The shape is what makes "is this idea ready" a
|
||||||
|
factual question instead of an opinion.
|
||||||
259
docs/LICENSE-SERVER.md
Normal file
259
docs/LICENSE-SERVER.md
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
# LICENSE-SERVER — online issuance & record-keeping
|
||||||
|
|
||||||
|
**Status:** **deployed (PR 1 + PR 2 code merged)**. Live at
|
||||||
|
`licenses.datatools.unalogix.com`. See `ADMIN.md §"Live deployment"`
|
||||||
|
for day-2 operations, and `ARCHITECTURE.md` for the end-to-end
|
||||||
|
diagram including the desktop and storefronts.
|
||||||
|
|
||||||
|
This doc describes the smallest useful server we could build to
|
||||||
|
replace the manual mint-and-paste workflow, without compromising the
|
||||||
|
"your data never leaves your computer" promise to buyers (see
|
||||||
|
`DECISIONS.md §9b`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
1. **Automate fulfillment.** Gumroad sale → buyer gets a blob in
|
||||||
|
their inbox within seconds. No creator intervention.
|
||||||
|
2. **Authoritative customer list.** A queryable record of who has
|
||||||
|
what tier, when it expires, what they paid. Replaces the JSONL
|
||||||
|
log as the system of record.
|
||||||
|
3. **Self-service renewal & re-delivery.** Buyer enters their email
|
||||||
|
→ gets a fresh blob or a copy of their existing one. Cuts support
|
||||||
|
load.
|
||||||
|
4. **Move the private key off the founder's laptop.** Today the prod
|
||||||
|
private key has to be loaded as an env var to mint anything;
|
||||||
|
that's a security hazard. Server-side, it lives in a KMS and the
|
||||||
|
laptop never touches it.
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **No phone-home from the desktop app.** Activation stays offline.
|
||||||
|
The shipped binary still verifies blobs against the embedded
|
||||||
|
pubkey with no network call. `DECISIONS.md §9b` stands.
|
||||||
|
- **No per-machine activation limits enforced server-side.** v1
|
||||||
|
treats one license = one buyer, used on as many of their machines
|
||||||
|
as they want. Revisit only if abuse appears.
|
||||||
|
- **No telemetry.** The server only knows what the buyer or Gumroad
|
||||||
|
tells it (purchase events, renewal requests). It does not learn
|
||||||
|
anything from desktop installations.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────┐
|
||||||
|
│ Gumroad │
|
||||||
|
└────────┬────────┘
|
||||||
|
│ webhook (sale, refund)
|
||||||
|
▼
|
||||||
|
┌──────────────┐ ┌───────────────┐ ┌──────────────┐
|
||||||
|
│ Buyer email │◄──────│ Mint API │──────►│ licenses │
|
||||||
|
│ (SMTP send) │ │ (Python web) │ │ (Postgres) │
|
||||||
|
└──────────────┘ └───────┬───────┘ └──────────────┘
|
||||||
|
│ sign() via
|
||||||
|
▼
|
||||||
|
┌─────────────────┐
|
||||||
|
│ KMS / HSM │
|
||||||
|
│ (private key) │
|
||||||
|
└─────────────────┘
|
||||||
|
|
||||||
|
┌─────────────────────────────────────────┐
|
||||||
|
│ Renewal / re-delivery portal │
|
||||||
|
│ - buyer enters email │
|
||||||
|
│ - signed magic link │
|
||||||
|
│ - sees current license + "resend" │
|
||||||
|
└─────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
### 1. Mint API
|
||||||
|
|
||||||
|
Thin Python web service (FastAPI or Flask — Streamlit isn't appropriate
|
||||||
|
here). Two internal endpoints:
|
||||||
|
|
||||||
|
- `POST /internal/mint` — name, email, tier, years → blob + DB row.
|
||||||
|
Auth: shared HMAC header from the webhook receiver only.
|
||||||
|
- `POST /internal/revoke` — license_key → sets `revoked_at`. Auth: same.
|
||||||
|
|
||||||
|
The mint endpoint is the **only** place that calls `crypto.sign()`.
|
||||||
|
It pulls the private key from the KMS at request time; the key
|
||||||
|
material never lives in the API process's environment.
|
||||||
|
|
||||||
|
### 2. Webhook receiver
|
||||||
|
|
||||||
|
Public endpoint `POST /webhooks/gumroad`. Verifies Gumroad's
|
||||||
|
signature, maps the payload to a `mint` call, returns 200. Stores
|
||||||
|
the raw payload to a `gumroad_events` table for audit.
|
||||||
|
|
||||||
|
Refunds: webhook → `POST /internal/revoke` keyed on
|
||||||
|
`gumroad_order_id`. The desktop app doesn't currently honor
|
||||||
|
revocations (no online check), but future buyers won't be able to
|
||||||
|
renew a revoked license, and the row remains as evidence if a
|
||||||
|
dispute escalates.
|
||||||
|
|
||||||
|
### 3. Renewal portal
|
||||||
|
|
||||||
|
Single-page form, public. Buyer enters email → server emails a
|
||||||
|
signed magic link → click → page shows their license (tier, expiry,
|
||||||
|
"resend blob" button, "renew" button).
|
||||||
|
|
||||||
|
Renew flow: button → `POST /internal/mint` with the same name/email
|
||||||
|
and a fresh expiry → buyer gets the new blob → pastes into desktop
|
||||||
|
app via existing `license_cli.py renew`. No code change in the
|
||||||
|
desktop app.
|
||||||
|
|
||||||
|
### 4. Database
|
||||||
|
|
||||||
|
Postgres (small — a few thousand rows for the foreseeable future).
|
||||||
|
Single source of truth for the customer list.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Schema
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE licenses (
|
||||||
|
license_key text PRIMARY KEY, -- DT1-{TIER}-xxxx-xxxx
|
||||||
|
name text NOT NULL,
|
||||||
|
email text NOT NULL,
|
||||||
|
tier text NOT NULL, -- lite | core | pro | enterprise
|
||||||
|
issued_at timestamptz NOT NULL,
|
||||||
|
expires_at timestamptz NOT NULL,
|
||||||
|
blob text NOT NULL, -- DTLIC2:...
|
||||||
|
gumroad_order_id text UNIQUE, -- null for manual mints
|
||||||
|
revoked_at timestamptz, -- null = active
|
||||||
|
notes text -- free-form support notes
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_licenses_email ON licenses (lower(email));
|
||||||
|
CREATE INDEX idx_licenses_expires ON licenses (expires_at) WHERE revoked_at IS NULL;
|
||||||
|
CREATE INDEX idx_licenses_gumroad ON licenses (gumroad_order_id);
|
||||||
|
|
||||||
|
CREATE TABLE gumroad_events (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
received_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
event_type text NOT NULL, -- sale | refund | dispute | ...
|
||||||
|
order_id text,
|
||||||
|
raw_payload jsonb NOT NULL,
|
||||||
|
processed boolean NOT NULL DEFAULT false,
|
||||||
|
error text -- non-null if processing failed
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
The `licenses` schema is the JSONL log fields plus
|
||||||
|
`gumroad_order_id`, `revoked_at`, `notes`. The migration script from
|
||||||
|
JSONL → Postgres is therefore a flat insert.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security
|
||||||
|
|
||||||
|
- **Private key**: AWS KMS, GCP KMS, or HashiCorp Vault. Mint API
|
||||||
|
has IAM permission to *use* the key (sign operation), not to
|
||||||
|
*export* it. Rotating to a new key still requires a new desktop
|
||||||
|
build (the pubkey is embedded); plan a 90-day overlap window where
|
||||||
|
both keys are accepted.
|
||||||
|
- **Webhook secret**: Gumroad's HMAC signature, verified before
|
||||||
|
touching the body.
|
||||||
|
- **Internal endpoints**: not reachable from the public internet —
|
||||||
|
bind to localhost or a private subnet, fronted by the webhook
|
||||||
|
receiver and the renewal portal.
|
||||||
|
- **PII**: name + email + Gumroad order ID. Standard customer-data
|
||||||
|
hygiene — DB backups encrypted at rest, no PII in application
|
||||||
|
logs, GDPR delete-on-request supported via a `DELETE FROM
|
||||||
|
licenses WHERE email = ?` (the desktop activation still works
|
||||||
|
until the license expires; the buyer just won't appear in our
|
||||||
|
records anymore).
|
||||||
|
- **Mint API access**: short-lived signed tokens for any creator
|
||||||
|
CLI that talks to it. The CLI is a thin wrapper around the same
|
||||||
|
`POST /internal/mint`; the days of running
|
||||||
|
`scripts/generate_license.py` against the prod private key on a
|
||||||
|
laptop are over once the server exists.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Migration plan
|
||||||
|
|
||||||
|
Three phases, each independently revertable.
|
||||||
|
|
||||||
|
### Phase 0 (done)
|
||||||
|
|
||||||
|
- Ed25519 signing with prod key on creator's laptop.
|
||||||
|
- Local JSONL issuance log at `~/.datatools-creator/issued.jsonl`.
|
||||||
|
|
||||||
|
### Phase 1 — server stands up, no behavior change
|
||||||
|
|
||||||
|
1. Stand up Postgres + Mint API in a small VPS / Fly.io / Render box.
|
||||||
|
2. Provision a KMS-held keypair; **the public key must match the one
|
||||||
|
already embedded in the shipped binary** — i.e., import the
|
||||||
|
existing prod private key into KMS, do not generate a new one. If
|
||||||
|
the existing key is laptop-only and can't be imported, plan a
|
||||||
|
build-with-new-pubkey + buyer-side rotation cycle (see
|
||||||
|
`ADMIN.md` Recovery).
|
||||||
|
3. Run a one-shot script: read `~/.datatools-creator/issued.jsonl`,
|
||||||
|
`INSERT … ON CONFLICT (license_key) DO NOTHING` each row.
|
||||||
|
4. Add a creator-only CLI command `datatools-admin mint` that calls
|
||||||
|
`POST /internal/mint` instead of running the local script. Local
|
||||||
|
script stays as a fallback.
|
||||||
|
|
||||||
|
At this point: nothing buyer-facing has changed. The creator now has
|
||||||
|
two ways to mint (server or local) and a real DB.
|
||||||
|
|
||||||
|
### Phase 2 — automation
|
||||||
|
|
||||||
|
5. Wire the Gumroad webhook. New buyers get automated fulfillment.
|
||||||
|
6. Manual mints (friends, comps, support replacements) still go
|
||||||
|
through `datatools-admin mint`, which writes to the same DB.
|
||||||
|
7. Old local script is deprecated but kept (read-only) as a break-glass
|
||||||
|
tool if the server is down.
|
||||||
|
|
||||||
|
### Phase 3 — self-service
|
||||||
|
|
||||||
|
8. Ship the renewal portal.
|
||||||
|
9. Replace "email support to lose-my-blob" with a self-service form.
|
||||||
|
|
||||||
|
Each phase ships independently. The desktop app sees no change
|
||||||
|
across any of them — that's the whole point.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **Hosting choice.** *Decided: self-hosted* on the existing
|
||||||
|
`46.225.166.142` box alongside the `*.invixiom.com` services.
|
||||||
|
Runbook in `SETUP-LICENSE-SERVER.md`. Operator owns uptime,
|
||||||
|
backups, TLS renewal, and key custody — see that doc's
|
||||||
|
"Operational concerns" section.
|
||||||
|
- **Per-seat or per-device limits?** v1 says no. Revisit if/when
|
||||||
|
abuse is observable.
|
||||||
|
- **Email delivery.** Postmark or SES — both fine. Pick whichever the
|
||||||
|
rest of the stack uses. Avoid Gmail SMTP for transactional mail.
|
||||||
|
- **Audit log retention.** `gumroad_events` rows are unbounded growth
|
||||||
|
but trivially small. Default to forever; partition by year if it
|
||||||
|
ever exceeds a few GB.
|
||||||
|
- **Existing Gumroad customers.** Before any of this lands, every
|
||||||
|
buyer is already in Gumroad's records. A one-shot import from
|
||||||
|
Gumroad's CSV export → `licenses` table would catch anyone whose
|
||||||
|
blob the JSONL log doesn't have (e.g., if the creator's laptop
|
||||||
|
was lost before this design lands).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Code pointers (current state, for the future implementer)
|
||||||
|
|
||||||
|
| File | What it does now | What changes |
|
||||||
|
|------|------------------|--------------|
|
||||||
|
| `scripts/generate_license.py` | Sign locally, append JSONL | Becomes a CLI client of the Mint API |
|
||||||
|
| `src/license/crypto.py` | `sign()` reads `$DATATOOLS_LICENSE_PRIVKEY` | `sign()` calls KMS; the env var stays as a fallback for local dev |
|
||||||
|
| `src/license_cli.py` | Activate / status / renew — already buyer-facing | **No change.** Still verifies offline against embedded pubkey |
|
||||||
|
| `src/license/manager.py` | Verify, persist | **No change.** |
|
||||||
|
|
||||||
|
The desktop app is deliberately decoupled from any of this. The
|
||||||
|
server is a fulfillment + record-keeping layer wrapped around the
|
||||||
|
existing, frozen, offline activation flow.
|
||||||
329
docs/NEXT-STEPS.md
Normal file
329
docs/NEXT-STEPS.md
Normal file
@@ -0,0 +1,329 @@
|
|||||||
|
# Next Steps — from "code complete" to first paying customer
|
||||||
|
|
||||||
|
> Creator-only. The runnable checklist that takes the operator from
|
||||||
|
> the current state (1,729 tests passing, 6 tools shipped, 0 paying
|
||||||
|
> customers) through launch and into the first 90 days.
|
||||||
|
> **Version**: 1.0 · **Adopted**: 2026-05-01
|
||||||
|
|
||||||
|
This document is the **single answer** to "what now?". Every line
|
||||||
|
item has an owner, a time estimate, a blocker, a cost, and the
|
||||||
|
external dependency that makes it un-shippable today. Items are
|
||||||
|
ordered by **must-finish-before-the-next-item** — work top-down.
|
||||||
|
|
||||||
|
Cross-references:
|
||||||
|
- Strategy: `PLAN.md` (the 8 strategic moves + the 90-day sequence)
|
||||||
|
- Demo specs: `DEMO-PLAN.md`
|
||||||
|
- Deployment mechanics: `DEPLOYMENT.md`
|
||||||
|
- Post-launch measurement: `POST-LAUNCH.md`
|
||||||
|
- Locked criteria: `DECISIONS.md` §1
|
||||||
|
|
||||||
|
Status legend:
|
||||||
|
- **🟢** Done — the asset exists in this repo
|
||||||
|
- **🟡** Buildable now — no external dependency needed
|
||||||
|
- **🟠** External dependency — needs an account / signup / payment
|
||||||
|
- **🔴** Manual / requires user input that can't be automated
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 0 · What's already done (skip ahead)
|
||||||
|
|
||||||
|
| ✓ | Item | Where it lives |
|
||||||
|
|---|------|----------------|
|
||||||
|
| 🟢 | 6 of 9 tools shipped (Dedup, Text, Format, Missing, Column-Map, Pipeline) | `src/core/`, `src/cli_*.py`, `src/gui/pages/` |
|
||||||
|
| 🟢 | Automated Workflows (the retention multiplier per `PLAN.md` §2.6) | `src/core/pipeline.py`, `src/cli_pipeline.py`, `src/gui/pages/9_Pipeline_Runner.py` |
|
||||||
|
| 🟢 | 1,729 passing tests · 0 skipped · 0 xfailed | `tests/` |
|
||||||
|
| 🟢 | 3 niche demo datasets + pre-tuned pipeline JSONs | `samples/demo/` |
|
||||||
|
| 🟢 | Streamlit demo app + Cloud entry shim | `streamlit_app.py`, `src/gui/app_demo.py` |
|
||||||
|
| 🟢 | 3 niche landing pages + apex chooser + shared CSS | `landing/` |
|
||||||
|
| 🟢 | Landing-page deploy script (URL-substitution + sitemap + 404 + favicon) | `landing/deploy.py` |
|
||||||
|
| 🟢 | Strategic plan + demo plan + post-launch measurement plan + deployment doc | `docs/PLAN.md`, `DEMO-PLAN.md`, `POST-LAUNCH.md`, `DEPLOYMENT.md` |
|
||||||
|
| 🟢 | PyInstaller bundle scaffold (spec + launcher + Streamlit hook + README) | `build/` |
|
||||||
|
| 🟢 | Customer-facing copy single-source-of-truth (landing + demo + email subjects + Gumroad listing) | `marketing/COPY.md` |
|
||||||
|
| 🟢 | 9 niche-community post drafts (3 posts × 3 niches: bookkeeper, revops, shopify-pet) | `marketing/community-posts/` |
|
||||||
|
| 🟢 | 18 email drafts (Gumroad delivery + 5-touch onboarding × 3 niches) | `marketing/emails/` |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 1 · Stand the funnel up (target: end of week 1, ~6 hours total work)
|
||||||
|
|
||||||
|
The bottleneck right now is **distribution, not feature count**.
|
||||||
|
Everything in this phase is about turning code into a URL the user
|
||||||
|
can hit.
|
||||||
|
|
||||||
|
### 1.1 — 🟠 Push to GitHub (5 min)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | `git init` (if not already), commit, push to a private or public GitHub repo. |
|
||||||
|
| **Why** | Cloud deploy services need a Git source. Streamlit Community Cloud auto-deploys on push to `main`. |
|
||||||
|
| **External dependency** | A GitHub account (free). |
|
||||||
|
| **Cost** | $0. |
|
||||||
|
| **Blocked by** | Nothing. |
|
||||||
|
|
||||||
|
### 1.2 — 🟠 Deploy the demo to Streamlit Community Cloud (15 min)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Follow `DEPLOYMENT.md` Part 1. Result: a public URL like `https://datatools-demo.streamlit.app`. |
|
||||||
|
| **Why** | The landing pages embed this in their iframe. Without it, every "Run pipeline" button on the landing pages 404s. |
|
||||||
|
| **External dependency** | Free Streamlit Community Cloud account, signed in via GitHub. |
|
||||||
|
| **Cost** | $0. |
|
||||||
|
| **Blocked by** | 1.1 (the repo must be on GitHub). |
|
||||||
|
| **Watch out for** | First build takes 2–3 min while Cloud installs deps. Subsequent deploys < 30 s. |
|
||||||
|
|
||||||
|
### 1.3 — 🟠 Buy the apex domain (5 min, ~$15/year)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Register `datatools.app` (or whichever) at any registrar. Point the nameservers at Cloudflare. |
|
||||||
|
| **Why** | The landing-page canonical URLs and CTA buttons refer to this domain. Pages can deploy to a free `*.pages.dev` URL first if you want to defer this. |
|
||||||
|
| **External dependency** | A registrar account; payment method. |
|
||||||
|
| **Cost** | ~$15/year. Within `BUSINESS.md` §9 cost cap. |
|
||||||
|
| **Blocked by** | Nothing — can run in parallel with 1.1 / 1.2. |
|
||||||
|
|
||||||
|
### 1.4 — 🟠 Deploy the landing pages to Cloudflare Pages (15 min)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Follow `DEPLOYMENT.md` Part 2. Run `python3 landing/deploy.py` with the operator's URLs in `deploy.config.json`, then `wrangler pages deploy landing/dist` (or drag-drop). |
|
||||||
|
| **Why** | This is the marketing surface. Three persona URLs go live as soon as it deploys. |
|
||||||
|
| **External dependency** | Free Cloudflare account; Wrangler CLI (optional — drag-drop works too). |
|
||||||
|
| **Cost** | $0. |
|
||||||
|
| **Blocked by** | 1.2 (the demo URL goes into `deploy.config.json`); ideally 1.3 for the custom domain. |
|
||||||
|
| **Watch out for** | The `deploy.config.json` file is gitignored — your real URLs never get committed. |
|
||||||
|
|
||||||
|
### 1.5 — 🟠 Open a Gumroad listing (15 min) **— stub for now**
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Create a Gumroad account, draft a listing with a single screenshot + the landing-page copy, set price to $49. Don't enable purchases yet — leave it as a draft. |
|
||||||
|
| **Why** | The CTA buttons on the landing pages link to `gumroad.com/l/datatools?from=<persona>`. Until the listing exists, those buttons 404. |
|
||||||
|
| **External dependency** | Free Gumroad account; Stripe-connected payout method (defer to Phase 2). |
|
||||||
|
| **Cost** | $0 to draft, ~10% per sale once live. |
|
||||||
|
| **Blocked by** | Nothing — can run in parallel with 1.1–1.4. |
|
||||||
|
| **Watch out for** | The listing URL must be `gumroad.com/l/datatools` to match the landing-page hard-coded CTAs. If you pick a different slug, update `landing/deploy.config.json` → `gumroad_listing` and re-run `deploy.py`. |
|
||||||
|
|
||||||
|
### 1.6 — 🟡 End-to-end smoke verification (10 min)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Run the four `curl` commands from `DEPLOYMENT.md` Part 4. All four landing pages, all three demo personas, sitemap.xml. |
|
||||||
|
| **Why** | First time something can break is the moment a real user hits it. Ten minutes of `curl` saves a week of "why is conversion zero." |
|
||||||
|
| **External dependency** | None. |
|
||||||
|
| **Cost** | $0. |
|
||||||
|
| **Blocked by** | 1.4 + 1.2. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 2 · Make it sellable (target: end of week 2)
|
||||||
|
|
||||||
|
### 2.1 — 🟠 Apple Developer Program enrollment (5 min to start, 1–2 weeks lead)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Per `BUSINESS.md` §10. Required for code-signing the macOS installer. |
|
||||||
|
| **External dependency** | Apple ID + government-issued ID (individual) or D-U-N-S number (org). |
|
||||||
|
| **Cost** | $99/year. |
|
||||||
|
| **Blocked by** | Nothing — start ASAP because of the 1–2 week approval window. The pipeline waits on this; nothing else does. |
|
||||||
|
|
||||||
|
### 2.2 — 🟢 PyInstaller spec + cross-platform build *(scaffold shipped — runs need per-OS hosts)*
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | `build/datatools.spec` + `build/launcher.py` + `build/hooks/hook-streamlit.py` bundle the Streamlit GUI + all 6 tools + samples into one app. Folder-mode (one-dir) by default; Mac `.dmg`, Windows `.exe`, Linux `.tar.gz`. Per-platform recipe in `build/README.md`. |
|
||||||
|
| **Why** | The buyer's deliverable. Without this, there is nothing to attach to the Gumroad listing. |
|
||||||
|
| **External dependency** | `pip install pyinstaller`. None for Linux/Mac builds. Windows builds need a Windows machine or a CI matrix runner. |
|
||||||
|
| **Cost** | $0 (GitHub Actions matrix runners are free for public repos). |
|
||||||
|
| **Blocked by** | Nothing for the spec; 2.1 for the signed Mac build. |
|
||||||
|
| **Watch out for** | Streamlit's bundle size lands around 300–500 MB per `DECISIONS.md` §4c — accepted tradeoff. PyInstaller cross-compilation isn't supported — Mac builds need a Mac, Windows builds need a Windows host. |
|
||||||
|
| **Where it lives** | `build/datatools.spec`, `build/launcher.py`, `build/hooks/hook-streamlit.py`, `build/README.md` |
|
||||||
|
|
||||||
|
### 2.3 — 🟡 macOS sign + notarize (30 min once Apple Dev is approved)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Sign the `.dmg`, submit to Apple's notarization service, staple the ticket. |
|
||||||
|
| **Why** | Without it, Gatekeeper hard-blocks the install with no obvious way out (per `BUSINESS.md` §10). The buyer gives up. |
|
||||||
|
| **External dependency** | Apple Developer Program (2.1). |
|
||||||
|
| **Cost** | $0 incremental over 2.1. |
|
||||||
|
| **Blocked by** | 2.1 + 2.2. |
|
||||||
|
|
||||||
|
### 2.4 — 🟢 Refund policy + license + Gumroad listing copy *(drafted in COPY.md)*
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | A clear refund policy (30-day no-questions) + a software licence text + the Gumroad listing description. |
|
||||||
|
| **Why** | Required by Gumroad's terms; surfaces on the listing page; protects against buyer disputes. |
|
||||||
|
| **External dependency** | None — operator authoring. |
|
||||||
|
| **Cost** | $0. |
|
||||||
|
| **Blocked by** | Nothing. |
|
||||||
|
| **Where it lives** | `marketing/COPY.md` § 5 (Gumroad listing — full title / tagline / description / bullets / refund text / tags). Refund window is also referenced in COPY.md § 0 so it stays consistent across surfaces. |
|
||||||
|
| **Still to author** | A short licence text (one-time perpetual use, no redistribution) — not in COPY.md yet. Recommend Polyform Strict 1.0.0 or a 10-line bespoke text. |
|
||||||
|
|
||||||
|
### 2.5 — 🟠 Activate the Gumroad listing (15 min)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Upload the cross-platform installers from 2.2/2.3, paste the copy from 2.4, set $49 price, enable purchases, configure Stripe payout. |
|
||||||
|
| **Why** | This is the "buy" button finally working. |
|
||||||
|
| **External dependency** | Gumroad + Stripe account; the installers from 2.2/2.3. |
|
||||||
|
| **Cost** | ~10 % per sale. |
|
||||||
|
| **Blocked by** | 2.2, 2.3, 2.4. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 3 · First-traffic ignition (target: end of week 4)
|
||||||
|
|
||||||
|
Per `PLAN.md` §3 and `BUSINESS.md` §7 channel priorities. The strict
|
||||||
|
no-touch constraint of `DECISIONS.md` §1 #8 makes channel choice
|
||||||
|
matter — these are the only ones that fit.
|
||||||
|
|
||||||
|
### 3.1 — 🟢 First niche-community post *(9 drafts ready — pick one and personalize)*
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | One value-first post in one niche-relevant community (e.g. r/Bookkeeping, r/revops, r/shopify; IndieHackers; niche Slacks/Discords). Lead with the demo URL, not the buy URL. |
|
||||||
|
| **Why** | Marketplaces alone don't drive discovery. Communities are the only first-touch channel that works under no-touch. |
|
||||||
|
| **External dependency** | Account in the chosen community; understand its self-promotion rules. |
|
||||||
|
| **Cost** | $0. |
|
||||||
|
| **Blocked by** | 1.4 (demo URL must work). |
|
||||||
|
| **Hint** | Pick the niche the operator knows best. Don't post all three drafts in the same community in the same week — see `marketing/community-posts/README.md` for cadence guidance. |
|
||||||
|
| **Where it lives** | `marketing/community-posts/{bookkeeper,revops,shopify-pet}/0{1-story,2-tip,3-soft-offer}.md` — 3 posts × 3 niches = 9 drafts. |
|
||||||
|
|
||||||
|
### 3.2 — 🟡 First long-tail SEO blog post (4–6 hours)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | One 800–1,500-word post on `datatools.app/blog/` (sub-route of Cloudflare Pages or Substack) targeting one niche keyword from `BUSINESS.md` §7. Topic: a real problem you've encountered, the cleanup steps, the demo URL at the end. |
|
||||||
|
| **Why** | Compounding asset — `BUSINESS.md` §2 says SEO pays in 6–18 months, not week 1. Don't mistake it for an early-stage channel. |
|
||||||
|
| **External dependency** | None. |
|
||||||
|
| **Cost** | $0. |
|
||||||
|
| **Blocked by** | Nothing. |
|
||||||
|
|
||||||
|
### 3.3 — 🟡 Cloudflare Web Analytics + event counters (45 min)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Enable Cloudflare Web Analytics on the Pages project (one click). Add a tiny inline `<script>` to each landing page that fires `cta_clicked` when the buy button is hit, before redirecting. Per `POST-LAUNCH.md` §1. |
|
||||||
|
| **Why** | Without this, the post-launch checklist is unrunnable. |
|
||||||
|
| **External dependency** | Cloudflare account (already from 1.4). |
|
||||||
|
| **Cost** | $0. |
|
||||||
|
| **Blocked by** | 1.4. |
|
||||||
|
| **Hint** | The Gumroad webhook captures `?from=<persona>` automatically — no extra wiring. |
|
||||||
|
|
||||||
|
### 3.4 — 🟢 Email autoresponder *(18 drafts ready — paste into provider)*
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Gumroad's built-in delivery email plus a **5-touch** onboarding sequence (Day 1, 3, 7, 14, 30) per niche. Per-niche segmentation via Gumroad's "What do you do?" custom field at checkout. |
|
||||||
|
| **Why** | Increases activation, reduces refund risk, surfaces support questions while volume is small. The Day-1 email in particular drives buyers from "I bought it" to "I ran it" — buyers who don't open within 72h refund at ~3× the rate of buyers who do. |
|
||||||
|
| **External dependency** | Gumroad delivery is built-in. The 5-touch sequence needs an email service that supports tag-based drips (Buttondown is the cheapest fit; ConvertKit if you want HTML editor; Resend if you'll script it). |
|
||||||
|
| **Cost** | $0–$30/month per `BUSINESS.md` §9. |
|
||||||
|
| **Blocked by** | 2.5. |
|
||||||
|
| **Where it lives** | `marketing/emails/{bookkeeper,revops,shopify-pet}/{00-delivery,01-day1,02-day3,03-day7,04-day14,05-day30}.md` — 6 emails × 3 niches = 18 drafts. Variables (`{{first_name}}`, `{{download_url}}`, `{{sample_file_url}}`, `{{landing_page}}`) are listed in `marketing/emails/README.md`. |
|
||||||
|
| **Sequence policy** | Pause if buyer replies (until you reply); kill on refund request; skip Day 14 + 30 if buyer has already engaged via support. See `marketing/emails/README.md` for full quiet rules. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 4 · First-buyer trigger and review
|
||||||
|
|
||||||
|
Per `PLAN.md` §4 decision triggers and `POST-LAUNCH.md` §4.
|
||||||
|
|
||||||
|
### 4.1 — 🟢 Run the monthly review (30 min, first Monday after launch)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Follow `POST-LAUNCH.md` §2 — pull last-30-days demo events + Gumroad sales + refunds, compute the five numbers, decide ONE change. |
|
||||||
|
| **Why** | Without this discipline, the funnel drifts and the operator changes 5 things at once and learns nothing. |
|
||||||
|
| **External dependency** | None — analytics from 3.3, sales from 2.5. |
|
||||||
|
| **Cost** | $0. |
|
||||||
|
| **Blocked by** | 3.3 + 2.5. |
|
||||||
|
|
||||||
|
### 4.2 — 🟢 First paying customer (target: 90 days)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | The actual first sale. |
|
||||||
|
| **Why** | Per `BUSINESS.md` §6: validates the funnel; not the business. |
|
||||||
|
| **Trigger action** | Continue, no plan change. Make the first $1k/month within month 6. |
|
||||||
|
|
||||||
|
### 4.3 — 🔴 Zero-paid-in-90-days fallback (only fires if 4.2 doesn't)
|
||||||
|
|
||||||
|
| | |
|
||||||
|
|---|---|
|
||||||
|
| **What** | Per `POST-LAUNCH.md` §4 — audit the funnel, not the features. Run a 1-week outbound experiment to 30 niche contacts as a control (per `BUSINESS.md` §8 the no-touch revisit is allowed below $5k MRR if it produces signal). |
|
||||||
|
| **Why** | Distinguishes "no reach" from "no conversion" — they need different fixes. |
|
||||||
|
| **External dependency** | Operator's time. |
|
||||||
|
| **Cost** | The 10 hr/wk allocation already exists; this displaces other work. |
|
||||||
|
| **Blocked by** | The 90-day calendar trigger from 4.2. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 5 · Steady state — what NOT to build
|
||||||
|
|
||||||
|
Per `PLAN.md` §5 (anti-temptations) and `DECISIONS.md` §8 (re-lock
|
||||||
|
triggers). The trap is treating "more code" as the answer when the
|
||||||
|
data says "more reach" or "more conversion." The five forbidden
|
||||||
|
moves until $5k/mo MRR:
|
||||||
|
|
||||||
|
| | Why locked |
|
||||||
|
|---|---|
|
||||||
|
| ❌ More tools (06–08) | `PLAN.md` §2.1 distribution-gate. Tool 09 was the exception; no others until first paid customer + one external review. |
|
||||||
|
| ❌ Tool #10 PDF → CSV (the most-asked-for adjacency) | Parked in `docs/FUTURE-TOOLS.md` with full design + 3–4 wk MVP / 6–10 wk polished estimate. Ship trigger: paying customer + ≥3 paid or ≥5 demo emails asking for PDF + the bookkeeper niche converting first. None have fired yet. |
|
||||||
|
| ❌ SaaS pivot | `DECISIONS.md` §4 — recurring infra conflicts with the lifestyle constraint. |
|
||||||
|
| ❌ Live chat / sales calls | `DECISIONS.md` §1 #8 — no-touch is locked until $5k/mo. |
|
||||||
|
| ❌ Custom integrations / one-off consulting | Breaks "build once, sell many." |
|
||||||
|
| ❌ Going broad on personas | `PLAN.md` §5 — "all small businesses" converts at 1 %; vertical converts at 5–15 %. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Triage table — what blocks what
|
||||||
|
|
||||||
|
```
|
||||||
|
Phase 1 (week 1) Phase 2 (week 2) Phase 3 (week 4)
|
||||||
|
┌──────────────┐ ┌──────────────┐ ┌──────────────┐
|
||||||
|
│ 1.1 Push GH │──────────┐ │ 2.1 Apple │ ───┐ │ 3.1 Community│
|
||||||
|
│ 1.2 Demo │──┐ ├──▶│ Dev (1-2w) │ │ │ 3.2 SEO post │
|
||||||
|
│ 1.3 Domain │ │ │ │ 2.2 Build │ ───┤ │ 3.3 Analytics│
|
||||||
|
│ 1.4 Pages │◀─┘ │ │ 2.3 Sign │ ───┤ │ 3.4 Emails │
|
||||||
|
│ 1.5 Gumroad │──────────┘ │ 2.4 Copy │ │ └──────────────┘
|
||||||
|
│ 1.6 Verify │ │ 2.5 Activate │ ◀──┘
|
||||||
|
└──────────────┘ └──────────────┘ ↓
|
||||||
|
┌──────────────┐
|
||||||
|
│ 4.1 Monthly │
|
||||||
|
│ 4.2 First $ │
|
||||||
|
│ 4.3 Fallback │
|
||||||
|
└──────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
The longest blocking path is **2.1 Apple Developer Program**
|
||||||
|
(1–2 weeks). Start it on day 1 of week 1 — it unblocks everything in
|
||||||
|
Phase 2 and you can do all of Phase 1 while waiting.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Time estimate — total operator time
|
||||||
|
|
||||||
|
| Phase | Hours | Wall-clock |
|
||||||
|
|---|---|---|
|
||||||
|
| Phase 1 | ~1 hour | end of week 1 (mostly waiting for builds) |
|
||||||
|
| Phase 2 | ~1 day | end of week 2 (gated by Apple Dev approval) |
|
||||||
|
| Phase 3 | ~6 hours | week 3–4 |
|
||||||
|
| Phase 4 | 30 min/month | ongoing |
|
||||||
|
| **Total to launch** | **~12 hours of operator time** | **~14 days wall-clock** |
|
||||||
|
|
||||||
|
Well inside the 10 hr/wk constraint of `DECISIONS.md` §1 #2.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## The thing that decides whether the plan works
|
||||||
|
|
||||||
|
Not the build. Not the deploy. Not even the first sale.
|
||||||
|
|
||||||
|
**The discipline of running the monthly review** in Phase 4 — and the
|
||||||
|
"decide ONE thing per month" rule from `POST-LAUNCH.md` §2 — is what
|
||||||
|
separates "this product exists" from "this product compounds." Every
|
||||||
|
feature added before the funnel is measured is a guess; every change
|
||||||
|
made after the monthly review is informed.
|
||||||
|
|
||||||
|
Don't skip 4.1.
|
||||||
228
docs/PLAN.md
Normal file
228
docs/PLAN.md
Normal file
@@ -0,0 +1,228 @@
|
|||||||
|
# Strategic Plan — DataTools
|
||||||
|
|
||||||
|
> Creator-only. Locks the "what next" in light of the locked criteria
|
||||||
|
> (DECISIONS.md §1) and the v1.6 honest status (BUSINESS.md §13).
|
||||||
|
> **Version**: 1.0 · **Adopted**: 2026-05-01 · **Owner**: Michael
|
||||||
|
|
||||||
|
This document is the active plan, derived from the strategic review of
|
||||||
|
2026-05-01. It compresses the eight strategic moves and a 90-day
|
||||||
|
execution sequence onto one page so the next decision (build vs.
|
||||||
|
ship vs. market) has a single reference.
|
||||||
|
|
||||||
|
It is **not** a re-lock of operating criteria — those still live in
|
||||||
|
DECISIONS.md and have not changed. This plan is downstream of those
|
||||||
|
criteria; if a move below conflicts with §1 of Decisions, the criteria
|
||||||
|
win.
|
||||||
|
|
||||||
|
## 1. Frame
|
||||||
|
|
||||||
|
**Locked context** (BUSINESS.md, DECISIONS.md):
|
||||||
|
|
||||||
|
- Niche Python automation tools, $49–79 single / $149 suite.
|
||||||
|
- Cash budget ≤ $1,200/mo recurring · Time ≤ 10 hr/wk · No external funding.
|
||||||
|
- Async + no-touch sales (revisit at $5k/mo MRR).
|
||||||
|
- Marketplace-first distribution (Gumroad / Lemon Squeezy).
|
||||||
|
- Streamlit GUI + CLI dual interface, runs locally.
|
||||||
|
- Lifestyle cashflow goal (no exit needed).
|
||||||
|
|
||||||
|
**Honest current state** (2026-05-01):
|
||||||
|
|
||||||
|
| Asset | State |
|
||||||
|
|---|---|
|
||||||
|
| Tools 1–5 (Find Duplicates, Clean Text, Standardize Formats, Fix Missing Values, Map Columns) | Ready · 1,691 tests passing · 0 xfailed |
|
||||||
|
| Tools 6–9 (Find Unusual Values, Combine Files, Quality Check, Automated Workflows) | Coming Soon |
|
||||||
|
| PyInstaller installer pipeline | Not started |
|
||||||
|
| macOS code signing (Apple Dev Program) | Not started |
|
||||||
|
| Hosted browser demo (Streamlit Cloud) | Not deployed |
|
||||||
|
| Landing page | Not live |
|
||||||
|
| Marketplace listing (Gumroad) | Not listed |
|
||||||
|
| Paying customers | 0 |
|
||||||
|
|
||||||
|
**Diagnosis**: the bottleneck is not feature count — it's distribution.
|
||||||
|
The next $1 of value comes from closing the gap between "code-complete"
|
||||||
|
and "buyer-pulls-out-card", not from tool 6.
|
||||||
|
|
||||||
|
## 2. The eight strategic moves
|
||||||
|
|
||||||
|
Numbered moves. Each is consistent with locked criteria.
|
||||||
|
|
||||||
|
### 2.1 Freeze new-tool development (one exception). Ship what exists.
|
||||||
|
|
||||||
|
Tools 6–8 are blocked behind a **distribution gate**: no work on them
|
||||||
|
until the existing 5 tools have a paying customer + one external review
|
||||||
|
(BUSINESS.md §4 sequence rule, applied recursively inside the bundle).
|
||||||
|
|
||||||
|
**Exception granted 2026-05-01**: Tool 09 Automated Workflows is built
|
||||||
|
*now*. Rationale: the pipeline transforms the bundle from "5 tools you
|
||||||
|
buy" into "an automatable workflow you depend on." That conversion is
|
||||||
|
what produces retention and word-of-mouth — the only marketing channel
|
||||||
|
that scales under the no-network/no-touch constraint.
|
||||||
|
|
||||||
|
**Parked behind the freeze**: post-launch tool ideas are captured in
|
||||||
|
`docs/FUTURE-TOOLS.md` with feasibility, GUI sketch, effort estimate,
|
||||||
|
and ship criteria for each. Currently parked: **#10 PDF → CSV
|
||||||
|
extractor** (bank statements et al.) — gated on a paying customer +
|
||||||
|
≥3 paying customers or ≥5 demo emails explicitly asking for PDF
|
||||||
|
extraction, with the bookkeeper niche converting at least one customer
|
||||||
|
first. None of those triggers have fired yet.
|
||||||
|
|
||||||
|
### 2.2 The demo *is* the product. Make it embarrassingly good.
|
||||||
|
|
||||||
|
- Three persona-tagged sample datasets, not one generic CSV: Shopify
|
||||||
|
customers / bookkeeper bank export / agency lead list.
|
||||||
|
- Run the *full pipeline* on the sample (Review → Dedup → Text Clean →
|
||||||
|
Format → Missing → Column Map). Free version caps **output rows**,
|
||||||
|
not the experience.
|
||||||
|
- Embed the demo as an **iframe on the landing page** (not "click to
|
||||||
|
open"). Friction kills conversion.
|
||||||
|
- Persistent CTA after demo: *"Run this on your own 50 k-row file →
|
||||||
|
buy for $49 →"* directly above the Gumroad button.
|
||||||
|
|
||||||
|
### 2.3 Niche down. Stop selling "data cleaning."
|
||||||
|
|
||||||
|
One engine, three landing pages:
|
||||||
|
|
||||||
|
| Persona | Landing-page lead | Demo dataset |
|
||||||
|
|---|---|---|
|
||||||
|
| Shopify operator (priority: pet supplies) | "Clean your customer / vendor / subscriber exports" | uc01_shopify_customer_list |
|
||||||
|
| Bookkeeper / freelance accountant | "Reconcile bank exports + vendor lists. Auditable changes." | uc06_bank_export_overlap |
|
||||||
|
| Marketing / RevOps agency | "Dedupe lead lists. Standardize phones across vendors." | uc13_combined_lead_sources |
|
||||||
|
|
||||||
|
Generic copy competes with `pip install pandas`. Vertical copy
|
||||||
|
competes with nothing.
|
||||||
|
|
||||||
|
### 2.3a Top pain points per niche
|
||||||
|
|
||||||
|
The "what does this actually fix?" question. Each pain point below is
|
||||||
|
sourced from operator-domain knowledge of these markets and the
|
||||||
|
buyer-use-case research already captured in `BUSINESS.md §4a`. Pain
|
||||||
|
points are ranked by **frequency × dollar impact** for that persona —
|
||||||
|
high-frequency / high-cost pains lead the landing-page copy and the
|
||||||
|
demo dataset.
|
||||||
|
|
||||||
|
> **Validation gap (honest disclaimer)**: these pains are derived from
|
||||||
|
> operator knowledge of the categories, not from a sample of buyer
|
||||||
|
> interviews. Per `BUSINESS.md §8` (no-touch constraint review at $5k/mo
|
||||||
|
> MRR), validate the top-3 per persona via 5 buyer interviews before the
|
||||||
|
> first $200 of paid acquisition spend. If any pain ranks below the
|
||||||
|
> assumed level, swap it for the next-highest in this list.
|
||||||
|
|
||||||
|
#### Shopify operator (priority: pet supplies)
|
||||||
|
|
||||||
|
| # | Pain | $ / time impact | Tools that fix it |
|
||||||
|
|---|------|-----------------|---|
|
||||||
|
| S1 | **Klaviyo / Mailchimp / Omnisend per-contact billing.** Subscriber list with 10–18 % duplicate rate (case drift, plus signs in Gmail addresses, multiple devices) → recurring overpay forever. | $30–300/mo per percent of dupes on a 50 k list — recurring | Dedup + Format Standardize (email canonicalization) + Pipeline (re-run weekly) |
|
||||||
|
| S2 | **Product feed rejected by Google Merchant Center / Meta Catalog.** Smart quotes in titles, NBSP in SKU, inconsistent attributes; campaign launch delayed 24–72 h while feed gets fixed. | 1–3 days delayed launch × campaign value | Clean Text + Standardize Formats |
|
||||||
|
| S3 | **Multi-channel order consolidation.** Shopify + Etsy + Amazon + Faire + wholesale spreadsheet, each with a different column for "customer email" / "order total" / "ship country". | 4–8 hr / month manually merging | Map Columns + Find Duplicates + Automated Workflows |
|
||||||
|
| S4 | **Subscription identity fragmentation.** Pet-box subscribers cancel and re-sub under a different email; cohort analysis says churn is 20 % when it's actually 12 % — pricing decisions wrong. | Mis-priced LTV → over- or under-paid acquisition | Dedup with `merge=true` survivor |
|
||||||
|
| S5 | **International tax / VAT MOSS compliance.** Country column is `UK` / `U.K.` / `United Kingdom` / `GB` in the same export; VAT report breaks. Phone formats per region break call-center routing. | Compliance penalty risk + ops friction | Standardize Formats (per-row country) + Map Columns |
|
||||||
|
|
||||||
|
#### Bookkeeper / freelance accountant
|
||||||
|
|
||||||
|
| # | Pain | $ / time impact | Tools that fix it |
|
||||||
|
|---|------|-----------------|---|
|
||||||
|
| B1 | **Bank-export month-overlap re-import.** Same transaction posts twice when Jan and Feb exports overlap at the boundary; client's books understate cash by 1–4 %. | 2–4 hr / month / client + reconciliation errors | Dedup with explicit Date+Amount+fuzzy Vendor strategy |
|
||||||
|
| B2 | **QBO / Xero vendor consolidation for 1099 reports.** "Amazon" / "amazon.com" / "AMAZON.COM*4F2X9" become 3 vendors; 1099 reports break, P&L by vendor unusable. | 1–2 hr / 1099 cycle + IRS-paper-trail risk | Format Standardize (name canonicalization) + Dedup |
|
||||||
|
| B3 | **Liability / professional indemnity.** Cannot use AI tools that don't show their work; client audit response window is 24–48 h. | Per-firm liability premium ≈ $500–2,500 / yr | Audit log built into every tool — every change row-logged |
|
||||||
|
| B4 | **Per-license-not-per-client economics.** Most cleanup tools are per-seat / per-client SaaS; bookkeepers managing 10–30 clients hit price walls fast. | $30/mo × N clients vs. $49 once | Desktop license, no per-client constraint |
|
||||||
|
| B5 | **Multi-currency books.** US-domiciled clients with EU customers; comma-decimal amounts (`€1.234,56`) crash standard parsers; parens-negative (`($89.50)`) treated as positive. | 30–60 min per multi-currency client per month | Format Standardize (`currency_decimal=auto`, parens-negative) |
|
||||||
|
|
||||||
|
#### Marketing / RevOps agency
|
||||||
|
|
||||||
|
| # | Pain | $ / time impact | Tools that fix it |
|
||||||
|
|---|------|-----------------|---|
|
||||||
|
| R1 | **HubSpot / Marketo / Iterable per-contact tier pricing.** 10 k contacts → enterprise tier at $4–8 k/mo. Every duplicate is a recurring tax. | $200–800 / month per 1 k duplicate contacts — recurring | Dedup with cross-source merge + Pipeline |
|
||||||
|
| R2 | **Email-deliverability / sender reputation.** Sending to invalid or duplicate addresses tanks reputation; recovery takes weeks. | Catastrophic — entire email programme degraded | Format Standardize (email canonicalization) + Missing (sentinel detection) |
|
||||||
|
| R3 | **GDPR / contact-data privacy.** Uploading lead data to a third-party cleaning SaaS is itself a GDPR concern; legal review blocks adoption. | Compliance risk + 4–8 wk legal-review delay | Local-only desktop app, zero outbound calls |
|
||||||
|
| R4 | **Multi-vendor lead-source unification.** Apollo, ZoomInfo, LinkedIn Sales Nav, manual scrapes — each export has different headers, scoring, country format. | 1–3 days per campaign of manual unification | Map Columns (alias matching) + Standardize Formats (per-row country) + Find Duplicates |
|
||||||
|
| R5 | **Suppression-list management across 5+ platforms.** Each platform has its own format; un-deduped suppression lists let opt-outs slip through, triggering CAN-SPAM / GDPR exposure. | Compliance risk + churn-back cost | Pipeline saved as JSON, re-run on each new suppression batch |
|
||||||
|
|
||||||
|
### 2.4 Operationalize the moat the docs already name.
|
||||||
|
|
||||||
|
Three durable advantages, each promoted from buried feature to
|
||||||
|
landing-page H1:
|
||||||
|
|
||||||
|
- **Quality**: 1 GB international standardization in ~2.5 minutes,
|
||||||
|
locally. Excel can't do this; OpenRefine fights you for an hour.
|
||||||
|
- **Privacy**: "Your data never leaves this computer." Already in the
|
||||||
|
GUI footer — promote to landing-page lead, screenshot the empty
|
||||||
|
network tab.
|
||||||
|
- **Update cadence**: ship a v1.1 patch within 30 days of v1.0 launch.
|
||||||
|
Not features — *evidence* the product is alive. "Added Czech Republic
|
||||||
|
phone format support" beats "no updates in 6 months" every time.
|
||||||
|
|
||||||
|
### 2.5 Surface the audit-trail feature in sales copy.
|
||||||
|
|
||||||
|
Every tool has a structured audit log. Most cleaning tools do not.
|
||||||
|
Bookkeepers and consultants get fired if they can't show what changed
|
||||||
|
to a client. The audit feature is currently invisible on every
|
||||||
|
proposed landing page and should be the **second-largest callout** —
|
||||||
|
right after "runs locally."
|
||||||
|
|
||||||
|
Copy seed: *"Every change auditable. Hand the audit CSV to your client
|
||||||
|
with the cleaned file."*
|
||||||
|
|
||||||
|
### 2.6 Automated Workflows is the retention multiplier.
|
||||||
|
|
||||||
|
A buyer with a saved pipeline isn't a one-off purchase — they're a
|
||||||
|
recurring user who recommends the product. This is exactly the
|
||||||
|
behavioural lever the no-touch constraint needs (DECISIONS.md §8
|
||||||
|
trigger). Build it now (see §2.1 exception).
|
||||||
|
|
||||||
|
### 2.7 Add a $199 "priority support" tier post-launch.
|
||||||
|
|
||||||
|
Same code, async-email SLA (24 h response). Targets the bookkeeper /
|
||||||
|
consultant persona whose own time is $300/hr. Zero new product work,
|
||||||
|
~3× ARPU on 5–10 % of buyers. Lock the SLA to **async only** so the
|
||||||
|
no-touch constraint isn't violated. Defer until $5 k/mo MRR (the same
|
||||||
|
trigger DECISIONS.md §8 already names).
|
||||||
|
|
||||||
|
### 2.8 Dependency-aware pipeline UX.
|
||||||
|
|
||||||
|
Tools have soft execution-order preferences (Text Clean before Format
|
||||||
|
Standardize, Format before Dedup, Missing before Dedup). Automated
|
||||||
|
Workflows *recommends* the order, *warns* on reversals, and **never
|
||||||
|
forces** — the user owns their workflow. Implementation: see
|
||||||
|
`src/core/pipeline.py` `SOFT_DEPENDENCIES`.
|
||||||
|
|
||||||
|
## 3. 90-day execution sequence
|
||||||
|
|
||||||
|
| Week | Action | Done when |
|
||||||
|
|---|---|---|
|
||||||
|
| 1 | PyInstaller pipeline · Mac/Win unsigned installers · Apple Dev Program enrollment (1–2 wk lead) | `dist/datatools-mac.dmg` and `dist/datatools-win.exe` install on a clean machine |
|
||||||
|
| 2 | Demo deployed to Streamlit Cloud · landing page v1 with embedded demo · 3 persona datasets in the demo | Public URL serves a working pipeline run on a sample dataset in < 30 s |
|
||||||
|
| 3 | Gumroad listing live · share value-first in 3 niche communities (no pitch) · 1 long-tail SEO post for the lead persona | First listing impression captured · post not removed for self-promotion |
|
||||||
|
| 4 | Automated Workflows v1.0 shipped (this week, 2026-05-01 — exception per §2.1) · v1.1 patch announced with Tool 09 + intl improvements | Pipeline saves/loads JSON · 3 demo pipelines preloaded |
|
||||||
|
| 5–8 | Bookkeeper landing page · agency landing page · second tool's promo cycle · priority-support tier added (defer purchase until §2.7 trigger) | Three live landing pages with distinct H1, demo dataset, conversion target |
|
||||||
|
| 9–13 | Tool 06–08 only **if** revenue trajectory supports continued investment · otherwise more market work on the existing 5 + 09 | Decision made on 13 Aug 2026 with revenue data, not feature ambition |
|
||||||
|
|
||||||
|
## 4. Decision triggers (re-evaluation prompts)
|
||||||
|
|
||||||
|
These flip the plan, not the underlying criteria:
|
||||||
|
|
||||||
|
| Trigger | Reaction |
|
||||||
|
|---|---|
|
||||||
|
| First paying customer in week 4–13 | Continue. Plan is working. |
|
||||||
|
| **Zero** paid in 90 days | Audit the funnel. Demo conversion? Niche fit? Price? Don't add features. |
|
||||||
|
| $5 k/mo MRR | DECISIONS.md §8 trigger fires: revisit async + priority-support tier. |
|
||||||
|
| Marketplace policy / shutdown | Switch to own-domain Stripe immediately; landing pages are already self-hosted. |
|
||||||
|
| Streamlit hard direction change | Low-probability re-lock per DECISIONS.md §8. Tk fallback is documented. |
|
||||||
|
|
||||||
|
## 5. Anti-temptations (things the plan refuses)
|
||||||
|
|
||||||
|
- **More tools before more buyers.** Locked. Exception only for Automated Workflows per §2.1.
|
||||||
|
- **SaaS pivot.** Recurring infra conflicts with the lifestyle constraint (DECISIONS.md §4).
|
||||||
|
- **Live chat / sales calls.** Conflicts with no-touch (DECISIONS.md §1 #8).
|
||||||
|
- **Custom integrations / one-off consulting.** $300/hr looks tempting; breaks the "build once, sell many" model that justifies the entire strategy.
|
||||||
|
- **Going broad on personas.** "All small businesses" is a generic landing page that converts at 1 %; "Shopify pet-supply operators with 1k–50k customers" converts at 5–15 % in the right communities.
|
||||||
|
|
||||||
|
## 6. What this plan deliberately leaves open
|
||||||
|
|
||||||
|
- Whether tools 06–08 ever ship. Decided on revenue, not roadmap.
|
||||||
|
- Whether to add a fourth niche landing page. Decided on which of the
|
||||||
|
three is producing.
|
||||||
|
- Whether to invest in own-domain SEO. Compounding 6–18 mo asset; not
|
||||||
|
the early-stage channel. Revisit when marketplace + community
|
||||||
|
produces baseline traffic to optimise.
|
||||||
|
- Whether to add a Notion / Slack support community. If support volume
|
||||||
|
per 100 sales > 10 (BUSINESS.md §12 target), revisit; else leave async-email only.
|
||||||
158
docs/POST-LAUNCH.md
Normal file
158
docs/POST-LAUNCH.md
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
# Post-launch — 90-day measurement plan
|
||||||
|
|
||||||
|
> Creator-only. The other half of `PLAN.md`: PLAN tells you what to
|
||||||
|
> build, this tells you what to measure once it's live and which
|
||||||
|
> numbers trigger which actions.
|
||||||
|
> **Version**: 1.0 · **Adopted**: 2026-05-01 · **Owner**: Michael
|
||||||
|
|
||||||
|
This is a runnable monthly checklist, not analytics theatre. Every
|
||||||
|
metric below has a **threshold** and an **action**. If you're not
|
||||||
|
willing to execute the action when the threshold trips, drop the
|
||||||
|
metric — measuring without responding is busywork.
|
||||||
|
|
||||||
|
## 1. The five numbers that matter
|
||||||
|
|
||||||
|
Every other dashboard, chart, or vanity stat is downstream of these
|
||||||
|
five. The funnel is short on purpose; pre-PMF traffic doesn't have
|
||||||
|
the resolution to support more.
|
||||||
|
|
||||||
|
| # | Metric | How to compute | Threshold | When tripped |
|
||||||
|
|---|--------|----------------|-----------|--------------|
|
||||||
|
| 1 | **Persona engagement** | `demo.run_completed / demo.page_view` per persona | < 30 % for 4 consecutive weeks | Demo isn't running or BEFORE preview isn't compelling. **Action:** check iframe loads; widen BEFORE preview to show pollution clearly; move demo above the fold. |
|
||||||
|
| 2 | **Demo→CTA intent** | `demo.cta_clicked / demo.run_completed` per persona | < 5 % for 4 consecutive weeks | Demo is impressive but the CTA isn't earning trust. **Action:** add network-tab privacy screenshot; soften the price callout; A-B test eyebrow copy on the CTA card. |
|
||||||
|
| 3 | **Purchase rate** | `gumroad.purchase / demo.cta_clicked` per persona | < 30 % for 4 consecutive weeks | Visitors click through but don't pull the card out. **Action:** check Gumroad listing renders cleanly; verify refund-policy copy; check that the screenshot on the listing matches the demo they just ran. |
|
||||||
|
| 4 | **Refund rate** | `gumroad.refunds / gumroad.purchase` rolling 30 days | > 5 % | Buyer expectation mismatch. **Action:** read every refund email; determine if it's a feature gap (build it), a positioning lie (rewrite), or a personal-fit miss (fine, ignore). |
|
||||||
|
| 5 | **Support load** | email tickets / 100 sales rolling 30 days | > 10 | The product isn't self-serve enough at this price. **Action:** find the top 3 questions; add to in-app onboarding + landing-page FAQ + the persona's saved pipeline. |
|
||||||
|
|
||||||
|
These five also map to BUSINESS.md §12 — that doc names the metrics;
|
||||||
|
this doc operationalises them.
|
||||||
|
|
||||||
|
## 2. Monthly review — 30-minute checklist
|
||||||
|
|
||||||
|
Block 30 minutes on the first Monday of every month for the first six
|
||||||
|
months. After month 6 if numbers are stable, drop to 15 minutes
|
||||||
|
quarterly.
|
||||||
|
|
||||||
|
```
|
||||||
|
[ ] Pull last 30 days of demo events from Cloudflare Web Analytics
|
||||||
|
[ ] Pull last 30 days of Gumroad sales + refunds export
|
||||||
|
[ ] Compute the five numbers in §1 per persona
|
||||||
|
[ ] Note which thresholds are tripped (if any)
|
||||||
|
[ ] Read every refund email since last review
|
||||||
|
[ ] Read every support email since last review
|
||||||
|
[ ] Decide ONE thing to change this month (only one)
|
||||||
|
[ ] Update CHANGELOG with what was changed and why
|
||||||
|
[ ] Schedule next review
|
||||||
|
```
|
||||||
|
|
||||||
|
The "decide ONE thing" rule is load-bearing. Pre-PMF traffic doesn't
|
||||||
|
have the volume to A/B-test multiple changes in parallel — you'll just
|
||||||
|
confuse yourself about what moved the number.
|
||||||
|
|
||||||
|
## 3. Per-persona scoreboard (template)
|
||||||
|
|
||||||
|
Maintain in a single text file or spreadsheet. The shape that fits in
|
||||||
|
a notebook page is the shape you'll actually update.
|
||||||
|
|
||||||
|
```
|
||||||
|
Month: 2026-06
|
||||||
|
─────────────────────────────────────────────────────────────────
|
||||||
|
Shopify Bookkeeper RevOps Total
|
||||||
|
Page views 420 180 290 890
|
||||||
|
Demo runs 137 59 82 278
|
||||||
|
CTA clicks 9 7 6 22
|
||||||
|
Purchases 3 2 2 7
|
||||||
|
|
||||||
|
Metric 1 (engage) 33% 33% 28% 31%
|
||||||
|
Metric 2 (intent) 7% 12% 7% 8%
|
||||||
|
Metric 3 (purchase) 33% 29% 33% 32%
|
||||||
|
Metric 4 (refund) 0% 0% 0% 0%
|
||||||
|
Metric 5 (support) 3 tickets / 100 sales
|
||||||
|
|
||||||
|
Tripped thresholds: RevOps engagement (28% < 30%)
|
||||||
|
|
||||||
|
This-month change: Move demo embed above the fold on revops
|
||||||
|
page; reduce hero text by 40%.
|
||||||
|
|
||||||
|
Last-month change: Added network-tab screenshot to all 3
|
||||||
|
pages. Result: intent +1.5 percentage
|
||||||
|
points on Shopify, flat elsewhere.
|
||||||
|
```
|
||||||
|
|
||||||
|
## 4. Stage-gate triggers from PLAN.md
|
||||||
|
|
||||||
|
Reproduced here so the gate criteria sit beside the metrics that
|
||||||
|
fire them:
|
||||||
|
|
||||||
|
| Trigger | From | Action |
|
||||||
|
|---|------|--------|
|
||||||
|
| **First paying customer** | PLAN §4 | Continue. Plan is working. |
|
||||||
|
| **Zero paid in 90 days** | PLAN §4 | Audit the funnel. Don't add features. Run a small (1-week) outbound experiment to 30 niche-community contacts as a control, even though it stretches the no-touch constraint, to determine whether the bottleneck is reach or conversion. |
|
||||||
|
| **$5 k/mo MRR** | DECISIONS §8 | Re-evaluate async constraint. Add priority-support tier (PLAN §2.7). |
|
||||||
|
| **$10 k/mo MRR** | DECISIONS §8 | Revisit time-budget allocation. Decide on tools 06–08 vs. additional bundles. |
|
||||||
|
| **Marketplace shutdown** | PLAN §4 / DECISIONS §8 | Switch landing-page CTA to own-domain Stripe Checkout. Pre-built; one-line edit. |
|
||||||
|
| **Streamlit hard direction change** | DECISIONS §8 | Low-probability re-lock. Tk fallback documented. |
|
||||||
|
| **Burnout signal** | DECISIONS §8 | Stop. Triage. The constraint matters more than the revenue ramp. |
|
||||||
|
|
||||||
|
## 5. What we deliberately do NOT measure
|
||||||
|
|
||||||
|
These look productive but predict nothing pre-PMF. Don't add them.
|
||||||
|
|
||||||
|
- **Bounce rate** — single-page sites have artificially high bounce. Useless signal.
|
||||||
|
- **Time on page** — landing pages are *supposed* to be quick reads. Long time on page often means confusion, not engagement.
|
||||||
|
- **Heatmaps / scroll-depth** — no statistical resolution at <500 monthly visitors. Add when you cross 5 k/month.
|
||||||
|
- **Email open rates** — under §2.7 priority support is the only email channel; opens aren't a buying signal.
|
||||||
|
- **Social mentions** — vanity. The signal that matters is "did they buy" or "did they come back."
|
||||||
|
|
||||||
|
## 6. What we measure once, then trust
|
||||||
|
|
||||||
|
Do these once, then let them run for 6+ months without re-measuring:
|
||||||
|
|
||||||
|
- **Demo correctness** — once per pipeline release, run all 3 demos
|
||||||
|
end-to-end via `tests/test_pipeline.py` and check the output looks
|
||||||
|
reasonable. The CI pipeline already does this; nothing to add.
|
||||||
|
- **Cross-platform install** — once per release, verify the
|
||||||
|
PyInstaller bundle launches on Mac / Windows / Linux. After three
|
||||||
|
green releases, trust the build pipeline; spot-check on major OS
|
||||||
|
updates only.
|
||||||
|
- **Privacy claim integrity** — once at launch, capture the network
|
||||||
|
tab while running the cleaner and host that screenshot at a stable
|
||||||
|
URL. Re-capture only when a new tool or dependency is added.
|
||||||
|
|
||||||
|
## 7. Per-persona attribution
|
||||||
|
|
||||||
|
The buy buttons on every landing page carry `?from=<persona>` query
|
||||||
|
parameters. Gumroad propagates that into the order metadata. Use it
|
||||||
|
to attribute purchases:
|
||||||
|
|
||||||
|
| persona key | landing page URL | Gumroad query | Source |
|
||||||
|
|---|---|---|---|
|
||||||
|
| `shopify-pet` | `/shopify-pet/` | `?from=shopify-pet` | Shopify operator |
|
||||||
|
| `bookkeeper` | `/bookkeeper/` | `?from=bookkeeper` | Bookkeeper / freelance accountant |
|
||||||
|
| `revops` | `/revops/` | `?from=revops` | Marketing / RevOps agency |
|
||||||
|
| `apex` | `/` | (no query — use `unknown` bucket) | Generic discovery |
|
||||||
|
|
||||||
|
When `unknown` exceeds 30 % of total, add UTM tagging to community
|
||||||
|
posts and SEO blog backlinks so you can break the bucket apart.
|
||||||
|
|
||||||
|
## 8. The four months that decide whether the plan works
|
||||||
|
|
||||||
|
Reading PLAN.md §3 + this doc together, the rough script:
|
||||||
|
|
||||||
|
| Month | What's running | What we expect to learn |
|
||||||
|
|---|---|---|
|
||||||
|
| **M1** (June) | Installers · demo · 3 landing pages · Gumroad live | Whether the funnel mechanically works. Numbers will be noisy; just look for one purchase. |
|
||||||
|
| **M2** (July) | M1 + community posts in 3 niches + 1 SEO post | Which persona converts. Re-allocate effort to the highest-converting niche. |
|
||||||
|
| **M3** (August) | M2 + landing-page changes from M2 review | Whether intent-rate moved on the change. Decide tools 06–08 go/no-go. |
|
||||||
|
| **M4** (September) | M3 + first repeat-buyer signals | Whether Automated Workflows is producing retention as designed. |
|
||||||
|
|
||||||
|
By end of M4, the data tells you whether the plan is producing
|
||||||
|
$1k–3k/mo (BUSINESS.md §6 6-month target) — extrapolated from the
|
||||||
|
trajectory, not the absolute number.
|
||||||
|
|
||||||
|
## 9. The hardest part of the plan to execute
|
||||||
|
|
||||||
|
Not the metrics. Not the build. **The "decide ONE thing per month"
|
||||||
|
rule** — operators with engineering backgrounds chronically pick
|
||||||
|
three changes per month and conclude nothing because their signal
|
||||||
|
is muddled. This doc says one. It means one.
|
||||||
35
docs/README.es.md
Normal file
35
docs/README.es.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
> 🌐 **Idioma:** Español · [English](README.md)
|
||||||
|
|
||||||
|
# Paquete Maestría en Limpieza de Datos Excel y CSV
|
||||||
|
|
||||||
|
9 herramientas de limpieza de datos en Python, cada una con CLI y GUI en el navegador. Solo local, sin internet. Windows / macOS / Linux.
|
||||||
|
|
||||||
|
## Inicio rápido
|
||||||
|
|
||||||
|
1. Descarga desde tu correo de compra. Dos formatos por sistema operativo — elige uno:
|
||||||
|
- **Instalador** (`.dmg` en macOS, `.exe` en Windows) — crea acceso directo en el escritorio + entrada en el menú Inicio / Launchpad.
|
||||||
|
- **.zip portable** — descomprime y haz doble clic. Sin instalación, sin admin, se ejecuta desde cualquier lugar.
|
||||||
|
2. Ábrelo (no necesitas Python; todo viene incluido).
|
||||||
|
3. La app arranca un servidor local y abre tu navegador. Nada sale de tu equipo.
|
||||||
|
|
||||||
|
Paso a paso completo incluyendo SmartScreen / Gatekeeper: [USER-GUIDE.es.md §1](USER-GUIDE.es.md#1-instalaci%C3%B3n).
|
||||||
|
|
||||||
|
## Documentación
|
||||||
|
|
||||||
|
**Para usuarios** (se entrega con el producto, en español):
|
||||||
|
- [USER-GUIDE.es.md](USER-GUIDE.es.md) — instalación + guía por herramienta
|
||||||
|
- [CLI-REFERENCE.es.md](CLI-REFERENCE.es.md) — referencia de cada comando
|
||||||
|
|
||||||
|
**Para usuarios** (se entrega con el producto, en inglés):
|
||||||
|
- [USER-GUIDE.md](USER-GUIDE.md) · [CLI-REFERENCE.md](CLI-REFERENCE.md)
|
||||||
|
|
||||||
|
**Solo internos** (no se entrega; solo en inglés):
|
||||||
|
- [BUSINESS.md](BUSINESS.md) — mercado, precios, marketing
|
||||||
|
- [TECHNICAL.md](TECHNICAL.md) — arquitectura, pipeline de build, estándares
|
||||||
|
- [DECISIONS.md](DECISIONS.md) — criterios bloqueados, registro de decisiones
|
||||||
|
- [RECOVERY.md](RECOVERY.md) — guía de reconstrucción completa
|
||||||
|
- [REQUIREMENTS.md](REQUIREMENTS.md) — matriz numerada de soporte
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Versión**: 1.6 · **Actualizado**: 2026-05-13 · **Propietario**: Michael
|
||||||
@@ -1,38 +1,32 @@
|
|||||||
|
> 🌐 **Language:** English · [Español](README.es.md)
|
||||||
|
|
||||||
# Excel & CSV Data Cleaning Mastery Bundle
|
# Excel & CSV Data Cleaning Mastery Bundle
|
||||||
|
|
||||||
**Ready-to-sell Python automation product.**
|
9 Python data-cleaning tools, every one with a CLI and a browser GUI. Local-only, no internet. Windows / macOS / Linux.
|
||||||
9 scripts for data cleaning, deduplication, text hygiene, formatting, merging, validation, and reporting.
|
|
||||||
|
|
||||||
Each script ships with both a GUI (runs in your browser locally, no internet needed) and a CLI.
|
## Quick Start
|
||||||
|
|
||||||
Cross-platform: Windows, macOS, Linux.
|
1. Download from your purchase email. Two flavors per OS — pick one:
|
||||||
|
- **Installer** (`.dmg` on macOS, `.exe` on Windows) — wires up Desktop + Start Menu / Launchpad shortcuts.
|
||||||
|
- **Portable .zip** — unzip and double-click. No install, no admin rights, runs from anywhere.
|
||||||
|
2. Open it (no Python needed; everything is bundled inside).
|
||||||
|
3. The app starts a local server and opens your browser. Nothing leaves your machine.
|
||||||
|
|
||||||
|
Full step-by-step including SmartScreen / Gatekeeper workarounds: [USER-GUIDE.md §1](USER-GUIDE.md#1-install).
|
||||||
|
|
||||||
|
## Docs
|
||||||
|
|
||||||
|
**Buyer-facing** (ships with the product):
|
||||||
|
- **English**: [USER-GUIDE.md](USER-GUIDE.md) · [CLI-REFERENCE.md](CLI-REFERENCE.md)
|
||||||
|
- **Español**: [USER-GUIDE.es.md](USER-GUIDE.es.md) · [CLI-REFERENCE.es.md](CLI-REFERENCE.es.md)
|
||||||
|
|
||||||
|
**Creator-only** (do not ship):
|
||||||
|
- [BUSINESS.md](BUSINESS.md) — market, pricing, marketing
|
||||||
|
- [TECHNICAL.md](TECHNICAL.md) — architecture, build pipeline, standards
|
||||||
|
- [DECISIONS.md](DECISIONS.md) — locked criteria, decision log
|
||||||
|
- [RECOVERY.md](RECOVERY.md) — full rebuild guide
|
||||||
|
- [REQUIREMENTS.md](REQUIREMENTS.md) — numbered support matrix
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Quick Start (for buyers)
|
**Version**: 1.6 · **Updated**: 2026-05-01 · **Owner**: Michael
|
||||||
|
|
||||||
1. Download the installer for your operating system.
|
|
||||||
2. Run the installer. No Python knowledge required.
|
|
||||||
3. Launch via the desktop shortcut "Launch Bundle" (or the app icon on macOS, or the AppImage on Linux).
|
|
||||||
4. Your default browser opens to a local page where the data tool runs. Your data never leaves your computer.
|
|
||||||
|
|
||||||
Full instructions: see [USER-GUIDE.md](USER-GUIDE.md).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Documentation Index
|
|
||||||
|
|
||||||
### Ships with the product (buyer-facing)
|
|
||||||
- [USER-GUIDE.md](USER-GUIDE.md) - Installation, script reference, usage examples for both GUI and CLI.
|
|
||||||
|
|
||||||
### Creator-only (do not ship to buyers)
|
|
||||||
- [BUSINESS.md](BUSINESS.md) - Business case, market analysis, pricing, marketing strategy (including the hosted browser demo as a conversion lever).
|
|
||||||
- [TECHNICAL.md](TECHNICAL.md) - Architecture (dual CLI + Streamlit GUI), build pipeline, dev standards.
|
|
||||||
- [DECISIONS.md](DECISIONS.md) - Locked criteria, scoring rubric, decisions log, rationale for product choices including the GUI framework decision.
|
|
||||||
- [RECOVERY.md](RECOVERY.md) - How to rebuild the entire project from scratch if lost.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
**Version**: 1.6
|
|
||||||
**Last updated**: April 28, 2026
|
|
||||||
**Owner**: Michael
|
|
||||||
|
|||||||
251
docs/RECOVERY.md
251
docs/RECOVERY.md
@@ -1,180 +1,147 @@
|
|||||||
# RECOVERY.md - Full Project Recovery Guide
|
# Recovery
|
||||||
|
|
||||||
> **Creator-only document. Do not ship to buyers.**
|
> Creator-only. Full project rebuild guide.
|
||||||
|
> **Version**: 1.6 · **Updated**: 2026-05-01
|
||||||
|
|
||||||
**Version**: 1.6
|
If lost, this doc + the source ZIP rebuilds the project 100%.
|
||||||
**Last updated**: April 28, 2026
|
|
||||||
|
|
||||||
If the project is ever lost, this guide plus the source ZIP is enough to rebuild it 100%.
|
## 1. Project layout
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 1. What's in the Project
|
|
||||||
|
|
||||||
```
|
```
|
||||||
project-root/
|
project-root/
|
||||||
├── README.md
|
├── README.md
|
||||||
├── BUSINESS.md # Creator only
|
├── docs/
|
||||||
├── TECHNICAL.md # Creator only
|
│ ├── BUSINESS.md # creator-only
|
||||||
├── DECISIONS.md # Creator only - locked criteria, rationale, GUI framework decision
|
│ ├── TECHNICAL.md # creator-only
|
||||||
├── USER-GUIDE.md # Ships to buyers
|
│ ├── DECISIONS.md # creator-only — locked criteria + decision log
|
||||||
├── RECOVERY.md # Creator only (this file)
|
│ ├── DEVELOPER.md # creator-only
|
||||||
│
|
│ ├── RECOVERY.md # creator-only (this file)
|
||||||
├── scripts/ # The 9 .py source files (CLI entry points)
|
│ ├── REQUIREMENTS.md
|
||||||
│ ├── 01_deduplicator.py # Working
|
│ ├── USER-GUIDE.md # ships to buyers
|
||||||
│ ├── 02_text_cleaner.py
|
│ └── CLI-REFERENCE.md
|
||||||
│ ├── 03_format_standardizer.py
|
|
||||||
│ ├── 04_missing_value_handler.py
|
|
||||||
│ ├── 05_column_mapper_enforcer.py
|
|
||||||
│ ├── 06_outlier_detector.py
|
|
||||||
│ ├── 07_multi_file_merger.py
|
|
||||||
│ ├── 08_validator_reporter.py
|
|
||||||
│ └── 09_master_orchestrator.py
|
|
||||||
│
|
|
||||||
├── src/
|
├── src/
|
||||||
│ ├── core/ # Shared business logic - both CLI and GUI call into this
|
│ ├── core/ # shared logic — both CLI + GUI call into this
|
||||||
│ ├── cli.py # Typer CLI front-end
|
│ ├── cli.py # Find Duplicates CLI
|
||||||
│ └── gui/ # Streamlit GUI front-end
|
│ ├── cli_text_clean.py # Clean Text CLI
|
||||||
│ ├── app.py # Streamlit entry point
|
│ ├── cli_analyze.py # Analyzer CLI
|
||||||
│ ├── pages/ # One Streamlit page per script in the bundle
|
│ └── gui/
|
||||||
│ └── components.py # Shared widgets
|
│ ├── app.py # Streamlit entry
|
||||||
│
|
│ ├── pages/ # one page per tool
|
||||||
├── samples/
|
│ └── components/ # shared widgets
|
||||||
│ ├── messy_sales.csv
|
├── samples/ # messy_sales.csv, bank_export.xlsx
|
||||||
│ └── bank_export.xlsx
|
├── test-cases/ # corpora: text-cleaner, encodings, format-cleaner
|
||||||
│
|
├── tests/ # pytest
|
||||||
├── demo/
|
├── demo/streamlit_app.py # constrained Streamlit Community Cloud version
|
||||||
│ └── streamlit_app.py # Constrained version for Streamlit Community Cloud
|
|
||||||
│
|
|
||||||
├── build/
|
├── build/
|
||||||
│ ├── pyinstaller.spec # Cross-platform build spec (handles GUI launcher + CLI binaries)
|
│ ├── pyinstaller.spec # cross-platform build spec
|
||||||
│ ├── launcher.py # Starts local Streamlit server, opens default browser
|
│ ├── launcher.py # starts Streamlit, opens browser
|
||||||
│ ├── windows/
|
│ ├── windows/installer.iss
|
||||||
│ │ └── installer.iss # Inno Setup wrapper
|
│ ├── macos/{entitlements.plist, dmg_settings.py}
|
||||||
│ ├── macos/
|
│ └── linux/AppImage/
|
||||||
│ │ ├── entitlements.plist
|
├── ci/build.yml # GitHub Actions matrix build
|
||||||
│ │ └── dmg_settings.py
|
|
||||||
│ └── linux/
|
|
||||||
│ └── AppImage/ # AppImage build assets
|
|
||||||
│
|
|
||||||
├── ci/
|
|
||||||
│ └── build.yml # GitHub Actions cross-platform build
|
|
||||||
│
|
|
||||||
├── tests/
|
|
||||||
│
|
|
||||||
└── requirements.txt
|
└── requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
## 2. Rebuild steps
|
||||||
|
|
||||||
## 2. Rebuild Steps
|
|
||||||
|
|
||||||
### From a complete ZIP backup
|
### From a complete ZIP backup
|
||||||
1. Unzip into a clean directory.
|
1. Unzip into a clean directory.
|
||||||
2. Push to a GitHub repository.
|
2. Push to GitHub.
|
||||||
3. The CI pipeline (`ci/build.yml`) builds Windows, macOS, and Linux artifacts on tagged releases.
|
3. Tag a release → CI builds Windows / macOS / Linux artifacts.
|
||||||
4. Connect the repo to Streamlit Community Cloud and point it at `demo/streamlit_app.py` to redeploy the hosted demo.
|
4. Connect repo to Streamlit Community Cloud → demo deploys.
|
||||||
5. For local builds: see Section 3.
|
5. Local builds: see §3.
|
||||||
6. Done.
|
|
||||||
|
|
||||||
### From documentation only (worst case)
|
### From documentation only (worst case)
|
||||||
1. Read `DECISIONS.md` to understand *why* the project is what it is. Section 4c locks the GUI framework as Streamlit; Section 4b locks the UX standards. These are non-negotiable.
|
1. Read **DECISIONS.md** — understand *why* the project is what it is. §4c locks Streamlit; §4b locks UX standards. **Non-negotiable.**
|
||||||
2. Read `TECHNICAL.md` Sections 2-3 for the build pipeline architecture, including the Streamlit launcher pattern in Section 3.4.
|
2. Read **TECHNICAL.md** §1-3 (architecture + build pipeline + Streamlit launcher pattern in §3.4).
|
||||||
3. Read `BUSINESS.md` for product strategy, which bundles to build, and the hosted demo as a marketing asset.
|
3. Read **BUSINESS.md** for product strategy + hosted demo as marketing asset.
|
||||||
4. Recreate scripts using the spec in `USER-GUIDE.md` Section 2 (script table), `TECHNICAL.md` Section 7 (per-bundle technical notes), `TECHNICAL.md` Section 9 (boundary between scripts 04 and 06 - do not relitigate this), and `TECHNICAL.md` Section 10 (per-script functional requirements; Section 10.1 is the v1 launch target for the deduplicator).
|
4. Recreate scripts using:
|
||||||
5. Set up the cross-platform build pipeline (Section 3 below).
|
- USER-GUIDE.md §2 (script table)
|
||||||
6. Recreate installer configs per `TECHNICAL.md` Section 3.
|
- TECHNICAL.md §10 (04/06 boundary — do not relitigate)
|
||||||
7. Build the constrained `demo/streamlit_app.py` for hosted deployment. Constraints: row limit, watermark, sample data only or strict file-size cap.
|
- TECHNICAL.md §11 (per-script functional specs; §11.1-11.3 are the v1 launch targets for Ready tools).
|
||||||
|
5. Set up cross-platform build pipeline (§3 below).
|
||||||
|
6. Recreate installer configs per TECHNICAL.md §3.5-3.7.
|
||||||
|
7. Build constrained `demo/streamlit_app.py` (row limit, watermark, sample data).
|
||||||
|
|
||||||
---
|
## 3. Local build setup
|
||||||
|
|
||||||
## 3. Local Build Setup (per platform)
|
### Common
|
||||||
|
```bash
|
||||||
### All platforms (common)
|
pip install -r requirements.txt pyinstaller
|
||||||
- Install Python 3.11+.
|
streamlit run src/gui/app.py # verify GUI
|
||||||
- `pip install -r requirements.txt pyinstaller`
|
python -m src.cli --help # verify CLI
|
||||||
- Verify Streamlit app runs locally: `streamlit run src/gui/app.py`
|
```
|
||||||
- Verify CLI runs locally: `python -m src.cli --help`
|
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
- Install Inno Setup: https://jrsoftware.org/isinfo.php
|
- Install Inno Setup: https://jrsoftware.org/isinfo.php
|
||||||
- Build: `pyinstaller build/pyinstaller.spec`
|
- `pyinstaller build/pyinstaller.spec`
|
||||||
- Wrap in installer: open `build/windows/installer.iss` in Inno Setup, compile.
|
- Open `build/windows/installer.iss` in Inno Setup, compile.
|
||||||
|
|
||||||
### macOS
|
### macOS
|
||||||
- Install Xcode command line tools: `xcode-select --install`
|
1. `xcode-select --install`
|
||||||
- Enroll in Apple Developer Program ($99/yr). Allow 1-2 weeks first time.
|
2. Enroll in Apple Developer Program ($99/yr — 1-2 wk first time).
|
||||||
- Generate Developer ID Application certificate, install in Keychain.
|
3. Generate Developer ID cert, install in Keychain.
|
||||||
- Generate app-specific password for `notarytool`.
|
4. Generate app-specific password for `notarytool`.
|
||||||
- Build: `pyinstaller build/pyinstaller.spec`
|
5. `pyinstaller build/pyinstaller.spec`
|
||||||
- Sign: `codesign --deep --force --options runtime --sign "Developer ID Application: [Name]" dist/BundleName.app`
|
6. `codesign --deep --force --options runtime --sign "Developer ID Application: [Name]" dist/App.app`
|
||||||
- Package as DMG.
|
7. Package as DMG.
|
||||||
- Notarize: `xcrun notarytool submit BundleName.dmg --wait`
|
8. `xcrun notarytool submit *.dmg --wait`
|
||||||
- Staple: `xcrun stapler staple BundleName.dmg`
|
9. `xcrun stapler staple *.dmg`
|
||||||
|
|
||||||
### Linux
|
### Linux
|
||||||
- Install AppImage tooling: download `appimagetool` from https://appimage.github.io
|
- Download `appimagetool` from https://appimage.github.io
|
||||||
- Build: `pyinstaller build/pyinstaller.spec`
|
- `pyinstaller build/pyinstaller.spec`
|
||||||
- Wrap as AppImage using `appimagetool` per the assets in `build/linux/AppImage/`.
|
- Wrap as AppImage via assets in `build/linux/AppImage/`.
|
||||||
|
|
||||||
### Streamlit + PyInstaller specific notes
|
### Streamlit + PyInstaller notes
|
||||||
- A custom PyInstaller hook (`hook-streamlit.py`) is required to bundle Streamlit's data files correctly.
|
- Custom `hook-streamlit.py` required.
|
||||||
- Hidden imports must include `streamlit`, `altair`, `pyarrow` (and their submodules where PyInstaller fails to detect them).
|
- Hidden imports: `streamlit`, `altair`, `pyarrow` (and submodules where auto-detection fails).
|
||||||
- The launcher script (`build/launcher.py`) is the actual PyInstaller entry point, not the Streamlit script directly.
|
- The PyInstaller entry point is `build/launcher.py`, **not** the Streamlit script directly.
|
||||||
- Budget 1-3 days the first time getting the Streamlit-PyInstaller spec right; it's reusable across all subsequent bundles.
|
- Budget 1-3 days first time. Reusable across all bundles.
|
||||||
|
|
||||||
### CI build (recommended)
|
### CI build (recommended)
|
||||||
- Push the repo to GitHub.
|
```bash
|
||||||
- Tag a release: `git tag v1.0.0 && git push --tags`
|
git tag v1.0.0 && git push --tags
|
||||||
- GitHub Actions runs the matrix build, produces all three artifacts.
|
# GitHub Actions runs the matrix → 3 platform artifacts on Releases page.
|
||||||
- Manual step: download artifacts from the Releases page, upload to Gumroad / Lemon Squeezy.
|
# Manual: download → upload to Gumroad / Lemon Squeezy.
|
||||||
|
```
|
||||||
|
|
||||||
### Hosted demo deployment (separate from desktop build)
|
### Hosted demo deployment
|
||||||
- Connect GitHub repo to Streamlit Community Cloud (one-time, free).
|
- Connect GitHub repo to Streamlit Community Cloud (one-time, free).
|
||||||
- Configure the deployment to point at `demo/streamlit_app.py`.
|
- Configure deployment → `demo/streamlit_app.py`.
|
||||||
- The demo updates automatically on git push to the configured branch.
|
- Auto-updates on push to configured branch.
|
||||||
- Custom domain optional via CNAME (verify Streamlit Community Cloud current policy at recovery time).
|
- Custom domain optional via CNAME.
|
||||||
|
|
||||||
---
|
## 4. External dependencies
|
||||||
|
|
||||||
## 4. External Dependencies (re-acquire if lost)
|
|
||||||
|
|
||||||
| Item | Source | Cost |
|
| Item | Source | Cost |
|
||||||
|---|---|---|
|
|------|--------|------|
|
||||||
| Python | https://python.org/downloads | Free |
|
| Python | python.org/downloads | Free |
|
||||||
| PyInstaller | `pip install pyinstaller` | Free |
|
| PyInstaller, Streamlit, Python libs | `pip install -r requirements.txt` | Free |
|
||||||
| Streamlit | `pip install streamlit` | Free |
|
| Inno Setup (Windows) | jrsoftware.org/isinfo.php | Free |
|
||||||
| Inno Setup (Windows) | https://jrsoftware.org/isinfo.php | Free |
|
| Apple Developer Program (macOS) | developer.apple.com | $99/yr |
|
||||||
| Apple Developer Program (macOS signing) | https://developer.apple.com | $99/yr |
|
| Xcode CLT (macOS) | `xcode-select --install` | Free |
|
||||||
| Xcode command line tools (macOS) | `xcode-select --install` | Free |
|
| appimagetool (Linux) | appimage.github.io | Free |
|
||||||
| appimagetool (Linux) | https://appimage.github.io | Free |
|
| GitHub Actions (CI) | github.com | Free tier covers all 3 OS runners |
|
||||||
| GitHub Actions (CI) | github.com | Free tier covers all three OS runners |
|
| Streamlit Community Cloud | streamlit.io/cloud | Free |
|
||||||
| Streamlit Community Cloud (demo hosting) | streamlit.io/cloud | Free |
|
|
||||||
| Python libraries | See `requirements.txt`, `pip install -r requirements.txt` | Free |
|
|
||||||
|
|
||||||
---
|
## 5. Backup recommendation
|
||||||
|
|
||||||
## 5. Backup Recommendation
|
- **Primary**: GitHub repository (private). Source of truth.
|
||||||
|
- **Secondary**: ZIP of full project tree on cloud storage (Drive / Dropbox / S3).
|
||||||
|
- **Apple Developer credentials**: cert + app-specific password in a password manager. Re-issuable, not catastrophic.
|
||||||
|
- **Streamlit Community Cloud**: stored as GitHub OAuth link in Streamlit UI. Re-authorize from new account if lost.
|
||||||
|
- Back up after every meaningful change.
|
||||||
|
- **Always include RECOVERY.md + DECISIONS.md** — irreplaceable context.
|
||||||
|
|
||||||
- **Primary backup**: GitHub repository (private). Source is the source of truth.
|
## 6. Recovery priorities (under time pressure)
|
||||||
- **Secondary backup**: ZIP of the full project tree on cloud storage (Google Drive / Dropbox / S3).
|
|
||||||
- **Apple Developer credentials**: store certificate + app-specific password in a password manager. Losing these requires regenerating, not catastrophic.
|
|
||||||
- **Streamlit Community Cloud connection**: stored in Streamlit's UI as a GitHub OAuth link. Re-authorize from a new Streamlit account if lost.
|
|
||||||
- Back up after every meaningful code or doc change.
|
|
||||||
- Include this `RECOVERY.md` and `DECISIONS.md` in every backup. They contain the irreplaceable context.
|
|
||||||
|
|
||||||
---
|
1. **`src/core/` + scripts** — without these there is no product.
|
||||||
|
2. **DECISIONS.md** — without this you'll re-litigate every settled call.
|
||||||
## 6. Recovery Priorities (if rebuilding under time pressure)
|
3. **TECHNICAL.md** §10 (04/06 boundary) + §11 (per-script specs). Without these you'll rebuild dedup with weaker fuzzy than the v1 spec demands and lose to free Excel.
|
||||||
|
4. **`src/gui/`** — primary buyer surface; without it the product reverts to CLI-only and the persona refunds.
|
||||||
If you only have time to rebuild part of the project, this is the order:
|
5. **PyInstaller spec + launcher + per-OS configs** — recreating the Streamlit-PyInstaller integration is 1-3 days.
|
||||||
|
6. **Apple Developer Program enrollment** — 1-2 wk lead. Start first if Mac matters.
|
||||||
1. **Source: `src/core/` and `scripts/`**. Without these there is no product.
|
7. **Hosted demo** — important marketing asset, not blocking for desktop sales.
|
||||||
2. **DECISIONS.md**. Without this you will re-litigate every settled decision (especially GUI framework, dual interface, UX standards) and probably get it wrong differently.
|
8. Doc files (USER-GUIDE, BUSINESS, README) — recoverable from memory + this guide.
|
||||||
3. **TECHNICAL.md**, especially Sections 9 (04/06 boundary) and 10 (per-script functional requirements). Without these you will rebuild the deduplicator with weaker fuzzy matching than the v1 launch spec demands and ship something that loses to free Excel.
|
9. CI config — nice to have, not blocking.
|
||||||
4. **Streamlit GUI source (`src/gui/`)**. The primary buyer surface; without it the product reverts to CLI-only and the buyer persona will refund.
|
|
||||||
5. **PyInstaller spec + launcher + per-OS build configs** (`build/`). Reproducing the Streamlit-PyInstaller integration from scratch is 1-3 days of work.
|
|
||||||
6. **Apple Developer Program enrollment**. 1-2 week lead time. Start this first if Mac distribution matters.
|
|
||||||
7. **Hosted demo (`demo/streamlit_app.py`)**. Important marketing asset but not blocking for desktop sales.
|
|
||||||
8. Documentation files (USER-GUIDE, BUSINESS, README). Recoverable from memory + this guide.
|
|
||||||
9. CI config (`ci/build.yml`). Nice to have, not blocking.
|
|
||||||
|
|||||||
@@ -1,146 +1,262 @@
|
|||||||
# REQUIREMENTS.md
|
# Requirements
|
||||||
|
|
||||||
Numbered, categorized requirements list — short form. The companion to USER-GUIDE.md and TECHNICAL.md; updated with every shipped capability.
|
Numbered support matrix. Updated with every shipped capability.
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 1. File handling
|
## 1. File handling
|
||||||
|
1.1 Size: ≤ 1.5 GB target (larger works, slower).
|
||||||
1.1 File size: ≤ 1 GB (target; bigger files work but the gate's full-DataFrame Apply pass scales linearly).
|
1.2 Read: CSV, TSV, XLSX, XLS.
|
||||||
1.2 Input formats: CSV, TSV, XLSX, XLS.
|
1.3 Write: CSV, TSV.
|
||||||
1.3 Output formats: CSV, TSV.
|
1.4 Excel: multi-sheet picker.
|
||||||
1.4 Excel: multi-sheet workbook picker.
|
1.5 Empty file: blocked with `empty_input` error finding.
|
||||||
1.5 Empty file: detected, blocks gate with `empty_input` error finding.
|
|
||||||
|
|
||||||
## 2. Input encodings (auto-detected)
|
## 2. Input encodings (auto-detected)
|
||||||
|
2.1 Unicode: UTF-8, UTF-8-BOM, UTF-16 LE/BE BOM, UTF-16 LE no-BOM.
|
||||||
2.1 Unicode: UTF-8, UTF-8 with BOM, UTF-16 LE/BE with BOM, UTF-16 LE without BOM (best-effort).
|
|
||||||
2.2 Western: cp1252, ISO-8859-1, ISO-8859-15, Mac Roman.
|
2.2 Western: cp1252, ISO-8859-1, ISO-8859-15, Mac Roman.
|
||||||
2.3 Eastern European: cp1250, ISO-8859-2.
|
2.3 Eastern European: cp1250, ISO-8859-2.
|
||||||
2.4 Cyrillic: cp1251, KOI8-R.
|
2.4 Cyrillic: cp1251, KOI8-R.
|
||||||
2.5 CJK: Shift_JIS / cp932, GB18030, Big5, EUC-KR / cp949.
|
2.5 CJK: Shift_JIS / cp932, GB18030, Big5, EUC-KR / cp949.
|
||||||
2.6 ASCII: detected as UTF-8 (byte-equivalent).
|
2.6 ASCII → detected as UTF-8.
|
||||||
2.7 User override: any Python codec name typed in the Review page.
|
2.7 User override: any Python codec name.
|
||||||
2.8 BOM: stripped on read, never written.
|
2.8 BOM: stripped on read, never written.
|
||||||
2.9 Decode failure: surfaced as `encoding_decode_failed` (error severity).
|
2.9 Decode failure → `encoding_decode_failed` (error).
|
||||||
2.10 Replacement char (U+FFFD) in output: surfaced as `encoding_uncertain` (error).
|
2.10 U+FFFD in output → `encoding_uncertain` (error).
|
||||||
|
|
||||||
## 3. Output encodings
|
## 3. Output encodings
|
||||||
|
3.1 UTF-8 (default), UTF-8-BOM (Excel-friendly).
|
||||||
3.1 UTF-8 (default).
|
3.2 cp1252, ISO-8859-1/15, cp1250, ISO-8859-2, cp1251.
|
||||||
3.2 UTF-8 with BOM (Excel-friendly).
|
3.3 Shift_JIS, GB18030, Big5, EUC-KR, UTF-16 LE.
|
||||||
3.3 cp1252, ISO-8859-1, ISO-8859-15, cp1250, ISO-8859-2, cp1251.
|
3.4 Lossy fallback: `?` + warning when codec can't represent a char.
|
||||||
3.4 Shift_JIS, GB18030, Big5, EUC-KR, UTF-16 LE.
|
|
||||||
3.5 Lossy fallback: `?` replacement + warning shown when chosen codec can't represent a character.
|
|
||||||
|
|
||||||
## 4. Delimiters
|
## 4. Delimiters
|
||||||
|
4.1 Input auto-detect: `,`, `\t`, `;`, `|`.
|
||||||
4.1 Auto-detect (input): `,`, `\t`, `;`, `|`.
|
|
||||||
4.2 Output: `,` (default), `\t`, `;`, `|`.
|
4.2 Output: `,` (default), `\t`, `;`, `|`.
|
||||||
4.3 File extension: `.tsv` for tab, `.csv` otherwise.
|
4.3 Extension: `.tsv` for tab, `.csv` otherwise.
|
||||||
|
|
||||||
## 5. Line endings
|
## 5. Line endings
|
||||||
|
5.1 Read: LF / CRLF / bare CR — all normalized to LF.
|
||||||
5.1 Input: LF, CRLF, bare CR (all normalized to LF on read).
|
|
||||||
5.2 Embedded in quoted cells: also normalized to LF.
|
5.2 Embedded in quoted cells: also normalized to LF.
|
||||||
5.3 Output: LF (default), CRLF, CR.
|
5.3 Write: LF (default), CRLF, CR.
|
||||||
5.4 Mixed line endings: surfaced as `mixed_line_endings` finding.
|
5.4 Mixed → `mixed_line_endings` finding.
|
||||||
|
|
||||||
## 6. Analyzer detectors
|
## 6. Analyzer detectors
|
||||||
|
|
||||||
6.1 File-level (audit log of read-time fixes): `csv_bom_stripped`, `csv_nul_stripped`, `csv_smart_quotes_folded`, `csv_line_endings_normalized`, `csv_transcoded_to_utf8`, `csv_unquoted_delimiters_repaired`, `csv_unrepairable_rows`.
|
**File-level** (read-time fixes, audit-logged):
|
||||||
6.2 Cell-level: `smart_punctuation_in_data`, `nbsp_or_unicode_whitespace`, `zero_width_or_invisible`, `dirty_column_headers`, `whitespace_padding`, `null_like_sentinels`, `suspected_mojibake`, `mixed_case_email_column`, `near_duplicate_rows`, `leading_zero_ids`.
|
- `csv_bom_stripped`, `csv_nul_stripped`, `csv_smart_quotes_folded`, `csv_line_endings_normalized`, `csv_transcoded_to_utf8`, `csv_unquoted_delimiters_repaired`, `csv_unrepairable_rows`.
|
||||||
6.3 Encoding integrity: `encoding_uncertain`, `encoding_decode_failed`, `empty_input`.
|
|
||||||
6.4 Sample size (default): 1,000 rows; configurable.
|
**Cell-level**:
|
||||||
|
- `smart_punctuation_in_data`, `nbsp_or_unicode_whitespace`, `zero_width_or_invisible`, `dirty_column_headers`, `whitespace_padding`, `null_like_sentinels`, `suspected_mojibake`, `mixed_case_email_column`, `inconsistent_date_format`, `near_duplicate_rows`, `leading_zero_ids`.
|
||||||
|
|
||||||
|
**Encoding integrity**: `encoding_uncertain`, `encoding_decode_failed`, `encoding_lying_bom`, `empty_input`.
|
||||||
|
|
||||||
|
Sample size: 1,000 rows (configurable).
|
||||||
|
|
||||||
## 7. Finding fields
|
## 7. Finding fields
|
||||||
|
`id`, `severity` (info/warn/error), `confidence` (high/medium/low), `fix_action`, `pre_applied`, `tool`, `count`, `description`, `column`, `samples` (≤5).
|
||||||
7.1 `id` — stable identifier.
|
|
||||||
7.2 `severity` — info / warn / error (error blocks gate).
|
|
||||||
7.3 `confidence` — high / medium / low (auto-fixability).
|
|
||||||
7.4 `fix_action` — id of the algorithm in `src/core/fixes.py`.
|
|
||||||
7.5 `pre_applied` — true if fixed during read pass.
|
|
||||||
7.6 `tool` — owning tool id (or empty).
|
|
||||||
7.7 `count`, `description`, `column`, `samples` (≤5).
|
|
||||||
|
|
||||||
## 8. Confidence tiers
|
## 8. Confidence tiers
|
||||||
|
- **high** — round-trip safe, one-click auto-fix.
|
||||||
|
- **medium** — preview before applying.
|
||||||
|
- **low** — opt-in only, can corrupt if wrong.
|
||||||
|
- **error** — must resolve or waive before tool pages unlock.
|
||||||
|
|
||||||
8.1 **high** — round-trip safe; one-click auto-fix.
|
## 9. Decision actions
|
||||||
8.2 **medium** — preview before applying.
|
- `auto` — apply registered fix.
|
||||||
8.3 **low** — opt-in only; can corrupt data if wrong.
|
- `skip` — waive (audit-logged).
|
||||||
8.4 **error** — must resolve or waive before tool pages unlock.
|
- `modified` — apply with custom payload.
|
||||||
|
|
||||||
## 9. Decision actions per finding
|
## 10. Performance (1.5 GB input)
|
||||||
|
- Initial scan (sample): < 2 s · peak RSS ~110 MB.
|
||||||
|
- Full-file `repair_bytes`: 30–40 s (UTF-8); non-UTF-8 fold path now
|
||||||
|
uses ``str.count`` instead of a Python char-by-char zip walk —
|
||||||
|
formerly ~100 s on a 1 GB cp1252 file with smart quotes, now <1 s.
|
||||||
|
- Full-DataFrame analyze: ~4 min (~25 µs/cell). Near-duplicate detector
|
||||||
|
no longer allocates a full-frame copy — peak RSS during the
|
||||||
|
near-duplicate pass drops to roughly the size of the string columns
|
||||||
|
alone (~50% memory cut on text-heavy 1 GB inputs).
|
||||||
|
- Full-DataFrame `auto_fix`: ~5 min (~30 µs/cell).
|
||||||
|
- Output write: ~10 s.
|
||||||
|
- Recommended RAM: 3–4× input size for the full-Apply path.
|
||||||
|
- **Standardize Formats** (`standardize_dataframe`): ~2.7M rows/sec on
|
||||||
|
cache-warm repetition-heavy columns (synthetic 1M-row in-memory
|
||||||
|
benchmark, 2 typed columns); the fused single-pass loop replaced a
|
||||||
|
3-pass ``.tolist()`` cycle, so per-call overhead is now dominated by
|
||||||
|
the underlying parsers (phonenumbers, dateutil) rather than Python
|
||||||
|
list materialisation. A 1.5 GB CSV with mixed phone+currency+address
|
||||||
|
columns finishes in ~1.5–6 minutes depending on column count.
|
||||||
|
`StandardizeOptions.parallel_columns` (default 1, serial) lands the
|
||||||
|
thread-pool scaffolding; on CPython 3.12 with the GIL it's
|
||||||
|
roughly neutral, but the API is ready for the free-threaded
|
||||||
|
(PEP 703) Python 3.13+ build where it will help.
|
||||||
|
- **Clean Text** (`clean_dataframe`): ~1M rows/sec on
|
||||||
|
repetition-heavy columns (per-call string cache: the pipeline runs
|
||||||
|
once per *unique* cell value, not once per row).
|
||||||
|
- **Fix Missing Values** (`handle_missing`): lazy-copy — when sentinel
|
||||||
|
standardization runs but finds nothing, AND no drops AND no fills
|
||||||
|
apply, the input frame is returned as-is. On a clean 1 GB file this
|
||||||
|
saves the 1 GB allocation that the unconditional upfront copy used
|
||||||
|
to take.
|
||||||
|
- **Map Columns** (`map_columns`): rename + drop both already
|
||||||
|
return fresh frames; the explicit upfront `df.copy()` is now
|
||||||
|
removed and downstream mutating steps (schema-add, coerce) copy on
|
||||||
|
demand via `_ensure_owned()`. Rename-only and identity-mapping
|
||||||
|
paths run with zero explicit copies.
|
||||||
|
- **Find Duplicates**:
|
||||||
|
- **Exact-only strategies** (every column uses `Algorithm.EXACT` at
|
||||||
|
threshold 100 — covers strong-key dedup like email/phone, the
|
||||||
|
fallback drop-duplicates path, and explicit "match on this exact
|
||||||
|
column" calls) now run in **O(n)** via groupby. Measured: 10k
|
||||||
|
rows on an email-exact strategy → 73 ms (was ~30 minutes via the
|
||||||
|
old O(n²) pair compare).
|
||||||
|
- **Fuzzy strategies** still pair-compare. Opt in to **prefix
|
||||||
|
blocking** via `deduplicate(..., blocking_columns=['name'],
|
||||||
|
blocking_prefix_len=1)` to partition pairs by a cheap key.
|
||||||
|
Measured: 5k rows fuzzy-name dedup → 25.6s with blocking vs.
|
||||||
|
179s without (7× faster). Trade-off: cross-block matches are
|
||||||
|
missed; lower `blocking_prefix_len` widens blocks.
|
||||||
|
- Normalisation pass remains LRU-cached per call so repeat values
|
||||||
|
(the common dedup workload) skip re-parsing.
|
||||||
|
|
||||||
9.1 `auto` — apply the registered fix.
|
## 11. Tools
|
||||||
9.2 `skip` — waive (no change, audit-logged).
|
1. Find Duplicates — Ready
|
||||||
9.3 `modified` — apply with custom payload (e.g. user-edited null sentinels).
|
2. Clean Text — Ready
|
||||||
|
3. Standardize Formats — Ready
|
||||||
|
4. Fix Missing Values — Ready
|
||||||
|
5. Map Columns — Ready
|
||||||
|
6. Find Unusual Values — Coming Soon
|
||||||
|
7. Combine Files — Coming Soon
|
||||||
|
8. Quality Check — Coming Soon
|
||||||
|
9. Automated Workflows — Ready
|
||||||
|
|
||||||
## 10. Performance (1 GB input)
|
**Future / not in v1.** Tool ideas captured for after-launch consideration
|
||||||
|
live in `docs/FUTURE-TOOLS.md` — entries there are gated by the new-tool
|
||||||
|
freeze in `PLAN.md` §2.1 and don't ship without a paying-customer +
|
||||||
|
repeated-demand signal. Currently parked there:
|
||||||
|
|
||||||
10.1 Initial scan (`analyze` sample-mode): < 2 s.
|
- **#10. PDF → CSV extractor** (bank statements + similar). No PDF
|
||||||
10.2 Peak RSS during initial scan: ~110 MB.
|
dependency exists in the repo today; this tool would need pdfplumber,
|
||||||
10.3 Full-file `repair_bytes`: ~30–40 s (when triggered).
|
streamlit-drawable-canvas, and a templates store. Estimated 3–4 weeks
|
||||||
10.4 Full-DataFrame analyze: ~4 min (~25 µs/cell).
|
for a text-only MVP, 6–10 weeks for the polished version with
|
||||||
10.5 Full-DataFrame `auto_fix`: ~5 min (~30 µs/cell).
|
multi-page template recall.
|
||||||
10.6 Output write: ~10 s for 1 GB UTF-8 CSV.
|
|
||||||
10.7 RAM headroom recommended: 4× input file size for the full-Apply path.
|
|
||||||
|
|
||||||
## 11. Tools shipped
|
### 11.a Recommended pipeline order (soft, not enforced)
|
||||||
|
|
||||||
11.1 Deduplicator — Ready.
|
Automated Workflows ships with a `SOFT_DEPENDENCIES` table; the
|
||||||
11.2 Text Cleaner — Ready.
|
following ordering is the default and the basis of the warning
|
||||||
11.3 Format Standardizer — Coming Soon.
|
surface. Re-ordering is allowed; the runner emits a warning string
|
||||||
11.4 Missing Value Handler — Coming Soon.
|
and proceeds.
|
||||||
11.5 Column Mapper — Coming Soon.
|
|
||||||
11.6 Outlier Detector — Coming Soon.
|
| # | Tool | Why this slot |
|
||||||
11.7 Multi-File Merger — Coming Soon.
|
|---|------|---------------|
|
||||||
11.8 Validator & Reporter — Coming Soon.
|
| 1 | column_map (optional, for header alignment) | Multi-vendor unification — rename early so downstream tools see canonical headers |
|
||||||
11.9 Pipeline Runner — Coming Soon.
|
| 2 | text_clean | NBSP / smart quotes / zero-width pollution silently breaks downstream parsers |
|
||||||
|
| 3 | format_standardize | Phones / dates / currencies → canonical form before missing detection and dedup |
|
||||||
|
| 4 | missing | Sentinel detection, imputation, drop strategies — needs canonical types |
|
||||||
|
| 5 | column_map (optional, for schema enforcement) | Project to target schema, coerce, drop extras AFTER cleaning |
|
||||||
|
| 6 | dedup | Fuzzy matching is most accurate on canonicalised, sentinel-laundered data |
|
||||||
|
|
||||||
## 12. Gate (Review & Normalize)
|
## 12. Gate (Review & Normalize)
|
||||||
|
- Gates every tool page.
|
||||||
12.1 Gates every tool page; tool pages refuse to load until passed.
|
- Auto-fix button: applies all `confidence=high` findings in one click.
|
||||||
12.2 Auto-fix button applies all `confidence=high` findings in one click.
|
- Per-finding controls: Auto / Skip / Customize.
|
||||||
12.3 Per-finding controls: Auto-fix / Skip / Customize.
|
- Live before/after preview (≤5 sample rows).
|
||||||
12.4 Live before/after preview per finding (≤5 sample rows).
|
- Audit log per fix (id, decision, cells changed).
|
||||||
12.5 Audit log: every fix tagged with finding id, decision, cells changed.
|
- Encoding-override picker (16 codepages + custom).
|
||||||
12.6 Encoding override picker (16 codepages + custom).
|
- Advanced output expander: encoding + delimiter + line terminator.
|
||||||
12.7 Advanced output options expander: encoding + delimiter + line terminator.
|
- Result keyed by upload SHA-256; survives reload, invalidated on re-upload.
|
||||||
12.8 Result keyed by upload SHA-256; survives page reloads, invalidated on re-upload.
|
|
||||||
|
|
||||||
## 13. Interfaces
|
## 13. Interfaces
|
||||||
|
- **GUI**: Streamlit, browser-based, local, no internet. Sidebar language picker (English, Español).
|
||||||
13.1 GUI: Streamlit, runs locally, browser-based, no internet required.
|
- **CLI**: `python -m src.cli` (dedup) · `src.cli_text_clean` · `src.cli_format` · `src.cli_missing` · `src.cli_column_map` · `src.cli_pipeline` · `src.cli_analyze`. (CLI output is English-only.)
|
||||||
13.2 CLI: Typer apps — `python -m src.cli`, `src.cli_text_clean`, `src.cli_analyze`.
|
- **Python API**: `from src.core import …` (analyze, repair_bytes, clean_dataframe, deduplicate, standardize_dataframe, …).
|
||||||
13.3 Python API: `from src.core import …` (analyze, repair_bytes, clean_dataframe, deduplicate, etc.).
|
- **JSON output**: `--json` on `cli_analyze`.
|
||||||
13.4 JSON output: `--json` flag on `cli_analyze`; full Finding schema.
|
- **Language packs**: `from src.i18n import t, LANGUAGES`. Add `<code>.json` to `src/i18n/packs/` + entry in `LANGUAGES` to add a language.
|
||||||
|
|
||||||
## 14. Platforms
|
## 14. Platforms
|
||||||
|
- Python ≥ 3.10.
|
||||||
14.1 Python: ≥ 3.10.
|
- OS: Linux, macOS, Windows.
|
||||||
14.2 OS: Linux, macOS, Windows.
|
- Browser: any modern browser.
|
||||||
14.3 Display: any modern browser (Streamlit GUI).
|
- Network: not required at runtime.
|
||||||
14.4 Network: not required at runtime.
|
|
||||||
|
|
||||||
## 15. Dependencies
|
## 15. Dependencies
|
||||||
|
- **Core**: pandas, openpyxl, charset-normalizer, typer, loguru.
|
||||||
15.1 Core: pandas, openpyxl, charset-normalizer, typer, loguru.
|
- **Dedup**: rapidfuzz, phonenumbers.
|
||||||
15.2 Dedup: rapidfuzz, phonenumbers.
|
- **GUI**: streamlit.
|
||||||
15.3 GUI: streamlit.
|
- **Optional**: ftfy (mojibake repair).
|
||||||
15.4 Optional: ftfy (mojibake repair, `repair_mojibake` fix).
|
- **Dev**: pytest, tox.
|
||||||
15.5 Dev: pytest, tox.
|
|
||||||
|
|
||||||
## 16. Test coverage
|
## 16. Test coverage
|
||||||
|
- 2,033 tests passing, 0 skipped, 0 xfailed.
|
||||||
16.1 Unit + integration: 765 tests passing.
|
- 1,868 core + CLI tests (run with `pytest -m 'not gui'` for a quick loop).
|
||||||
16.2 Documented gaps: 17 xfail (charset-normalizer label drift on byte-equivalent codepages, byte-level smart-quote fold expectation).
|
Includes 49 license-layer unit tests (Ed25519 sign/verify, dev-key
|
||||||
16.3 Fixture corpora: 21 text-cleaner fixtures, 31 encoding fixtures, 9 reference UTF-8 files.
|
derivation, production-safe tripwire, schema), 25 license-CLI
|
||||||
16.4 CI surface: `python run_tests.py [--tool …] [--fixtures] [--coverage]`.
|
tests, and 17 Lite-tier feature-map + guard tests.
|
||||||
|
- 165 GUI tests under `tests/gui/` driving Streamlit pages via `AppTest`
|
||||||
|
(smoke + EN/ES localization, chrome, gate, workflows, dedup review,
|
||||||
|
advanced panels, error paths, findings panel, activation +
|
||||||
|
license gate, Lite-tier per-page lock behaviour). Marked `gui`.
|
||||||
|
- Includes 15 perf-shape regression tests.
|
||||||
|
- Fixture corpora: text-cleaner (21), encodings (31), reference UTF-8 (9), format-cleaner (199 buyer cases + 20-row international stress fixture), missing-handler (3 use cases + 16 edge cases), column-mapper (3 use cases + 5 edge cases).
|
||||||
|
- Run: `python run_tests.py [--tool …] [--fixtures] [--coverage]`.
|
||||||
|
|
||||||
## 17. Privacy / data handling
|
## 17. Privacy / data handling
|
||||||
|
- All processing local; no network calls in the data path.
|
||||||
|
- No telemetry.
|
||||||
|
- Original input never modified.
|
||||||
|
- Audit logs: `logs/` next to each run (timestamped).
|
||||||
|
|
||||||
17.1 All processing local; no network calls in the data path.
|
## 17a. Licensing
|
||||||
17.2 No telemetry, no usage analytics shipped.
|
- **Storage**: ``~/.datatools/license.json`` (or
|
||||||
17.3 Original input file never modified — outputs go to a separate path.
|
``$DATATOOLS_LICENSE_PATH`` override). Signed with Ed25519
|
||||||
17.4 Audit logs written to `logs/` next to each run (timestamped).
|
(asymmetric).
|
||||||
|
- **Crypto**: Ed25519. The seller holds the private key; every
|
||||||
|
shipped binary embeds only the public key. A motivated reverse
|
||||||
|
engineer who pulls everything out of the binary still can't sign
|
||||||
|
new licenses. Keys are 32 bytes raw, exposed as hex via
|
||||||
|
``DATATOOLS_LICENSE_PRIVKEY`` (seller-side) and
|
||||||
|
``DATATOOLS_LICENSE_PUBKEY`` (build-time bake-in).
|
||||||
|
- **Activation**: buyer pastes a base64-encoded license blob
|
||||||
|
(``DTLIC1:...``) on first launch; app verifies the signature
|
||||||
|
offline + matches the buyer-entered name/email to the embedded
|
||||||
|
values.
|
||||||
|
- **No free trial**: every license requires a paid blob from the
|
||||||
|
seller. The user-facing trial flow (button + ``license_cli trial``
|
||||||
|
subcommand) was removed in v1.6 to keep paid-tier economics clean.
|
||||||
|
- **Lifetime**: every license is 1 year by default. Renewal applies a
|
||||||
|
fresh blob without losing the embedded buyer identity. Tier may
|
||||||
|
change during renewal (Lite → Core upgrade path).
|
||||||
|
- **Tiers**:
|
||||||
|
- ``lite`` — Find Duplicates + Clean Text + Standardize Formats.
|
||||||
|
Buyer pays once, gets the three universally-useful tools.
|
||||||
|
- ``core`` — every Ready tool (all 9 in v1.6).
|
||||||
|
- ``pro``, ``enterprise`` — scaffolded for future SKUs; currently
|
||||||
|
mirror Core. Add per-SKU restrictions by editing
|
||||||
|
``FEATURES_BY_TIER`` in ``src/license/features.py``.
|
||||||
|
- ``trial`` — kept in the enum for backwards compat with any
|
||||||
|
field-tested trial licenses but no longer issuable.
|
||||||
|
- **Feature flags**: every tool has a stable feature id matching its
|
||||||
|
``tool_id`` in :mod:`src.gui.tools_registry`. Adding a future per-
|
||||||
|
tool SKU is a one-line change to ``FEATURES_BY_TIER`` — no consumer
|
||||||
|
code edits.
|
||||||
|
- **Per-tool gating**: each tool page (GUI) and tool CLI calls
|
||||||
|
``require_feature(FeatureFlag.<TOOL>)`` at entry. GUI shows an
|
||||||
|
upgrade prompt + button to the Activate page; CLI prints a
|
||||||
|
message naming the locked feature and exits with code 2.
|
||||||
|
- **Lock badge**: the home grid shows a red 🔒 Locked pill on tool
|
||||||
|
cards the current tier doesn't unlock.
|
||||||
|
- **Dev bypass**: ``DATATOOLS_DEV_MODE=1`` skips every check (used by
|
||||||
|
the test suite and during development). **Refused in shipped
|
||||||
|
builds** by the production-safe tripwire.
|
||||||
|
- **Production-safe tripwire**: ``assert_production_safe()`` runs at
|
||||||
|
startup in every frozen build. Refuses to boot when ``DEV_MODE``
|
||||||
|
is set or the verification key is still the embedded dev key
|
||||||
|
(i.e., the build pipeline forgot to override
|
||||||
|
``DATATOOLS_LICENSE_PUBKEY``). No-op in source / pytest runs.
|
||||||
|
- **No internet**: signature verification is fully offline. The
|
||||||
|
shipped binary embeds only the public key; the private key never
|
||||||
|
leaves the seller. See ``docs/DECISIONS.md`` for the threat-model
|
||||||
|
discussion.
|
||||||
|
|
||||||
|
## 18. Error handling
|
||||||
|
- Structured hierarchy: `DataToolsError` → `InputValidationError`, `ConfigError`, `FileFormatError`, `FileAccessError`.
|
||||||
|
- Subclasses extend stdlib `ValueError` / `OSError` so existing handlers still catch them.
|
||||||
|
- Every error carries: message, file path, column, operation, suggestion, underlying cause.
|
||||||
|
|||||||
593
docs/SETUP-LICENSE-SERVER.md
Normal file
593
docs/SETUP-LICENSE-SERVER.md
Normal file
@@ -0,0 +1,593 @@
|
|||||||
|
# SETUP — Self-hosted license server runbook
|
||||||
|
|
||||||
|
End-to-end build instructions for `licenses.datatools.unalogix.com` on
|
||||||
|
the existing invixiom box (Ubuntu 24.04, public IP `46.225.166.142`).
|
||||||
|
|
||||||
|
Audience: creator/operator. Read top to bottom on first install; use as
|
||||||
|
a reference thereafter.
|
||||||
|
|
||||||
|
Companions:
|
||||||
|
- `LICENSE-SERVER.md` — the architecture / design rationale
|
||||||
|
- `ADMIN.md` — day-2 ops (minting comps, looking at the issuance log)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0. Multi-tenancy: where this lands among existing services
|
||||||
|
|
||||||
|
This box already hosts the `*.invixiom.com` family (kasm, files, lifeos,
|
||||||
|
code, gitea) via one shared nginx + one shared Let's Encrypt cert.
|
||||||
|
DataTools is intentionally separated from that stack at every layer:
|
||||||
|
|
||||||
|
| Layer | Existing | New |
|
||||||
|
|---|---|---|
|
||||||
|
| **DNS zone** | `invixiom.com` | `unalogix.com` (different TLD) |
|
||||||
|
| **nginx file** | `/etc/nginx/sites-available/invixiom` | `/etc/nginx/sites-available/unalogix` |
|
||||||
|
| **nginx symlink** | `sites-enabled/invixiom` | `sites-enabled/unalogix` |
|
||||||
|
| **TLS cert** | `letsencrypt/live/kasm.invixiom.com[-0001]` | `letsencrypt/live/datatools.unalogix.com` |
|
||||||
|
| **Backend port** | 8000, 8002, 8003, 8080, 8081, 8443 | **8090** (mint API), **5433** (Postgres, localhost-only) |
|
||||||
|
| **Docker compose project** | per-service (kasm, lifeos, gitea) | `datatools-license` |
|
||||||
|
| **Docker volume** | per service | `datatools_pg_data` |
|
||||||
|
| **Filesystem root** | various | `/srv/datatools-license/` |
|
||||||
|
| **System user** | various | `datatools-api` (UID auto-assigned, no shell) |
|
||||||
|
|
||||||
|
Nothing in the invixiom stack is read, modified, or referenced by the
|
||||||
|
datatools stack. Restart, upgrade, or remove either without affecting
|
||||||
|
the other.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Pre-flight checklist (off-box, before any commands run)
|
||||||
|
|
||||||
|
These have to be done by the operator outside this box. The build
|
||||||
|
won't proceed without them.
|
||||||
|
|
||||||
|
### 1a. DNS records
|
||||||
|
|
||||||
|
In your `unalogix.com` registrar / DNS panel, add:
|
||||||
|
|
||||||
|
```
|
||||||
|
A datatools.unalogix.com 46.225.166.142
|
||||||
|
A licenses.datatools.unalogix.com 46.225.166.142
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify before continuing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
dig +short datatools.unalogix.com
|
||||||
|
dig +short licenses.datatools.unalogix.com
|
||||||
|
# Both should print: 46.225.166.142
|
||||||
|
```
|
||||||
|
|
||||||
|
DNS propagation can take 1–60 minutes. Let's Encrypt won't issue
|
||||||
|
certs until DNS resolves correctly.
|
||||||
|
|
||||||
|
### 1b. Postmark account (transactional email)
|
||||||
|
|
||||||
|
1. Sign up at https://postmarkapp.com (free 100 emails/mo, $15/mo for
|
||||||
|
the volume range we'll be in).
|
||||||
|
2. Verify the `unalogix.com` domain (DNS TXT/CNAME records — Postmark
|
||||||
|
will tell you exactly what to add).
|
||||||
|
3. Create a Server, copy the **Server API Token**. Stash it; we'll put
|
||||||
|
it in the app's `.env`.
|
||||||
|
4. Configure the sender address: `licenses@datatools.unalogix.com`.
|
||||||
|
|
||||||
|
If you prefer SES, Mailgun, Resend, etc. — fine, just swap the
|
||||||
|
adapter (see §6). Postmark is the recommended default.
|
||||||
|
|
||||||
|
### 1c. Cloudflare in front (recommended)
|
||||||
|
|
||||||
|
Move `unalogix.com` DNS hosting to Cloudflare and enable proxy ("orange
|
||||||
|
cloud") on both subdomains. Gets you free DDoS protection, WAF, and rate
|
||||||
|
limiting. **Origin TLS still goes through Let's Encrypt on this box**;
|
||||||
|
Cloudflare adds a second TLS hop in front. Cert renewal still works
|
||||||
|
because we use HTTP-01 challenge on the origin, which Cloudflare
|
||||||
|
proxies transparently.
|
||||||
|
|
||||||
|
If you skip this, the public webhook endpoint is directly hammerable.
|
||||||
|
Not catastrophic at low scale, but the free protection is worth taking.
|
||||||
|
|
||||||
|
### 1d. Gumroad webhook secret
|
||||||
|
|
||||||
|
In Gumroad's seller dashboard → Settings → Advanced → "Ping URL":
|
||||||
|
|
||||||
|
```
|
||||||
|
URL: https://licenses.datatools.unalogix.com/webhooks/gumroad
|
||||||
|
Secret: <generate a random 32-char hex; save it for the .env>
|
||||||
|
```
|
||||||
|
|
||||||
|
Don't enter this until §10 ("PR 2 cutover") — the endpoint won't exist
|
||||||
|
yet during the Mint API build.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. One-time host setup
|
||||||
|
|
||||||
|
Run as `root` (or via `sudo`).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Update apt cache and pull in the bits the rest of the doc needs.
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y \
|
||||||
|
docker-compose-plugin \
|
||||||
|
certbot \
|
||||||
|
python3-certbot-nginx \
|
||||||
|
postgresql-client-16 # for psql to reach the containerized DB
|
||||||
|
|
||||||
|
# Sanity check: docker + compose v2 are already installed via Docker CE.
|
||||||
|
docker --version
|
||||||
|
docker compose version
|
||||||
|
|
||||||
|
# Create the system user the app process will run as (no shell, no home).
|
||||||
|
adduser --system --group --no-create-home --shell /usr/sbin/nologin datatools-api
|
||||||
|
|
||||||
|
# Filesystem layout under /srv (separate from /opt to make the
|
||||||
|
# multi-tenant boundary obvious on disk).
|
||||||
|
install -d -o datatools-api -g datatools-api -m 750 /srv/datatools-license
|
||||||
|
install -d -o datatools-api -g datatools-api -m 750 /srv/datatools-license/app
|
||||||
|
install -d -o datatools-api -g datatools-api -m 750 /srv/datatools-license/secrets
|
||||||
|
install -d -o datatools-api -g datatools-api -m 750 /srv/datatools-license/backups
|
||||||
|
```
|
||||||
|
|
||||||
|
The `secrets/` dir is mode 750 owned by `datatools-api`. The private
|
||||||
|
signing key and Postmark token live there as mode-400 files — never
|
||||||
|
in environment-variable-via-systemd-EnvironmentFile, never in the
|
||||||
|
docker-compose file, never anywhere `root` doesn't need to look.
|
||||||
|
|
||||||
|
> **Gotcha — secret file ownership UID.** Docker compose's
|
||||||
|
> `uid:`/`gid:`/`mode:` long-form on `secrets:` is silently ignored
|
||||||
|
> for **file-based** secrets (it's a swarm-mode-only feature). The
|
||||||
|
> file inside the container appears with whatever ownership it has
|
||||||
|
> on the host, and the API runs as UID 10001 (the `app` user from
|
||||||
|
> the Dockerfile). So chown the actual files to **10001** (a numeric
|
||||||
|
> UID that doesn't exist on the host — that's fine, chown accepts
|
||||||
|
> it) and rely on the parent dir's mode 750 + ownership for host-side
|
||||||
|
> access control. See §3 below for the corrected `chown` step.
|
||||||
|
|
||||||
|
### Firewall recommendation (separate decision)
|
||||||
|
|
||||||
|
The box currently runs without UFW. Enabling it now would affect all
|
||||||
|
existing services. Two options:
|
||||||
|
|
||||||
|
- **(A) Don't enable UFW.** Leave the cloud provider's network firewall
|
||||||
|
as the perimeter. This is the current state.
|
||||||
|
- **(B) Enable UFW with `allow 22, 80, 443` only.** Forces every Docker
|
||||||
|
service to bind to `127.0.0.1` (some currently bind `0.0.0.0`). Will
|
||||||
|
break any direct-port access until those binds are updated.
|
||||||
|
|
||||||
|
Default for this runbook: **(A)**. Revisit independently of the
|
||||||
|
DataTools rollout. The DataTools containers always bind to `127.0.0.1`
|
||||||
|
regardless.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Database (Postgres in Docker)
|
||||||
|
|
||||||
|
Postgres lives inside the datatools compose project — separate from
|
||||||
|
every other service on the box, separate volume, separate port,
|
||||||
|
localhost-only binding.
|
||||||
|
|
||||||
|
`/srv/datatools-license/compose.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:16-alpine
|
||||||
|
container_name: datatools-postgres
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
POSTGRES_DB: datatools_licenses
|
||||||
|
POSTGRES_USER: datatools_api
|
||||||
|
POSTGRES_PASSWORD_FILE: /run/secrets/pg_password
|
||||||
|
secrets:
|
||||||
|
- pg_password
|
||||||
|
volumes:
|
||||||
|
- datatools_pg_data:/var/lib/postgresql/data
|
||||||
|
ports:
|
||||||
|
- "127.0.0.1:5433:5432" # localhost-only, non-default port
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U datatools_api -d datatools_licenses"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
api:
|
||||||
|
build:
|
||||||
|
context: ./app
|
||||||
|
dockerfile: server/Dockerfile
|
||||||
|
image: datatools-license-api:latest
|
||||||
|
container_name: datatools-api
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
postgres:
|
||||||
|
condition: service_healthy
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: postgresql+psycopg://datatools_api@postgres:5432/datatools_licenses
|
||||||
|
PG_PASSWORD_FILE: /run/secrets/pg_password
|
||||||
|
DATATOOLS_ADMIN_TOKEN_FILE: /run/secrets/admin_token
|
||||||
|
# PR 2 — uncomment when Postmark + Gumroad are provisioned.
|
||||||
|
# POSTMARK_TOKEN_FILE: /run/secrets/postmark_token
|
||||||
|
# GUMROAD_WEBHOOK_SECRET_FILE: /run/secrets/gumroad_secret
|
||||||
|
# Production keypair (replaces in-tree dev key): set
|
||||||
|
# DATATOOLS_LICENSE_PRIVKEY_FILE: /run/secrets/license_privkey
|
||||||
|
# and DATATOOLS_LICENSE_PUBKEY: <hex> before shipping v1.0.
|
||||||
|
secrets:
|
||||||
|
- pg_password
|
||||||
|
- admin_token
|
||||||
|
# PR 2:
|
||||||
|
# - postmark_token
|
||||||
|
# - gumroad_secret
|
||||||
|
ports:
|
||||||
|
- "127.0.0.1:8090:8000" # localhost-only; nginx is the only path in
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
secrets:
|
||||||
|
pg_password: { file: ./secrets/pg_password }
|
||||||
|
admin_token: { file: ./secrets/admin_token }
|
||||||
|
# PR 2:
|
||||||
|
# postmark_token: { file: ./secrets/postmark_token }
|
||||||
|
# gumroad_secret: { file: ./secrets/gumroad_secret }
|
||||||
|
# Production keypair rotation adds:
|
||||||
|
# license_privkey: { file: ./secrets/license_privkey }
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
datatools_pg_data:
|
||||||
|
name: datatools_pg_data
|
||||||
|
```
|
||||||
|
|
||||||
|
Populate the secrets (each file should contain the value with no
|
||||||
|
trailing newline). For PR 1, only `pg_password` and `admin_token`
|
||||||
|
are required; the rest land in PR 2 / production key rotation.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/datatools-license
|
||||||
|
|
||||||
|
# Random 32-char hex DB password
|
||||||
|
openssl rand -hex 32 > secrets/pg_password
|
||||||
|
|
||||||
|
# Random admin Bearer token (CLI auth). Save this — you'll need it
|
||||||
|
# on your laptop to talk to /internal/* via the SSH tunnel.
|
||||||
|
openssl rand -hex 32 > secrets/admin_token
|
||||||
|
|
||||||
|
# --- PR 2 secrets ---
|
||||||
|
# echo -n "<postmark-server-token>" > secrets/postmark_token # from postmarkapp.com
|
||||||
|
# openssl rand -hex 32 > secrets/gumroad_secret # paste into Gumroad's Ping URL: ?secret=<this>
|
||||||
|
#
|
||||||
|
# --- production-key follow-up (defer until v1.0 cutover) ---
|
||||||
|
# echo -n "<ed25519-private-hex>" > secrets/license_privkey
|
||||||
|
|
||||||
|
# Lock everything down. The numeric 10001 matches the in-container
|
||||||
|
# `app` user (Dockerfile-defined), letting the API read the file
|
||||||
|
# while keeping host-side access gated by the parent dir's mode 750.
|
||||||
|
chmod 400 secrets/*
|
||||||
|
chown 10001:10001 secrets/*
|
||||||
|
```
|
||||||
|
|
||||||
|
The corresponding **public** key for `DATATOOLS_LICENSE_PUBKEY` goes
|
||||||
|
in `/srv/datatools-license/.env` (it's not secret — it's already in
|
||||||
|
every shipped binary):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "DATATOOLS_LICENSE_PUBKEY=<hex-pubkey>" > /srv/datatools-license/.env
|
||||||
|
chmod 640 /srv/datatools-license/.env
|
||||||
|
chown datatools-api:datatools-api /srv/datatools-license/.env
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. App image build
|
||||||
|
|
||||||
|
The Mint API source lives in this repo under `server/` (new directory
|
||||||
|
introduced by PR 1). Build the Docker image:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/datatools-license/app
|
||||||
|
git clone https://git.invixiom.com/giteadmin/datatools-dev.git .
|
||||||
|
docker build -t datatools-license-api:latest -f server/Dockerfile server/
|
||||||
|
```
|
||||||
|
|
||||||
|
Schema bootstrap (one-time, after first `docker compose up`):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose exec api alembic upgrade head
|
||||||
|
```
|
||||||
|
|
||||||
|
Smoke test:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s http://127.0.0.1:8090/health
|
||||||
|
# expects: {"status":"ok","db":"ok"}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. nginx config
|
||||||
|
|
||||||
|
> **Gotcha — nginx version syntax.** Ubuntu 24.04 ships nginx 1.24,
|
||||||
|
> which uses the legacy `listen 443 ssl http2;` form. The standalone
|
||||||
|
> `http2 on;` directive arrived in nginx 1.25 and will error on 1.24
|
||||||
|
> with `unknown directive "http2"`. The config below uses the 1.24
|
||||||
|
> form.
|
||||||
|
>
|
||||||
|
> **Bring-up sequence.** This config references a TLS cert at
|
||||||
|
> `/etc/letsencrypt/live/datatools.unalogix.com/`, which doesn't
|
||||||
|
> exist on a fresh install — nginx would refuse to start. The
|
||||||
|
> working sequence is: (a) install a temporary HTTP-only config
|
||||||
|
> that serves `.well-known/acme-challenge/` and returns 503 for
|
||||||
|
> everything else, (b) `nginx -s reload`, (c) run `certbot
|
||||||
|
> certonly --webroot`, (d) replace with the HTTPS config below,
|
||||||
|
> (e) `nginx -s reload` again. See §6.
|
||||||
|
|
||||||
|
`/etc/nginx/sites-available/unalogix` — **new file**, do not merge
|
||||||
|
into `invixiom`:
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
# Marketing / product site (datatools.unalogix.com) — static for now.
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name datatools.unalogix.com licenses.datatools.unalogix.com;
|
||||||
|
return 301 https://$host$request_uri;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 443 ssl http2; # nginx 1.24 syntax (Ubuntu 24.04)
|
||||||
|
server_name datatools.unalogix.com;
|
||||||
|
|
||||||
|
ssl_certificate /etc/letsencrypt/live/datatools.unalogix.com/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/letsencrypt/live/datatools.unalogix.com/privkey.pem;
|
||||||
|
|
||||||
|
root /srv/datatools-license/site; # static landing page; create later
|
||||||
|
index index.html;
|
||||||
|
|
||||||
|
location / {
|
||||||
|
try_files $uri $uri/ =404;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# License operations subdomain.
|
||||||
|
server {
|
||||||
|
listen 443 ssl http2; # nginx 1.24 syntax (Ubuntu 24.04)
|
||||||
|
server_name licenses.datatools.unalogix.com;
|
||||||
|
|
||||||
|
ssl_certificate /etc/letsencrypt/live/datatools.unalogix.com/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/letsencrypt/live/datatools.unalogix.com/privkey.pem;
|
||||||
|
|
||||||
|
# Block /internal/* from the public side as defense-in-depth.
|
||||||
|
# (The app also enforces this server-side; this is layered.)
|
||||||
|
location /internal/ {
|
||||||
|
return 404;
|
||||||
|
}
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_pass http://127.0.0.1:8090;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
# Gumroad webhook payloads are tiny but tighten anyway.
|
||||||
|
client_max_body_size 1m;
|
||||||
|
|
||||||
|
# Basic rate limiting: 30 req/min/IP on /webhooks/* and /portal/*.
|
||||||
|
# Tune in nginx.conf with a `limit_req_zone` directive.
|
||||||
|
# limit_req zone=licenses burst=10 nodelay;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Enable + reload:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ln -s /etc/nginx/sites-available/unalogix /etc/nginx/sites-enabled/unalogix
|
||||||
|
nginx -t # validate
|
||||||
|
systemctl reload nginx
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. TLS cert
|
||||||
|
|
||||||
|
Use the standalone http-01 challenge (nginx-plugin works too; this is
|
||||||
|
slightly more explicit):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
certbot certonly \
|
||||||
|
--webroot -w /var/www/html \
|
||||||
|
-d datatools.unalogix.com \
|
||||||
|
-d licenses.datatools.unalogix.com \
|
||||||
|
--agree-tos \
|
||||||
|
--email michael.dombaugh@gmail.com \
|
||||||
|
--non-interactive
|
||||||
|
```
|
||||||
|
|
||||||
|
Cert lands at `/etc/letsencrypt/live/datatools.unalogix.com/`.
|
||||||
|
Auto-renewal is already configured by the certbot package (systemd
|
||||||
|
timer `certbot.timer`). Confirm:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
systemctl list-timers certbot.timer
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Bring it up
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/datatools-license
|
||||||
|
docker compose up -d
|
||||||
|
docker compose ps # both services should be 'running (healthy)'
|
||||||
|
docker compose logs -f api
|
||||||
|
```
|
||||||
|
|
||||||
|
Public smoke test:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s https://licenses.datatools.unalogix.com/health
|
||||||
|
# expects: {"status":"ok","db":"ok"}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Verification — end-to-end internal mint
|
||||||
|
|
||||||
|
From your laptop (NOT the server), open an SSH tunnel for the internal
|
||||||
|
endpoint:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh -L 8090:127.0.0.1:8090 michael@46.225.166.142 -N
|
||||||
|
# Leave running; in another terminal:
|
||||||
|
|
||||||
|
curl -X POST http://127.0.0.1:8090/internal/mint \
|
||||||
|
-H "Authorization: Bearer $DATATOOLS_ADMIN_TOKEN" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"name":"Test Buyer",
|
||||||
|
"email":"test@example.com",
|
||||||
|
"tier":"core",
|
||||||
|
"years":1,
|
||||||
|
"source":"manual",
|
||||||
|
"notes":"smoke test"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: 200 + a `DTLIC2:...` blob + a row inserted in the `licenses`
|
||||||
|
table. Confirm with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose exec postgres \
|
||||||
|
psql -U datatools_api -d datatools_licenses \
|
||||||
|
-c "SELECT license_key, email, tier, source FROM licenses;"
|
||||||
|
```
|
||||||
|
|
||||||
|
Then **revoke the test row** before going further:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose exec postgres \
|
||||||
|
psql -U datatools_api -d datatools_licenses \
|
||||||
|
-c "DELETE FROM licenses WHERE email = 'test@example.com';"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Operational concerns
|
||||||
|
|
||||||
|
### Backups (Postgres → off-site)
|
||||||
|
|
||||||
|
`/etc/cron.daily/datatools-license-backup`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
TS=$(date -u +%Y%m%dT%H%M%SZ)
|
||||||
|
OUT=/srv/datatools-license/backups/db-${TS}.sql.gz
|
||||||
|
docker compose -f /srv/datatools-license/compose.yml exec -T postgres \
|
||||||
|
pg_dump -U datatools_api datatools_licenses | gzip > "$OUT"
|
||||||
|
chmod 600 "$OUT"
|
||||||
|
# Off-site copy — pick one:
|
||||||
|
# rclone copy "$OUT" remote:datatools-license-backups/
|
||||||
|
# aws s3 cp "$OUT" s3://datatools-backups/db/ --sse AES256
|
||||||
|
find /srv/datatools-license/backups -name 'db-*.sql.gz' -mtime +30 -delete
|
||||||
|
```
|
||||||
|
|
||||||
|
Pick an off-site target. Without one, a disk failure loses every
|
||||||
|
customer record. Test the restore at least once on a staging copy.
|
||||||
|
|
||||||
|
### Monitoring
|
||||||
|
|
||||||
|
External uptime probe (free):
|
||||||
|
1. UptimeRobot account → add monitor for `https://licenses.datatools.unalogix.com/health`.
|
||||||
|
2. 5-minute interval, alert to email/SMS.
|
||||||
|
|
||||||
|
Container health is already handled by `restart: unless-stopped` +
|
||||||
|
healthcheck. To see recent failures:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose ps # last health-check status
|
||||||
|
docker compose logs api --tail 200
|
||||||
|
journalctl -u docker --since '1 hour ago' | grep datatools
|
||||||
|
```
|
||||||
|
|
||||||
|
### Log rotation
|
||||||
|
|
||||||
|
Docker handles container logs; cap their size in
|
||||||
|
`/etc/docker/daemon.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"log-driver": "json-file",
|
||||||
|
"log-opts": {
|
||||||
|
"max-size": "10m",
|
||||||
|
"max-file": "3"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Then `systemctl restart docker` (this restarts all containers — schedule
|
||||||
|
during a quiet window).
|
||||||
|
|
||||||
|
### Key rotation (future)
|
||||||
|
|
||||||
|
If the private signing key is ever compromised:
|
||||||
|
|
||||||
|
1. Generate a new keypair (`scripts/generate_keypair.py`).
|
||||||
|
2. Build and ship a desktop release with the new pubkey embedded.
|
||||||
|
3. Update `/srv/datatools-license/secrets/license_privkey` and
|
||||||
|
`/srv/datatools-license/.env`'s pubkey.
|
||||||
|
4. `docker compose restart api`.
|
||||||
|
5. Re-issue every active license (script that queries the DB, calls
|
||||||
|
`/internal/mint`, emails buyers). Old blobs will fail verification
|
||||||
|
in the new desktop build.
|
||||||
|
|
||||||
|
Plan a 90-day overlap window where the desktop verifies against
|
||||||
|
*both* keys before retiring the old pubkey. (Verification logic
|
||||||
|
change to the desktop app — not in scope for PR 1.)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. PR cutover sequence
|
||||||
|
|
||||||
|
This runbook covers the box-level scaffolding. Application code lands
|
||||||
|
in three independently shippable PRs:
|
||||||
|
|
||||||
|
| PR | Adds | Ship gate | Webhook live? |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **1** | Source-agnostic Mint API + Postgres + `datatools-admin mint` CLI | Operator can mint a comp license through the server | No |
|
||||||
|
| **2** | Gumroad adapter + webhook receiver + email send | Real Gumroad sale auto-mints + emails buyer | **Yes** (enable in Gumroad dashboard at this PR's deploy) |
|
||||||
|
| **3** | Renewal / re-delivery portal | Buyer self-services renewals and lost-blob re-delivery | (unchanged) |
|
||||||
|
|
||||||
|
§1d (Gumroad webhook URL) is **filled in during PR 2's deploy**, not
|
||||||
|
before. Until then the endpoint returns 404.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Rollback
|
||||||
|
|
||||||
|
Each component is independently reversible.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop and remove containers (DB volume persists)
|
||||||
|
docker compose -f /srv/datatools-license/compose.yml down
|
||||||
|
|
||||||
|
# Full teardown including DB (DESTRUCTIVE — backup first)
|
||||||
|
docker compose -f /srv/datatools-license/compose.yml down -v
|
||||||
|
|
||||||
|
# Remove nginx site
|
||||||
|
rm /etc/nginx/sites-enabled/unalogix
|
||||||
|
nginx -t && systemctl reload nginx
|
||||||
|
|
||||||
|
# Revoke + delete TLS cert
|
||||||
|
certbot delete --cert-name datatools.unalogix.com
|
||||||
|
|
||||||
|
# Remove filesystem
|
||||||
|
rm -rf /srv/datatools-license # NOTE: includes secrets dir; backup first
|
||||||
|
|
||||||
|
# Remove system user
|
||||||
|
deluser datatools-api
|
||||||
|
delgroup datatools-api
|
||||||
|
```
|
||||||
|
|
||||||
|
DNS records can stay or be removed — they're not on this host.
|
||||||
@@ -1,570 +1,399 @@
|
|||||||
# TECHNICAL.md - Technical Design, Build Pipeline, Standards
|
# Technical
|
||||||
|
|
||||||
> **Creator-only document. Do not ship to buyers.**
|
> Creator-only. Do not ship to buyers.
|
||||||
|
> **Version**: 1.6 · **Updated**: 2026-05-01
|
||||||
|
|
||||||
**Version**: 1.6
|
For the end-to-end picture (desktop app + license server + storefronts
|
||||||
**Last updated**: April 28, 2026
|
+ email), see `ARCHITECTURE.md`. This doc focuses on desktop internals.
|
||||||
|
|
||||||
---
|
## 1. Architecture
|
||||||
|
|
||||||
## 1. Architecture Overview
|
- **Dual interface**: CLI + GUI, both wrapping the same `src/core/` library.
|
||||||
|
- **GUI**: Streamlit, runs as local web server, opens in default browser. No internet.
|
||||||
|
- **Runtime**: Python 3.10+ (bundled into installer; buyer never sees Python).
|
||||||
|
- **Cross-platform**: Windows, macOS, Linux from day one. PyInstaller per OS.
|
||||||
|
- **Core/UI rule**: business logic in `core/` only. CLI + GUI are thin front-ends.
|
||||||
|
|
||||||
- Standalone tools with **dual interface**: CLI and GUI, both wrapping the same core library.
|
**Locks**:
|
||||||
- GUI framework: **Streamlit**. Runs as a local web server, opens in the buyer's default browser. No internet used.
|
- v1.2 — dual interface required (non-technical buyers won't use CLI).
|
||||||
- Python 3.11+ runtime (bundled into the installer; the buyer never installs Python).
|
- v1.3 — Streamlit chosen (over CustomTkinter inactive, plain Tk UX gap, Flet/PySide6/NiceGUI each fails one dimension). See DECISIONS.md §4c.
|
||||||
- Modular code, one concern per script. Core logic is library code; CLI and GUI are thin front-ends.
|
|
||||||
- Cross-platform from day one: Windows, macOS, Linux.
|
|
||||||
- PyInstaller produces standalone executables per OS. Buyer never sees Python, pip, venvs, or PATH.
|
|
||||||
- No internet required at runtime.
|
|
||||||
|
|
||||||
**Why dual interface (locked v1.2)**: The primary buyer persona is non-technical and will not use a CLI. The GUI is therefore the primary surface and is required at v1, not deferred. The CLI is retained for power users, automation, scheduled jobs, and future scripted workflows. Both share a single core; neither has features the other lacks (except interactive review, which only makes sense in GUI).
|
## 2. Repo layout
|
||||||
|
|
||||||
**Why Streamlit (locked v1.3)**: Fastest build velocity, lowest maintenance burden per added feature, hosted browser demo deployable as a marketing asset, future SaaS optionality. Selected over CustomTkinter (maintenance inactive since Jan 2024), plain Tkinter (UX gap at this price tier), Flet (ecosystem too young), PySide6 (overkill), and NiceGUI (smaller community). Full rationale in DECISIONS.md Section 4c.
|
|
||||||
|
|
||||||
This is a major change from the original Inno-Setup-only, CLI-only design. Rationale chain:
|
|
||||||
1. Requiring a buyer to install Python before using the product is the largest source of install friction (solved by PyInstaller in v1.1).
|
|
||||||
2. Requiring a non-technical buyer to use a CLI is the second-largest source of refund risk (solved by dual interface in v1.2).
|
|
||||||
3. Betting the GUI on an unmaintained library is the largest hidden technical risk (solved by Streamlit choice in v1.3).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 2. Standard Bundle Structure (source repo)
|
|
||||||
|
|
||||||
Every bundle follows this layout in source. Core logic is shared, CLI and GUI are thin front-ends.
|
|
||||||
|
|
||||||
```
|
```
|
||||||
bundle-name/
|
src/
|
||||||
├── src/
|
core/ # Shared logic. No UI code.
|
||||||
│ ├── __init__.py
|
analyze.py # Detectors + Finding schema
|
||||||
│ ├── core/ # Shared business logic. No UI code here.
|
config.py # DeduplicationConfig (JSON profiles)
|
||||||
│ │ ├── __init__.py
|
dedup.py # Match strategies, union-find, survivor selection
|
||||||
│ │ ├── dedup.py # (example) the actual algorithm
|
errors.py # Structured error hierarchy + format_for_user
|
||||||
│ │ └── io.py # File I/O, encoding/delimiter detection, etc.
|
fixes.py # Fix registry (one per fix_action)
|
||||||
│ ├── cli.py # Command-line interface (Typer). Thin wrapper over core.
|
format_standardize.py # Per-cell standardizers + DataFrame pipeline
|
||||||
│ └── gui/ # Streamlit front-end. Thin wrapper over core.
|
io.py # read_file / write_file / repair_bytes
|
||||||
│ ├── __init__.py
|
normalize.py # CSV-normalization gate
|
||||||
│ ├── app.py # Main Streamlit entry point (st.set_page_config, layout)
|
normalizers.py # Per-column normalizers for dedup matching
|
||||||
│ ├── pages/ # Streamlit multi-page app (one page per script in the bundle)
|
text_clean.py # clean_dataframe + smart_title_case
|
||||||
│ │ ├── 1_Deduplicator.py
|
_constants.py # Shared USPS abbrevs + state names
|
||||||
│ │ ├── 2_Text_Cleaner.py
|
cli.py # Find Duplicates CLI (Typer)
|
||||||
│ │ ├── 3_Format_Standardizer.py
|
cli_text_clean.py # Clean Text CLI
|
||||||
│ │ └── ...
|
cli_analyze.py # Analyzer CLI (--json)
|
||||||
│ └── components.py # Reusable Streamlit widgets and helpers
|
gui/
|
||||||
├── data_examples/ # Sample input files
|
app.py # Streamlit entry point
|
||||||
├── tests/ # Unit tests (pytest). Tests target core, not UI.
|
pages/ # One page per tool
|
||||||
├── build/
|
components/ # shared, dedup_review, findings, gate, _legacy
|
||||||
│ ├── pyinstaller.spec # PyInstaller build spec (handles both CLI + GUI entry points)
|
i18n/ # GUI language packs (JSON-backed, in-house lookup)
|
||||||
│ ├── launcher.py # Small launcher script: starts Streamlit server, opens browser
|
__init__.py # t() · current_language() · render_language_selector()
|
||||||
│ ├── windows/
|
packs/ # en.json, es.json, … (one file per language)
|
||||||
│ │ └── installer.iss # Inno Setup wrapper for Windows .exe installer
|
build/ # PyInstaller spec, launcher, OS-specific configs
|
||||||
│ ├── macos/
|
demo/ # Constrained Streamlit Community Cloud version
|
||||||
│ │ ├── entitlements.plist
|
tests/ # pytest; targets core/, not UI
|
||||||
│ │ └── dmg_settings.py # dmg-creation config
|
test-cases/ # Fixture corpora (text-cleaner, encodings, format-cleaner)
|
||||||
│ └── linux/
|
|
||||||
│ └── AppImage/ # AppImage build assets
|
|
||||||
├── demo/ # Stripped-down version for hosted browser demo
|
|
||||||
│ └── streamlit_app.py # Entry point for Streamlit Community Cloud deployment
|
|
||||||
├── requirements.txt
|
|
||||||
├── README_bundle.md # User-facing guide (covers both CLI and GUI usage)
|
|
||||||
├── LICENSE
|
|
||||||
└── ci/
|
|
||||||
└── build.yml # GitHub Actions cross-platform build
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Core/UI separation rule**: A new feature is implemented in `core/` first, with tests. CLI and GUI both call into core. If a feature exists only in one front-end (e.g., interactive review only in GUI), the underlying capability still lives in core; only the presentation differs.
|
**Demo subfolder**: row-limited, watermarked, file-size-capped Streamlit app for public deployment. Same core, different front-end constraints.
|
||||||
|
|
||||||
**Demo subfolder rule**: The `demo/` folder contains a constrained Streamlit app for public deployment to Streamlit Community Cloud. Constraints: row limit (e.g., 100 rows max output), no file save, watermark on output, sample dataset only or strict file-size cap. Same core library, different front-end constraints.
|
## 3. Build pipeline
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 3. Cross-Platform Build Pipeline
|
|
||||||
|
|
||||||
### 3.1 Tooling
|
### 3.1 Tooling
|
||||||
|
|
||||||
| Concern | Tool |
|
| Concern | Tool |
|
||||||
|---|---|
|
|---------|------|
|
||||||
| Bundling Python + scripts into a standalone binary | PyInstaller |
|
| Bundling | PyInstaller |
|
||||||
| GUI framework | **Streamlit** |
|
| GUI | Streamlit |
|
||||||
| Browser launch from launcher | Python `webbrowser` module (stdlib) |
|
| CLI | Typer |
|
||||||
| CLI framework | Typer |
|
| Browser launch | stdlib `webbrowser` |
|
||||||
| Windows installer wrapper | Inno Setup (free) |
|
| Win installer | Inno Setup (free) |
|
||||||
| macOS bundle format | `.app` packaged in `.dmg` |
|
| macOS sign+notarize | `codesign` + `notarytool` |
|
||||||
| macOS code signing & notarization | `codesign` + `notarytool` (built into Xcode command line tools) |
|
| Linux | AppImage (primary) + tarball fallback |
|
||||||
| Linux distribution format | AppImage (primary) + plain tarball (fallback) |
|
| CI | GitHub Actions matrix |
|
||||||
| CI / automated builds | GitHub Actions (free tier handles all three OS runners) |
|
| Demo host | Streamlit Community Cloud (free) |
|
||||||
| Hosted demo | Streamlit Community Cloud (free) or $5/mo VPS |
|
|
||||||
|
|
||||||
### 3.2 Build Outputs (what the buyer downloads)
|
### 3.2 Build outputs
|
||||||
|
| OS | File | Buyer experience |
|
||||||
|
|----|------|------------------|
|
||||||
|
| Win | `*-Setup-1.0.exe` | Wizard → desktop shortcut "Launch Bundle" → browser opens. CLI on PATH. |
|
||||||
|
| macOS | `*-1.0.dmg` | Drag to Applications. Signed + notarized. |
|
||||||
|
| Linux | `*-1.0.AppImage` | `chmod +x`, double-click. |
|
||||||
|
|
||||||
| Platform | Output file | Buyer experience |
|
### 3.3 PyInstaller
|
||||||
|---|---|---|
|
|
||||||
| Windows | `BundleName-Setup-1.0.exe` | Double-click installer, click through wizard. Desktop shortcut "Launch Bundle" runs `launcher.py`, which starts the local Streamlit server and opens default browser to `http://localhost:8501`. CLI executables also installed and on PATH. |
|
|
||||||
| macOS | `BundleName-1.0.dmg` | Double-click DMG, drag app to Applications. Signed and notarized. Launching the app runs the launcher, which starts the local server and opens the browser. CLI binaries shipped in the app bundle. |
|
|
||||||
| Linux | `BundleName-1.0.AppImage` | Mark executable, double-click. AppImage runs the launcher, opens browser. Tarball fallback also includes CLI binaries. |
|
|
||||||
|
|
||||||
The **default buyer experience on every platform is**: double-click, browser opens, work done. The CLI is present, documented, and on PATH for users who want it.
|
- `--onefile` for Linux, `--onedir` for Win/macOS (faster startup, easier signing).
|
||||||
|
- Two entry points: GUI launcher + CLI binaries.
|
||||||
|
- Streamlit hooks needed: `streamlit`, `altair`, `pyarrow` data dirs.
|
||||||
|
- Custom `hook-streamlit.py` per documented pattern.
|
||||||
|
- Budget: 1-3 days first time. Reusable after.
|
||||||
|
|
||||||
**Browser-launch UX mitigation** (per DECISIONS.md Section 4c tradeoff): The launcher script displays a brief "Starting your data tool..." console message before opening the browser. The Streamlit app's first page includes a one-line note: *"This tool runs locally in your browser and does not use the internet."* Install email reinforces the same message.
|
### 3.4 Streamlit launcher
|
||||||
|
|
||||||
### 3.3 PyInstaller Configuration
|
1. Find free port (don't hardcode 8501).
|
||||||
|
2. Set env: `STREAMLIT_SERVER_HEADLESS=true`, `STREAMLIT_BROWSER_GATHER_USAGE_STATS=false`, `STREAMLIT_SERVER_PORT={port}`.
|
||||||
|
3. Start Streamlit programmatically in a thread.
|
||||||
|
4. Poll port until ready.
|
||||||
|
5. Open browser to `http://localhost:{port}`.
|
||||||
|
6. Keep launcher alive while server runs.
|
||||||
|
|
||||||
Single `.spec` file per bundle, parameterized for OS. Key settings:
|
Optional v1.1: wrap with `pywebview` to eliminate browser-launch UX. Defer until support tickets show meaningful confusion.
|
||||||
|
|
||||||
- `--onefile` for Linux (single AppImage), `--onedir` for Windows and macOS (faster startup, easier signing).
|
### 3.5 macOS pipeline
|
||||||
- All dependencies bundled. No internet required at runtime.
|
|
||||||
- Hidden imports declared explicitly for pandas/openpyxl/Streamlit edge cases (PyInstaller's auto-detection misses some).
|
|
||||||
- Icon files per platform (`.ico` for Windows, `.icns` for macOS, `.png` for Linux).
|
|
||||||
- **Two entry points per bundle**: the GUI launcher (default, what the desktop shortcut runs) and the CLI binaries.
|
|
||||||
- **Streamlit-specific PyInstaller hooks**: include the `streamlit` data directory, the `altair` data directory (Streamlit dependency), and the `pyarrow` C extensions. Add a custom hook file (`hook-streamlit.py`) per the documented pattern. Budget 1-3 days the first time getting the spec right; reuse across all subsequent bundles.
|
|
||||||
|
|
||||||
### 3.4 Streamlit Launcher Pattern
|
|
||||||
|
|
||||||
The launcher script handles starting the local Streamlit server in a way that survives PyInstaller bundling. Conceptual outline:
|
|
||||||
|
|
||||||
1. Find a free local port (avoid hardcoding 8501 in case of conflict).
|
|
||||||
2. Set Streamlit environment variables: `STREAMLIT_SERVER_HEADLESS=true`, `STREAMLIT_BROWSER_GATHER_USAGE_STATS=false`, `STREAMLIT_SERVER_PORT={port}`.
|
|
||||||
3. Start Streamlit programmatically (via `streamlit.web.cli.main_run` or `bootstrap.run`) in a background thread.
|
|
||||||
4. Wait for the port to accept connections (poll with timeout).
|
|
||||||
5. Open the buyer's default browser to `http://localhost:{port}` via `webbrowser.open()`.
|
|
||||||
6. Keep the launcher process alive while the server runs. Detect server shutdown and exit cleanly.
|
|
||||||
|
|
||||||
Optional v1.1 enhancement: replace step 5 with a `pywebview` window that wraps the local server. Eliminates the "default browser opens" UX surprise. Adds a dependency and some packaging complexity. Defer until support tickets show the browser-launch is causing meaningful confusion.
|
|
||||||
|
|
||||||
### 3.5 macOS Signing & Notarization Pipeline
|
|
||||||
|
|
||||||
Required setup (one-time):
|
|
||||||
1. Enroll in Apple Developer Program ($99/yr - see BUSINESS.md Section 10).
|
|
||||||
2. Generate Developer ID Application certificate via Apple Developer portal.
|
|
||||||
3. Install certificate in macOS keychain on the build machine (or store as encrypted GitHub Actions secret for CI).
|
|
||||||
4. Generate an app-specific password for `notarytool`.
|
|
||||||
|
|
||||||
Build-time flow (automated):
|
|
||||||
1. PyInstaller produces unsigned `.app`.
|
1. PyInstaller produces unsigned `.app`.
|
||||||
2. `codesign --deep --force --options runtime --sign "Developer ID Application: [Your Name]" BundleName.app`
|
2. `codesign --deep --force --options runtime --sign "Developer ID Application: ..." App.app`.
|
||||||
3. Package into `.dmg`.
|
3. Package as `.dmg`.
|
||||||
4. Submit `.dmg` to Apple notary service: `xcrun notarytool submit BundleName.dmg --wait`.
|
4. `xcrun notarytool submit *.dmg --wait`.
|
||||||
5. Staple the notarization ticket: `xcrun stapler staple BundleName.dmg`.
|
5. `xcrun stapler staple *.dmg`.
|
||||||
6. Output is the final, distributable `.dmg`.
|
|
||||||
|
|
||||||
Buyers on macOS see no Gatekeeper warnings. Clean install.
|
Setup: Apple Developer Program ($99/yr), Developer ID cert in Keychain, app-specific password.
|
||||||
|
|
||||||
### 3.6 Windows Pipeline
|
### 3.6-3.7 Win + Linux
|
||||||
|
|
||||||
1. PyInstaller produces `BundleName/` folder with launcher `BundleName.exe` (which opens the GUI in browser) plus CLI binaries plus dependencies.
|
- **Win**: PyInstaller `--onedir` → Inno Setup wraps → installer adds Start Menu, desktop shortcut, PATH entries. Optional code-signing cert ($200-400/yr) if SmartScreen friction.
|
||||||
2. Inno Setup script wraps the folder into `BundleName-Setup-1.0.exe`.
|
- **Linux**: PyInstaller → `appimagetool` wraps. `.tar.gz` fallback for distros where AppImage fails.
|
||||||
3. Installer creates Start Menu entry, desktop shortcut (launches GUI), optional Add/Remove Programs entry, and adds CLI binaries to PATH.
|
|
||||||
4. Optional Windows code signing certificate (~$200-400/yr from a CA) eliminates SmartScreen warnings. **Not required at launch**; revisit if SmartScreen friction shows up in support tickets.
|
|
||||||
|
|
||||||
### 3.7 Linux Pipeline
|
### 3.8 CI matrix
|
||||||
|
|
||||||
1. PyInstaller produces single-file binaries per entry point.
|
|
||||||
2. Wrap in AppImage using `appimagetool` (free, well-documented). AppImage runs the launcher as the default target.
|
|
||||||
3. Provide a plain `.tar.gz` fallback for users on distributions where AppImage fails. Tarball includes both GUI launcher and CLI binaries plus a `run.sh`.
|
|
||||||
4. No signing required on Linux; users execute `chmod +x` then double-click or run.
|
|
||||||
|
|
||||||
### 3.8 CI Build Matrix
|
|
||||||
|
|
||||||
GitHub Actions builds all three platforms on tagged release:
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
# Conceptual, full file lives in ci/build.yml
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
os: [windows-latest, macos-latest, ubuntu-latest]
|
os: [windows-latest, macos-latest, ubuntu-latest]
|
||||||
```
|
```
|
||||||
|
|
||||||
Result: one git tag triggers three platform builds. Artifacts upload to GitHub Releases. Manual step: copy artifacts to Gumroad / Lemon Squeezy product page.
|
Tag a release → 3 platform artifacts upload to GitHub Releases. Manual: copy to Gumroad / Lemon Squeezy.
|
||||||
|
|
||||||
### 3.9 Hosted Demo Deployment
|
### 3.9 Hosted demo
|
||||||
|
|
||||||
Separate from the desktop build pipeline. The `demo/streamlit_app.py` entry point is deployed to Streamlit Community Cloud:
|
`demo/streamlit_app.py` → Streamlit Community Cloud. Configure deployment in Streamlit UI. Custom domain via CNAME (verify policy at deploy time). Fall back to $5/mo VPS if rate limits / branding constraints hit.
|
||||||
|
|
||||||
1. Connect the GitHub repo to Streamlit Community Cloud (one-time).
|
### 3.10 Bundled Tesseract (PDF Extractor OCR)
|
||||||
2. Configure the app to deploy from the `demo/` folder, main branch.
|
|
||||||
3. Set deployment-time environment variables (e.g., row limits, watermark flag).
|
|
||||||
4. App is publicly accessible at a `*.streamlit.app` URL. Link from Gumroad landing page.
|
|
||||||
5. Optional: custom domain via CNAME (free with Streamlit Community Cloud as of last check; verify before locking).
|
|
||||||
|
|
||||||
If Streamlit Community Cloud is ever unsuitable (rate limits, bandwidth, branding requirements), fall back to a $5/mo VPS running the demo via Docker. Same `demo/streamlit_app.py`, different host.
|
Frozen builds ship Tesseract 5.5 + `eng.traineddata` inside the PyInstaller bundle so scanned PDFs work without a separate install. Per-platform binary URLs pinned in `build/make_release.py`; tessdata vendored at `build/vendor/tessdata/eng.traineddata`. License attribution in `LICENSE_TESSERACT.txt` at the repo root.
|
||||||
|
|
||||||
---
|
**Discovery order at runtime** (see `docs/DEVELOPER.md` for the full Path layout):
|
||||||
|
|
||||||
|
1. `DATATOOLS_TESSERACT_BIN` env var override.
|
||||||
|
2. Bundled path under `sys._MEIPASS / "tesseract" /` (frozen bundles only).
|
||||||
|
3. `tesseract` on `PATH` (source / pip developer environments).
|
||||||
|
4. Windows well-known locations.
|
||||||
|
|
||||||
## 4. Libraries
|
## 4. Libraries
|
||||||
|
|
||||||
| Purpose | Library |
|
| Purpose | Library |
|
||||||
|---|---|
|
|---------|---------|
|
||||||
| GUI framework | Streamlit |
|
| GUI | streamlit |
|
||||||
| CLI framework | Typer |
|
| CLI | typer |
|
||||||
| Data manipulation | pandas, openpyxl, numpy |
|
| Data | pandas, openpyxl, numpy |
|
||||||
| Fuzzy string matching | rapidfuzz |
|
| Fuzzy match | rapidfuzz |
|
||||||
| File encoding detection | charset-normalizer |
|
| Phone parsing | phonenumbers |
|
||||||
|
| Encoding detect | charset-normalizer |
|
||||||
| Logging | loguru |
|
| Logging | loguru |
|
||||||
| Progress bars | tqdm (CLI), `st.progress` (GUI) |
|
| Mojibake (optional) | ftfy |
|
||||||
| Validation | pydantic (optional) |
|
| Reports | reportlab |
|
||||||
| Reports | ReportLab (PDF), pandas styling (Excel) |
|
|
||||||
| Optional native window wrap | pywebview (deferred to v1.1 if needed) |
|
|
||||||
|
|
||||||
`requirements.txt` (current bundle, v1.3):
|
## 5. Coding standards
|
||||||
|
|
||||||
|
### 5.1 Code
|
||||||
|
- PEP 8 + type hints on public functions.
|
||||||
|
- Docstrings on every module + public function.
|
||||||
|
- `pathlib.Path` for paths, never string concat.
|
||||||
|
- All I/O explicitly UTF-8-aware.
|
||||||
|
- No platform-specific shell calls.
|
||||||
|
- pytest for `core/`, not UI.
|
||||||
|
- Errors raise via `src.core.errors` hierarchy (Section 7).
|
||||||
|
|
||||||
|
### 5.2 GUI UX (load-bearing per DECISIONS.md §4b)
|
||||||
|
- **Works out of the box** — drop file → useful result with zero config.
|
||||||
|
- **Sensible defaults visible everywhere**.
|
||||||
|
- **Progressive disclosure** — basic = file uploader + run button + results; rest in `st.expander`.
|
||||||
|
- **Plain-English labels**; technical detail in `help=` tooltip.
|
||||||
|
- **Dry-run / preview by default**.
|
||||||
|
- **Identical core to CLI**.
|
||||||
|
- **Local-first messaging** — "runs locally in your browser, no internet" line on every page.
|
||||||
|
|
||||||
|
### 5.3 Functional scope (load-bearing per DECISIONS.md §4a)
|
||||||
|
- Each script ships **complete coverage of the workflow it names**, including features Excel does for free.
|
||||||
|
- Boundary = the named workflow. Dedup includes normalization + survivor + audit; not format conversion or charting.
|
||||||
|
|
||||||
|
## 6. System requirements
|
||||||
|
|
||||||
|
**Buyer runtime**: Win 10/11 64-bit · macOS 11+ · Linux glibc 2020+ · modern browser · ~400-500 MB disk · no internet.
|
||||||
|
|
||||||
|
**Developer**: Python 3.10+ · PyInstaller · Inno Setup (Win) · Xcode CLT (macOS) · Apple Developer Program $99/yr · Git + GitHub.
|
||||||
|
|
||||||
|
## 7. Error handling (`src/core/errors.py`)
|
||||||
|
|
||||||
|
Structured hierarchy for friendly messages + maintainable trace context:
|
||||||
|
|
||||||
```
|
```
|
||||||
streamlit>=1.30
|
DataToolsError # base; carries path/column/operation/suggestion/cause
|
||||||
pandas
|
InputValidationError(ValueError) # bad arg / wrong type
|
||||||
openpyxl
|
ConfigError(ValueError) # bad config / options
|
||||||
numpy
|
FileFormatError(ValueError) # file isn't what we expected
|
||||||
typer
|
FileAccessError(OSError) # I/O failure (perms, disk, missing)
|
||||||
rapidfuzz
|
|
||||||
charset-normalizer
|
|
||||||
loguru
|
|
||||||
tqdm
|
|
||||||
reportlab
|
|
||||||
pyarrow # Streamlit dependency, declare explicitly for PyInstaller clarity
|
|
||||||
altair # Streamlit dependency, declare explicitly for PyInstaller clarity
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
**Subclassing rule**: every subclass extends a stdlib base (`ValueError` or `OSError`) so existing `except OSError` / `except ValueError` handlers still catch them.
|
||||||
|
|
||||||
## 5. Coding Standards
|
**Helpers**:
|
||||||
|
- `ensure_dataframe(value, function=...)` — uniform DataFrame guard at every public entry.
|
||||||
|
- `ensure_choice(value, name=, choices=)` — uniform enum/literal guard.
|
||||||
|
- `wrap_file_read(path, op, exc)` / `wrap_file_write(...)` — tag OSError with file path + Windows-aware permission tip.
|
||||||
|
- `format_for_user(exc, context=)` — single string for `st.error()` / CLI stderr.
|
||||||
|
|
||||||
### 5.1 Code Standards
|
GUI / CLI handlers use `format_for_user()` so the user always sees: file path, operation, underlying error class, recovery suggestion.
|
||||||
|
|
||||||
- PEP 8 + type hints on all public functions.
|
## 8. Per-bundle status
|
||||||
- Docstrings on every module and public function.
|
|
||||||
- `--help` output (CLI) that a non-technical user can act on.
|
|
||||||
- Graceful error handling with human-readable messages, not stack traces. Errors must name the problem AND the likely fix where possible (e.g., "Column 'email' not found. Available columns: name, phone. Did you mean 'phone'?").
|
|
||||||
- All file paths handled via `pathlib.Path`, never string concatenation. Cross-platform correctness depends on this.
|
|
||||||
- All file I/O explicitly UTF-8-aware: detect encoding on input (charset-normalizer), write UTF-8 on output. Windows defaults to cp1252 otherwise.
|
|
||||||
- No platform-specific shell calls. If absolutely needed, branch on `sys.platform`.
|
|
||||||
- Unit tests for core logic (pytest). Tests target `core/`, not UI front-ends. Tests run on all three OS runners in CI.
|
|
||||||
- Semantic versioning per bundle.
|
|
||||||
- **Core/UI separation**: never put business logic in `cli.py` or `gui/`. If a CLI command and a GUI button do "the same thing," they call the same function in `core/`.
|
|
||||||
|
|
||||||
### 5.2 UX Standards (GUI / Streamlit) - load-bearing per DECISIONS.md Section 4b
|
| Bundle | Status |
|
||||||
|
|--------|--------|
|
||||||
|
| Data Cleaning Mastery | 3/9 tools Ready (Find Duplicates, Clean Text, Standardize Formats); 6 stubs |
|
||||||
|
| Automated Business Reporting | Not started |
|
||||||
|
| Ecommerce Data Pipeline | Not started |
|
||||||
|
| Small Business Finance | Not started |
|
||||||
|
| Marketing Public Data Aggregation | Not started |
|
||||||
|
| AI Ecommerce Aggregation (Shopify Pet) | Not started |
|
||||||
|
|
||||||
- **Works out of the box**: dropping a file into the Streamlit `st.file_uploader` must produce a useful result with zero configuration.
|
## 9. Open decisions
|
||||||
- **Sensible defaults visible everywhere**: every `st.selectbox`, `st.slider`, `st.checkbox` has a default, the default is shown, the user is not forced through a config screen on first run.
|
|
||||||
- **Progressive disclosure**: basic view shows file uploader + go button + results. Advanced options live in `st.expander("Advanced options")` panes.
|
|
||||||
- **Plain-English labels**: no technical jargon in primary UI. `help=` parameter on widgets carries technical detail for users who want it.
|
|
||||||
- **Dry-run / preview by default**: user sees what would change before any file is written. Original input is never modified.
|
|
||||||
- **Single-page completion**: basic task completes on a single Streamlit page. Multi-page apps are for separate scripts in the bundle, not for multi-step wizards within one script.
|
|
||||||
- **Identical core to CLI**: any capability available in CLI is available in GUI, and vice versa. The only legitimate divergence is interactive review (GUI-natural) and scripted/scheduled execution (CLI-natural).
|
|
||||||
- **Local-first messaging**: every GUI page includes the line *"This tool runs locally in your browser and does not use the internet"* in a small, persistent location (e.g., footer or sidebar).
|
|
||||||
|
|
||||||
### 5.3 Functional Scope Standard - load-bearing per DECISIONS.md Section 4a
|
- **pywebview wrap** — defer until support tickets show browser-launch confusion.
|
||||||
|
- **Win code signing** — defer until SmartScreen drives volume. Cost ~$200-400/yr.
|
||||||
|
- **Auto-update mechanism** — none at launch. Email-delivered updates. Revisit at 100+ buyers/bundle.
|
||||||
|
- **Demo hosting migration** — Streamlit Community Cloud → $5/mo VPS if rate/brand limits hit.
|
||||||
|
- **Code obfuscation** — none; license text + bundle complexity sufficient at $49-79.
|
||||||
|
- **Telemetry** — none. Consider opt-in privacy-respecting only post-launch.
|
||||||
|
|
||||||
- Each script ships with **complete coverage of the workflow it names**, including features available free elsewhere (e.g., exact-match dedup).
|
## 10. Script boundaries — 04 (Missing Values) vs 06 (Outliers)
|
||||||
- Scope boundary is the workflow, not "things adjacent to the workflow." A deduplicator includes normalization, survivor selection, audit. It does not include format conversion or charting; those belong elsewhere in the bundle.
|
|
||||||
|
|
||||||
---
|
Deliberately separate. Confluent original spec was wrong.
|
||||||
|
|
||||||
## 6. System Requirements
|
| Script | Owns |
|
||||||
|
|--------|------|
|
||||||
|
| 04 Fix Missing Values | "What's not there." Disguised nulls (`N/A`, `-`, sentinel codes), missingness patterns, imputation, drop-by-threshold. |
|
||||||
|
| 06 Find Unusual Values | "What shouldn't be there." z-score / IQR / modified-z, multivariate (Isolation Forest, Mahalanobis), domain rules, winsorization. |
|
||||||
|
|
||||||
**For buyers (runtime)**:
|
**Run order**: 04 before 06. Outlier stats on data with `NaN` / sentinels are mathematically poisoned (means dragged, IQR widens, false negatives).
|
||||||
- Windows: Windows 10 or 11, 64-bit.
|
|
||||||
- macOS: macOS 11 (Big Sur) or later, Apple Silicon or Intel.
|
|
||||||
- Linux: any glibc-based distribution from 2020 onward (Ubuntu 20.04+, Fedora 33+, etc.).
|
|
||||||
- A modern default browser (Chrome, Edge, Firefox, Safari from the last 3 years). Used to display the local GUI; no internet required.
|
|
||||||
- ~400-500 MB free disk space (Streamlit packaging is larger than alternatives; this is an accepted tradeoff per DECISIONS.md Section 4c).
|
|
||||||
- No internet required after install. No Python install required ever.
|
|
||||||
|
|
||||||
**For developers (you)**:
|
**Pipeline order** (Automated Workflows enforces): 02 → 03 → 04 → 05 → 06 → 07 → 08. 01 is order-flexible.
|
||||||
- Python 3.11+.
|
|
||||||
- PyInstaller, Inno Setup (Windows builds), Xcode command line tools (macOS builds).
|
|
||||||
- Apple Developer Program membership ($99/yr) for macOS distribution.
|
|
||||||
- Git + GitHub account (for CI builds and Streamlit Community Cloud deployment of demos).
|
|
||||||
|
|
||||||
---
|
**Contested cases**:
|
||||||
|
- Whitespace-only cell — 02 trims to empty; 04 then flags empty as null.
|
||||||
|
- `-999` sentinel — 04 converts to `NaN` first; 06 then computes stats.
|
||||||
|
- Suspicious-but-plausible (age 110) — 06 territory.
|
||||||
|
|
||||||
## 7. Per-Bundle Technical Notes
|
## 10b. GUI internationalization (i18n)
|
||||||
|
|
||||||
| Bundle | Status | Tech notes |
|
The GUI uses an in-house, JSON-backed translation layer at `src/i18n/`. **No** `gettext` / `babel` / `.po` pipeline — the surface is small enough that a 100-line module + per-language JSON file is a better fit than a build-time toolchain.
|
||||||
|---|---|---|
|
|
||||||
| Data Cleaning Mastery | Lead, 1/9 scripts complete (CLI only; needs Streamlit GUI port) | Cleaning, dedup, text hygiene, standardization, missing-value handling, outlier detection, type coercion, reporting. Scripts 04 (missing values) and 06 (outliers) are deliberately separate concerns; 04 runs first to neutralize sentinel codes before 06 computes statistics (see Section 9). Script 02 (text cleaner) runs first in the pipeline to normalize whitespace and special characters before any other operation. v1.3 spec: Streamlit GUI required at launch, with hosted demo deployed to Streamlit Community Cloud. |
|
|
||||||
| Automated Business Reporting | Not started | Aggregation -> styled PDF/Excel output |
|
|
||||||
| Ecommerce Data Pipeline | Not started | Extract -> clean -> export workflow |
|
|
||||||
| Small Business Finance | Not started | Bookkeeping summaries, simple reconciliation |
|
|
||||||
| Marketing Public Data Aggregation | Not started | Public API + scraping with respect for robots.txt and ToS |
|
|
||||||
| AI Ecommerce Aggregation (Shopify Pet) | Not started | Optional LLM enhancement, requires API key from buyer |
|
|
||||||
|
|
||||||
---
|
**Resolution model**: `t(key, lang=None, **fmt)` walks a dotted key (`home.title`, `tools.01_deduplicator.name`) through a nested dict. Fallback chain: requested lang → English (canonical) → the literal key. Missing format placeholders return the raw template rather than raising so a translation file cannot crash the UI.
|
||||||
|
|
||||||
## 8. Open Technical Decisions
|
**Active language** is stored in `st.session_state["ui_lang"]`. Reading it outside a Streamlit run (tests, scripts) silently falls back to English, keeping the module importable without Streamlit context.
|
||||||
|
|
||||||
GUI framework choice is now **closed** (Streamlit, locked v1.3 - see DECISIONS.md Section 4c).
|
**Picker placement**: `hide_streamlit_chrome()` calls `render_language_selector()` on every page that hides Streamlit's default chrome — i.e., the entire app. One mount point, every page picks it up.
|
||||||
|
|
||||||
Remaining open items:
|
**Pack parity** is a tested invariant: `tests/test_lang_packs.py::TestPackParity` fails CI when `en.json` and another pack diverge in either direction. This catches translation drift at PR time rather than from buyer reports.
|
||||||
|
|
||||||
- **pywebview wrap of Streamlit launcher**: Optional v1.1 enhancement to eliminate the "browser opens" UX surprise. Defer until support tickets show meaningful buyer confusion. Cost: extra dependency, more PyInstaller complexity. Benefit: native-window UX.
|
**Farewell overlay**: the shutdown screen's JS payload interpolates pack strings into an `innerHTML` inside a JS single-quoted string. `_js_html_safe()` in `components/_legacy.py` escapes both the JS string terminator (`'`) and HTML special chars (`< > &`). The test `TestFarewellEscape` pins this; never bypass it.
|
||||||
- **Windows code signing**: Currently unsigned. Revisit if SmartScreen warnings drive support volume. Cost: ~$200-400/yr.
|
|
||||||
- **Auto-update mechanism**: None at launch. Email-delivered version updates. Revisit at 100+ paying customers per bundle.
|
|
||||||
- **Demo deployment hosting**: Streamlit Community Cloud at launch (free). Migrate to $5/mo VPS if rate limits, bandwidth, or branding constraints become an issue.
|
|
||||||
- **Code obfuscation**: Currently relying on license text + PyInstaller bundling. Decompilation is possible but unlikely for $49-79 products. Accept the risk.
|
|
||||||
- **Telemetry**: None. Consider privacy-respecting opt-in usage telemetry post-launch to inform roadmap, but only if explicit and disclosed.
|
|
||||||
|
|
||||||
---
|
**Why not gettext**: zero compiled artifacts in the PyInstaller bundle, no build step before tests run, no `.po`/`.mo` round-trip for translators (anyone can edit JSON), and the same lookup works in unit tests without process state. Locked in because the surface won't grow large enough to need the alternative, and the alternative breaks the "drop a file, run pytest, ship" loop.
|
||||||
|
|
||||||
## 9. Script Boundaries: 04 (Missing Values) vs 06 (Outliers)
|
## 10c. GUI chrome — sidebar nav indicator swap
|
||||||
|
|
||||||
The two scripts are deliberately separate. Original spec ("missing-value handler also does basic outlier flagging") was wrong: it conflated two different statistical operations and would have produced overlapping CLI flags, confused buyers, and brittle code.
|
Streamlit's `st.Page`-driven sidebar renders section headers with a Material Symbols ligature (`expand_more` / `expand_less`). The header element is not a button and carries no `aria-expanded`, so a pure-CSS swap can't follow open/closed state. We replace the glyph with plain typographic `+` / `−` (U+2212) via JS:
|
||||||
|
|
||||||
### 9.1 Boundary
|
- **CSS** (`components/_legacy.py`, `_HIDE_CHROME_CSS`) drops the Material Symbols font on `[data-testid="stIconMaterial"]` inside `[data-testid="stNavSectionHeader"]` so the rewritten character renders as normal text rather than re-resolving as an icon name.
|
||||||
|
- **JS** (`_SWAP_NAV_SECTION_INDICATOR_JS`) walks each section header, reads the icon's text node, and rewrites `expand_more` → `+` / `expand_less` → `−`. A MutationObserver re-runs the swap when Streamlit re-renders the sidebar (RAF-throttled so a burst of mutations is one swap).
|
||||||
|
|
||||||
**`04_missing_value_handler.py` owns "what's not there"**:
|
The script ships through the same component-iframe bundle as the brand injector and upload-button rename inside `hide_streamlit_chrome()` — one iframe per page, three DOM mutations.
|
||||||
- Detect disguised nulls: `NaN`, empty string, `"N/A"`, `"-"`, `"unknown"`, whitespace-only, sentinel codes (`-999`, `9999`, etc.).
|
|
||||||
- Missingness pattern analysis (which columns co-miss).
|
|
||||||
- Imputation strategies: mean, median, mode, forward-fill, KNN (optional), regression (optional).
|
|
||||||
- Required-field enforcement (drop rows missing a required column).
|
|
||||||
- Drop rows or columns by missingness threshold.
|
|
||||||
|
|
||||||
**`06_outlier_detector.py` owns "what shouldn't be there"**:
|
## 11. Per-script functional specs
|
||||||
- Univariate statistical detection: z-score, IQR, modified z-score (MAD-based).
|
|
||||||
- Multivariate detection: Isolation Forest, Mahalanobis distance.
|
|
||||||
- Domain-rule violations (age > 120, negative quantity, future dates in historical data).
|
|
||||||
- Winsorization / capping as optional remediation.
|
|
||||||
- Distribution shape diagnostics.
|
|
||||||
|
|
||||||
### 9.2 Run Order
|
Specs live in this section as scripts enter active build. Each follows the Tier 1/2/3 structure with explicit strategic framing (what's the market gap given some of this is free elsewhere).
|
||||||
|
|
||||||
04 must run before 06. Reason: outlier statistics computed on data still containing `NaN` or sentinel codes are mathematically poisoned. Means and standard deviations get dragged, IQR widens, false negatives explode.
|
### 11.1 `01_deduplicator.py` — Smart duplicate removal
|
||||||
|
|
||||||
The Master Orchestrator (script 09) enforces this order. CLI users running scripts manually get a warning printed by 06 if it detects unhandled sentinel patterns in the input.
|
**Status**: Ready. Tier 1 mostly built. Streamlit GUI port complete.
|
||||||
|
|
||||||
Pipeline-wide order enforced by the orchestrator: `02_text_cleaner` → `03_format_standardizer` → `04_missing_value_handler` → `05_column_mapper_enforcer` → `06_outlier_detector` → `07_multi_file_merger` → `08_validator_reporter`. Script `01_deduplicator` is order-flexible; it normalizes whitespace and case internally for matching purposes regardless of upstream cleaning, so it can run before or after `02_text_cleaner` depending on the buyer's workflow.
|
**Market gap**: fuzzy match quality of OpenRefine, with the zero-learning UX of Excel, sold once for under $100, runs locally.
|
||||||
|
|
||||||
### 9.3 Contested Cases
|
**Tier 1**:
|
||||||
|
- **Input**: auto-detect encoding (UTF-8, UTF-8-BOM, Latin-1, cp1252) · delimiter · header row · CSV/TSV/XLSX/XLS · multi-sheet picker · streaming for files > RAM.
|
||||||
|
- **Matching**: exact + 3 fuzzy algos (Levenshtein / Jaro-Winkler / token-set) · per-column normalizers (5 types) · configurable threshold per strategy · multi-strategy OR.
|
||||||
|
- **Survivor**: keep first / last / most-complete / most-recent · merge mode (fill blanks from losers).
|
||||||
|
- **Trust**: dry-run preview by default · interactive review for gray-zone matches · confidence score per match · match-group export.
|
||||||
|
- **Audit**: timestamped log · removed-rows separate file · input never modified · idempotent.
|
||||||
|
- **Config**: save/load JSON profiles · sensible auto-detect defaults.
|
||||||
|
- **UX**: human `--help` · progress bar > 10k rows · errors name row + column + value + suggestion.
|
||||||
|
|
||||||
**Use cases that prove 04 and 06 are distinct concerns** (not just naming differences):
|
**Tier 2**: numeric/date tolerance · phonetic match (Soundex, Metaphone) · blocking/indexing · watch-folder.
|
||||||
|
|
||||||
- *Bank export with blank fee columns*: pure 04 job. The fees aren't outliers, they're missing. Imputation or drop-by-threshold is the right tool.
|
**Tier 3**: ML scoring · cross-file dedup · cron · Shopify/Klaviyo API direct.
|
||||||
- *Sales data with one $1M order in a $50-average column*: pure 06 job. Nothing is missing; one row is statistically extreme. Z-score or IQR catches it.
|
|
||||||
- *Survey data where `999` means "refused to answer"*: needs both, in order. 04 converts `999` to `NaN` per `--sentinels`, then 06 computes statistics on the cleaned column.
|
|
||||||
|
|
||||||
Sentinel values like `-999` are *both* disguised missing *and* statistical outliers. Resolution: 04 owns sentinel detection and converts them to `NaN` (or imputes per user choice) before 06 sees the data. Sentinel patterns are configurable in 04 via `--sentinels` flag.
|
### 11.2 `02_text_cleaner.py` — Character-level hygiene
|
||||||
|
|
||||||
Suspicious-but-plausible values (e.g., age = 110): 06's territory. Not missing; just rare.
|
**Status**: Ready. Tier 1 built.
|
||||||
|
|
||||||
Whitespace-only cells (e.g., `" "`) are a contested case between 02 (text cleaner) and 04 (missing value handler). Resolution: 02 trims first, leaving an empty string. 04 then detects empty strings as disguised nulls per its existing logic. This means 02 must run before 04 in any pipeline that uses both. The orchestrator enforces this; CLI users get a warning if 04 detects whitespace-only cells suggesting 02 was skipped.
|
**Market gap**: one-click correctness for the dirty-CSV failure modes that cause silent VLOOKUP misses.
|
||||||
|
|
||||||
### 9.4 Shared Plumbing
|
**Boundary**:
|
||||||
|
- 02 — whitespace, Unicode normalize, smart-char fold, BOM, line endings, zero-width, control chars, case ops. Writes to disk.
|
||||||
|
- 03 — dates, currencies, names, phones, addresses (display formatting).
|
||||||
|
- 04 — disguised nulls.
|
||||||
|
- 01 — `normalize_string` is *match-time* only, distinct from 02's *write-time* policy.
|
||||||
|
|
||||||
Both scripts emit:
|
**Tier 1 ops** (each toggleable; defaults shown for `excel-hygiene`):
|
||||||
- A flagged-row report with column, row index, original value, action taken.
|
1. Trim leading/trailing whitespace — ON
|
||||||
- A timestamped log file in `logs/`.
|
2. Collapse internal whitespace runs — ON
|
||||||
- An optional cleaned output file.
|
3. NFC normalize — ON
|
||||||
|
4. NFKC compatibility fold — OFF (lossy, opt-in via `paranoid` preset)
|
||||||
|
5. Smart-char fold (curly quotes, em/en-dash, NBSP, ellipsis) — ON
|
||||||
|
6. Zero-width / invisible char strip — ON
|
||||||
|
7. BOM strip — ON
|
||||||
|
8. Control-char strip (preserve `\t\n\r`) — ON
|
||||||
|
9. Line-ending normalize (CRLF/CR → LF inside cells) — ON
|
||||||
|
10. Case conversion (UPPER / lower / Title / Sentence) — OFF, per-column
|
||||||
|
|
||||||
Report and log formats are identical between the two scripts. Implemented via shared helpers in `src/core/` to avoid drift.
|
**Scope**: per-column selection · skip-list · operates on string-typed columns only.
|
||||||
|
|
||||||
---
|
**Trust**: dry-run by default · per-cell change log (capped 1000, `--full-changelog` removes cap) · 3 output files mirroring dedup · idempotent.
|
||||||
|
|
||||||
## 10. Per-Script Functional Requirements
|
**Config**: 3 presets (`minimal` / `excel-hygiene` (default) / `paranoid`) · save/load JSON.
|
||||||
|
|
||||||
This section captures the full functional spec for each script, beyond the one-line description in USER-GUIDE.md Section 2. Specs answer "what does v1 need to ship to be best-of-class for the target buyer." Promoted from chat-history-only into docs in v1.6 to prevent silent drift.
|
### 11.3 `03_format_standardizer.py` — Per-domain canonical forms
|
||||||
|
|
||||||
**Note on script status**: a script labeled "Working" in the bundle status table means it has CLI execution and basic correctness, NOT that it implements every Tier 1 item below. Tier 1 is the v1 launch target; the current code may implement a subset.
|
**Status**: Ready. Full Tier 1 + most Tier 2 built. 199-row buyer corpus passing.
|
||||||
|
|
||||||
### 10.1 `01_deduplicator.py` - Smart duplicate removal
|
**Market gap**: unify dates / phones / emails / addresses / names / currencies / booleans across messy ETL inputs without buyer writing code.
|
||||||
|
|
||||||
**Current implementation status**: `01_deduplicator.py` exists and works for exact match plus basic fuzzy with configurable subset columns and timestamped logs (the description in USER-GUIDE.md Section 2 reflects current state). It does NOT yet implement most Tier 1 items below. Tier 1 is the v1 launch target, not current state. The Streamlit GUI port is the natural moment to close this gap.
|
**Domains**:
|
||||||
|
| Domain | Default canonical | Notable handling |
|
||||||
|
|--------|-------------------|------------------|
|
||||||
|
| Date | ISO 8601 (`YYYY-MM-DD`) | MDY/DMY, Excel serial, Unix timestamp (s + ms), longform months, year-month, quarter, ISO week date (`2024-W03-1`), ISO ordinal (`2024-015`), RFC 2822, CJK separators (`2024年01月15日`), fullwidth digits, named-TZ resolution (EST/PST/JST/…), `two_digit_year_cutoff` |
|
||||||
|
| Phone | E.164 + `;ext=N` | libphonenumber, 001 international prefix, error sentinels for placeholders / multi-number / contamination |
|
||||||
|
| Email | lowercase + trim | display-name extraction, mailto/angle-bracket strip, smart-quote unwrap, BIDI/RTL override strip (security), optional `--gmail-canonical` |
|
||||||
|
| Address | USPS-canonical (`expand=False`) or expanded (`expand=True`) | state/province-name → code for US/CA/AU/DE, UK postcode detection, multi-line collapse, PO Box normalize, state-code preservation regardless of input case |
|
||||||
|
| Name | smart Title Case | Mc/Mac/O'/D' inner caps, Arabic `al-`/`el-` lowercase, particle lowercasing (von/van/de/da/bin/ibn/ben), East Asian honorific suffixes (`-san`/`-sama`/`-ssi`), comma reversal (skippable via `family_first`), period stripping for titles/suffixes/initials, PhD/MD/Mag/Habil acronyms |
|
||||||
|
| Currency | bare number (dot decimal) | auto-detect EU vs US separators, space-thousands, Swiss apostrophe, accounting parens, optional ISO code preservation |
|
||||||
|
| Boolean | `True`/`False` (configurable) | accepts `yes`/`no`/`y`/`n`/`1`/`0`/`on`/`off` |
|
||||||
|
|
||||||
**Strategic framing**: Excel's built-in Remove Duplicates handles exact match for free. Pandas `drop_duplicates()` handles it for free in code. A $49-$79 dedup tool that ships "exact + basic fuzzy" loses to Excel for free or to OpenRefine for free. The fuzzy matching has to be the product, not a checkbox. The market gap this script targets is "fuzzy match quality of OpenRefine, with the zero-learning-curve UX of Excel, sold once for under $100, runs locally" (see BUSINESS.md Section 4a).
|
**International coverage** (added v1.7):
|
||||||
|
- **Date locales**: English (default) plus opt-in French / German / Spanish / Portuguese / Italian / Dutch / Russian month + weekday recognition.
|
||||||
|
- **Currency symbols**: $, €, £, ¥, ₹, ₩, ₽, ₪, ₺, ¢ + ฿(THB), ₫(VND), ₮(MNT), ₴(UAH), ₦(NGN), ₱(PHP), ₲(PYG), ﷼(SAR), ₨(PKR), ₵(GHS).
|
||||||
|
- **ISO 4217 codes**: 23 baseline (USD, EUR, …) plus ~30 emerging-market additions (SAR, AED, ARS, EGP, IDR, MYR, PHP, THB, VND, NGN, GHS, KES, HUF, CZK, RON, UAH, …).
|
||||||
|
- **Address jurisdictions**: US, Canada (13 provinces/territories), Australia (8 states), Germany (16 Bundesländer), UK (postcode shape).
|
||||||
|
- **Address PO Box**: English, German (`Postfach`), French (`Boîte postale`), Spanish (`Apartado`), Italian (`Casella postale`), Portuguese (`Caixa postal`).
|
||||||
|
|
||||||
#### Tier 1: Must-ship for v1 to be best-of-class
|
**Per-domain `error_policy`**: `"passthrough"` (default) keeps the original; `"sentinel"` emits `<error: <reason>>` for cases like Feb 30, double @, percentages mistaken for currency, etc.
|
||||||
|
|
||||||
**Input handling**
|
**Pipeline**: `standardize_dataframe(df, options)` runs per-column with `column_types: dict[str, FieldType]`. Returns `StandardizeResult` with `cells_changed`, `cells_unparseable`, change audit. Warns when > 10% of typed cells fail to parse.
|
||||||
1. Auto-detect file encoding (UTF-8, UTF-8-BOM, Latin-1, Windows-1252). Failure to handle this correctly is the #1 reason CSV tools crash on real-world business data.
|
|
||||||
2. Auto-detect delimiter (comma, tab, semicolon, pipe).
|
|
||||||
3. Read CSV, TSV, XLSX, XLS. For XLSX, support multi-sheet workbooks (let user pick or process each).
|
|
||||||
4. Handle files larger than RAM via chunked / streaming processing. A 500MB customer export should not crash the tool.
|
|
||||||
5. Header row detection with manual override.
|
|
||||||
|
|
||||||
**Matching**
|
**Presets**: `us-default`, `european`, `uk`, `iso-strict`, `legacy-us`. Custom abbreviations via `extra_abbreviations`. Per-column culture flags: `name_family_first` (East Asian), `address_state_to_code` (any of 4 supported jurisdictions), `date_month_locales` (list of 8 supported codes).
|
||||||
6. Exact match with configurable subset columns.
|
|
||||||
7. Fuzzy match algorithms: Levenshtein, Jaro-Winkler, token-set ratio (rapidfuzz library). Multiple algorithms, not one. Different data types match better with different algorithms.
|
|
||||||
8. Per-column normalization before comparison:
|
|
||||||
- Email: lowercase, strip whitespace, strip Gmail dots, strip `+tag` suffixes.
|
|
||||||
- Phone: strip formatting and country codes, normalize to E.164.
|
|
||||||
- Name: strip titles (Mr/Ms/Dr), strip suffixes (Jr/III), collapse whitespace, optional case-fold.
|
|
||||||
- Address: USPS-style abbreviation normalization (St/Street, Ave/Avenue, Apt/#).
|
|
||||||
- Generic string: trim, collapse internal whitespace, optional case-fold.
|
|
||||||
9. Configurable similarity threshold (e.g., 85%, 90%, 95%) per match strategy.
|
|
||||||
10. Multi-strategy matching with OR logic: "match if email is exact OR (name fuzzy >90% AND phone exact)." This is what real-world dedup actually requires. Single-strategy match handles maybe 40% of cases.
|
|
||||||
|
|
||||||
**Survivor selection (which row to keep when duplicates are found)**
|
### 11.4 Upload-time analyzer (`src/core/analyze.py`)
|
||||||
11. Configurable survivor rules: keep first, keep last, keep most-complete (fewest empty cells), keep most-recent (date column), keep manually-selected.
|
|
||||||
12. Merge mode: instead of deleting losers, fill missing fields in survivor from losers. Common real ask: combine partial records into one complete record.
|
|
||||||
|
|
||||||
**Trust and review**
|
Read-only advisory pass on every upload. Emits `Finding` objects:
|
||||||
13. Dry-run / preview mode by default. Output shows what *would* be merged before any file is written. Non-negotiable for trust. Aligns with Section 5.2 visible-safety standard.
|
|
||||||
14. Interactive review mode for uncertain matches. For matches in the gray zone (e.g., 75-90% similarity), prompt user yes/no/skip with side-by-side diff. This is what justifies a paid product over free Excel. GUI-natural; CLI gets a reduced-form prompt loop.
|
|
||||||
15. Confidence score on every fuzzy match in the output.
|
|
||||||
16. Match group export: separate file showing every group of matched rows so user can audit.
|
|
||||||
|
|
||||||
**Audit and safety**
|
| Field | Meaning |
|
||||||
17. Full timestamped log of every action: which rows matched, on which strategy, with what score, which row survived, which fields were merged.
|
|-------|---------|
|
||||||
18. Removed-duplicates exported to a separate file (never silently destroyed).
|
| `id` | Stable identifier (never localized) |
|
||||||
19. Original input file is never modified. Output is always a new file.
|
| `severity` | `info` / `warn` / `error` (only `error` blocks gate) |
|
||||||
20. Idempotency: running the tool twice on the same input with the same config produces the same output.
|
| `confidence` | `high` (round-trip safe) / `medium` (preview) / `low` (heuristic) |
|
||||||
|
| `fix_action` | id of algorithm in `fixes.py` (empty for informational-only) |
|
||||||
|
| `pre_applied` | `true` if fix already ran during read pass |
|
||||||
|
| `tool` | owning tool id (or empty for file-level) |
|
||||||
|
| `count` | cells / rows affected |
|
||||||
|
| `description` | one-sentence human summary |
|
||||||
|
| `column` | column name (None for file-level) |
|
||||||
|
| `samples` | up to 5 `(row, col, value)` examples |
|
||||||
|
|
||||||
**Configuration**
|
Entry point: `analyze(source, *, sample_rows=1000, repair_result=None, encoding_override=None)`. `encoding_override` skips charset detection — the hook that lets the Review page recover from misdetections.
|
||||||
21. Save / load configuration profiles. A user who deduplicates a Shopify customer export weekly should configure once, not every run.
|
|
||||||
22. Sensible defaults that work on a typical messy CSV with zero configuration. The first run must produce a useful result with no flags. Per DECISIONS.md Section 4b UX standards.
|
|
||||||
|
|
||||||
**UX**
|
### 11.5 CSV-normalization gate (`src/core/normalize.py`, `fixes.py`)
|
||||||
23. `--help` (CLI) written for non-technical users with concrete examples, not a flag list.
|
|
||||||
24. Progress bar for files over ~10K rows.
|
|
||||||
25. Error messages name the row number, column, and value that caused the problem. No raw stack traces. Per Section 5.1.
|
|
||||||
26. Sample data (`samples/messy_sales.csv`) must demonstrate fuzzy match scenarios, not just exact dupes. Otherwise the demo doesn't sell.
|
|
||||||
|
|
||||||
#### Tier 2: Worth-considering for v1.1
|
Two paths:
|
||||||
|
1. **Auto-fix** — `auto_fix(df, findings)` applies every `confidence="high"` finding whose `fix_action` is registered.
|
||||||
|
2. **Per-finding decisions** — `apply_decisions(df, findings, decisions)` accepts `Decision(finding_id, action, payload)` with action `"auto"|"skip"|"modified"`.
|
||||||
|
|
||||||
27. Numeric tolerance for matching (prices within $0.01 considered same).
|
Returns `NormalizationResult` with `cleaned_df`, `cleaned_bytes` (UTF-8 CSV), `applied`, `skipped_findings`, `pending_findings`, `blocking_findings`.
|
||||||
28. Date tolerance for matching (dates within N days considered same).
|
|
||||||
29. Phonetic matching (Soundex, Metaphone) for name fields with common misspellings.
|
|
||||||
30. Blocking / indexing for performance on large files (compare only rows that share a first letter or first three characters of a key field). Without this, fuzzy match on 100K rows is O(n²) and unusable. Move to Tier 1 if early buyers report performance complaints.
|
|
||||||
31. Watch-folder mode: auto-process any file dropped into a folder.
|
|
||||||
32. Geolocation-aware address matching (optional, requires bundled USPS data or third-party API).
|
|
||||||
|
|
||||||
#### Tier 3: Optional / later
|
`is_normalized(findings, result)` re-runs `analyze()` against cleaned bytes; returns False if any high-confidence detector still fires (the strict contract tool pages depend on).
|
||||||
|
|
||||||
33. Machine-learning-based match scoring (Dedupe.io territory; high complexity, marginal gain for this price point).
|
**Fix registry**: `@register("fix_id")` decorates `(df, payload) → (new_df, n_cells_changed)`. New fix = one entry in `analyze.py` `FIX_*` constants + one detector emitting that `fix_action` + one registered function. No other call sites change.
|
||||||
34. Multi-table joins for cross-file dedup.
|
|
||||||
35. Schedule / cron integration.
|
|
||||||
36. Direct Shopify / Klaviyo / Mailchimp API integration to dedupe in place. This would be a real differentiator for the Shopify niche specifically and is probably the right v2 direction if early sales validate the niche.
|
|
||||||
|
|
||||||
### 10.2 `02_text_cleaner.py` - Character-level hygiene
|
### 11.6 Review page (`src/gui/pages/0_Review.py`)
|
||||||
|
|
||||||
**Current implementation status**: Stub only. `src/gui/pages/2_Text_Cleaner.py` is a placeholder UI with disabled controls. No `src/core/text_clean.py`, no CLI, no tests. Tier 1 below is the v1 launch target; nothing in this section is built yet.
|
1. Detected encoding + override picker (16 codepages + custom).
|
||||||
|
2. One expandable card per finding (sorted by severity then confidence) with: decision radio (Auto/Skip/Customize), live before/after preview built by running the registered fix on `Finding.samples`, payload editor for fixes that take user input.
|
||||||
|
3. Apply persists `NormalizationResult` keyed by upload SHA-256; tool pages refuse to load until hash matches.
|
||||||
|
4. `⚙️ Advanced output options` expander: per-download encoding + delimiter + line terminator. `_build_output_bytes()` returns `(bytes, error_message)`; lossy fallbacks emit a warning the page surfaces.
|
||||||
|
|
||||||
**Strategic framing**: Excel and the OS provide effectively nothing here. Find/Replace fixes one character at a time. Power Query's "Clean" strips control chars but ignores BOMs, smart quotes, NBSPs, and zero-width chars. OpenRefine has the operations buried under "Common transforms" where the buyer never finds them. Pandas users `df.applymap(str.strip)` and miss everything else.
|
Gates the entire tool sidebar via `require_normalization_gate()` in `src/gui/components/_legacy.py`.
|
||||||
|
|
||||||
The market gap this script fills: **one-click correctness for the dirty-CSV failure modes that cause "why won't this VLOOKUP match?"** Trailing spaces, NBSP-in-place-of-space, smart quotes pasted from Word, mojibake, BOMs from Excel's "Save As CSV UTF-8". The buyer doesn't know they need this script until it fixes a problem they have spent two hours on. Demo value is high: the before/after diff sells itself.
|
### 11.7 Pre-parse repair (`src/core/io.py::repair_bytes`)
|
||||||
|
|
||||||
**Boundary clarification** (cross-references Section 9):
|
Byte-level pre-parse pass. **Order is meaningful**:
|
||||||
- 02 owns whitespace, Unicode normalization, smart-character folding, BOM strip, line-ending normalization, zero-width strip, control-char strip, case ops. Writes cleaned values back to disk.
|
|
||||||
- 03 (format standardizer) owns dates, currencies, names, phones, addresses.
|
|
||||||
- 04 (missing values) owns disguised nulls (`N/A`, `-`, `unknown`, sentinel codes). Whitespace-only cells: 02 trims first to empty string; 04 then detects empty as null (per Section 9.3).
|
|
||||||
- 01 (deduplicator) has its own `normalize_string` helper for *match-time* case-folding. That is a match-time policy and stays distinct from 02's *write-time* policy. The two will not be merged; 02 may use lower-level helpers but does not aggressively case-fold cleaned output by default.
|
|
||||||
|
|
||||||
#### Tier 1: Must-ship for v1 to be best-of-class
|
1. **Wide-encoding transcode** (UTF-16/32 → UTF-8) — must run first or NUL strip below shreds UTF-16.
|
||||||
|
|
||||||
**Operations** (each independently toggleable; defaults given for the `excel-hygiene` preset)
|
|
||||||
|
|
||||||
1. Whitespace trim - leading/trailing on every cell. Default ON.
|
|
||||||
2. Internal whitespace collapse - multi-space and tabs-in-cells to single space. Default ON.
|
|
||||||
3. Unicode NFC normalization - combining-character forms folded to canonical (e.g., `e + U+0301` to single `é`). Default ON.
|
|
||||||
4. Unicode NFKC normalization - compat fold (`①` to `1`, `fi` to `fi`). Default OFF, lossy, opt-in only. Not part of any preset other than `paranoid`.
|
|
||||||
5. Smart-character folding - curly quotes to ASCII, em/en-dash to hyphen, ellipsis `…` to `...`, NBSP `U+00A0` to space. Default ON.
|
|
||||||
6. Zero-width / invisible character strip - `U+200B`, `U+200C`, `U+200D`, `U+2060`, mid-string `U+FEFF`. Default ON.
|
|
||||||
7. BOM strip - `U+FEFF` at the start of the first cell of the first column (covers the case where the I/O layer didn't catch it). Default ON.
|
|
||||||
8. Control character strip - `U+0000`-`U+001F` and `U+007F`, *except* preserve `\t`, `\n`, `\r`. Default ON.
|
|
||||||
9. Line-ending normalization - within multi-line cells, `\r\n` and bare `\r` to `\n`. Default ON.
|
|
||||||
10. Case conversion - UPPER / lower / Title / Sentence. Default OFF, per-column. Title case is "smart": preserves all-caps tokens (`USA`, `NASA`) and lowercases mid-string particles (`of`, `and`, `the`).
|
|
||||||
|
|
||||||
**Scope control**
|
|
||||||
|
|
||||||
11. Per-column selection - by default operate on string-typed columns only; numeric / datetime columns pass through untouched. User can pick columns explicitly via `--columns`.
|
|
||||||
12. Skip-list - exclude specific columns via `--skip` even if they match the string-dtype filter (e.g., free-text notes columns).
|
|
||||||
|
|
||||||
**Trust and audit**
|
|
||||||
|
|
||||||
13. Dry-run preview by default. Output shows N cells that would change in column X. `--apply` writes. Non-negotiable for trust. Same standard as the deduplicator.
|
|
||||||
14. Per-cell change log: `{input}_changes.csv` with (row, column, old, new, ops_applied). Capped to first N rows by default to avoid 50MB audit files; `--full-changelog` removes the cap.
|
|
||||||
15. Three output files on `--apply`: `{input}_cleaned.csv`, `{input}_changes.csv`, `logs/text_clean_{ts}.log`. Mirrors the deduplicator output shape.
|
|
||||||
16. Original input file is never modified.
|
|
||||||
17. Idempotency: `clean(clean(x)) == clean(x)` for every individual op and every preset. Asserted as a property test.
|
|
||||||
|
|
||||||
**Configuration**
|
|
||||||
|
|
||||||
18. Presets: `--preset excel-hygiene` (everything safe ON, NFKC OFF, case OFF), `--preset minimal` (only trim + collapse), `--preset paranoid` (everything including NFKC). Buyers should not have to learn 9 flags. Default preset when no flag given: `excel-hygiene`.
|
|
||||||
19. Save / load JSON config. Same shape and reuse pattern as `DeduplicationConfig`.
|
|
||||||
|
|
||||||
**UX**
|
|
||||||
|
|
||||||
20. `--help` written for non-technical users with concrete examples, not a flag dump. Per DECISIONS.md Section 4b.
|
|
||||||
21. Progress bar for files over ~10K rows.
|
|
||||||
22. Error messages name the row, column, and value that caused the problem. No raw stack traces.
|
|
||||||
23. Sample data (`samples/messy_text.csv`) demonstrates: smart quotes from Excel, NBSP-vs-space, BOM, mixed line endings, zero-width chars. The before/after diff is the demo.
|
|
||||||
|
|
||||||
#### Tier 2: Worth-considering for v1.1
|
|
||||||
|
|
||||||
24. Custom regex find/replace - power-user escape hatch, per-column.
|
|
||||||
25. Diacritic strip (`José` to `Jose`). Lossy; opt-in only.
|
|
||||||
26. Mojibake auto-repair - detect `é` to `é` patterns (UTF-8 read as Latin-1 then re-encoded) and fix. Standard tool: `ftfy`. Promote to Tier 1 if early buyers report this.
|
|
||||||
27. Punctuation normalization - all Unicode dash/quote/space variants folded; runs of punctuation collapsed.
|
|
||||||
28. Profile detector - scan a file and recommend which ops to enable based on what's actually present. Lowers config friction further.
|
|
||||||
|
|
||||||
#### Tier 3: Optional / later
|
|
||||||
|
|
||||||
29. Locale-aware case conversion (Turkish dotted/dotless `i`, German `ß`).
|
|
||||||
30. Custom character-class strip rules (regex-class).
|
|
||||||
31. Streaming / chunked write for very large files (defer until a buyer reports it).
|
|
||||||
|
|
||||||
#### Open decisions captured at spec time
|
|
||||||
|
|
||||||
- Smart-character folding default ON in `excel-hygiene` accepted as the right tradeoff: highest-impact use case, dry-run preview makes the change visible before commit.
|
|
||||||
- NFKC stays Tier 1 but OFF by default and excluded from `excel-hygiene`. Lossy by design.
|
|
||||||
- CLI surface: separate `src/cli_text_clean.py` module, matching the "one CLI binary per script on PATH" pattern in Section 3.2. Not a subcommand on the existing dedup Typer app.
|
|
||||||
- `ftfy` dependency deferred to Tier 2 (~5MB). Revisit if mojibake reports come in.
|
|
||||||
|
|
||||||
### 10.2.1 Upload-time analyzer (`src/core/analyze.py`)
|
|
||||||
|
|
||||||
The analyzer is a read-only, advisory pass that runs on every uploaded file before any tool page sees it. It produces a list of `Finding` objects, each carrying:
|
|
||||||
|
|
||||||
| Field | Type | Meaning |
|
|
||||||
|---|---|---|
|
|
||||||
| `id` | str | Stable identifier (`smart_punctuation_in_data`, `mixed_line_endings`, …). Never localized. |
|
|
||||||
| `severity` | `info` / `warn` / `error` | UX urgency. `error` is the only level that blocks the gate. |
|
|
||||||
| `confidence` | `high` / `medium` / `low` | Auto-fixability. **High** is round-trip safe, **medium** has known false-positive shapes, **low** is heuristic and opt-in. |
|
|
||||||
| `fix_action` | str | Stable id naming the algorithm in `src/core/fixes.py` that resolves this finding. Empty for informational-only findings. |
|
|
||||||
| `pre_applied` | bool | True when the fix already ran during the read pass (BOM strip, NUL strip, byte-level smart-quote fold). The gate treats these as already-resolved. |
|
|
||||||
| `tool` | str | Tool id that owns this concern (`02_text_cleaner`, `04_missing_handler`). Empty for file-level findings. |
|
|
||||||
| `count` | int | Cells / rows affected. |
|
|
||||||
| `description` | str | One-sentence human summary (banners, tooltips). |
|
|
||||||
| `column` | str / None | Column name when scoped to one column. |
|
|
||||||
| `samples` | list[(row, col, value)] | Up to 5 examples for the GUI to render. |
|
|
||||||
|
|
||||||
`analyze(source, *, sample_rows=1000, repair_result=None, encoding_override=None)` is the public entry point. `source` is a DataFrame or a path; `encoding_override` skips charset detection and uses the user's chosen codepage instead — this is the hook that lets the Review page recover from misdetections (cp1252-vs-cp1250 ambiguity, KOI8-R surfacing as Shift_JIS).
|
|
||||||
|
|
||||||
### 10.2.2 CSV-normalization gate (`src/core/normalize.py`, `src/core/fixes.py`)
|
|
||||||
|
|
||||||
A file enters tool pages only after passing the gate. The gate has two paths:
|
|
||||||
|
|
||||||
1. **Auto-fix** — `auto_fix(df, findings)` applies every `confidence="high"` finding whose `fix_action` is registered in `fixes.py`.
|
|
||||||
2. **Per-finding decisions** — `apply_decisions(df, findings, decisions)` accepts an explicit list of `Decision(finding_id, action, payload)` where action is `"auto" | "skip" | "modified"`.
|
|
||||||
|
|
||||||
Output is a `NormalizationResult` with:
|
|
||||||
|
|
||||||
- `cleaned_df` — the DataFrame after every applied fix.
|
|
||||||
- `cleaned_bytes` — UTF-8 CSV serialization for the download.
|
|
||||||
- `applied`, `skipped_findings`, `pending_findings`, `blocking_findings` — audit log + gate status.
|
|
||||||
|
|
||||||
`is_normalized(findings, result)` re-runs `analyze()` against the cleaned bytes and returns False if any high-confidence detector still fires — that's the strict contract tool pages depend on.
|
|
||||||
|
|
||||||
`fixes.py` is a registry: `@register("fix_id")` decorates a `(df, payload) -> (new_df, n_cells_changed)` function. Adding a new fix means appending one entry to `analyze.py`'s `FIX_*` constants, one detector that emits a Finding with that `fix_action`, and one registered function in `fixes.py`. No other call sites change.
|
|
||||||
|
|
||||||
### 10.2.3 Review page (`src/gui/pages/0_Review.py`)
|
|
||||||
|
|
||||||
Streamlit page that orchestrates the gate visually. Gates the entire tool sidebar via `require_normalization_gate()` in `src/gui/components.py`, which every tool page calls right after `hide_streamlit_chrome()`.
|
|
||||||
|
|
||||||
The page:
|
|
||||||
|
|
||||||
1. Surfaces the detected encoding plus an override picker (16 common codepages + custom-text fallback).
|
|
||||||
2. Renders one expandable card per finding, sorted by severity then confidence, with a decision radio (Auto / Skip / Customize), a live before/after preview built by running the registered fix on each `Finding.samples` value, and a payload editor for fixes that take user input (e.g. custom null-sentinel list for `replace_null_sentinels`).
|
|
||||||
3. Apply button persists a `NormalizationResult` keyed by upload SHA-256; tool pages refuse to load until the hash matches.
|
|
||||||
4. After apply, an `⚙️ Advanced output options` expander offers per-download encoding, delimiter, and line-terminator selection. The helper `_build_output_bytes(df, *, encoding, delimiter, line_terminator)` returns `(bytes, error_message)` — when the chosen encoding can't represent a character, falls back to `errors="replace"` and returns a warning the page surfaces.
|
|
||||||
|
|
||||||
### 10.2.4 Pre-parse repair (`src/core/io.py::repair_bytes`)
|
|
||||||
|
|
||||||
Byte-level pre-parse pass. Order is meaningful and each step is independently toggleable:
|
|
||||||
|
|
||||||
1. **Wide-encoding transcode** — UTF-16/UTF-32 → UTF-8. Has to run first because the byte-level NUL strip below would shred UTF-16 data (UTF-16 ASCII chars carry NUL as half of every 16-bit unit). Records `transcode_to_utf8` audit action; the analyzer surfaces it as a `csv_transcoded_to_utf8` info finding.
|
|
||||||
2. **UTF-8 BOM strip** (file start only).
|
2. **UTF-8 BOM strip** (file start only).
|
||||||
3. **NUL strip** — only meaningful after step 1, so genuine corruption (truncated C strings, half-binary exports) rather than encoding artifacts.
|
3. **NUL strip** — only meaningful after step 1, so flags genuine corruption.
|
||||||
4. **Line-ending normalize** — CRLF and bare CR → LF. Bare CR confuses the C parser; the text-cleaner contract also calls for LF inside multi-line cells.
|
4. **Line-ending normalize** — CRLF + bare CR → LF.
|
||||||
5. **Byte-level smart-quote fold** — curly / guillemet / double-prime → ASCII `"`. Only structural double-quote-equivalents; single curly quotes are deferred to the cell-level cleaner.
|
5. **Byte-level smart-quote fold** — curly / guillemet / double-prime → ASCII `"` (only structural double-quote-equivalents; single curlies deferred to cell-level).
|
||||||
6. **Per-row delimiter repair** — when one row has +1 field and the merge candidate is currency-shaped (`$1,500.00` etc.), merge and quote.
|
6. **Per-row delimiter repair** — when a row has +1 field and merge candidate is currency-shaped (`$1,500.00`), merge + quote.
|
||||||
|
|
||||||
`detect_encoding()` tries strict UTF-8 first and returns `"utf-8"` if the bytes decode cleanly. This was added because charset-normalizer fingerprints small files dominated by short non-ASCII sequences (e.g. zero-width chars at U+200B-class) as `mac_latin2` — but if the bytes are valid UTF-8, that's the right answer regardless of label.
|
`detect_encoding()` tries strict UTF-8 first — charset-normalizer mislabels short-non-ASCII files as `mac_latin2`, but valid UTF-8 bytes mean UTF-8 regardless of label.
|
||||||
|
|
||||||
### 10.3 - 10.9 (Future)
|
|
||||||
|
|
||||||
Functional specs for scripts 03 through 09 to be added when each script enters active build. The deduplicator (10.1) and text cleaner (10.2) specs are the template; specs for other scripts should follow the same Tier 1 / Tier 2 / Tier 3 structure with explicit strategic framing (what's the market gap this script fills, given that some of its functionality is available free elsewhere).
|
|
||||||
|
|||||||
217
docs/USER-GUIDE.es.md
Normal file
217
docs/USER-GUIDE.es.md
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
> 🌐 **Idioma:** Español · [English](USER-GUIDE.md)
|
||||||
|
|
||||||
|
# Guía del usuario
|
||||||
|
|
||||||
|
**Versión**: 1.6 · **Actualizado**: 2026-05-13
|
||||||
|
|
||||||
|
## 0. Primer arranque — activación
|
||||||
|
|
||||||
|
DataTools debe activarse antes de desbloquear cualquier herramienta. En el primer arranque verás la pantalla **Activar**.
|
||||||
|
|
||||||
|
Introduce tu nombre completo y correo, pega el código de licencia del correo de compra (empieza con `DTLIC1:`) y pulsa **Activar**. La renovación funciona igual: pega el código de renovación y pulsa **Aplicar renovación**.
|
||||||
|
|
||||||
|
**Niveles**:
|
||||||
|
|
||||||
|
| Nivel | Herramientas |
|
||||||
|
|---|---|
|
||||||
|
| **Lite** | Buscar duplicados · Limpiar texto · Estandarizar formatos |
|
||||||
|
| **Core** | Las 9 herramientas |
|
||||||
|
|
||||||
|
Un usuario Lite que abra una herramienta exclusiva de Core verá un mensaje "Actualiza tu licencia". La página de inicio también muestra una marca 🔒 Bloqueado en las tarjetas de las herramientas que tu nivel no incluye. Para actualizar, pega un código Core en la página Activar.
|
||||||
|
|
||||||
|
Cada licencia dura 1 año. La barra lateral muestra tu nivel y los días restantes en todo momento; aparece un aviso de renovación 30 días antes de la caducidad. El archivo de licencia vive en `~/.datatools/license.json` (Windows: `C:\Users\<tú>\.datatools\license.json`).
|
||||||
|
|
||||||
|
Para usar la misma licencia en otro equipo: desactiva éste (página Activar → **Desactivar este dispositivo**) y vuelve a pegar tu código en el nuevo.
|
||||||
|
|
||||||
|
## 1. Instalación
|
||||||
|
|
||||||
|
No necesitas tener Python ni permisos de administrador — el paquete trae su propio intérprete y todas las dependencias. Dos formatos por sistema operativo, elige el que tu política de TI permita:
|
||||||
|
|
||||||
|
- **Instalador** — crea automáticamente acceso directo en el escritorio + entrada en el menú Inicio / Launchpad. Recomendado para la mayoría.
|
||||||
|
- **.zip portable** — descomprime y haz doble clic. No toca el registro, se ejecuta desde cualquier lugar (escritorio, USB, recurso de red). Úsalo si no puedes ejecutar instaladores, quieres una instalación de una sola carpeta que puedas copiar entre equipos, o estás evaluando antes de instalar.
|
||||||
|
|
||||||
|
Ambos formatos son idénticos por dentro: mismo Python, mismas dependencias, mismo comportamiento de arranque.
|
||||||
|
|
||||||
|
### 1.1 Windows
|
||||||
|
|
||||||
|
**Opción A — Instalador (`DataTools-<ver>-win-setup.exe`)**
|
||||||
|
|
||||||
|
1. Descarga `DataTools-<ver>-win-setup.exe` desde tu correo de licencia o GitHub Releases.
|
||||||
|
2. Doble clic en el instalador. La primera vez, Windows SmartScreen mostrará **"Windows protegió tu PC"** — pulsa **Más información** → **Ejecutar de todas formas**. (Este aviso solo aparece una vez por compilación hasta que tengamos un certificado EV de firma de código.)
|
||||||
|
3. Acepta la ruta de instalación por usuario (`%LOCALAPPDATA%\Programs\DataTools` por defecto — no pide UAC). Marca **Crear acceso directo en el escritorio** si lo quieres (activado por defecto).
|
||||||
|
4. Pulsa **Instalar** y luego **Finalizar**. El instalador te ofrece lanzar DataTools al terminar.
|
||||||
|
5. A partir de ahora ejecútalo desde: **Menú Inicio → DataTools**, el **acceso directo del escritorio**, o escribiendo `DataTools` en Ejecutar (Win+R) / cmd.
|
||||||
|
|
||||||
|
Para anclarlo a la barra de tareas, lanza la app una vez, clic derecho en su icono de la barra de tareas, y **Anclar a la barra de tareas**. Windows requiere este paso manual — ningún instalador puede anclar por programa.
|
||||||
|
|
||||||
|
**Opción B — Portable (`DataTools-<ver>-win-portable.zip`)**
|
||||||
|
|
||||||
|
1. Descarga `DataTools-<ver>-win-portable.zip`.
|
||||||
|
2. Clic derecho en el .zip → **Extraer todo…** → elige una carpeta (p. ej. `C:\Tools\DataTools`).
|
||||||
|
3. Abre la carpeta `DataTools\` extraída, doble clic en `DataTools.exe`. El aviso de SmartScreen aparece solo la primera vez.
|
||||||
|
4. Para crear tu propio acceso directo en el escritorio: clic derecho en `DataTools.exe` → **Enviar a → Escritorio (crear acceso directo)**.
|
||||||
|
|
||||||
|
**Desinstalar** (solo instalador): Configuración → Aplicaciones → DataTools → Desinstalar. Portable: borra la carpeta.
|
||||||
|
|
||||||
|
### 1.2 macOS
|
||||||
|
|
||||||
|
**Opción A — DMG instalador (`DataTools-<ver>-mac.dmg`)**
|
||||||
|
|
||||||
|
1. Descarga `DataTools-<ver>-mac.dmg`.
|
||||||
|
2. Doble clic en el .dmg. Se abre una ventana de Finder con el icono **DataTools** y un alias **Aplicaciones**.
|
||||||
|
3. Arrastra **DataTools** sobre **Aplicaciones**. Espera a que termine la copia y expulsa el DMG.
|
||||||
|
4. En compilaciones sin firma, el primer arranque muestra **"No se puede abrir 'DataTools' porque no se puede verificar al desarrollador"**. Solución: clic derecho en DataTools en /Aplicaciones → **Abrir** → confirma **Abrir** en el diálogo. macOS recuerda la elección — los siguientes arranques no muestran nada.
|
||||||
|
5. Ejecútalo desde **Launchpad**, **Spotlight** (`⌘ Espacio` → escribe "DataTools"), o **Aplicaciones** en Finder.
|
||||||
|
|
||||||
|
Para mantener DataTools en el Dock: lanza la app, clic derecho en su icono del Dock → **Opciones → Mantener en el Dock**. macOS no permite que los instaladores fijen al Dock automáticamente.
|
||||||
|
|
||||||
|
**Opción B — Portable (`DataTools-<ver>-mac-portable.zip`)**
|
||||||
|
|
||||||
|
1. Descarga `DataTools-<ver>-mac-portable.zip`. Safari descomprime al descargar por defecto; en Finder verás `DataTools.app` directamente.
|
||||||
|
2. Mueve `DataTools.app` a **Aplicaciones** si quieres que aparezca en Launchpad — o déjalo en el escritorio, un USB o un recurso de red. La .app portable se ejecuta desde cualquier sitio.
|
||||||
|
3. Doble clic en `DataTools.app`. Clic derecho → **Abrir** la primera vez (misma rutina que con el DMG).
|
||||||
|
|
||||||
|
**Desinstalar**: arrastra `DataTools.app` a la Papelera. Tus archivos de datos siguen donde estén — la app no instala nada más.
|
||||||
|
|
||||||
|
### 1.3 Linux
|
||||||
|
|
||||||
|
`DataTools-<ver>-linux-x86_64.AppImage` ya es portable — no hay .zip aparte.
|
||||||
|
|
||||||
|
1. Descarga el .AppImage.
|
||||||
|
2. `chmod +x DataTools-*.AppImage`.
|
||||||
|
3. Doble clic, o ejecútalo desde la terminal.
|
||||||
|
|
||||||
|
Si tu distro no incluye FUSE 2: `sudo apt install libfuse2` (Debian/Ubuntu) o equivalente.
|
||||||
|
|
||||||
|
### 1.4 Qué pasa al arrancar por primera vez
|
||||||
|
|
||||||
|
El lanzador (llamado `DataTools.exe` / `DataTools.app` / `DataTools.AppImage`) hace tres cosas, en orden:
|
||||||
|
|
||||||
|
1. Elige un puerto TCP libre en `127.0.0.1` — normalmente el 8501; si está ocupado prueba 8502, 8503, …
|
||||||
|
2. Arranca un servidor Streamlit local en ese puerto. El servidor solo está enlazado a localhost, nunca a tu red.
|
||||||
|
3. Abre tu navegador predeterminado en `http://127.0.0.1:<puerto>/`. Si el navegador no se abre en 5 segundos, pega esa URL manualmente.
|
||||||
|
|
||||||
|
La ventana del lanzador queda abierta en segundo plano. Cerrarla detiene el servidor — la pestaña del navegador dirá "no se puede acceder a este sitio" la próxima vez.
|
||||||
|
|
||||||
|
### 1.5 Cómo funciona la GUI
|
||||||
|
|
||||||
|
- Se ejecuta localmente en tu equipo. **Sin internet, sin subidas.**
|
||||||
|
- El navegador es solo la capa de visualización. Cerrarlo NO detiene la app — cierra la ventana del lanzador (o sal de la .app de macOS desde el Dock) para terminar del todo.
|
||||||
|
- ¿Prefieres la terminal? Cada herramienta incluye también una CLI — ver Sección 3.
|
||||||
|
|
||||||
|
### 1.6 Requisitos del sistema
|
||||||
|
|
||||||
|
- Windows 10/11 (64 bits), macOS 11+, Linux moderno (2020+).
|
||||||
|
- Navegador moderno (Chrome, Edge, Firefox, Safari, últimos 3 años).
|
||||||
|
- ~500 MB de espacio libre en disco (el paquete ocupa ~300 MB; el resto es espacio de trabajo para CSV grandes).
|
||||||
|
|
||||||
|
**OCR para PDFs escaneados viene incluido** — Tesseract 5.5 y el modelo en inglés `eng.traineddata` vienen dentro de cada instalador / portable / AppImage. La ruta de extracción de PDFs escaneados del Extractor de PDF funciona sin configuración adicional; no hace falta instalar nada por separado. (Quien ejecute desde un checkout con `pip install -r requirements.txt` sigue necesitando Tesseract del sistema en el `PATH` — ver [DEVELOPER.md §PDF Extractor — bundled Tesseract](DEVELOPER.md#pdf-extractor--bundled-tesseract) (solo en inglés).)
|
||||||
|
|
||||||
|
Matriz de soporte completa: [REQUIREMENTS.md](REQUIREMENTS.md) (solo en inglés).
|
||||||
|
|
||||||
|
## 2. Qué incluye
|
||||||
|
|
||||||
|
| # | Herramienta | Propósito | Estado |
|
||||||
|
|---|------|---------|--------|
|
||||||
|
| 01 | Buscar duplicados | Coincidencia exacta + difusa, 5 normalizadores, auditoría | Listo |
|
||||||
|
| 02 | Limpiar texto | Espacios, caracteres tipográficos, BOM, finales de línea, mayúsculas/minúsculas | Listo |
|
||||||
|
| 03 | Estandarizar formatos | Fechas / teléfonos / correos / direcciones / nombres / monedas / booleanos | Listo |
|
||||||
|
| 04 | Corregir valores faltantes | Nulos disfrazados, imputación, descarte por umbral | Próximamente |
|
||||||
|
| 05 | Mapear columnas | Renombrar + aplicar esquema | Próximamente |
|
||||||
|
| 06 | Detectar valores atípicos | z-score, IQR, multivariante | Próximamente |
|
||||||
|
| 07 | Combinar archivos | Combina varios archivos | Próximamente |
|
||||||
|
| 08 | Verificación de calidad | Reglas + informe PDF/Excel | Próximamente |
|
||||||
|
| 09 | Flujos automatizados | Lanzador multi-herramienta de un clic | Próximamente |
|
||||||
|
|
||||||
|
**Datos de muestra** (`samples/`): `messy_sales.csv`, `bank_export.xlsx`.
|
||||||
|
|
||||||
|
## 3. Uso
|
||||||
|
|
||||||
|
### 3.1 GUI (recomendada)
|
||||||
|
|
||||||
|
1. Inicia el paquete.
|
||||||
|
2. Selecciona una herramienta en la barra lateral.
|
||||||
|
3. Suelta tu archivo (o elige una muestra).
|
||||||
|
4. Los valores por defecto están preconfigurados — pulsa **Ejecutar** para previsualizar.
|
||||||
|
5. Pulsa **Guardar salida** para escribir el archivo limpio.
|
||||||
|
|
||||||
|
Las opciones avanzadas se encuentran en paneles desplegables. El archivo original nunca se modifica.
|
||||||
|
|
||||||
|
**Ayuda en la herramienta**: cada página tiene un botón **Help** a la derecha del título. Al pulsarlo se abre una ventana emergente con una guía compacta (Cuándo usarla · Pasos · Ejemplos · Consejo). Úsala como recordatorio a media tarea — la ventana se cierra al hacer clic fuera y tus datos no se ven afectados.
|
||||||
|
|
||||||
|
**Navegación lateral**: la barra lateral agrupa las herramientas en secciones (Análisis, Limpiadores de datos, Transformaciones, Automatizaciones). Cada cabecera muestra `+` cuando está plegada y `−` cuando está desplegada — pulsa la cabecera para alternar.
|
||||||
|
|
||||||
|
### 3.2 CLI
|
||||||
|
|
||||||
|
```bash
|
||||||
|
deduplicator customers.csv [--apply]
|
||||||
|
text-cleaner messy.csv [--apply]
|
||||||
|
format-standardize feed.csv [--apply]
|
||||||
|
```
|
||||||
|
|
||||||
|
Ayuda: `deduplicator --help`. Referencia completa: [CLI-REFERENCE.es.md](CLI-REFERENCE.es.md).
|
||||||
|
|
||||||
|
### 3.3 Orden de ejecución (cuando uses las herramientas manualmente)
|
||||||
|
|
||||||
|
Si no usas Flujos automatizados, sigue este orden:
|
||||||
|
|
||||||
|
1. **02 Limpiar texto** primero — normaliza espacios y caracteres especiales.
|
||||||
|
2. **03 Estandarizar formatos** — fechas, teléfonos, etc. necesitan texto limpio.
|
||||||
|
3. **04 Corregir valores faltantes** — códigos centinela se ocultan como números.
|
||||||
|
4. **05 Mapear columnas** — esquema antes que estadísticas de atípicos.
|
||||||
|
5. **06 Detectar valores atípicos** — necesita datos numéricos limpios. Calcular estadísticas con `NaN` o `-999` envenena los resultados.
|
||||||
|
6. **07 Combinar archivos**, **08 Verificación de calidad** según sea necesario.
|
||||||
|
7. **01 Buscar duplicados** es flexible en cuanto al orden (normaliza internamente para la coincidencia).
|
||||||
|
|
||||||
|
Flujos automatizados aplica este orden automáticamente.
|
||||||
|
|
||||||
|
### 3.4 Idioma
|
||||||
|
|
||||||
|
La barra lateral tiene un selector **Language / Idioma**. Se incluyen dos paquetes hoy:
|
||||||
|
|
||||||
|
- **English** (por defecto)
|
||||||
|
- **Español**
|
||||||
|
|
||||||
|
Elige el idioma una vez — la opción persiste durante la sesión y el selector es visible desde cualquier página. Cambia cuando quieras; la página se vuelve a renderizar en su sitio sin perder datos.
|
||||||
|
|
||||||
|
**Cobertura** (v1.6): página de inicio, tarjetas de herramientas, panel de carga y análisis, lista de hallazgos, indicador de la verificación de normalización CSV, selector lateral y pantalla de cierre. Los cuerpos de cada página de herramienta (etiquetas de opciones avanzadas, indicaciones del mapeador de columnas, etiquetas de revisión de duplicados) están planificados para paquetes futuros — actualmente se muestran en inglés en ambos modos. Si una cadena que esperabas ver traducida no cambia, se trata de una clave de paquete pendiente, no de un fallo del selector; escribe a soporte adjuntando una captura.
|
||||||
|
|
||||||
|
## 4. Verificación de Revisar y Normalizar
|
||||||
|
|
||||||
|
Cada archivo subido se analiza antes de que cualquier herramienta lo toque.
|
||||||
|
|
||||||
|
**Niveles de confianza**:
|
||||||
|
- **Alta** — seguras de ida y vuelta. El botón "Corregir automáticamente lo de alta confianza" las aplica todas con un clic.
|
||||||
|
- **Media** — normalmente correctas, con falsos positivos ocasionales. Previsualiza primero.
|
||||||
|
- **Baja** — heurística. Desactivada por defecto; opt-in por hallazgo.
|
||||||
|
- **Error** — bloquea la verificación (archivo vacío, U+FFFD, filas no reparables).
|
||||||
|
|
||||||
|
**Sustitución de codificación**: cuando el detector reporta `encoding_uncertain` o detectas mojibake (`é`) o caracteres `<60>`, elige el codepage correcto en la parte superior de la página (cp1252 para Excel occidental, KOI8-R para ruso antiguo, Big5 para chino tradicional, …) → **Re-analizar**.
|
||||||
|
|
||||||
|
**Salida avanzada**: un desplegable `⚙️` en la descarga te permite ajustar la codificación, el delimitador y el terminador de línea. El nombre del archivo descargado se ajusta automáticamente (`.tsv` para tabulador, `.csv` en los demás casos).
|
||||||
|
|
||||||
|
## 5. Salida
|
||||||
|
|
||||||
|
Cada ejecución escribe:
|
||||||
|
- **Archivo limpio** junto al original (o donde indiques).
|
||||||
|
- **Archivo de auditoría** (cambios celda por celda en herramientas de texto/formato, grupos de coincidencia en deduplicación).
|
||||||
|
- **Registro con marca de tiempo** en `logs/`.
|
||||||
|
|
||||||
|
El archivo original nunca se modifica.
|
||||||
|
|
||||||
|
## 6. Solución de problemas
|
||||||
|
|
||||||
|
- **La GUI no se abre / el navegador no se inicia** — espera 10-15 s; visita manualmente `http://127.0.0.1:8501` (o el puerto que muestre la ventana del lanzador). Error de puerto ocupado → cierra otras instancias. El lanzador recorre los puertos 8501–8550 buscando uno libre, así que una instancia colgada puede desplazar la URL.
|
||||||
|
- **¿Por qué se abre el navegador?** — patrón de aplicación web local (igual que Jupyter o RStudio). Nada sale de tu equipo.
|
||||||
|
- **Windows SmartScreen** — pulsa "Más información" → "Ejecutar de todas formas". Una sola vez por compilación hasta que tengamos un certificado EV.
|
||||||
|
- **macOS "La aplicación está dañada" / "no se puede verificar al desarrollador"** — clic derecho en la app → **Abrir** → confirma. Si el mensaje persiste, el archivo se corrompió en tránsito — vuelve a descargarlo. Último recurso: `xattr -cr /Applications/DataTools.app` limpia el atributo de cuarentena.
|
||||||
|
- **macOS — el .zip portable extraído no abre** — Safari descomprime al descargar; si ves una carpeta `__MACOSX/` o archivos `._DataTools.app` usaste otro descompresor. Vuelve a extraer con la Utilidad de Archivo integrada (clic derecho en el .zip → **Abrir con → Utilidad de Archivo**) para preservar los metadatos de la .app.
|
||||||
|
- **Windows — el antivirus pone en cuarentena `DataTools.exe` del portable** — tu antivirus no reconoce el paquete. Añade la carpeta extraída a la lista blanca. El instalador .exe activa menos antivirus porque es un envoltorio Inno Setup conocido.
|
||||||
|
- **El AppImage de Linux no se ejecuta** — `chmod +x archivo.AppImage`. Si falta FUSE → `sudo apt install libfuse2`.
|
||||||
|
- **Lento con archivos grandes** — por encima de ~100k filas tarda más; la barra de progreso lo indica. Para millones de filas → usa la CLI directamente.
|
||||||
|
- **¿Dónde guarda la app mi licencia / configuración?** — `~/.datatools/` en macOS y Linux, `C:\Users\<tú>\.datatools\` en Windows. Tus archivos de entrada y salida siguen donde los dejes; la app nunca los copia a otro sitio.
|
||||||
|
- **Necesito ayuda** — escribe al correo que aparece en tu recibo de compra.
|
||||||
|
|
||||||
|
## 7. Licencia
|
||||||
|
|
||||||
|
Usuario único. Consulta `LICENSE.txt`.
|
||||||
@@ -1,208 +1,217 @@
|
|||||||
# USER-GUIDE.md - Excel & CSV Data Cleaning Mastery Bundle
|
> 🌐 **Language:** English · [Español](USER-GUIDE.es.md)
|
||||||
|
|
||||||
**Version**: 1.6
|
# User Guide
|
||||||
**Last updated**: April 28, 2026
|
|
||||||
|
|
||||||
Thank you for purchasing the Data Cleaning Mastery Bundle. This guide covers installation and every script included.
|
**Version**: 1.6 · **Updated**: 2026-05-01
|
||||||
|
|
||||||
---
|
## 0. First launch — activation
|
||||||
|
|
||||||
## 1. Installation
|
DataTools must be activated before any tools unlock. On first launch you'll see the **Activate** screen.
|
||||||
|
|
||||||
The bundle is fully self-contained. **You do not need to install Python.**
|
Enter your full name + email, paste the license blob from your purchase email (starts with `DTLIC1:`), and click **Activate**. Renewal works the same way — paste the renewal blob, click **Apply renewal**.
|
||||||
|
|
||||||
### Windows
|
**Tiers**:
|
||||||
|
|
||||||
1. Download `BundleName-Setup-1.0.exe` from your purchase email.
|
| Tier | Tools |
|
||||||
2. Double-click the installer.
|
|---|---|
|
||||||
3. Follow the wizard. The installer creates a desktop shortcut named "Launch Bundle" and an entry in Start Menu.
|
| **Lite** | Find Duplicates · Clean Text · Standardize Formats |
|
||||||
4. Launch via the desktop shortcut. Your default browser will open to a local page (typically `http://localhost:8501`) where the data tool runs.
|
| **Core** | All 9 tools |
|
||||||
|
|
||||||
### macOS
|
A Lite user opening a Core-only tool sees an "Upgrade your license" prompt. The home page also shows a 🔒 Locked badge on tool cards your tier doesn't unlock. To upgrade, paste a Core blob on the Activate page.
|
||||||
|
|
||||||
1. Download `BundleName-1.0.dmg` from your purchase email.
|
Every license lasts 1 year. The sidebar shows your tier and days remaining at all times; a renewal warning appears 30 days before expiry. The license file lives at `~/.datatools/license.json` (Windows: `C:\Users\<you>\.datatools\license.json`).
|
||||||
2. Double-click the `.dmg` to mount it.
|
|
||||||
3. Drag the Bundle app into the Applications folder.
|
|
||||||
4. Launch from Applications, Spotlight, or Launchpad. Your default browser will open to a local page where the data tool runs.
|
|
||||||
|
|
||||||
The app is signed and notarized by Apple, so it opens cleanly with no security warnings.
|
To use the same license on a different machine: deactivate this one (Activate page → **Deactivate this device**) and re-paste your blob on the new machine.
|
||||||
|
|
||||||
### Linux
|
## 1. Install
|
||||||
|
|
||||||
1. Download `BundleName-1.0.AppImage` from your purchase email.
|
You don't need Python and you don't need admin rights — the bundle ships its own interpreter and every dependency. Two flavors per OS, pick whichever your IT policy allows:
|
||||||
2. Make it executable: `chmod +x BundleName-1.0.AppImage`
|
|
||||||
3. Double-click to run, or execute from a terminal. Your default browser will open to a local page where the data tool runs.
|
|
||||||
|
|
||||||
If AppImage doesn't work on your distribution, a `.tar.gz` fallback is available in your purchase email. Extract it and run `./run.sh` from the extracted folder.
|
- **Installer** — wires up Desktop shortcut + Start Menu / Launchpad entry automatically. Recommended for most users.
|
||||||
|
- **Portable .zip** — unzip and double-click. No registry writes, runs from anywhere (Desktop, USB stick, network share). Use this if you can't run installers, want a single-folder install you can copy between machines, or are evaluating before committing to install.
|
||||||
|
|
||||||
### How the GUI works (important to know)
|
Both flavors are byte-identical inside: same Python, same dependencies, same launch behavior.
|
||||||
|
|
||||||
This tool runs in your browser **locally on your computer**. When you launch it, a small program starts a local server on your machine and opens your default browser to view it. This is normal and expected.
|
### 1.1 Windows
|
||||||
|
|
||||||
- **No internet is required.** Your data never leaves your computer.
|
**Option A — Installer (`DataTools-<ver>-win-setup.exe`)**
|
||||||
- **Your data is not uploaded anywhere.** All processing happens on your machine.
|
|
||||||
- The browser is just the display surface. Closing the browser closes the GUI; the underlying program also stops.
|
|
||||||
|
|
||||||
If you prefer the command line, every script also ships as a CLI tool. See Section 3.
|
1. Download `DataTools-<ver>-win-setup.exe` from your release email or GitHub Releases.
|
||||||
|
2. Double-click the installer. On the first run Windows SmartScreen will say **"Windows protected your PC"** — click **More info** → **Run anyway**. (This warning only appears once per build until we have an EV code-signing cert.)
|
||||||
|
3. Accept the per-user install location (`%LOCALAPPDATA%\Programs\DataTools` by default — no admin prompt). Check **Create a desktop shortcut** if you want one (on by default).
|
||||||
|
4. Click **Install**, then **Finish**. The installer offers to launch DataTools immediately.
|
||||||
|
5. From now on launch from: **Start Menu → DataTools**, the **Desktop shortcut**, or just type `DataTools` into Windows Run (Win+R) / cmd.
|
||||||
|
|
||||||
### Requirements
|
To pin to the taskbar, launch the app once, right-click its icon in the taskbar, then **Pin to taskbar**. Windows requires this manual step — no installer is allowed to pin programmatically.
|
||||||
|
|
||||||
- Windows: Windows 10 or 11 (64-bit).
|
**Option B — Portable (`DataTools-<ver>-win-portable.zip`)**
|
||||||
- macOS: macOS 11 Big Sur or later (Apple Silicon or Intel).
|
|
||||||
- Linux: any modern 64-bit distribution from 2020 onward.
|
|
||||||
- A modern default browser (Chrome, Edge, Firefox, or Safari from the last 3 years).
|
|
||||||
- ~400-500 MB free disk space.
|
|
||||||
- Internet connection: not required.
|
|
||||||
|
|
||||||
For the full short-form numbered list of what's supported (file sizes, code pages, delimiters, performance targets, detector list, etc.), see [REQUIREMENTS.md](REQUIREMENTS.md).
|
1. Download `DataTools-<ver>-win-portable.zip`.
|
||||||
|
2. Right-click the .zip → **Extract All…** → pick a folder (e.g. `C:\Tools\DataTools`).
|
||||||
|
3. Open the extracted `DataTools\` folder, double-click `DataTools.exe`. SmartScreen warning fires the first time only.
|
||||||
|
4. To create your own desktop shortcut later: right-click `DataTools.exe` → **Send to → Desktop (create shortcut)**.
|
||||||
|
|
||||||
---
|
**Uninstall** (installer only): Settings → Apps → DataTools → Uninstall. Portable: delete the folder.
|
||||||
|
|
||||||
## 2. What's Included
|
### 1.2 macOS
|
||||||
|
|
||||||
**Scripts (in the `scripts/` folder)**:
|
**Option A — Installer DMG (`DataTools-<ver>-mac.dmg`)**
|
||||||
|
|
||||||
| # | Script | Purpose | Status |
|
1. Download `DataTools-<ver>-mac.dmg`.
|
||||||
|---|---|---|---|
|
2. Double-click the .dmg. A Finder window opens showing the **DataTools** icon and an **Applications** alias.
|
||||||
| 01 | `01_deduplicator.py` | Smart duplicate removal: exact match + basic fuzzy, configurable subset columns, full logs | Working |
|
3. Drag **DataTools** onto **Applications**. Wait for the copy to finish, then eject the DMG.
|
||||||
| 02 | `02_text_cleaner.py` | Character-level hygiene: trim leading/trailing whitespace, collapse internal multi-spaces, strip non-printable characters, Unicode normalization (smart quotes, em-dashes, accents), remove zero-width characters, BOM handling, line-ending normalization, case operations | Working |
|
4. On unsigned builds the first launch shows **"DataTools" cannot be opened because the developer cannot be verified**. Fix: right-click DataTools in /Applications → **Open** → confirm **Open** in the dialog. macOS remembers this choice — subsequent launches are clean.
|
||||||
| 03 | `03_format_standardizer.py` | Standardize dates, currencies, names, phone numbers, addresses | Skeleton |
|
5. Launch from **Launchpad**, **Spotlight** (`⌘ Space` → type "DataTools"), or **Applications** in Finder.
|
||||||
| 04 | `04_missing_value_handler.py` | Detect and handle missing values: disguised nulls (`N/A`, `-`, blanks, sentinel codes), imputation (mean/median/mode/forward-fill), required-field enforcement, drop-by-threshold | Skeleton |
|
|
||||||
| 05 | `05_column_mapper_enforcer.py` | Rename columns, enforce a target schema | Skeleton |
|
|
||||||
| 06 | `06_outlier_detector.py` | Detect and flag statistical outliers (z-score, IQR, modified z-score), multivariate detection, domain-rule violations, optional winsorization | Skeleton |
|
|
||||||
| 07 | `07_multi_file_merger.py` | Merge multiple CSV or Excel files into one | Skeleton |
|
|
||||||
| 08 | `08_validator_reporter.py` | Validate data against rules, output PDF or Excel report | Skeleton |
|
|
||||||
| 09 | `09_master_orchestrator.py` | One-click launcher menu, calls any other script | Skeleton |
|
|
||||||
|
|
||||||
**Sample data (in the `samples/` folder)**:
|
To keep DataTools in the Dock: launch the app, right-click its Dock icon → **Options → Keep in Dock**. macOS doesn't allow installers to pin to the Dock automatically.
|
||||||
- `messy_sales.csv` - intentionally dirty sales data for testing.
|
|
||||||
- `bank_export.xlsx` - sample bank export for testing missing-value handling and outlier detection.
|
|
||||||
|
|
||||||
---
|
**Option B — Portable (`DataTools-<ver>-mac-portable.zip`)**
|
||||||
|
|
||||||
|
1. Download `DataTools-<ver>-mac-portable.zip`. Safari auto-unzips on download; in Finder you'll see `DataTools.app` directly.
|
||||||
|
2. Move `DataTools.app` to **Applications** if you want it discoverable via Launchpad — or keep it on your Desktop, a USB stick, or a network share. The portable .app runs from anywhere.
|
||||||
|
3. Double-click `DataTools.app`. Right-click → **Open** the first time (same unsigned-build dance as the DMG).
|
||||||
|
|
||||||
|
**Uninstall**: drag `DataTools.app` to the Trash. Your data files stay where you put them — nothing else is installed.
|
||||||
|
|
||||||
|
### 1.3 Linux
|
||||||
|
|
||||||
|
`DataTools-<ver>-linux-x86_64.AppImage` is already portable — no separate zip needed.
|
||||||
|
|
||||||
|
1. Download the .AppImage.
|
||||||
|
2. `chmod +x DataTools-*.AppImage`.
|
||||||
|
3. Double-click, or run it from a terminal.
|
||||||
|
|
||||||
|
If your distro doesn't ship FUSE 2: `sudo apt install libfuse2` (Debian/Ubuntu) or equivalent.
|
||||||
|
|
||||||
|
### 1.4 What happens on first launch
|
||||||
|
|
||||||
|
The launcher (called `DataTools.exe` / `DataTools.app` / `DataTools.AppImage`) does three things, in order:
|
||||||
|
|
||||||
|
1. Picks a free TCP port on `127.0.0.1` — usually 8501, falls back through 8502, 8503, … if another app is using 8501.
|
||||||
|
2. Starts a local Streamlit server on that port. The server is **bound to localhost only**, never to your LAN.
|
||||||
|
3. Opens your default browser at `http://127.0.0.1:<port>/`. If the browser doesn't open within 5 seconds, paste that URL into your browser manually.
|
||||||
|
|
||||||
|
The launcher window stays open in the background. Closing it stops the server — the browser tab will say "this site can't be reached" the next time you click it.
|
||||||
|
|
||||||
|
### 1.5 How the GUI works
|
||||||
|
|
||||||
|
- Runs locally on your machine. **No internet, no upload.**
|
||||||
|
- The browser is just the display surface. Closing it does NOT stop the app — close the launcher window (or quit the macOS .app from the Dock) to fully exit.
|
||||||
|
- Prefer the terminal? Every tool ships with a CLI too (Section 3).
|
||||||
|
|
||||||
|
### 1.6 System requirements
|
||||||
|
|
||||||
|
- Windows 10/11 (64-bit), macOS 11+, modern Linux (2020+).
|
||||||
|
- Modern browser (Chrome, Edge, Firefox, Safari, last 3 years).
|
||||||
|
- ~500 MB free disk space (the bundle itself is ~300 MB; the rest is working scratch space for large CSVs).
|
||||||
|
|
||||||
|
**OCR for scanned PDFs is bundled** — Tesseract 5.5 + the English `eng.traineddata` model ship inside every installer / portable / AppImage. The PDF Extractor's scanned-statement path works out of the box; no separate install required. (Developers running from a `pip install -r requirements.txt` checkout still need system Tesseract on `PATH` — see [DEVELOPER.md §PDF Extractor — bundled Tesseract](DEVELOPER.md#pdf-extractor--bundled-tesseract).)
|
||||||
|
|
||||||
|
Full numbered support matrix: [REQUIREMENTS.md](REQUIREMENTS.md).
|
||||||
|
|
||||||
|
## 2. What's included
|
||||||
|
|
||||||
|
| # | Tool | Purpose | Status |
|
||||||
|
|---|------|---------|--------|
|
||||||
|
| 01 | Find Duplicates | Exact + fuzzy match, 5 normalizers, audit | Ready |
|
||||||
|
| 02 | Clean Text | Whitespace, smart chars, BOM, line endings, case ops | Ready |
|
||||||
|
| 03 | Standardize Formats | Dates / phones / emails / addresses / names / currencies / booleans | Ready |
|
||||||
|
| 04 | Fix Missing Values | Disguised nulls, imputation, drop-by-threshold | Coming Soon |
|
||||||
|
| 05 | Map Columns | Rename + enforce schema | Coming Soon |
|
||||||
|
| 06 | Find Unusual Values | z-score, IQR, multivariate | Coming Soon |
|
||||||
|
| 07 | Combine Files | Combine multiple files | Coming Soon |
|
||||||
|
| 08 | Quality Check | Rules + PDF/Excel report | Coming Soon |
|
||||||
|
| 09 | Automated Workflows | One-click multi-tool launcher | Coming Soon |
|
||||||
|
|
||||||
|
**Sample data** (`samples/`): `messy_sales.csv`, `bank_export.xlsx`.
|
||||||
|
|
||||||
## 3. Usage
|
## 3. Usage
|
||||||
|
|
||||||
You have two ways to use the bundle: the GUI (recommended for most users) or the CLI (for power users and automation).
|
### 3.1 GUI (recommended)
|
||||||
|
|
||||||
### 3.1 GUI usage (recommended)
|
1. Launch the bundle.
|
||||||
|
2. Pick a tool from the sidebar.
|
||||||
|
3. Drop your file (or select a sample).
|
||||||
|
4. Defaults are pre-filled — click **Run** to preview.
|
||||||
|
5. Click **Save Output** to write the cleaned file.
|
||||||
|
|
||||||
1. Launch the bundle via the desktop shortcut, app icon, or AppImage.
|
Advanced options are tucked in expander panes. The original file is never modified.
|
||||||
2. Your browser opens to the bundle's home page.
|
|
||||||
3. Select the script you want to use from the sidebar (Deduplicator, Format Standardizer, etc.).
|
|
||||||
4. Drop your file into the file uploader, or select from the included samples.
|
|
||||||
5. Sensible defaults are pre-filled. Click "Run" to see a preview of what the script will do.
|
|
||||||
6. Review the preview. If it looks right, click "Save Output" to write the cleaned file.
|
|
||||||
|
|
||||||
The GUI is designed to work out of the box with zero configuration. Advanced options are tucked into expandable "Advanced" panes for users who want them.
|
**In-tool Help**: every tool page has a **Help** button right of the title. Click it to open a popover with a compact how-to (When to use · Steps · Examples · Tip). Use it as a refresher mid-task — the popover closes when you click outside, your inputs are untouched.
|
||||||
|
|
||||||
### 3.2 CLI usage
|
**Sidebar nav**: the sidebar groups tools into sections (Analysis, Data Cleaners, Transformations, Automations). Each section header shows `+` when collapsed and `−` when expanded — click the header to toggle.
|
||||||
|
|
||||||
All scripts are also CLI tools with `--help` output.
|
### 3.2 CLI
|
||||||
|
|
||||||
**Basic usage** (from a terminal):
|
```bash
|
||||||
|
deduplicator customers.csv [--apply]
|
||||||
Windows (the bundle adds CLI tools to your PATH):
|
text-cleaner messy.csv [--apply]
|
||||||
```
|
format-standardize feed.csv [--apply]
|
||||||
deduplicator samples\messy_sales.csv
|
|
||||||
```
|
```
|
||||||
|
|
||||||
macOS / Linux:
|
Get help: `deduplicator --help`. Full reference: [CLI-REFERENCE.md](CLI-REFERENCE.md).
|
||||||
```
|
|
||||||
deduplicator samples/messy_sales.csv
|
|
||||||
```
|
|
||||||
|
|
||||||
**With options**:
|
### 3.3 Run order (when running tools manually)
|
||||||
|
|
||||||
```
|
If you skip Automated Workflows, follow this order:
|
||||||
deduplicator samples/messy_sales.csv --output cleaned.csv --subset email,phone
|
|
||||||
```
|
|
||||||
|
|
||||||
**Get help on any script**:
|
1. **02 Clean Text** first — normalizes whitespace + special chars.
|
||||||
|
2. **03 Standardize Formats** — dates, phones, etc. need cleaned text.
|
||||||
|
3. **04 Fix Missing Values** — sentinel codes hide as numbers.
|
||||||
|
4. **05 Map Columns** — schema before outlier stats.
|
||||||
|
5. **06 Find Unusual Values** — needs clean numerics. Stats on data with `NaN` or `-999` are mathematically poisoned.
|
||||||
|
6. **07 Combine Files**, **08 Quality Check** as needed.
|
||||||
|
7. **01 Find Duplicates** is order-flexible (normalizes internally for matching).
|
||||||
|
|
||||||
```
|
Automated Workflows enforces this automatically.
|
||||||
deduplicator --help
|
|
||||||
```
|
|
||||||
|
|
||||||
**Recommended run order**: If you are running scripts individually, run `02_text_cleaner` first to normalize whitespace and special characters, then `04_missing_value_handler` *before* `06_outlier_detector`. Outlier detection on data still containing blanks or sentinel codes (like `-999`) produces unreliable results because missing-value placeholders distort the statistics (means get dragged, IQR widens, false negatives explode). The Master Orchestrator (script 09) runs them in the correct order automatically.
|
### 3.4 Language
|
||||||
|
|
||||||
---
|
The sidebar has a **Language / Idioma** picker. Two packs ship today:
|
||||||
|
|
||||||
## 3.3 Review & Normalize gate
|
- **English** (default)
|
||||||
|
- **Español**
|
||||||
|
|
||||||
Before any tool page accepts a file, the file passes through a **CSV-normalization gate**. The gate scans every uploaded file, surfaces every data-quality issue our analyzer can detect, and lets you choose how to handle each one before downstream tools see the data.
|
Pick a language once — the choice persists for the session and the picker is visible from every page. Switch any time; the page re-renders in place with no data loss.
|
||||||
|
|
||||||
### How it works
|
**Coverage** (v1.6): home page, tool cards, the upload + analysis panel, the findings list, the Review & Normalize gate prompt, the sidebar picker, and the shutdown screen. Per-tool page bodies (advanced-option labels, column-mapper prompts, dedup review labels) are tracked for future packs — they currently render in English in both modes. If a string you'd expect to switch doesn't, that's a missing pack key, not a bug in the picker; email support with a screenshot.
|
||||||
|
|
||||||
1. Upload a file on the home page. The analyzer scans it and counts findings by confidence tier.
|
## 4. Review & Normalize gate
|
||||||
2. Click any tool. If the file hasn't been normalized yet, you're redirected to the **Review & Normalize** page.
|
|
||||||
3. The page shows every finding grouped by severity and confidence, with a per-finding decision control.
|
|
||||||
|
|
||||||
### Confidence tiers
|
Every uploaded file is scanned before any tool sees it.
|
||||||
|
|
||||||
- **High** — round-trip-safe algorithmic fix (BOM strip, whitespace trim, NBSP / zero-width strip, smart-quote fold, line-ending normalize, header cleanup). One-click "Auto-fix high-confidence" applies them all.
|
**Confidence tiers**:
|
||||||
- **Medium** — right call in the common case but with known false-positive shapes. Examples: lowercasing the email column, replacing null-like sentinels (`N/A`, `-`, `nan`), repairing unquoted-currency rows. Preview the change before applying.
|
- **High** — round-trip safe. One-click "Auto-fix high-confidence" applies them all.
|
||||||
- **Low** — heuristic fixes that can corrupt data when wrong. Mojibake repair (`café` → `café`), mixed-encoding detection. Off by default; you opt in per finding.
|
- **Medium** — usually right, occasional false positives. Preview first.
|
||||||
- **Error** — blocking. Empty file, unrepairable rows, U+FFFD replacement characters. Cannot enter the tool pages until resolved or explicitly waived.
|
- **Low** — heuristic. Off by default; opt in per finding.
|
||||||
|
- **Error** — blocks the gate (empty file, U+FFFD, unrepairable rows).
|
||||||
|
|
||||||
### Encoding override
|
**Encoding override**: when the picker reports `encoding_uncertain` or you spot mojibake (`é`) or `<60>` chars, choose the right codepage at the top of the page (cp1252 for Western Excel, KOI8-R for older Russian, Big5 for traditional Chinese, …) → **Re-analyze**.
|
||||||
|
|
||||||
When the analyzer reports `encoding_uncertain` or you spot mojibake (`é`) or `<EFBFBD>` characters in the findings list, use the **File encoding** picker at the top of the Review page. Pick the right code page (cp1252 for Western Excel exports, KOI8-R for older Russian data, Big5 for traditional Chinese, etc.) or type a custom one, then click **Re-analyze**. Findings refresh against the corrected decode.
|
**Advanced output**: an `⚙️` expander on the download lets you tune encoding, delimiter, and line terminator. The download filename auto-adjusts (`.tsv` for tab, `.csv` otherwise).
|
||||||
|
|
||||||
The picker is hidden for `.xlsx` files since Excel stores text as Unicode internally.
|
## 5. Output
|
||||||
|
|
||||||
### Advanced output options
|
Every run writes:
|
||||||
|
- **Cleaned file** next to the input (or wherever you specify).
|
||||||
|
- **Audit file** (per-cell changes for text/format tools, match groups for dedup).
|
||||||
|
- **Timestamped log** in `logs/`.
|
||||||
|
|
||||||
After applying decisions, an `⚙️ Advanced output options` expander on the download appears. Three dropdowns let you tune the output file format:
|
Original input is never modified.
|
||||||
|
|
||||||
- **Encoding (code page)** — UTF-8 (default), UTF-8 with BOM (Excel-friendly), Windows-1252, Latin-1, Latin-9, cp1250, ISO-8859-2, cp1251, Shift_JIS, GB18030, Big5, EUC-KR, UTF-16 LE.
|
## 6. Troubleshooting
|
||||||
- **Delimiter** — comma (default), tab, semicolon, pipe.
|
|
||||||
- **Line terminator** — LF (default), CRLF (Windows), CR.
|
|
||||||
|
|
||||||
The download filename auto-adjusts the extension (`.tsv` for tab, otherwise `.csv`). When the chosen encoding can't represent a character (Cyrillic content into cp1252, Asian script into Latin-1), the page shows a warning naming the offending character and falls back to `?` replacement so the download still works.
|
- **GUI won't launch / browser doesn't open** — wait 10-15 s; manually visit `http://127.0.0.1:8501` (or whichever port the launcher window prints). Port-in-use error → close other instances. The launcher walks ports 8501–8550 looking for a free one, so a stale instance can shift the URL.
|
||||||
|
- **Why does my browser open?** — local web app pattern (same as Jupyter, RStudio). Nothing leaves your machine.
|
||||||
|
- **Windows SmartScreen** — click "More info" → "Run anyway". One-time per build until we have an EV-signed cert.
|
||||||
|
- **macOS "App is damaged" / "developer cannot be verified"** — right-click the app → **Open** → confirm. If the message persists, the file was likely corrupted in transit — re-download. As a last resort: `xattr -cr /Applications/DataTools.app` clears the quarantine attribute.
|
||||||
|
- **macOS portable .zip — extracted but won't open** — Safari unzips on download by default; if you see a `__MACOSX/` folder or `._DataTools.app` file you used a different unarchiver. Re-extract with the built-in Archive Utility (right-click the .zip → **Open With → Archive Utility**) so the .app's metadata is preserved.
|
||||||
|
- **Windows portable .zip — antivirus quarantines DataTools.exe** — your AV doesn't recognize the bundle. Allowlist the extracted folder. The installer .exe trips fewer AV products because it's a known Inno Setup wrapper.
|
||||||
|
- **Linux AppImage won't run** — `chmod +x file.AppImage`. Missing FUSE → `sudo apt install libfuse2`.
|
||||||
|
- **Slow on large file** — over ~100k rows takes longer; progress bar shows. Multi-million rows → use the CLI directly.
|
||||||
|
- **Where does the app store my license / settings?** — `~/.datatools/` on macOS + Linux, `C:\Users\<you>\.datatools\` on Windows. Your input/output files stay where you put them; the app never copies them anywhere else.
|
||||||
|
- **Need help** — email the address on your purchase receipt.
|
||||||
|
|
||||||
---
|
## 7. License
|
||||||
|
|
||||||
## 4. Output
|
Single-user. See `LICENSE.txt`.
|
||||||
|
|
||||||
Every script writes:
|
|
||||||
- A cleaned output file next to the input (or wherever you specify).
|
|
||||||
- A timestamped log file in the `logs/` folder showing what changed and why.
|
|
||||||
|
|
||||||
Reports from `validator_reporter` go to the `reports/` folder as PDF or Excel.
|
|
||||||
|
|
||||||
The GUI also displays the output preview in-browser before any file is written. The original input file is never modified.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 5. Troubleshooting
|
|
||||||
|
|
||||||
**The GUI won't launch / browser doesn't open**:
|
|
||||||
1. Wait 10-15 seconds after double-clicking. The local server takes a moment to start the first time.
|
|
||||||
2. If the browser doesn't open automatically, manually visit `http://localhost:8501` in your browser.
|
|
||||||
3. If you see a "port in use" error, another program is using port 8501. Close other instances of the bundle and try again.
|
|
||||||
|
|
||||||
**"Why is my browser opening?" / "Why does this need internet?"**:
|
|
||||||
This tool runs as a local web app. The browser is just the display; nothing is uploaded, nothing leaves your computer. No internet connection is used after install. This is the same approach used by many modern data tools (Jupyter notebooks, RStudio, etc.).
|
|
||||||
|
|
||||||
**Windows: "Windows protected your PC" SmartScreen warning**:
|
|
||||||
Click "More info" then "Run anyway." This is a standard warning for software without an extended-validation Windows code signing certificate.
|
|
||||||
|
|
||||||
**macOS: "App is damaged and cannot be opened"**:
|
|
||||||
This usually indicates the download was corrupted. Re-download from the link in your purchase email.
|
|
||||||
|
|
||||||
**Linux: AppImage will not run**:
|
|
||||||
Make sure it is executable: `chmod +x BundleName-1.0.AppImage`. If it still fails, your distribution may be missing FUSE; install with `sudo apt install libfuse2` (Debian/Ubuntu) or use the `.tar.gz` fallback.
|
|
||||||
|
|
||||||
**Script throws an error about a file**:
|
|
||||||
Check the log file in the `logs/` folder. The log explains exactly what went wrong and which row of input data triggered it.
|
|
||||||
|
|
||||||
**The GUI feels slow on a large file**:
|
|
||||||
Files over ~100,000 rows take longer to process. The GUI shows a progress bar. If you have very large files (millions of rows) consider using the CLI directly, which is faster for batch jobs.
|
|
||||||
|
|
||||||
**Need help**: Email the address on your purchase receipt.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## 6. License
|
|
||||||
|
|
||||||
Single-user license. Do not redistribute. See `LICENSE.txt` in the install folder.
|
|
||||||
|
|||||||
142
landing/README.md
Normal file
142
landing/README.md
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
# Landing pages
|
||||||
|
|
||||||
|
Three persona-tagged landing pages per `docs/PLAN.md` §2.3 and
|
||||||
|
`docs/DEMO-PLAN.md` §3 / §7. Static HTML, zero build step, ship to
|
||||||
|
Cloudflare Pages.
|
||||||
|
|
||||||
|
## Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
landing/
|
||||||
|
├── _shared/styles.css shared CSS (system fonts, no externals)
|
||||||
|
├── shopify-pet/index.html Shopify operator (priority: pet supplies)
|
||||||
|
├── bookkeeper/index.html bookkeeper / freelance accountant
|
||||||
|
├── revops/index.html marketing / RevOps agency
|
||||||
|
└── README.md this file
|
||||||
|
```
|
||||||
|
|
||||||
|
Each page:
|
||||||
|
|
||||||
|
- Inherits `landing/_shared/styles.css`
|
||||||
|
- Overrides the `--accent` colour variable in an inline `<style>` block
|
||||||
|
so each persona has its own visual identity (Shopify = mint green,
|
||||||
|
Bookkeeper = steel blue, RevOps = vivid violet)
|
||||||
|
- Has a sticky buy bar with the Gumroad CTA tagged with `?from=<persona>`
|
||||||
|
- Embeds the live demo (Streamlit) via `<iframe>` with a sandbox attribute
|
||||||
|
- Carries persona-specific H1, sub-copy, use cases, FAQ, and a
|
||||||
|
ready-to-paste `terminal` block showing the CLI in action
|
||||||
|
- Includes Open Graph + Schema.org `SoftwareApplication` JSON-LD for
|
||||||
|
link-share previews and SEO
|
||||||
|
|
||||||
|
## Pre-deploy URL substitutions — automated
|
||||||
|
|
||||||
|
The HTML carries placeholder URLs (the literal strings
|
||||||
|
`https://demo.datatools.app`, `https://datatools.app`,
|
||||||
|
`https://gumroad.com/l/datatools`, `mailto:hello@datatools.app`)
|
||||||
|
that **must** be replaced before deployment. A small Python script
|
||||||
|
does this for you — no global search-and-replace needed.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1) Copy the template and fill in your real URLs:
|
||||||
|
cp landing/deploy.config.example.json landing/deploy.config.json
|
||||||
|
edit landing/deploy.config.json
|
||||||
|
|
||||||
|
# 2) Build the deploy-ready bundle:
|
||||||
|
python3 landing/deploy.py
|
||||||
|
# → produces landing/dist/ with substitutions applied,
|
||||||
|
# plus robots.txt, sitemap.xml, 404.html, favicon.svg
|
||||||
|
```
|
||||||
|
|
||||||
|
`landing/deploy.config.json` is gitignored so your real URLs never
|
||||||
|
hit the repo. Re-run `landing/deploy.py` whenever you change a URL or
|
||||||
|
edit any HTML source.
|
||||||
|
|
||||||
|
## Cloudflare Pages deployment
|
||||||
|
|
||||||
|
The simplest path — one Pages project pointed at `landing/dist/`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Option A: drag-and-drop the directory in the Cloudflare dashboard
|
||||||
|
# Pages → Create project → Direct Upload → drag landing/dist/
|
||||||
|
|
||||||
|
# Option B: Wrangler CLI (one command, scriptable)
|
||||||
|
wrangler pages deploy landing/dist
|
||||||
|
```
|
||||||
|
|
||||||
|
Configure the custom apex domain (`datatools.app`) in the Cloudflare
|
||||||
|
Pages project settings; sub-paths `/shopify-pet/`, `/bookkeeper/`,
|
||||||
|
`/revops/` are served automatically because the directory layout
|
||||||
|
mirrors them. Cache rule defaults are fine (HTML 1 day, CSS 7 days).
|
||||||
|
|
||||||
|
If you want **separate Pages projects** per persona for independent
|
||||||
|
A/B testing, point three projects at the same `landing/dist/` and
|
||||||
|
configure each with its own sub-domain (`shopify.datatools.app`, etc.)
|
||||||
|
and a Pages rule that rewrites the root to that persona's
|
||||||
|
sub-directory.
|
||||||
|
|
||||||
|
## Telemetry wiring (per DEMO-PLAN §8)
|
||||||
|
|
||||||
|
The plan calls for event-only counters, no PII, no Google Analytics.
|
||||||
|
|
||||||
|
For each page, on Cloudflare Pages, attach a Worker (or use Cloudflare
|
||||||
|
Web Analytics — it's privacy-friendly out of the box and zero config).
|
||||||
|
Track:
|
||||||
|
|
||||||
|
- `page_view` per persona (auto from CF Web Analytics)
|
||||||
|
- `cta_clicked` — add a small inline `<script>` that fires a fetch to
|
||||||
|
`/api/event?event=cta_clicked&persona=<persona>` when the buy button
|
||||||
|
is clicked, then continues the navigation to Gumroad.
|
||||||
|
- `demo.run_completed` and `demo.cta_clicked` are owned by the demo
|
||||||
|
app, not the landing page.
|
||||||
|
|
||||||
|
Conversion (per DEMO-PLAN §8):
|
||||||
|
|
||||||
|
```
|
||||||
|
demo_engagement = demo.run_completed / page_view (target ≥ 30%)
|
||||||
|
purchase_intent = demo.cta_clicked / demo.run_completed (target ≥ 5%)
|
||||||
|
purchase_rate = gumroad.purchase / demo.cta_clicked (target ≥ 30%)
|
||||||
|
```
|
||||||
|
|
||||||
|
The Gumroad webhook captures `?from=<persona>` so we can attribute
|
||||||
|
purchases back to the landing page that produced them.
|
||||||
|
|
||||||
|
## Maintenance triggers (per DEMO-PLAN §9)
|
||||||
|
|
||||||
|
Refresh the page when:
|
||||||
|
|
||||||
|
| Trigger | Action |
|
||||||
|
|---|---|
|
||||||
|
| `cta_clicked / run_completed < 5%` for 4 weeks | The demo is working but the buyer isn't trusting the CTA. Add a screenshot of the network tab showing zero outbound calls. Soften the price callout. |
|
||||||
|
| `page_view → run_completed < 30%` for 4 weeks | The demo iframe isn't loading or visitors aren't engaging. Check the iframe URL. Move the demo above the fold if it's currently below. |
|
||||||
|
| New tool ships (06–09) | Add it to the persona's saved pipeline only if it fits — don't bloat the demo with every tool. |
|
||||||
|
| Pricing change | Update `<meta>` schema, the buybar `.price-tag`, the pricing card, and the FAQ. Search-and-replace `$49` across the file. |
|
||||||
|
| New persona added (4th, 5th) | Copy `shopify-pet/index.html`, replace persona-specific copy, add to the `footer` cross-link block on the existing pages. |
|
||||||
|
|
||||||
|
## Why static HTML
|
||||||
|
|
||||||
|
Per `DECISIONS.md §5` and `BUSINESS.md §7`, the landing-page channel
|
||||||
|
must be:
|
||||||
|
|
||||||
|
- **Async-friendly** — Cloudflare Pages serves these with no operator
|
||||||
|
involvement
|
||||||
|
- **Cheap** — Cloudflare Pages free tier is sufficient until well past
|
||||||
|
the $5k/mo MRR re-lock trigger (`DECISIONS.md §8`)
|
||||||
|
- **Privacy-respecting** — no third-party tracker means no cookie
|
||||||
|
banner, which means no friction added to the conversion funnel
|
||||||
|
- **Zero ongoing maintenance** — no framework, no build, no upgrades.
|
||||||
|
The CSS uses system fonts; no Google Fonts; no CDN dependency that
|
||||||
|
could break the page when their TLS certificate rolls.
|
||||||
|
|
||||||
|
## Anti-temptations (per DEMO-PLAN §11 + plan §5)
|
||||||
|
|
||||||
|
These pages deliberately exclude:
|
||||||
|
|
||||||
|
- **No live chat widget.** Locked by no-touch.
|
||||||
|
- **No "schedule a demo with us" CTA.** Same.
|
||||||
|
- **No email capture before the demo.** Friction kills conversion.
|
||||||
|
- **No Google Analytics / Meta Pixel.** Privacy story is a moat, not
|
||||||
|
a checkbox to ignore.
|
||||||
|
- **No SaaS-style "free trial / no credit card."** This is a one-time
|
||||||
|
download, not a subscription.
|
||||||
|
- **No A/B-testing framework yet.** Pre-PMF traffic doesn't reach
|
||||||
|
statistical significance — ship the single-arm copy, iterate monthly.
|
||||||
234
landing/_shared/styles.css
Normal file
234
landing/_shared/styles.css
Normal file
@@ -0,0 +1,234 @@
|
|||||||
|
/* DataTools landing-page styles — single shared sheet for all niches.
|
||||||
|
*
|
||||||
|
* Design constraints:
|
||||||
|
* • No external font / CSS dependencies (works on Cloudflare Pages
|
||||||
|
* with zero build step, no privacy banner needed).
|
||||||
|
* • Mobile-first; layout reflows below 720 px.
|
||||||
|
* • Dark, focused, content-first. Buyer reads this on a laptop
|
||||||
|
* between Shopify exports — keep it readable and skimmable.
|
||||||
|
* • Persona pages all share this sheet — niche differences live in
|
||||||
|
* copy + accent-color variables overridden in each page's <style>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
:root {
|
||||||
|
--bg: #0f1115;
|
||||||
|
--surface: #161922;
|
||||||
|
--surface-2: #1d212b;
|
||||||
|
--text: #e8eaed;
|
||||||
|
--text-mute: #9aa3b2;
|
||||||
|
--text-soft: #c8ced8;
|
||||||
|
--rule: #252a36;
|
||||||
|
--accent: #6ee7b7; /* Shopify pet default — overridden per persona */
|
||||||
|
--accent-ink: #052e1a;
|
||||||
|
--warn: #fbbf24;
|
||||||
|
--max: 1080px;
|
||||||
|
--radius: 12px;
|
||||||
|
--shadow: 0 1px 3px rgba(0,0,0,0.3), 0 8px 24px rgba(0,0,0,0.2);
|
||||||
|
--mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, monospace;
|
||||||
|
--sans: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
|
||||||
|
"Helvetica Neue", Arial, sans-serif;
|
||||||
|
}
|
||||||
|
|
||||||
|
* { box-sizing: border-box; }
|
||||||
|
|
||||||
|
html, body {
|
||||||
|
margin: 0; padding: 0;
|
||||||
|
background: var(--bg);
|
||||||
|
color: var(--text);
|
||||||
|
font-family: var(--sans);
|
||||||
|
font-size: 16px;
|
||||||
|
line-height: 1.55;
|
||||||
|
-webkit-font-smoothing: antialiased;
|
||||||
|
}
|
||||||
|
|
||||||
|
a { color: var(--accent); text-decoration: none; }
|
||||||
|
a:hover { text-decoration: underline; }
|
||||||
|
|
||||||
|
/* ----- Sticky buy bar ----- */
|
||||||
|
.buybar {
|
||||||
|
position: sticky; top: 0; z-index: 50;
|
||||||
|
background: rgba(15,17,21,0.92);
|
||||||
|
backdrop-filter: blur(8px);
|
||||||
|
border-bottom: 1px solid var(--rule);
|
||||||
|
padding: 10px 20px;
|
||||||
|
}
|
||||||
|
.buybar-inner {
|
||||||
|
max-width: var(--max); margin: 0 auto;
|
||||||
|
display: flex; align-items: center; justify-content: space-between;
|
||||||
|
gap: 16px;
|
||||||
|
}
|
||||||
|
.buybar .brand { font-weight: 600; letter-spacing: -0.01em; }
|
||||||
|
.buybar .brand-mark { color: var(--accent); margin-right: 6px; }
|
||||||
|
.buybar .price-tag { color: var(--text-mute); font-size: 14px; margin-right: 12px; }
|
||||||
|
|
||||||
|
/* ----- Buttons ----- */
|
||||||
|
.btn {
|
||||||
|
display: inline-block;
|
||||||
|
background: var(--accent); color: var(--accent-ink);
|
||||||
|
font-weight: 600; font-size: 15px;
|
||||||
|
padding: 11px 18px; border-radius: 8px;
|
||||||
|
border: 0; cursor: pointer;
|
||||||
|
transition: transform 0.05s ease, box-shadow 0.15s ease;
|
||||||
|
}
|
||||||
|
.btn:hover { transform: translateY(-1px); text-decoration: none; box-shadow: var(--shadow); }
|
||||||
|
.btn-large {
|
||||||
|
padding: 14px 24px; font-size: 17px;
|
||||||
|
}
|
||||||
|
.btn-ghost {
|
||||||
|
background: transparent; color: var(--text-soft);
|
||||||
|
border: 1px solid var(--rule);
|
||||||
|
}
|
||||||
|
.btn-ghost:hover { background: var(--surface); }
|
||||||
|
|
||||||
|
/* ----- Layout ----- */
|
||||||
|
section {
|
||||||
|
padding: 60px 20px;
|
||||||
|
border-bottom: 1px solid var(--rule);
|
||||||
|
}
|
||||||
|
section:last-of-type { border-bottom: 0; }
|
||||||
|
.container { max-width: var(--max); margin: 0 auto; }
|
||||||
|
|
||||||
|
h1, h2, h3 { line-height: 1.2; letter-spacing: -0.02em; margin-top: 0; }
|
||||||
|
h1 { font-size: 44px; margin-bottom: 18px; }
|
||||||
|
h2 { font-size: 30px; margin-bottom: 16px; }
|
||||||
|
h3 { font-size: 19px; margin-bottom: 8px; }
|
||||||
|
p { margin: 0 0 14px 0; color: var(--text-soft); }
|
||||||
|
.muted { color: var(--text-mute); }
|
||||||
|
.eyebrow { color: var(--accent); font-size: 13px; font-weight: 600;
|
||||||
|
text-transform: uppercase; letter-spacing: 0.08em; margin-bottom: 10px; }
|
||||||
|
|
||||||
|
ul.bullets { padding-left: 20px; margin: 0 0 14px 0; }
|
||||||
|
ul.bullets li { margin-bottom: 8px; color: var(--text-soft); }
|
||||||
|
|
||||||
|
/* ----- Hero ----- */
|
||||||
|
.hero {
|
||||||
|
padding: 80px 20px 60px;
|
||||||
|
background: radial-gradient(ellipse at top, var(--surface), var(--bg) 60%);
|
||||||
|
}
|
||||||
|
.hero h1 strong { color: var(--accent); font-weight: 700; }
|
||||||
|
.hero .lead {
|
||||||
|
font-size: 19px; color: var(--text-soft); max-width: 720px;
|
||||||
|
margin-bottom: 28px;
|
||||||
|
}
|
||||||
|
.hero .cta-row { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; }
|
||||||
|
.hero .price-note { color: var(--text-mute); font-size: 14px; }
|
||||||
|
|
||||||
|
/* ----- Demo embed ----- */
|
||||||
|
.demo-frame {
|
||||||
|
background: var(--surface);
|
||||||
|
border: 1px solid var(--rule);
|
||||||
|
border-radius: var(--radius);
|
||||||
|
overflow: hidden;
|
||||||
|
box-shadow: var(--shadow);
|
||||||
|
}
|
||||||
|
.demo-frame iframe {
|
||||||
|
width: 100%; height: 720px; border: 0; display: block;
|
||||||
|
background: var(--surface-2);
|
||||||
|
}
|
||||||
|
.demo-caption {
|
||||||
|
font-size: 14px; color: var(--text-mute);
|
||||||
|
padding: 10px 16px; border-top: 1px solid var(--rule);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ----- Cards / grids ----- */
|
||||||
|
.grid {
|
||||||
|
display: grid; gap: 18px;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
|
||||||
|
}
|
||||||
|
.card {
|
||||||
|
background: var(--surface);
|
||||||
|
border: 1px solid var(--rule);
|
||||||
|
border-radius: var(--radius);
|
||||||
|
padding: 22px;
|
||||||
|
}
|
||||||
|
.card h3 { color: var(--text); }
|
||||||
|
.card p:last-child { margin-bottom: 0; }
|
||||||
|
.card .icon {
|
||||||
|
display: inline-block; font-size: 22px; margin-bottom: 8px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ----- Stats row ----- */
|
||||||
|
.stats { display: flex; gap: 28px; flex-wrap: wrap; margin: 18px 0 0; }
|
||||||
|
.stats .stat .num {
|
||||||
|
font-family: var(--mono); font-size: 26px; font-weight: 600;
|
||||||
|
color: var(--accent);
|
||||||
|
}
|
||||||
|
.stats .stat .label { font-size: 13px; color: var(--text-mute); }
|
||||||
|
|
||||||
|
/* ----- Privacy / audit callout panels ----- */
|
||||||
|
.callout {
|
||||||
|
background: var(--surface);
|
||||||
|
border-left: 3px solid var(--accent);
|
||||||
|
border-radius: 0 var(--radius) var(--radius) 0;
|
||||||
|
padding: 18px 22px;
|
||||||
|
margin: 18px 0;
|
||||||
|
}
|
||||||
|
.callout strong { color: var(--text); }
|
||||||
|
|
||||||
|
/* ----- Code-ish blocks ----- */
|
||||||
|
.terminal {
|
||||||
|
font-family: var(--mono); font-size: 14px;
|
||||||
|
background: #0a0c10;
|
||||||
|
color: #d8dfe8;
|
||||||
|
border: 1px solid var(--rule);
|
||||||
|
border-radius: var(--radius);
|
||||||
|
padding: 16px 18px;
|
||||||
|
overflow-x: auto;
|
||||||
|
white-space: pre;
|
||||||
|
line-height: 1.45;
|
||||||
|
}
|
||||||
|
.terminal .prompt { color: var(--text-mute); }
|
||||||
|
.terminal .ok { color: var(--accent); }
|
||||||
|
.terminal .warn { color: var(--warn); }
|
||||||
|
|
||||||
|
/* ----- Pricing ----- */
|
||||||
|
.pricing {
|
||||||
|
display: grid; gap: 18px;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(260px, 1fr));
|
||||||
|
}
|
||||||
|
.pricing .card .price {
|
||||||
|
font-size: 38px; font-weight: 700; letter-spacing: -0.02em;
|
||||||
|
color: var(--text);
|
||||||
|
}
|
||||||
|
.pricing .card .price-suffix { font-size: 14px; color: var(--text-mute); margin-left: 4px; }
|
||||||
|
.pricing .card.featured { border-color: var(--accent); }
|
||||||
|
.pricing .card .row { display: flex; align-items: baseline; gap: 4px; margin-bottom: 12px; }
|
||||||
|
.pricing .card ul { padding-left: 18px; margin: 12px 0 18px; }
|
||||||
|
.pricing .card li { color: var(--text-soft); margin-bottom: 6px; }
|
||||||
|
|
||||||
|
/* ----- FAQ ----- */
|
||||||
|
details.faq {
|
||||||
|
border-bottom: 1px solid var(--rule);
|
||||||
|
padding: 14px 0;
|
||||||
|
}
|
||||||
|
details.faq summary {
|
||||||
|
font-weight: 600; color: var(--text);
|
||||||
|
cursor: pointer; list-style: none;
|
||||||
|
display: flex; align-items: center; justify-content: space-between;
|
||||||
|
}
|
||||||
|
details.faq summary::after {
|
||||||
|
content: "+"; color: var(--accent); font-size: 22px;
|
||||||
|
margin-left: 14px;
|
||||||
|
}
|
||||||
|
details.faq[open] summary::after { content: "−"; }
|
||||||
|
details.faq p { margin-top: 10px; }
|
||||||
|
|
||||||
|
/* ----- Footer ----- */
|
||||||
|
footer {
|
||||||
|
padding: 40px 20px 60px;
|
||||||
|
font-size: 14px;
|
||||||
|
color: var(--text-mute);
|
||||||
|
}
|
||||||
|
footer .container { display: flex; gap: 28px; flex-wrap: wrap; justify-content: space-between; }
|
||||||
|
footer a { color: var(--text-soft); }
|
||||||
|
footer p { color: var(--text-mute); }
|
||||||
|
|
||||||
|
/* ----- Responsive ----- */
|
||||||
|
@media (max-width: 720px) {
|
||||||
|
h1 { font-size: 32px; }
|
||||||
|
h2 { font-size: 24px; }
|
||||||
|
section { padding: 40px 18px; }
|
||||||
|
.hero { padding: 56px 18px 40px; }
|
||||||
|
.demo-frame iframe { height: 560px; }
|
||||||
|
.buybar-inner .price-tag { display: none; }
|
||||||
|
}
|
||||||
354
landing/bookkeeper/index.html
Normal file
354
landing/bookkeeper/index.html
Normal file
@@ -0,0 +1,354 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>DataTools for Bookkeepers — Reconcile Bank Exports With An Audit Trail · $49</title>
|
||||||
|
<meta name="description" content="Reconcile messy bank exports. Catch duplicate transactions QuickBooks imported twice. Standardize dates, amounts, and vendor casing — locally. Every change auditable. $49 one-time." />
|
||||||
|
<meta name="keywords" content="reconcile bank export csv, quickbooks duplicate transactions, vendor list cleanup, bookkeeper csv tool, bank export deduplicator, bookkeeper audit trail" />
|
||||||
|
<link rel="canonical" href="https://datatools.app/bookkeeper/" />
|
||||||
|
<link rel="stylesheet" href="../_shared/styles.css" />
|
||||||
|
|
||||||
|
<!-- Persona accent: Bookkeeper → calm steel-blue -->
|
||||||
|
<style>
|
||||||
|
:root {
|
||||||
|
--accent: #7dd3fc;
|
||||||
|
--accent-ink: #042c43;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<!-- Open Graph -->
|
||||||
|
<meta property="og:title" content="DataTools for Bookkeepers — Reconcile Bank Exports With An Audit Trail" />
|
||||||
|
<meta property="og:description" content="Catch duplicate transactions. Standardize dates and amounts. Hand your client an audit trail. $49 one-time." />
|
||||||
|
<meta property="og:type" content="product" />
|
||||||
|
<meta property="og:url" content="https://datatools.app/bookkeeper/" />
|
||||||
|
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{
|
||||||
|
"@context": "https://schema.org",
|
||||||
|
"@type": "SoftwareApplication",
|
||||||
|
"name": "DataTools for Bookkeepers",
|
||||||
|
"operatingSystem": "Windows, macOS, Linux",
|
||||||
|
"applicationCategory": "BusinessApplication",
|
||||||
|
"offers": {
|
||||||
|
"@type": "Offer",
|
||||||
|
"price": "49",
|
||||||
|
"priceCurrency": "USD"
|
||||||
|
},
|
||||||
|
"description": "Reconcile bank exports, dedupe vendor lists, and produce a hand-off-ready audit trail. Six-tool data-cleaning bundle for bookkeepers and freelance accountants.",
|
||||||
|
"softwareVersion": "1.0"
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<div class="buybar">
|
||||||
|
<div class="buybar-inner">
|
||||||
|
<div class="brand"><span class="brand-mark">●</span> DataTools <span class="muted">/ for Bookkeepers</span></div>
|
||||||
|
<div>
|
||||||
|
<span class="price-tag">$49 — one-time, no subscription</span>
|
||||||
|
<a class="btn" href="https://gumroad.com/l/datatools?from=bookkeeper" rel="noopener">Get DataTools →</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<section class="hero">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">For bookkeepers · freelance accountants · small-firm partners</div>
|
||||||
|
<h1>Reconcile messy bank exports.<br /><strong>Hand your client an audit trail.</strong></h1>
|
||||||
|
<p class="lead">
|
||||||
|
The Jan and Feb exports overlap and you've got the same transaction
|
||||||
|
booked twice. Vendor names are <em>"Amazon"</em>, <em>"amazon.com"</em>,
|
||||||
|
and <em>"AMAZON.COM*4F2X9"</em> in three different rows. Dates are a
|
||||||
|
smoosh of <code>01/15/2025</code>, <code>2025-01-15</code>, and
|
||||||
|
<code>Jan 18 2025</code>. DataTools fixes all of it in one pass —
|
||||||
|
and produces a row-by-row CSV showing every change so your client
|
||||||
|
can verify your work.
|
||||||
|
</p>
|
||||||
|
<div class="cta-row">
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=bookkeeper" rel="noopener">Get DataTools — $49 →</a>
|
||||||
|
<a class="btn btn-ghost btn-large" href="#demo">Try the live demo ↓</a>
|
||||||
|
<span class="price-note">One-time payment · cross-platform · runs offline</span>
|
||||||
|
</div>
|
||||||
|
<div class="stats">
|
||||||
|
<div class="stat"><div class="num">6</div><div class="label">tools, one bundle</div></div>
|
||||||
|
<div class="stat"><div class="num">100 %</div><div class="label">auditable changes</div></div>
|
||||||
|
<div class="stat"><div class="num">0</div><div class="label">cloud uploads ever</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Pain points ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">If you've spent a Saturday on this, you already know</div>
|
||||||
|
<h2>Five pains DataTools fixes in one pass</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📅</span>
|
||||||
|
<h3>Jan and Feb bank exports overlap — the same transaction posts twice</h3>
|
||||||
|
<p>QuickBooks (or any reconciler) silently double-counts the month-boundary rows. Your client's books understate cash by 1–4 % and nobody notices until tax season.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> 2–4 hours per month per client + reconciliation errors that can compound.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📒</span>
|
||||||
|
<h3>1099 reports break because vendors are spelled three ways</h3>
|
||||||
|
<p>"Amazon", "amazon.com", "AMAZON.COM*4F2X9" become three separate vendors in QBO. You ship three 1099s instead of one — and the 1099-NEC threshold breaks both ways.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> 1–2 hours per 1099 cycle + IRS-paper-trail risk.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🛡️</span>
|
||||||
|
<h3>"Show me what you changed" — your liability hangs on the answer</h3>
|
||||||
|
<p>Cloud cleaners that "just clean your data" don't give you a row-level audit log. Your professional indemnity insurance hates that. Your client's auditor hates that. You hate explaining it.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> per-firm liability premium + 24–48 hr audit-response window stress.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">👥</span>
|
||||||
|
<h3>Per-client SaaS pricing destroys your margins at 10+ clients</h3>
|
||||||
|
<p>$30/mo per client × 20 clients = $600/mo, every month, for tooling. DataTools is a one-time desktop license you use on every client's books for the same $49. Forever.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> the difference between a $30/mo/client subscription and $49 once.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🌍</span>
|
||||||
|
<h3>Multi-currency books break standard parsers</h3>
|
||||||
|
<p>Your client has EU customers. Their amounts come in as <code>€1.234,56</code> (comma decimal). Standard import tools see "1.234" as the whole-dollar amount and drop the rest. Parens-negative <code>($89.50)</code> gets read as positive.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> 30–60 min per multi-currency client per month + occasional silent errors.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🔒</span>
|
||||||
|
<h3>Your client's books are too sensitive for a cloud cleaner</h3>
|
||||||
|
<p>One "vendor breach" email to your clients ends the relationship. DataTools is desktop-only. No upload, no SaaS account, no third party seeing a single transaction. Verifiable in your browser's network tab.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> nothing — and that's exactly the point.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section id="demo">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Live demo · runs in your browser</div>
|
||||||
|
<h2>Try it on a sample bank export with a known overlap</h2>
|
||||||
|
<p>
|
||||||
|
The demo below loads a 25-row export combining January and February
|
||||||
|
activity, with the month-boundary rows duplicated across exports —
|
||||||
|
the exact scenario where QuickBooks (or any reconciler) silently
|
||||||
|
double-counts transactions. Click <strong>Run pipeline</strong> and
|
||||||
|
watch the dedup catch every overlap, dates land in ISO format, and
|
||||||
|
the parens-negative amounts (<code>($89.50)</code>) become proper
|
||||||
|
negative numbers.
|
||||||
|
</p>
|
||||||
|
<div class="demo-frame">
|
||||||
|
<iframe
|
||||||
|
src="https://demo.datatools.app/?p=bookkeeper"
|
||||||
|
loading="lazy"
|
||||||
|
title="DataTools live demo — Bookkeeper"
|
||||||
|
sandbox="allow-scripts allow-same-origin allow-downloads allow-forms"></iframe>
|
||||||
|
<div class="demo-caption">
|
||||||
|
Demo runs on free hosting. Capped at 100 input rows · output
|
||||||
|
watermarked. The paid product has no caps and runs entirely offline.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Built for the bookkeeper's actual day</div>
|
||||||
|
<h2>Four workflows the rest of the industry tax-codes around</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🏦</span>
|
||||||
|
<h3>Bank export reconciliation</h3>
|
||||||
|
<p>Two months of activity overlap at the boundary. The same transaction posts twice — once in each export — with different formatting. DataTools dedups on Date + Amount + fuzzy Vendor and catches all of them.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📒</span>
|
||||||
|
<h3>Vendor list consolidation</h3>
|
||||||
|
<p>QuickBooks has <code>amazon.com</code>. Your spreadsheet has <code>Amazon</code>. The bank statement has <code>AMAZON.COM*4F2X9</code>. Standardize the casing, fuzzy-match across sources, hand the client one clean vendor list.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">👥</span>
|
||||||
|
<h3>Customer master cleanup pre-migration</h3>
|
||||||
|
<p>Before moving from one accounting system to another, the customer master needs to be deduped, standardized, and audited. One tool, one pipeline, one CSV in / clean CSV out.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🧾</span>
|
||||||
|
<h3>Expense report dedup</h3>
|
||||||
|
<p>Same receipt scanned twice. Same Uber ride entered manually and then imported from the corporate card. Catch them once — and produce the audit log that proves the duplicate <em>was</em> a duplicate.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">The feature your liability insurance cares about</div>
|
||||||
|
<h2>Every change auditable. Period.</h2>
|
||||||
|
<p>
|
||||||
|
Every cell DataTools modifies is logged with the original value, the
|
||||||
|
new value, and which rule fired. When your client asks why a
|
||||||
|
transaction got merged or a date got reformatted, you don't say
|
||||||
|
"the AI did it." You hand them the CSV.
|
||||||
|
</p>
|
||||||
|
<div class="callout">
|
||||||
|
<strong>Why this matters specifically to bookkeepers:</strong> your
|
||||||
|
professional liability hangs on traceability. Cloud cleaners that
|
||||||
|
"just clean your data" without a row-level audit are unsafe at any
|
||||||
|
price. DataTools writes the audit by default, downloadable as a
|
||||||
|
separate CSV alongside the cleaned file.
|
||||||
|
</div>
|
||||||
|
<div class="terminal"><span class="prompt">$</span> head -5 client_jan2025_changes.csv
|
||||||
|
row,column,field_type,old,new
|
||||||
|
0,"Date ",date,"01/15/2025","2025-01-15"
|
||||||
|
0,Description,name," AMAZON.COM*4F2X9 PURCHASE","Amazon.com*4F2X9 Purchase"
|
||||||
|
0,Amount,currency,"-$129.99","-129.99"
|
||||||
|
1,Date ,date,"2025-01-15","2025-01-15"
|
||||||
|
<span class="prompt">$</span> # one row of audit per cell change. handed to the client. signed off.</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">The thing every cloud reconciler can't say</div>
|
||||||
|
<h2>Your client's books never leave your computer.</h2>
|
||||||
|
<p>
|
||||||
|
Your clients trust you with their books. That trust is one
|
||||||
|
"we noticed our data appeared in a vendor breach" email away from
|
||||||
|
gone. DataTools is a desktop app — no upload, no SaaS, no
|
||||||
|
subscription, no third party seeing a single transaction.
|
||||||
|
</p>
|
||||||
|
<div class="callout">
|
||||||
|
<strong>Confirm it yourself.</strong> Open your browser's network
|
||||||
|
tab when DataTools is running. Click around. Run the pipeline.
|
||||||
|
Zero outbound requests. Ever.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">If your clients run multi-currency books</div>
|
||||||
|
<h2>$ £ € ¥ R$ kr zł — handled.</h2>
|
||||||
|
<p>
|
||||||
|
Standardize <code>$1,234.56</code>, <code>1.234,56 €</code> (EU
|
||||||
|
decimal), <code>($89.50)</code> (parens-negative),
|
||||||
|
<code>R$ 250,00</code>, <code>kr 1.250,50</code>, and the rest of
|
||||||
|
the long tail. Output is canonical numeric (your import tool's
|
||||||
|
favourite shape) with optional ISO 4217 prefix
|
||||||
|
(<code>USD 1234.56</code>) when you need to preserve the
|
||||||
|
currency.
|
||||||
|
</p>
|
||||||
|
<ul class="bullets">
|
||||||
|
<li><strong>Auto-detect</strong> EU comma decimal so your French and German clients' books reconcile without per-locale config.</li>
|
||||||
|
<li><strong>Parens-negative</strong> handled — accounting convention, not just a math style.</li>
|
||||||
|
<li><strong>Multi-character prefixes</strong> like <code>R$</code> (Brazilian Real) and <code>kr</code> (Nordic) detected before the single-symbol regex so they don't get bucketed as USD.</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">In the bundle</div>
|
||||||
|
<h2>Six tools. One pipeline. One $49 download.</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card"><h3>1 · Find Duplicates</h3><p>Fuzzy match (Jaro-Winkler), explicit strategies for Date+Amount+Vendor, survivor rules.</p></div>
|
||||||
|
<div class="card"><h3>2 · Clean Text</h3><p>Header whitespace, smart quotes from copy-paste, em-dash sentinels.</p></div>
|
||||||
|
<div class="card"><h3>3 · Standardize Formats</h3><p>ISO dates, numeric amounts (parens-negative), vendor casing, multi-currency.</p></div>
|
||||||
|
<div class="card"><h3>4 · Fix Missing Values</h3><p>Disguised-null detection: <code>—</code>, <code>N/A</code>, <code>(blank)</code>, <code>?</code>.</p></div>
|
||||||
|
<div class="card"><h3>5 · Map Columns</h3><p>Project to your accounting tool's required schema, coerce types, drop extras.</p></div>
|
||||||
|
<div class="card"><h3>6 · Automated Workflows</h3><p>Save the cleanup. Run it on next month's export with one command. Same audit, automated.</p></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Pricing — pay once, own it</div>
|
||||||
|
<h2>$49. No subscription. No per-client license.</h2>
|
||||||
|
<div class="pricing">
|
||||||
|
<div class="card featured">
|
||||||
|
<div class="row"><div class="price">$49</div><div class="price-suffix">one-time</div></div>
|
||||||
|
<h3>DataTools for Bookkeepers</h3>
|
||||||
|
<ul>
|
||||||
|
<li>All 6 tools, full pipeline</li>
|
||||||
|
<li>Mac · Windows · Linux installers</li>
|
||||||
|
<li>Code-signed (no Gatekeeper warnings)</li>
|
||||||
|
<li>Free updates for the v1.x line</li>
|
||||||
|
<li>Bonus: ready-made bank-reconcile and vendor-cleanup pipelines</li>
|
||||||
|
<li><strong>Use on any number of clients</strong> — no seat limits</li>
|
||||||
|
</ul>
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=bookkeeper" rel="noopener">Buy on Gumroad →</a>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="row"><div class="price">$199</div><div class="price-suffix">one-time</div></div>
|
||||||
|
<h3>+ Priority email support</h3>
|
||||||
|
<p class="muted">Available post-launch. 24-hour async response on edge cases. Same product. Targeted at bookkeepers whose own time is > $200/hr.</p>
|
||||||
|
<a class="btn btn-ghost btn-large" href="#" aria-disabled="true">Coming soon</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<h2>Questions</h2>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Does this replace QuickBooks / Xero?</summary>
|
||||||
|
<p>No — DataTools cleans the data <em>before</em> it goes into your accounting system, or after you export it for analysis. It sits alongside QB/Xero, not in place of them. Think of it as the import-clean-up step that should have shipped with the bank export feature in the first place.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Can I use it on multiple clients without paying again?</summary>
|
||||||
|
<p>Yes. The licence is per-bookkeeper, not per-client. Run it on every client's books for the same $49.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What's the audit log look like in court?</summary>
|
||||||
|
<p>It's a CSV with five columns per change: <code>row, column, field_type, old, new</code>. Plus a JSON pipeline file describing exactly which rules ran in which order. Together they reproduce the cleanup deterministically — your client (or their auditor) can verify it on their machine.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>How does it handle Excel-only weirdness like serial dates?</summary>
|
||||||
|
<p>Excel serial dates (the number 45295 = 2024-01-15) are detected and converted automatically. So are Unix timestamps in seconds and milliseconds, RFC 2822 dates from email exports, partial-precision dates (<code>2024-01</code>, <code>2024-Q1</code>), and locale-specific month names in English/French/German.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What about my clients' privacy?</summary>
|
||||||
|
<p>Your clients' books never leave your computer. The cleaner is a desktop app with zero network code in the data path. You can verify this in your browser's network tab.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What's your refund policy?</summary>
|
||||||
|
<p>Try the live demo above on the sample dataset before you buy. If DataTools doesn't fit your workflow within 14 days, email for a refund — no questions asked.</p>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container" style="text-align: center;">
|
||||||
|
<h2>Stop reconciling bank exports by hand.</h2>
|
||||||
|
<p class="lead" style="margin: 0 auto 28px;">One $49 download. Catches the duplicate transactions QuickBooks imported twice, standardises dates and amounts and vendor casing, and hands you a row-level audit log to share with your client.</p>
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=bookkeeper" rel="noopener">Get DataTools — $49 →</a>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<footer>
|
||||||
|
<div class="container">
|
||||||
|
<div>
|
||||||
|
<p><strong>DataTools</strong> — local data-cleaning for Shopify, bookkeepers, and RevOps teams.</p>
|
||||||
|
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p>
|
||||||
|
<a href="../shopify-pet/">For Shopify operators</a> ·
|
||||||
|
<a href="../revops/">For RevOps agencies</a><br />
|
||||||
|
<a href="https://gumroad.com/l/datatools?from=bookkeeper">Buy on Gumroad</a> ·
|
||||||
|
<a href="mailto:hello@datatools.app">Email support</a>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
22
landing/deploy.config.example.json
Normal file
22
landing/deploy.config.example.json
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
{
|
||||||
|
"_comment": [
|
||||||
|
"Deployment substitution config. Copy to deploy.config.json and",
|
||||||
|
"fill in the real URLs before running deploy.py.",
|
||||||
|
"deploy.config.json is gitignored (never commit your real URLs)."
|
||||||
|
],
|
||||||
|
|
||||||
|
"site_origin": "https://datatools.app",
|
||||||
|
|
||||||
|
"demo_base_url": "https://datatools-demo.streamlit.app",
|
||||||
|
"gumroad_listing": "https://gumroad.com/l/datatools",
|
||||||
|
"support_email": "hello@datatools.app",
|
||||||
|
|
||||||
|
"personas": ["shopify-pet", "bookkeeper", "revops"],
|
||||||
|
|
||||||
|
"_substitutions_made": [
|
||||||
|
"{{site_origin}}/ → site_origin/",
|
||||||
|
"{{demo_base_url}}/?p=<persona> → live demo iframe per persona",
|
||||||
|
"{{gumroad_url}}?from=<persona> → Gumroad CTA on every page",
|
||||||
|
"{{support_email}} → mailto: link"
|
||||||
|
]
|
||||||
|
}
|
||||||
235
landing/deploy.py
Normal file
235
landing/deploy.py
Normal file
@@ -0,0 +1,235 @@
|
|||||||
|
"""Build a deploy-ready ``landing/dist/`` from the source HTML.
|
||||||
|
|
||||||
|
Run from the repo root after copying ``landing/deploy.config.example.json``
|
||||||
|
to ``landing/deploy.config.json`` and filling in the real URLs:
|
||||||
|
|
||||||
|
python3 landing/deploy.py
|
||||||
|
|
||||||
|
Output:
|
||||||
|
landing/dist/index.html
|
||||||
|
landing/dist/shopify-pet/index.html
|
||||||
|
landing/dist/bookkeeper/index.html
|
||||||
|
landing/dist/revops/index.html
|
||||||
|
landing/dist/_shared/styles.css
|
||||||
|
landing/dist/robots.txt
|
||||||
|
landing/dist/sitemap.xml
|
||||||
|
landing/dist/404.html
|
||||||
|
landing/dist/favicon.svg
|
||||||
|
|
||||||
|
Upload ``landing/dist/`` to Cloudflare Pages (drag-and-drop in the
|
||||||
|
dashboard, or ``wrangler pages deploy landing/dist``).
|
||||||
|
|
||||||
|
Why this script exists:
|
||||||
|
The source HTML carries placeholder URLs (``{{demo_base_url}}``,
|
||||||
|
``{{gumroad_url}}``, ``{{support_email}}``, ``{{site_origin}}``)
|
||||||
|
so the operator's actual demo / Gumroad / domain URLs aren't
|
||||||
|
committed to the repo. This script reads the operator's config
|
||||||
|
and produces a ready-to-upload bundle.
|
||||||
|
|
||||||
|
It also stamps a sitemap.xml + robots.txt + 404.html and copies
|
||||||
|
the shared CSS so the output directory is fully self-contained.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from datetime import date
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
LANDING = Path(__file__).resolve().parent
|
||||||
|
REPO = LANDING.parent
|
||||||
|
DIST = LANDING / "dist"
|
||||||
|
|
||||||
|
CONFIG_PATH = LANDING / "deploy.config.json"
|
||||||
|
EXAMPLE_PATH = LANDING / "deploy.config.example.json"
|
||||||
|
|
||||||
|
|
||||||
|
# Files to substitute and copy. Order matters only for readability.
|
||||||
|
HTML_PAGES = [
|
||||||
|
LANDING / "index.html",
|
||||||
|
LANDING / "shopify-pet" / "index.html",
|
||||||
|
LANDING / "bookkeeper" / "index.html",
|
||||||
|
LANDING / "revops" / "index.html",
|
||||||
|
]
|
||||||
|
SHARED = LANDING / "_shared" / "styles.css"
|
||||||
|
|
||||||
|
|
||||||
|
def _load_config() -> dict:
|
||||||
|
if not CONFIG_PATH.exists():
|
||||||
|
sys.stderr.write(
|
||||||
|
f"\nERROR: {CONFIG_PATH.name} not found.\n"
|
||||||
|
f" cp {EXAMPLE_PATH.name} {CONFIG_PATH.name}\n"
|
||||||
|
f" edit {CONFIG_PATH.name} with your real URLs\n"
|
||||||
|
f" re-run: python3 landing/deploy.py\n\n"
|
||||||
|
)
|
||||||
|
sys.exit(2)
|
||||||
|
cfg = json.loads(CONFIG_PATH.read_text())
|
||||||
|
required = ("site_origin", "demo_base_url", "gumroad_listing", "support_email")
|
||||||
|
missing = [k for k in required if not cfg.get(k)]
|
||||||
|
if missing:
|
||||||
|
sys.stderr.write(
|
||||||
|
f"\nERROR: {CONFIG_PATH.name} is missing required fields: {missing}\n"
|
||||||
|
f" See {EXAMPLE_PATH.name} for the full template.\n\n"
|
||||||
|
)
|
||||||
|
sys.exit(2)
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def _substitute(text: str, cfg: dict) -> str:
|
||||||
|
"""Replace placeholders + the demo / Gumroad URL patterns the source HTML uses today."""
|
||||||
|
site_origin = cfg["site_origin"].rstrip("/")
|
||||||
|
demo_base = cfg["demo_base_url"].rstrip("/")
|
||||||
|
gumroad_base = cfg["gumroad_listing"]
|
||||||
|
support_email = cfg["support_email"]
|
||||||
|
|
||||||
|
# Direct placeholder tokens (clean approach — used by future copy).
|
||||||
|
text = text.replace("{{site_origin}}", site_origin)
|
||||||
|
text = text.replace("{{demo_base_url}}", demo_base)
|
||||||
|
text = text.replace("{{gumroad_url}}", gumroad_base)
|
||||||
|
text = text.replace("{{support_email}}", support_email)
|
||||||
|
|
||||||
|
# Backwards-compatible patterns: the source HTML in this repo carries
|
||||||
|
# literal ``https://datatools.app`` and ``https://demo.datatools.app``
|
||||||
|
# so this script swaps those too. Once new pages adopt the
|
||||||
|
# ``{{placeholder}}`` style above, this layer can be retired.
|
||||||
|
text = re.sub(
|
||||||
|
r"https://demo\.datatools\.app",
|
||||||
|
demo_base,
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# Replace ``https://datatools.app/...`` for canonical / OG URLs but
|
||||||
|
# do NOT swap ``https://datatools.app`` when it is followed by an
|
||||||
|
# at-sign as part of an email address (no such case today; defensive).
|
||||||
|
text = re.sub(
|
||||||
|
r"https://datatools\.app",
|
||||||
|
site_origin,
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# Gumroad URL family — preserve the ``?from=<persona>`` query.
|
||||||
|
text = re.sub(
|
||||||
|
r"https://gumroad\.com/l/datatools",
|
||||||
|
gumroad_base.rstrip("/").replace("/l/datatools", "/l/datatools"),
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
# Support email shows up only as ``mailto:hello@datatools.app``.
|
||||||
|
text = text.replace("mailto:hello@datatools.app", f"mailto:{support_email}")
|
||||||
|
text = text.replace("hello@datatools.app", support_email)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def _stamp_sitemap(cfg: dict) -> str:
|
||||||
|
site = cfg["site_origin"].rstrip("/")
|
||||||
|
today = date.today().isoformat()
|
||||||
|
urls = [site + "/"] + [
|
||||||
|
f"{site}/{p}/" for p in cfg.get("personas", ["shopify-pet", "bookkeeper", "revops"])
|
||||||
|
]
|
||||||
|
items = "\n".join(
|
||||||
|
f" <url><loc>{u}</loc><lastmod>{today}</lastmod></url>"
|
||||||
|
for u in urls
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
'<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||||
|
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n'
|
||||||
|
f"{items}\n"
|
||||||
|
"</urlset>\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _robots_txt(cfg: dict) -> str:
|
||||||
|
return (
|
||||||
|
"# Allow everything; we want every persona page indexable.\n"
|
||||||
|
"User-agent: *\n"
|
||||||
|
"Allow: /\n"
|
||||||
|
f"Sitemap: {cfg['site_origin'].rstrip('/')}/sitemap.xml\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _favicon_svg() -> str:
|
||||||
|
"""Tiny self-contained SVG favicon — broom emoji-style mark."""
|
||||||
|
return (
|
||||||
|
'<?xml version="1.0" encoding="UTF-8"?>\n'
|
||||||
|
'<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 64 64">\n'
|
||||||
|
' <rect width="64" height="64" rx="14" fill="#0f1115"/>\n'
|
||||||
|
' <circle cx="32" cy="32" r="9" fill="#6ee7b7"/>\n'
|
||||||
|
"</svg>\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_404_html(cfg: dict) -> str:
|
||||||
|
"""Cloudflare Pages serves 404.html when a path doesn't match."""
|
||||||
|
site_origin = cfg["site_origin"].rstrip("/")
|
||||||
|
return f"""<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>Not found · DataTools</title>
|
||||||
|
<link rel="stylesheet" href="/_shared/styles.css" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<section class="hero" style="text-align: center;">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">404</div>
|
||||||
|
<h1>That page isn't here.</h1>
|
||||||
|
<p class="lead" style="margin: 0 auto 28px;">Pick a workflow below to land somewhere useful.</p>
|
||||||
|
<p>
|
||||||
|
<a class="btn" href="{site_origin}/shopify-pet/">For Shopify</a>
|
||||||
|
|
||||||
|
<a class="btn" href="{site_origin}/bookkeeper/">For bookkeepers</a>
|
||||||
|
|
||||||
|
<a class="btn" href="{site_origin}/revops/">For RevOps</a>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
cfg = _load_config()
|
||||||
|
|
||||||
|
if DIST.exists():
|
||||||
|
shutil.rmtree(DIST)
|
||||||
|
DIST.mkdir(parents=True)
|
||||||
|
|
||||||
|
# Shared CSS (same path the source HTML expects: ``../_shared/styles.css``)
|
||||||
|
(DIST / "_shared").mkdir()
|
||||||
|
shutil.copy(SHARED, DIST / "_shared" / "styles.css")
|
||||||
|
|
||||||
|
# Per-page substitutions
|
||||||
|
page_count = 0
|
||||||
|
for src in HTML_PAGES:
|
||||||
|
rel = src.relative_to(LANDING)
|
||||||
|
dest = DIST / rel
|
||||||
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
dest.write_text(_substitute(src.read_text(), cfg))
|
||||||
|
page_count += 1
|
||||||
|
|
||||||
|
# Stamped supporting files
|
||||||
|
(DIST / "robots.txt").write_text(_robots_txt(cfg))
|
||||||
|
(DIST / "sitemap.xml").write_text(_stamp_sitemap(cfg))
|
||||||
|
(DIST / "404.html").write_text(_build_404_html(cfg))
|
||||||
|
(DIST / "favicon.svg").write_text(_favicon_svg())
|
||||||
|
|
||||||
|
# Final report
|
||||||
|
print(f"\n✓ Built {page_count} HTML pages + sitemap + robots + 404 + favicon")
|
||||||
|
print(f" Output: {DIST.relative_to(REPO)}/")
|
||||||
|
print()
|
||||||
|
print("Next steps:")
|
||||||
|
print(" 1) wrangler pages deploy landing/dist # if you use Wrangler")
|
||||||
|
print(" OR drag-and-drop landing/dist/ in the Cloudflare Pages dashboard")
|
||||||
|
print(" 2) Configure custom domain on Cloudflare Pages → "
|
||||||
|
f"{cfg['site_origin']}")
|
||||||
|
print(" 3) Verify: open the deployed apex URL, click each persona "
|
||||||
|
"card, click each demo iframe, click each buy button → Gumroad listing")
|
||||||
|
print()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
236
landing/index.html
Normal file
236
landing/index.html
Normal file
@@ -0,0 +1,236 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>DataTools — Local CSV / Excel Cleaning for Shopify, Bookkeepers, and RevOps</title>
|
||||||
|
<meta name="description" content="One desktop tool. Three workflows. Clean Shopify customer exports, reconcile messy bank statements, or dedupe lead lists across HubSpot and LinkedIn — all locally. $49 one-time." />
|
||||||
|
<link rel="canonical" href="https://datatools.app/" />
|
||||||
|
<link rel="stylesheet" href="_shared/styles.css" />
|
||||||
|
|
||||||
|
<meta property="og:title" content="DataTools — Local CSV / Excel Cleaning" />
|
||||||
|
<meta property="og:description" content="One desktop tool, three niche workflows. Runs entirely offline. $49 one-time." />
|
||||||
|
<meta property="og:type" content="website" />
|
||||||
|
<meta property="og:url" content="https://datatools.app/" />
|
||||||
|
|
||||||
|
<style>
|
||||||
|
/* Apex-page–only tweaks: persona cards are slightly bigger and use
|
||||||
|
per-card accent borders so the visitor visually identifies which
|
||||||
|
card matches their work in <2 seconds. */
|
||||||
|
.persona-grid {
|
||||||
|
display: grid; gap: 24px;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
||||||
|
margin-top: 28px;
|
||||||
|
}
|
||||||
|
.persona-card {
|
||||||
|
background: var(--surface);
|
||||||
|
border: 1px solid var(--rule);
|
||||||
|
border-radius: var(--radius);
|
||||||
|
padding: 28px;
|
||||||
|
display: flex; flex-direction: column;
|
||||||
|
transition: transform 0.08s ease, border-color 0.15s ease, box-shadow 0.2s ease;
|
||||||
|
text-decoration: none;
|
||||||
|
color: inherit;
|
||||||
|
}
|
||||||
|
.persona-card:hover {
|
||||||
|
transform: translateY(-2px);
|
||||||
|
border-color: var(--card-accent, var(--accent));
|
||||||
|
box-shadow: var(--shadow);
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
.persona-card.shopify { --card-accent: #6ee7b7; }
|
||||||
|
.persona-card.bookkeeper{ --card-accent: #7dd3fc; }
|
||||||
|
.persona-card.revops { --card-accent: #c4b5fd; }
|
||||||
|
.persona-card .pill {
|
||||||
|
display: inline-block;
|
||||||
|
background: rgba(255,255,255,0.04);
|
||||||
|
color: var(--card-accent, var(--accent));
|
||||||
|
border: 1px solid var(--card-accent, var(--accent));
|
||||||
|
padding: 4px 10px; border-radius: 999px;
|
||||||
|
font-size: 12px; font-weight: 600;
|
||||||
|
letter-spacing: 0.04em;
|
||||||
|
margin-bottom: 12px;
|
||||||
|
align-self: flex-start;
|
||||||
|
}
|
||||||
|
.persona-card h3 {
|
||||||
|
color: var(--text);
|
||||||
|
font-size: 22px;
|
||||||
|
margin-bottom: 12px;
|
||||||
|
}
|
||||||
|
.persona-card p {
|
||||||
|
color: var(--text-soft);
|
||||||
|
flex: 1;
|
||||||
|
margin-bottom: 16px;
|
||||||
|
}
|
||||||
|
.persona-card .pain {
|
||||||
|
font-size: 14px; color: var(--text-mute);
|
||||||
|
margin: 8px 0 18px;
|
||||||
|
}
|
||||||
|
.persona-card .pain li { margin-bottom: 4px; }
|
||||||
|
.persona-card .open {
|
||||||
|
color: var(--card-accent, var(--accent));
|
||||||
|
font-weight: 600;
|
||||||
|
font-size: 15px;
|
||||||
|
}
|
||||||
|
.persona-card .open::after {
|
||||||
|
content: " →";
|
||||||
|
transition: margin-left 0.15s ease;
|
||||||
|
}
|
||||||
|
.persona-card:hover .open::after { margin-left: 4px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<!-- Sticky brand bar (no buy CTA on the apex — visitor hasn't picked a niche yet) -->
|
||||||
|
<div class="buybar">
|
||||||
|
<div class="buybar-inner">
|
||||||
|
<div class="brand"><span class="brand-mark">●</span> DataTools</div>
|
||||||
|
<div>
|
||||||
|
<span class="price-tag">Pick your workflow ↓</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<section class="hero">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">For Shopify operators · bookkeepers · marketing & RevOps agencies</div>
|
||||||
|
<h1>Local CSV / Excel cleaning.<br /><strong>One tool. Three workflows.</strong></h1>
|
||||||
|
<p class="lead">
|
||||||
|
DataTools is a desktop app that fixes the data-cleaning headaches
|
||||||
|
every small business hits — duplicates Excel can't catch,
|
||||||
|
international phones it can't parse, dates and currencies in three
|
||||||
|
different formats per export. One $49 download. Works on Mac,
|
||||||
|
Windows, and Linux. <strong>Your data never leaves your
|
||||||
|
computer.</strong>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<div class="persona-grid">
|
||||||
|
<a class="persona-card shopify" href="shopify-pet/">
|
||||||
|
<span class="pill">🛍️ Shopify operator</span>
|
||||||
|
<h3>Customer / vendor / subscriber export cleanup</h3>
|
||||||
|
<p>
|
||||||
|
Klaviyo-import-ready customer lists in 30 seconds. Catches
|
||||||
|
cross-device duplicates, standardizes international phones
|
||||||
|
and addresses, fixes the disguised nulls that break product
|
||||||
|
feeds.
|
||||||
|
</p>
|
||||||
|
<ul class="pain">
|
||||||
|
<li>· Fix Klaviyo per-contact billing on phantom dupes</li>
|
||||||
|
<li>· Repair feeds rejected by Google Merchant / Meta</li>
|
||||||
|
<li>· Unify orders from Shopify + Etsy + Amazon + Faire</li>
|
||||||
|
<li>· Resolve VAT-MOSS country-name drift</li>
|
||||||
|
</ul>
|
||||||
|
<span class="open">Open the Shopify demo & pricing</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<a class="persona-card bookkeeper" href="bookkeeper/">
|
||||||
|
<span class="pill">📒 Bookkeeper / accountant</span>
|
||||||
|
<h3>Bank-export reconciliation with audit trail</h3>
|
||||||
|
<p>
|
||||||
|
Catches the duplicate transaction QuickBooks imported twice
|
||||||
|
when Jan and Feb exports overlap. Standardizes dates,
|
||||||
|
amounts, and vendor casing. Hands you a row-level audit log
|
||||||
|
to share with the client.
|
||||||
|
</p>
|
||||||
|
<ul class="pain">
|
||||||
|
<li>· Catch month-overlap re-import dupes</li>
|
||||||
|
<li>· Consolidate vendors for clean 1099 reports</li>
|
||||||
|
<li>· Produce hand-off-ready audit trail</li>
|
||||||
|
<li>· Multi-currency books (EUR / GBP / BRL)</li>
|
||||||
|
</ul>
|
||||||
|
<span class="open">Open the bookkeeper demo & pricing</span>
|
||||||
|
</a>
|
||||||
|
|
||||||
|
<a class="persona-card revops" href="revops/">
|
||||||
|
<span class="pill">🪢 Marketing / RevOps</span>
|
||||||
|
<h3>Lead-list dedup across HubSpot, LinkedIn, scrapes</h3>
|
||||||
|
<p>
|
||||||
|
One canonical lead per real person — across HubSpot,
|
||||||
|
LinkedIn, Apollo, ZoomInfo, and manual scrapes.
|
||||||
|
International phones (50+ country codes), per-row country
|
||||||
|
column, fuzzy match with merge.
|
||||||
|
</p>
|
||||||
|
<ul class="pain">
|
||||||
|
<li>· Stop paying HubSpot tier price for cross-source dupes</li>
|
||||||
|
<li>· Protect sender reputation from invalid emails</li>
|
||||||
|
<li>· Skip the 4–8 wk GDPR review on cloud cleaners</li>
|
||||||
|
<li>· Suppression-list sync across 5+ platforms</li>
|
||||||
|
</ul>
|
||||||
|
<span class="open">Open the RevOps demo & pricing</span>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">What's the same across all three</div>
|
||||||
|
<h2>One engine. Same six tools. Same $49.</h2>
|
||||||
|
<p>
|
||||||
|
The persona pages above are positioning, not different products.
|
||||||
|
Whichever you buy, you get the full bundle: Find Duplicates, Clean
|
||||||
|
Text, Standardize Formats, Fix Missing Values, Map Columns,
|
||||||
|
and Automated Workflows — pre-tuned with a saved pipeline
|
||||||
|
that matches your workflow.
|
||||||
|
</p>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🔒</span>
|
||||||
|
<h3>Local-first</h3>
|
||||||
|
<p>Desktop app. No cloud upload, no SaaS account, no subscription. Verify zero outbound calls in your browser's network tab.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📋</span>
|
||||||
|
<h3>Auditable</h3>
|
||||||
|
<p>Every cell change is logged with the original value, the new value, and which rule fired. Hand the audit CSV to your client.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🌍</span>
|
||||||
|
<h3>International</h3>
|
||||||
|
<p>50+ country codes, per-row country awareness, EU comma decimals, parens-negative amounts, locale-aware month names.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">⚙️</span>
|
||||||
|
<h3>Repeatable</h3>
|
||||||
|
<p>Save your cleanup as a JSON pipeline. Re-run on next week's export with one CLI command. Same cleanup, zero re-config.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📦</span>
|
||||||
|
<h3>Cross-platform</h3>
|
||||||
|
<p>Mac · Windows · Linux installers. Code-signed for macOS Gatekeeper. Free updates for the v1.x line.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">💰</span>
|
||||||
|
<h3>$49 one-time</h3>
|
||||||
|
<p>No subscription. No per-client license. No row caps. No AI black-box.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container" style="text-align: center;">
|
||||||
|
<h2>Pick your workflow above to try the live demo.</h2>
|
||||||
|
<p class="muted">Or read the docs first — every tool has a CLI, every pipeline is JSON, every change is audited.</p>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<footer>
|
||||||
|
<div class="container">
|
||||||
|
<div>
|
||||||
|
<p><strong>DataTools</strong> — local data-cleaning for Shopify, bookkeepers, and RevOps teams.</p>
|
||||||
|
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p>
|
||||||
|
<a href="shopify-pet/">For Shopify operators</a> ·
|
||||||
|
<a href="bookkeeper/">For bookkeepers</a> ·
|
||||||
|
<a href="revops/">For RevOps agencies</a><br />
|
||||||
|
<a href="mailto:hello@datatools.app">Email support</a>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
352
landing/revops/index.html
Normal file
352
landing/revops/index.html
Normal file
@@ -0,0 +1,352 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>DataTools for RevOps — Dedupe Lead Lists Across HubSpot, LinkedIn, and Manual Scrapes · $49</title>
|
||||||
|
<meta name="description" content="One tool to dedupe lead lists across HubSpot, LinkedIn, and manual scrapes. International phones (50+ country codes), per-row country normalization, fuzzy match across vendors, fully offline. $49 one-time." />
|
||||||
|
<meta name="keywords" content="dedupe lead list, hubspot deduplicate, linkedin lead cleanup, marketing data cleaning, revops csv tool, multi-vendor lead unification, international phone normalization" />
|
||||||
|
<link rel="canonical" href="https://datatools.app/revops/" />
|
||||||
|
<link rel="stylesheet" href="../_shared/styles.css" />
|
||||||
|
|
||||||
|
<!-- Persona accent: RevOps → vivid violet -->
|
||||||
|
<style>
|
||||||
|
:root {
|
||||||
|
--accent: #c4b5fd;
|
||||||
|
--accent-ink: #2e1065;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
|
||||||
|
<meta property="og:title" content="DataTools for RevOps — Dedupe Lead Lists Across HubSpot, LinkedIn, and Manual Scrapes" />
|
||||||
|
<meta property="og:description" content="International phones, country normalization, fuzzy dedup with merge — one tool, no upload. $49 one-time." />
|
||||||
|
<meta property="og:type" content="product" />
|
||||||
|
<meta property="og:url" content="https://datatools.app/revops/" />
|
||||||
|
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{
|
||||||
|
"@context": "https://schema.org",
|
||||||
|
"@type": "SoftwareApplication",
|
||||||
|
"name": "DataTools for RevOps",
|
||||||
|
"operatingSystem": "Windows, macOS, Linux",
|
||||||
|
"applicationCategory": "BusinessApplication",
|
||||||
|
"offers": {
|
||||||
|
"@type": "Offer",
|
||||||
|
"price": "49",
|
||||||
|
"priceCurrency": "USD"
|
||||||
|
},
|
||||||
|
"description": "Dedupe and unify lead lists across CRM, scraping, and manual sources. International phone normalization, per-row country, fuzzy match with merge. Six-tool data-cleaning bundle for RevOps and marketing agencies.",
|
||||||
|
"softwareVersion": "1.0"
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<div class="buybar">
|
||||||
|
<div class="buybar-inner">
|
||||||
|
<div class="brand"><span class="brand-mark">●</span> DataTools <span class="muted">/ for RevOps</span></div>
|
||||||
|
<div>
|
||||||
|
<span class="price-tag">$49 — one-time, no subscription</span>
|
||||||
|
<a class="btn" href="https://gumroad.com/l/datatools?from=revops" rel="noopener">Get DataTools →</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<section class="hero">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">For RevOps · marketing ops · agency lead-gen · audience-builders</div>
|
||||||
|
<h1>Dedupe lead lists across HubSpot, LinkedIn,<br /><strong>and manual scrapes — locally.</strong></h1>
|
||||||
|
<p class="lead">
|
||||||
|
The same prospect shows up as <code>alice@acme.com</code> in HubSpot,
|
||||||
|
<code>Alice.Johnson@acme.com</code> in LinkedIn Sales Navigator, and
|
||||||
|
<code>alice@acme.com</code> again from your VA's manual scrape. Their
|
||||||
|
phone is <code>(415) 555-1234</code> in one source and
|
||||||
|
<code>4155551234</code> in another. DataTools fuzzy-matches across
|
||||||
|
sources, normalizes phones to E.164 with per-row country awareness,
|
||||||
|
and produces one canonical lead per real person — without uploading
|
||||||
|
a single contact to a third-party tool.
|
||||||
|
</p>
|
||||||
|
<div class="cta-row">
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=revops" rel="noopener">Get DataTools — $49 →</a>
|
||||||
|
<a class="btn btn-ghost btn-large" href="#demo">Try the live demo ↓</a>
|
||||||
|
<span class="price-note">One-time payment · cross-platform · runs offline</span>
|
||||||
|
</div>
|
||||||
|
<div class="stats">
|
||||||
|
<div class="stat"><div class="num">50+</div><div class="label">country codes</div></div>
|
||||||
|
<div class="stat"><div class="num">3</div><div class="label">CRM sources unified</div></div>
|
||||||
|
<div class="stat"><div class="num">0</div><div class="label">cloud uploads ever</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Pain points ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">If your last campaign launch was held up by data hygiene</div>
|
||||||
|
<h2>Five pains DataTools fixes before you import to HubSpot</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">💸</span>
|
||||||
|
<h3>HubSpot / Marketo / Iterable bills you for every duplicate contact</h3>
|
||||||
|
<p>10 k contacts → enterprise tier at $4–8 k/mo. 18 % cross-source duplicate rate from Apollo + ZoomInfo + LinkedIn means you're at 8.2 k unique people but paying for 10 k. Every month. Forever.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> $200–$800 per 1 k duplicate contacts — recurring, every month.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🚫</span>
|
||||||
|
<h3>Sender reputation tanks when you mail to invalid or duplicate addresses</h3>
|
||||||
|
<p>One bad sending session — to addresses your team scraped or imported without hygiene — and your domain reputation takes weeks to recover. Your good campaigns sit in spam folders during the recovery.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> catastrophic — entire email programme degraded for 2–6 weeks.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">⚖️</span>
|
||||||
|
<h3>GDPR makes uploading to a cloud cleaner a legal-review marathon</h3>
|
||||||
|
<p>Every cloud-based lead-cleaner needs you to upload your prospect list. Your legal team needs 4–8 weeks to bless that. DataTools is desktop-only — no upload, no DPA, no review, no delay.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> 4–8 weeks of legal-review delay per tool, every time.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🪢</span>
|
||||||
|
<h3>Apollo + ZoomInfo + LinkedIn + manual scrapes all use different schemas</h3>
|
||||||
|
<p>Each export has its own column names, scoring scale, country format. Unifying them by hand for one campaign costs 1–3 days. Doing it for every campaign is unsustainable.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> 1–3 days per campaign of manual unification + judgement calls that drift across team members.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🛡️</span>
|
||||||
|
<h3>Suppression lists across 5+ marketing platforms get out of sync</h3>
|
||||||
|
<p>Each platform has its own suppression format. Out-of-sync lists let opted-out contacts slip through, triggering CAN-SPAM / GDPR exposure and the kind of "we got a complaint" email no one wants.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> compliance risk + churn-back cost + stakeholder trust.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📞</span>
|
||||||
|
<h3>International dialer fails because phone formats vary</h3>
|
||||||
|
<p>Calling list to 15 countries with mixed formats means dialler rejects 8–15 % of numbers, your reps spend the day on "number invalid" tones instead of conversations.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> rep productivity × failure rate × team size.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section id="demo">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Live demo · runs in your browser</div>
|
||||||
|
<h2>Try it on a real-looking 3-vendor lead list</h2>
|
||||||
|
<p>
|
||||||
|
The demo below loads a 25-row lead worksheet combining HubSpot,
|
||||||
|
LinkedIn Sales Navigator, and manual scraping — with the same prospect
|
||||||
|
appearing in two or three sources, country names spelled three
|
||||||
|
different ways (<code>USA</code>, <code>US</code>, <code>United
|
||||||
|
States</code>), and 13 different international phone formats. Click
|
||||||
|
<strong>Run pipeline</strong> and watch the 5-step pipeline (text
|
||||||
|
clean → format → missing → column map → dedup) collapse 25 rows to 19
|
||||||
|
with a single canonical record per prospect.
|
||||||
|
</p>
|
||||||
|
<div class="demo-frame">
|
||||||
|
<iframe
|
||||||
|
src="https://demo.datatools.app/?p=revops"
|
||||||
|
loading="lazy"
|
||||||
|
title="DataTools live demo — RevOps"
|
||||||
|
sandbox="allow-scripts allow-same-origin allow-downloads allow-forms"></iframe>
|
||||||
|
<div class="demo-caption">
|
||||||
|
Demo runs on free hosting. Capped at 100 input rows · output
|
||||||
|
watermarked. The paid product has no caps and runs entirely offline.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Built for the agency RevOps day</div>
|
||||||
|
<h2>Three workflows you do every campaign</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🪢</span>
|
||||||
|
<h3>Email-list dedup across lead sources</h3>
|
||||||
|
<p>HubSpot exports + LinkedIn Sales Navigator + the VA's spreadsheet, all merged. Fuzzy match across email + phone + name catches the cross-source duplicates that broke your last campaign send.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🌍</span>
|
||||||
|
<h3>Multi-platform audience reconciliation</h3>
|
||||||
|
<p>Build one canonical audience from Meta, Google Ads, LinkedIn, and your CRM. Each platform exports a different shape; Map Columns aligns them all, dedup merges the survivors with their most-complete fields.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🛡️</span>
|
||||||
|
<h3>Suppression-list management</h3>
|
||||||
|
<p>Suppression lists need to dedupe across email + phone + first-party identifiers. Add a row, dedupe, ship the canonical CSV to every platform — without uploading the suppression list to any of them.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">If your campaigns target outside the US — almost everyone's do</div>
|
||||||
|
<h2>50+ country codes. Per-row country awareness.</h2>
|
||||||
|
<p>
|
||||||
|
Your HubSpot list has <code>(415) 555-1234</code>. Your scraped
|
||||||
|
list from the same prospect has <code>+1 415 555 1234</code>. Your
|
||||||
|
Italian prospect entered <code>+39 06 6982</code>. Your Brazilian
|
||||||
|
lead has <code>11 3071 0000</code>. Each comes from a row tagged
|
||||||
|
with its country — DataTools reads that column per row and parses
|
||||||
|
every phone correctly to E.164.
|
||||||
|
</p>
|
||||||
|
<ul class="bullets">
|
||||||
|
<li><strong>Per-row country column</strong> drives the parser — no global default that bucks UK numbers as malformed US.</li>
|
||||||
|
<li><strong>Country-name normalization</strong>: <code>USA</code> / <code>US</code> / <code>United States</code> all resolve to the same ISO-2 code.</li>
|
||||||
|
<li><strong>50+ country support</strong> via Google's libphonenumber, including KR, CN, IN, MX, BR, IL, TR, PL, DK, SE.</li>
|
||||||
|
<li><strong>Schema enforcement</strong> via Map Columns: project to your CRM's required shape, coerce score columns to integers, reorder fields to match the import contract.</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">For platforms that charge per contact</div>
|
||||||
|
<h2>Every duplicate you don't catch costs you for the life of the contract.</h2>
|
||||||
|
<p>
|
||||||
|
HubSpot prices on contacts. Klaviyo prices on contacts. Marketo,
|
||||||
|
Iterable, ActiveCampaign — all priced on contacts. Every duplicate
|
||||||
|
you don't catch is a recurring tax on your campaign. DataTools
|
||||||
|
catches them once, before import, with a fuzzy matcher that's
|
||||||
|
tuned to the cross-source noise you actually see.
|
||||||
|
</p>
|
||||||
|
<div class="callout">
|
||||||
|
<strong>Real numbers from the demo:</strong> 25 input rows from
|
||||||
|
three sources collapse to 19 — that's 6 duplicates the cross-source
|
||||||
|
noise was hiding. On a 50,000-row campaign list, that ratio
|
||||||
|
typically saves 12,000+ contacts a month, every month.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">The thing every cloud cleaner can't say</div>
|
||||||
|
<h2>Your prospects' contact info never leaves your computer.</h2>
|
||||||
|
<p>
|
||||||
|
Cloud lead-cleaning tools require you to upload your audience.
|
||||||
|
That audience is your single most valuable agency asset — and once
|
||||||
|
it's on someone else's server, your client's privacy story is
|
||||||
|
no longer in your hands. DataTools is a desktop app. There is no
|
||||||
|
upload step.
|
||||||
|
</p>
|
||||||
|
<div class="terminal"><span class="prompt">$</span> python -m src.cli_pipeline campaign_q1.csv --pipeline revops_pipeline.json --apply
|
||||||
|
Reading campaign_q1.csv...
|
||||||
|
53,802 rows, 14 columns
|
||||||
|
Executing pipeline:
|
||||||
|
<span class="ok">✓</span> text_clean (160 ms) {cells_changed: 8,205}
|
||||||
|
<span class="ok">✓</span> format_standardize (1.4 s) {cells_changed: 41,889 — 50 country codes}
|
||||||
|
<span class="ok">✓</span> missing (140 ms) {sentinels_standardized: 6,710}
|
||||||
|
<span class="ok">✓</span> column_map (220 ms) {columns_renamed: 4, columns_added: 1}
|
||||||
|
<span class="ok">✓</span> dedup (4.8 s) {duplicates_removed: 12,344, merged: 12,344}
|
||||||
|
|
||||||
|
Initial rows: 53,802 → Final rows: 41,458
|
||||||
|
Total elapsed: 6.7 s
|
||||||
|
<span class="prompt">$</span> # 12,344 fewer contacts to pay for. for $49.</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">In the bundle</div>
|
||||||
|
<h2>Six tools. One pipeline. One $49 download.</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card"><h3>1 · Find Duplicates</h3><p>Fuzzy match across email + phone + name + company; merge survivors with most-complete fields.</p></div>
|
||||||
|
<div class="card"><h3>2 · Clean Text</h3><p>Smart quotes from copy-paste, NBSP from spreadsheet exports, BOM from Excel.</p></div>
|
||||||
|
<div class="card"><h3>3 · Standardize Formats</h3><p>E.164 phones with per-row country, canonical emails, name casing, ISO dates.</p></div>
|
||||||
|
<div class="card"><h3>4 · Fix Missing Values</h3><p>Detect <code>TBD</code>, <code>(unknown)</code>, <code>—</code> across vendor exports.</p></div>
|
||||||
|
<div class="card"><h3>5 · Map Columns</h3><p>Project to your CRM's required schema, coerce score to integer, reorder for import.</p></div>
|
||||||
|
<div class="card"><h3>6 · Automated Workflows</h3><p>Save the cleanup as JSON. Drop next campaign's combined export on it. Same dedup, automated.</p></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Pricing — pay once, own it</div>
|
||||||
|
<h2>$49. No subscription. No per-campaign fee.</h2>
|
||||||
|
<div class="pricing">
|
||||||
|
<div class="card featured">
|
||||||
|
<div class="row"><div class="price">$49</div><div class="price-suffix">one-time</div></div>
|
||||||
|
<h3>DataTools for RevOps</h3>
|
||||||
|
<ul>
|
||||||
|
<li>All 6 tools, full pipeline</li>
|
||||||
|
<li>Mac · Windows · Linux installers</li>
|
||||||
|
<li>Code-signed (no Gatekeeper warnings)</li>
|
||||||
|
<li>Free updates for the v1.x line</li>
|
||||||
|
<li>Bonus: 3-source unification pipeline preset</li>
|
||||||
|
<li><strong>Use on any number of clients</strong> — no seat limits</li>
|
||||||
|
</ul>
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=revops" rel="noopener">Buy on Gumroad →</a>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="row"><div class="price">$149</div><div class="price-suffix">one-time</div></div>
|
||||||
|
<h3>Full DataTools Suite</h3>
|
||||||
|
<p class="muted">Available when 3+ bundles ship. Includes everything in the RevOps pack plus the Shopify and Bookkeeper bundles. Save $48.</p>
|
||||||
|
<a class="btn btn-ghost btn-large" href="#" aria-disabled="true">Coming when ready</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<h2>Questions</h2>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Does this replace HubSpot's deduplication?</summary>
|
||||||
|
<p>No — it cleans data <em>before</em> import to HubSpot (or LinkedIn, Marketo, Klaviyo, etc.). HubSpot's dedup runs on already-imported contacts; DataTools catches duplicates that haven't yet cost you a contract slot.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Does it handle international phones correctly?</summary>
|
||||||
|
<p>Yes — via Google's libphonenumber, with 50+ country codes. The killer feature is per-row country: point a column at it (any column with values like <code>US</code>, <code>USA</code>, <code>United States</code>, <code>+1</code>, <code>JP</code>, <code>Japan</code>) and DataTools parses each row in its own region. No more UK numbers bucketed as malformed US.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Can I use it on multiple clients without paying again?</summary>
|
||||||
|
<p>Yes. The licence is per-operator, not per-client. Run it on every agency client's lead list for the same $49.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>How does fuzzy match work across columns?</summary>
|
||||||
|
<p>Out of the box, the dedup engine builds default strategies based on column names — typically email + phone with exact match, name with Jaro-Winkler at 85%. You can override via JSON: pick which columns to match on, which algorithm, and what threshold. Strategies survive in the saved pipeline so next campaign uses the same rules.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What's the audit trail look like?</summary>
|
||||||
|
<p>A row-by-row CSV: every modified cell with its original value, new value, and which rule fired. A separate JSON file describes the pipeline that produced it. Together they reproduce the cleanup deterministically — your client can verify it on their machine.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What's your refund policy?</summary>
|
||||||
|
<p>Try the live demo above on the sample dataset before you buy. If DataTools doesn't fit your workflow within 14 days, email for a refund — no questions asked.</p>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<section>
|
||||||
|
<div class="container" style="text-align: center;">
|
||||||
|
<h2>Stop paying twice for the same contact.</h2>
|
||||||
|
<p class="lead" style="margin: 0 auto 28px;">One $49 download. Catches the cross-source duplicates HubSpot and LinkedIn can't see, normalizes phones for 50+ countries, and saves a pipeline you can re-run on next campaign's combined list.</p>
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=revops" rel="noopener">Get DataTools — $49 →</a>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<footer>
|
||||||
|
<div class="container">
|
||||||
|
<div>
|
||||||
|
<p><strong>DataTools</strong> — local data-cleaning for Shopify, bookkeepers, and RevOps teams.</p>
|
||||||
|
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p>
|
||||||
|
<a href="../shopify-pet/">For Shopify operators</a> ·
|
||||||
|
<a href="../bookkeeper/">For bookkeepers</a><br />
|
||||||
|
<a href="https://gumroad.com/l/datatools?from=revops">Buy on Gumroad</a> ·
|
||||||
|
<a href="mailto:hello@datatools.app">Email support</a>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
381
landing/shopify-pet/index.html
Normal file
381
landing/shopify-pet/index.html
Normal file
@@ -0,0 +1,381 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>DataTools for Shopify — Clean Customer & Product Exports Locally · $49</title>
|
||||||
|
<meta name="description" content="Clean Shopify customer, product, and subscriber exports — locally. Klaviyo-import-ready in 30 seconds. Catches duplicates Excel misses. Your data never leaves your computer. $49 one-time." />
|
||||||
|
<meta name="keywords" content="shopify customer cleanup, shopify csv cleaner, shopify product feed cleaner, klaviyo deduplicate, shopify customer dedup tool, shopify pet supplies" />
|
||||||
|
<link rel="canonical" href="https://datatools.app/shopify/" />
|
||||||
|
<link rel="stylesheet" href="../_shared/styles.css" />
|
||||||
|
|
||||||
|
<!-- Persona accent: Shopify pet → mint green (default in shared sheet) -->
|
||||||
|
|
||||||
|
<!-- Open Graph -->
|
||||||
|
<meta property="og:title" content="DataTools for Shopify — Clean Customer & Product Exports Locally" />
|
||||||
|
<meta property="og:description" content="Klaviyo-import-ready in 30 seconds. Local. No upload. $49 one-time." />
|
||||||
|
<meta property="og:type" content="product" />
|
||||||
|
<meta property="og:url" content="https://datatools.app/shopify/" />
|
||||||
|
|
||||||
|
<!-- Schema.org Product -->
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{
|
||||||
|
"@context": "https://schema.org",
|
||||||
|
"@type": "SoftwareApplication",
|
||||||
|
"name": "DataTools for Shopify",
|
||||||
|
"operatingSystem": "Windows, macOS, Linux",
|
||||||
|
"applicationCategory": "BusinessApplication",
|
||||||
|
"offers": {
|
||||||
|
"@type": "Offer",
|
||||||
|
"price": "49",
|
||||||
|
"priceCurrency": "USD"
|
||||||
|
},
|
||||||
|
"description": "Clean Shopify customer, product, and subscriber CSV exports locally. Six-tool data-cleaning bundle: dedupe, text-clean, format-standardize, missing-value handle, column-map, pipeline.",
|
||||||
|
"softwareVersion": "1.0"
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<!-- ============= Sticky buy bar ============= -->
|
||||||
|
<div class="buybar">
|
||||||
|
<div class="buybar-inner">
|
||||||
|
<div class="brand"><span class="brand-mark">●</span> DataTools <span class="muted">/ for Shopify</span></div>
|
||||||
|
<div>
|
||||||
|
<span class="price-tag">$49 — one-time, no subscription</span>
|
||||||
|
<a class="btn" href="https://gumroad.com/l/datatools?from=shopify-pet" rel="noopener">Get DataTools →</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ============= Hero ============= -->
|
||||||
|
<section class="hero">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">For Shopify operators · pet supplies · subscription stores · DTC</div>
|
||||||
|
<h1>Klaviyo-import-ready customer lists.<br /><strong>In 30 seconds. Locally.</strong></h1>
|
||||||
|
<p class="lead">
|
||||||
|
Your Shopify customer export is a mess of formatting drift, disguised
|
||||||
|
duplicates, and inconsistent phone numbers. DataTools fixes all of it
|
||||||
|
in one pass — fuzzy-dedupes the same customer Klaviyo would charge
|
||||||
|
you for twice, standardises phones across your international
|
||||||
|
subscribers, and hands you a cleaned CSV. <strong>Your data never
|
||||||
|
leaves your computer.</strong>
|
||||||
|
</p>
|
||||||
|
<div class="cta-row">
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=shopify-pet" rel="noopener">Get DataTools — $49 →</a>
|
||||||
|
<a class="btn btn-ghost btn-large" href="#demo">Try the live demo ↓</a>
|
||||||
|
<span class="price-note">One-time payment · cross-platform · runs offline</span>
|
||||||
|
</div>
|
||||||
|
<div class="stats">
|
||||||
|
<div class="stat"><div class="num">6</div><div class="label">tools, one bundle</div></div>
|
||||||
|
<div class="stat"><div class="num">1 GB</div><div class="label">customer file in 2.5 min</div></div>
|
||||||
|
<div class="stat"><div class="num">0</div><div class="label">cloud uploads ever</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Pain points ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">If any of these sound like your Tuesday</div>
|
||||||
|
<h2>Five pains DataTools fixes in one pass</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">💸</span>
|
||||||
|
<h3>Klaviyo / Mailchimp / Omnisend bills you for every duplicate</h3>
|
||||||
|
<p>Same customer signs up twice — once with a typo, once with a plus-tag, once on mobile. Your subscriber list has 10–18 % duplicate rate and you're paying for every one of them, every month, forever.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> $30–$300/mo per percent of dupes on a 50 k-list — recurring.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📵</span>
|
||||||
|
<h3>Your product feed got rejected by Google Merchant Center</h3>
|
||||||
|
<p>Smart quotes from a copy-paste in product titles. NBSP in SKU. Inconsistent attribute casing. Feed bounces, the launch sits for 24–72 hours while you try to find the bad row in a 12,000-line CSV.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> 1–3 days of delayed campaign × the campaign value.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🪢</span>
|
||||||
|
<h3>Orders from Shopify + Etsy + Amazon + Faire don't speak the same language</h3>
|
||||||
|
<p>Each platform's export uses different column names for "customer email" / "ship country" / "order total." Merging takes hours of manual rename and copy-paste before the analysis can even begin.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> 4–8 hours per month manually merging exports.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🔁</span>
|
||||||
|
<h3>Subscription churn looks higher than it is</h3>
|
||||||
|
<p>Pet-box subscribers cancel, then re-sub three months later under a different email or device. Your cohort report says churn is 20 % when it's actually 12 % — and you're over-paying for acquisition because LTV is mis-calculated.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> wrong CAC ceiling for the next year of paid ads.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🌍</span>
|
||||||
|
<h3>VAT MOSS / EU tax breaks because country is spelled three ways</h3>
|
||||||
|
<p>Your UK customers are tagged <code>UK</code>, <code>U.K.</code>, and <code>United Kingdom</code> — all in one export. The VAT report aggregates them as three different markets. Compliance friction every quarter.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> compliance risk + repeated manual normalization.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🔒</span>
|
||||||
|
<h3>Cloud cleaners want you to upload your customer list</h3>
|
||||||
|
<p>Your customer list is your single most valuable business asset. Uploading it to a SaaS to clean it is the privacy story you do not want. DataTools is desktop-only — your list never leaves your computer.</p>
|
||||||
|
<p class="muted"><strong>What it costs:</strong> nothing — and that's the point.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Live demo ============= -->
|
||||||
|
<section id="demo">
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Live demo · runs in your browser</div>
|
||||||
|
<h2>Try it on a real-looking Shopify customer export</h2>
|
||||||
|
<p>
|
||||||
|
The demo below loads a sample 15-row Shopify customer file with
|
||||||
|
pollution we've seen in actual stores: smart quotes from copy-paste,
|
||||||
|
duplicates with email-case drift, international phones from the UK,
|
||||||
|
Spain, Germany, Australia, and Japan, and the usual mess of
|
||||||
|
<code>N/A</code> / <code>(blank)</code> / <code>?</code> sentinels.
|
||||||
|
Click <strong>Run pipeline</strong> and watch every column get
|
||||||
|
cleaned in under a second.
|
||||||
|
</p>
|
||||||
|
<div class="demo-frame">
|
||||||
|
<iframe
|
||||||
|
src="https://demo.datatools.app/?p=shopify-pet"
|
||||||
|
loading="lazy"
|
||||||
|
title="DataTools live demo — Shopify pet supplies"
|
||||||
|
sandbox="allow-scripts allow-same-origin allow-downloads allow-forms"></iframe>
|
||||||
|
<div class="demo-caption">
|
||||||
|
Demo runs on free hosting (Streamlit Community Cloud). Capped at
|
||||||
|
100 input rows · output watermarked with one trailing row. The
|
||||||
|
paid product has no caps and runs entirely offline.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Built for Shopify ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Built for the Shopify operator</div>
|
||||||
|
<h2>Five workflows you do every week</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🧹</span>
|
||||||
|
<h3>Customer-list cleanup</h3>
|
||||||
|
<p>Catches the same customer who shows up as <code>john@gmail.com</code>, <code>John@Gmail.com</code>, and <code>j.ohn@gmail.com</code>. Fuzzy match merges the spellings, exact match catches the obvious ones.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📦</span>
|
||||||
|
<h3>Product catalogue dedup</h3>
|
||||||
|
<p>SKU whitespace, near-identical product names, copy-paste smart quotes in titles — gone. Audit log shows every change.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🛒</span>
|
||||||
|
<h3>Abandoned-cart hygiene</h3>
|
||||||
|
<p>Before re-engagement: dedupe across email + phone, drop sentinels-as-missing, format dates so your sequence triggers fire correctly.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">📥</span>
|
||||||
|
<h3>Subscriber-list import to Klaviyo</h3>
|
||||||
|
<p>Klaviyo charges per contact. Every duplicate you don't catch costs you for the life of the subscription. Catch them once, pay once.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">🔗</span>
|
||||||
|
<h3>Multi-channel order consolidation</h3>
|
||||||
|
<p>Orders from Shopify + Etsy + a wholesale spreadsheet, each with a different column for "customer email." Map Columns aligns them; dedup merges across channels.</p>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<span class="icon">⚙️</span>
|
||||||
|
<h3>Repeatable pipeline</h3>
|
||||||
|
<p>Save the cleanup as a JSON file. Drop next week's export on it. Same cleanup, zero re-configuration. Automatable via the CLI.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Privacy moat ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">The thing every cloud cleaner can't say</div>
|
||||||
|
<h2>Your customer list never leaves your computer.</h2>
|
||||||
|
<p>
|
||||||
|
DataTools is a desktop app. There's no upload step, no SaaS account,
|
||||||
|
no subscription, no "trust our security policy." The first thing you
|
||||||
|
can do after install is open your browser's network tab, run the
|
||||||
|
cleaner on your real customer file, and verify zero outbound
|
||||||
|
requests.
|
||||||
|
</p>
|
||||||
|
<div class="callout">
|
||||||
|
<strong>Why it matters for Shopify:</strong> your customer list is
|
||||||
|
your single most valuable business asset. Cloud cleaners require
|
||||||
|
you to upload it. We don't.
|
||||||
|
</div>
|
||||||
|
<div class="terminal"><span class="prompt">$</span> python -m src.cli_pipeline customers.csv --apply
|
||||||
|
Reading customers.csv...
|
||||||
|
47,832 rows, 14 columns
|
||||||
|
Executing pipeline:
|
||||||
|
<span class="ok">✓</span> text_clean (140 ms) {cells_changed: 12,408}
|
||||||
|
<span class="ok">✓</span> format_standardize (810 ms) {cells_changed: 31,202}
|
||||||
|
<span class="ok">✓</span> missing (95 ms) {sentinels_standardized: 8,129}
|
||||||
|
<span class="ok">✓</span> dedup (3.1 s) {duplicates_removed: 2,347}
|
||||||
|
|
||||||
|
Initial rows: 47,832 → Final rows: 45,485
|
||||||
|
Total elapsed: 4.2 s
|
||||||
|
<span class="prompt">$</span> # zero network calls. zero. promise.</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Audit moat ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">For when your client asks "what changed?"</div>
|
||||||
|
<h2>Every change auditable. Every cell logged.</h2>
|
||||||
|
<p>
|
||||||
|
Every modification is recorded with the original value, the new
|
||||||
|
value, and which rule fired. Hand the audit CSV to your accountant,
|
||||||
|
your marketing manager, or your boss along with the cleaned file.
|
||||||
|
No <em>"I trust the AI"</em> hand-waving — they see exactly what
|
||||||
|
happened.
|
||||||
|
</p>
|
||||||
|
<div class="callout">
|
||||||
|
<strong>Real example:</strong> the demo above standardized 27
|
||||||
|
cells across 15 customers. The audit log lists each one — row,
|
||||||
|
column, before, after, which standardizer fired. The dedup audit
|
||||||
|
lists every duplicate group with the survivor and its losers.
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= International ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">If you sell internationally — most pet brands do</div>
|
||||||
|
<h2>Phones, addresses, and currencies from anywhere on Earth.</h2>
|
||||||
|
<p>
|
||||||
|
Your subscriber from London entered her phone as <code>020 7946
|
||||||
|
0958</code>. Your Tokyo customer entered <code>03-3210-7000</code>.
|
||||||
|
Your German wholesale buyer wrote <code>€2.410,75</code>. Excel
|
||||||
|
thinks all of them are mistakes. DataTools knows what country each
|
||||||
|
row is from (per-row country column) and parses every one correctly
|
||||||
|
to E.164 phones, ISO dates, and numeric amounts.
|
||||||
|
</p>
|
||||||
|
<ul class="bullets">
|
||||||
|
<li><strong>50+ country codes</strong> via Google's libphonenumber.</li>
|
||||||
|
<li><strong>Currency auto-detect</strong> for $ / £ / € / ¥ / R$ / kr / zł — including the EU comma-decimal that breaks Excel.</li>
|
||||||
|
<li><strong>Address shape detection</strong> for US, UK, Canada, Germany, Australia.</li>
|
||||||
|
<li><strong>Locale-aware month names</strong> in English, French, German.</li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= What you get ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">In the bundle</div>
|
||||||
|
<h2>Six tools. One pipeline. One $49 download.</h2>
|
||||||
|
<div class="grid">
|
||||||
|
<div class="card"><h3>1 · Find Duplicates</h3><p>Fuzzy match (Jaro-Winkler), 5 normalizers, survivor rules, interactive review.</p></div>
|
||||||
|
<div class="card"><h3>2 · Clean Text</h3><p>Whitespace, smart chars, NBSP, BOM, line endings, case ops.</p></div>
|
||||||
|
<div class="card"><h3>3 · Standardize Formats</h3><p>Dates, phones, emails, addresses, names, currencies, booleans.</p></div>
|
||||||
|
<div class="card"><h3>4 · Fix Missing Values</h3><p>Disguised-null detection, profile, mean/median/mode/ffill, drop strategies.</p></div>
|
||||||
|
<div class="card"><h3>5 · Map Columns</h3><p>Fuzzy auto-rename, target schema, type coercion, required-field defaults.</p></div>
|
||||||
|
<div class="card"><h3>6 · Automated Workflows</h3><p>Chain tools in recommended order, save/load JSON, automate weekly cleanups.</p></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Pricing ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<div class="eyebrow">Pricing — pay once, own it</div>
|
||||||
|
<h2>$49. No subscription. No ceiling on rows or files.</h2>
|
||||||
|
<div class="pricing">
|
||||||
|
<div class="card featured">
|
||||||
|
<div class="row"><div class="price">$49</div><div class="price-suffix">one-time</div></div>
|
||||||
|
<h3>DataTools for Shopify</h3>
|
||||||
|
<ul>
|
||||||
|
<li>All 6 tools, full pipeline</li>
|
||||||
|
<li>Mac · Windows · Linux installers</li>
|
||||||
|
<li>Code-signed (no Gatekeeper warnings)</li>
|
||||||
|
<li>Free updates for the v1.x line</li>
|
||||||
|
<li>Bonus: 3 ready-made Shopify pipelines</li>
|
||||||
|
</ul>
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=shopify-pet" rel="noopener">Buy on Gumroad →</a>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<div class="row"><div class="price">$149</div><div class="price-suffix">one-time</div></div>
|
||||||
|
<h3>Full DataTools Suite</h3>
|
||||||
|
<p class="muted">Available when 3+ bundles ship. Includes everything in the Shopify pack plus the Bookkeeper and RevOps bundles. Save $48.</p>
|
||||||
|
<a class="btn btn-ghost btn-large" href="#" aria-disabled="true">Coming when ready</a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= FAQ ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container">
|
||||||
|
<h2>Questions</h2>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Does this work with Shopify Plus?</summary>
|
||||||
|
<p>Yes — the input is just CSV / Excel from any source. Your Shopify Plus exports work the same as the standard plan, the same as a Shopify-to-CSV pipeline you've stitched together yourself. The cleaner doesn't care.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>How does this compare to Excel's "Remove Duplicates"?</summary>
|
||||||
|
<p>Excel does <em>exact</em> deduplication. <code>John@Gmail.com</code> and <code>john@gmail.com</code> are different customers to Excel. DataTools fuzzy-matches across case, whitespace, formatting, and even close-but-not-identical strings. The demo above merges 4 customer pairs Excel would leave duplicated.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>How big a file can it handle?</summary>
|
||||||
|
<p>1 GB CSV with international phones + addresses processes in about 2.5 minutes on a typical workstation. Streaming mode keeps memory bounded regardless of input size — we tested it on 26 million rows.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Do I need to know Python to use it?</summary>
|
||||||
|
<p>No. The GUI is a browser interface that opens automatically when you double-click the app. It loads your file, you click Run, you download the cleaned file. The CLI is there for power users who want to script weekly cleanups.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What about my privacy?</summary>
|
||||||
|
<p>Your customer list never leaves your computer. There is no cloud component, no telemetry, no "anonymous usage stats." When the app is running you can confirm zero outbound network requests in your browser's developer tools.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>What's your refund policy?</summary>
|
||||||
|
<p>Try the live demo above on the sample dataset before you buy. If you still find DataTools doesn't fit your workflow within 14 days, email for a refund — no questions asked.</p>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="faq">
|
||||||
|
<summary>Will there be updates?</summary>
|
||||||
|
<p>Yes. The v1.x line is included free for everyone who buys DataTools today. We ship a patch every 30 days adding country support, edge-case fixes, and small features.</p>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Final CTA ============= -->
|
||||||
|
<section>
|
||||||
|
<div class="container" style="text-align: center;">
|
||||||
|
<h2>Stop deduplicating customers by hand.</h2>
|
||||||
|
<p class="lead" style="margin: 0 auto 28px;">One $49 download. Mac, Windows, or Linux. Runs offline. Catches the duplicates Excel misses, standardizes the phones from your international customers, and saves a pipeline you can re-run on next week's export.</p>
|
||||||
|
<a class="btn btn-large" href="https://gumroad.com/l/datatools?from=shopify-pet" rel="noopener">Get DataTools — $49 →</a>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
|
||||||
|
<!-- ============= Footer ============= -->
|
||||||
|
<footer>
|
||||||
|
<div class="container">
|
||||||
|
<div>
|
||||||
|
<p><strong>DataTools</strong> — local data-cleaning for Shopify, bookkeepers, and RevOps teams.</p>
|
||||||
|
<p class="muted">© 2026 · Built solo · Shipped from a small office.</p>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<p>
|
||||||
|
<a href="../bookkeeper/">For bookkeepers</a> ·
|
||||||
|
<a href="../revops/">For RevOps agencies</a><br />
|
||||||
|
<a href="https://gumroad.com/l/datatools?from=shopify-pet">Buy on Gumroad</a> ·
|
||||||
|
<a href="mailto:hello@datatools.app">Email support</a>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</footer>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
187
layout-review/01_deduplicator.html
Normal file
187
layout-review/01_deduplicator.html
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Find Duplicates</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="01_deduplicator">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Find Duplicates</strong>, shown with a file imported and a completed run (results + match-group review). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Find Duplicates</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Find rows that repeat, then keep one and remove the extras.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding & delimiter auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">customers_export.csv</span>
|
||||||
|
<span class="size">2.1 MB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Delimiter selector (CSV) -->
|
||||||
|
<div class="dt-field" style="max-width:320px">
|
||||||
|
<label class="dt-label">Delimiter</label>
|
||||||
|
<div class="dt-select">Comma (,)</div>
|
||||||
|
<div class="dt-help-text">Auto-detected on upload. Change if the preview looks wrong.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed after a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: customers_export.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">18,442 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>Austin</td><td>512-555-0190</td><td>2024-01-04</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>jane doe</td><td>JANE@ACME.IO</td><td>austin</td><td>(512) 555-0190</td><td>01/04/2024</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R. Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Options expander -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<details class="dt-expander" style="margin-top:0">
|
||||||
|
<summary>Advanced Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Match on columns</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-placeholder">Leave empty to auto-detect</span></div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Strong keys</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">email <span class="x">✕</span></span></div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Fuzzy columns</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">name <span class="x">✕</span></span></div></div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Fuzzy algorithm</label><div class="dt-select">jaro_winkler</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Similarity threshold</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:70%"></div><div class="knob" style="left:70%"></div></div><div class="val">85</div></div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Survivor rule</label><div class="dt-select">most-complete</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on" style="margin-top:6px"><span class="box"><span class="dt-mi">check</span></span> Merge mode — fill missing fields in the surviving row</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Find Duplicates</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Original rows</div><div class="value">18,442</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Duplicate rows</div><div class="value">312</div><div class="delta down">−312 removed</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Match groups</div><div class="value">147</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Rows kept</div><div class="value">18,130</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-btn-row" style="max-width:560px">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download deduplicated CSV</button>
|
||||||
|
<button class="dt-btn">Download removed rows</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Match groups -->
|
||||||
|
<h2>Match Groups</h2>
|
||||||
|
<div class="dt-cols-3" style="max-width:520px">
|
||||||
|
<button class="dt-btn">Accept All</button>
|
||||||
|
<button class="dt-btn">Reject All</button>
|
||||||
|
<button class="dt-btn">Clear Decisions</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Match group card 1 -->
|
||||||
|
<div class="dt-match-card">
|
||||||
|
<div class="dt-match-head">
|
||||||
|
<span class="title">Group 1 · 2 rows</span>
|
||||||
|
<span class="conf"><span class="dt-count-pill success">98% match</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-match-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>keep</th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr class="dt-keep-row"><td><span class="dt-keep-tag">keep</span></td><td>Jane Doe</td><td>jane@acme.io</td><td>Austin</td><td>512-555-0190</td><td>2024-01-04</td></tr>
|
||||||
|
<tr><td><span class="dt-caption">remove</span></td><td class="dt-cell-flag">jane doe</td><td class="dt-cell-flag">JANE@ACME.IO</td><td class="dt-cell-flag">austin</td><td>(512) 555-0190</td><td class="dt-cell-flag">01/04/2024</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Differing columns highlighted. The survivor row is kept; uncheck rows to split the group.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Match group card 2 -->
|
||||||
|
<div class="dt-match-card">
|
||||||
|
<div class="dt-match-head">
|
||||||
|
<span class="title">Group 2 · 2 rows</span>
|
||||||
|
<span class="conf"><span class="dt-count-pill warn">87% match</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-match-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>keep</th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr class="dt-keep-row"><td><span class="dt-keep-tag">keep</span></td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
<tr><td><span class="dt-caption">remove</span></td><td class="dt-cell-flag">R. Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="dt-caption" style="margin-top:14px">Decisions: 1 merged, 1 pending</p>
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block" style="margin-top:8px">Apply Review Decisions & Download</button>
|
||||||
|
|
||||||
|
<!-- Processing log -->
|
||||||
|
<details class="dt-expander" style="margin-top:18px">
|
||||||
|
<summary>Processing Log</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-code">[00:00.01] Loaded 18,442 rows from customers_export.csv
|
||||||
|
[00:00.04] Strategy: exact(email) + fuzzy(name, jaro_winkler ≥ 85)
|
||||||
|
[00:00.91] Compared 18,442 rows → 147 match groups
|
||||||
|
[00:01.02] Survivor rule: most-complete · merge=on
|
||||||
|
[00:01.05] 312 rows flagged for removal</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
208
layout-review/02_text_cleaner.html
Normal file
208
layout-review/02_text_cleaner.html
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Clean Text</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
<style>
|
||||||
|
/* Hidden-character badges — mirrors src/core/text_clean.py:hidden_char_css(),
|
||||||
|
not part of app.css so reproduced inline against the same palette. */
|
||||||
|
.hidden-char { display: inline-block; padding: 0 2px; margin: 0 1px; border-radius: 3px; font-family: var(--font-mono); font-size: 0.85em; cursor: help; }
|
||||||
|
.hidden-char.hidden-whitespace { background: #fff3cd; color: #856404; border: 1px solid #ffeaa7; }
|
||||||
|
.hidden-char.hidden-special { background: #d1ecf1; color: #0c5460; border: 1px solid #bee5eb; }
|
||||||
|
.hidden-char.hidden-control { background: #f8d7da; color: #721c24; border: 1px solid #f5c6cb; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body data-page="02_text_cleaner">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Clean Text</strong>, shown with a file imported and a completed run (results metrics, changes-by-column, before/after examples, cleaned preview, downloads). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Clean Text</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Trim extra spaces and strip out odd characters.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">contacts_messy.csv</span>
|
||||||
|
<span class="size">684 KB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: contacts_messy.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">4,120 rows, 4 columns</p>
|
||||||
|
<div class="dt-check on" style="margin-top:2px"><span class="box"><span class="dt-mi">check</span></span> Show hidden characters in preview</div>
|
||||||
|
<div class="dt-table-wrap" style="margin-top:8px">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>company</th><th>notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td><span class="hidden-char hidden-whitespace" title="U+0020 SP LEAD">·</span>Jane Doe<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>jane@acme.io</td><td>Acme<span class="hidden-char hidden-whitespace" title="U+00A0 NBSP">·</span>Inc.</td><td>VIP<span class="hidden-char hidden-special" title="U+201D RIGHT DOUBLE QUOTE">”</span></td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com<span class="hidden-char hidden-special" title="U+200B ZWSP">∅</span></td><td>Globex</td><td>—<span class="hidden-char hidden-control" title="U+0007 CTRL">␣</span></td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Ana López</td><td>ana@initech.com</td><td>Initech<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>follow up</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td><span class="hidden-char hidden-whitespace" title="U+0009 TAB">→</span>Wei Chen</td><td>WEI@umbrella.co</td><td>Umbrella</td><td>“key<span class="hidden-char hidden-special" title="U+2014 EM DASH">—</span>account”</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Preset</label>
|
||||||
|
<div class="dt-radio-row">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> excel-hygiene (recommended)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> minimal</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> paranoid</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">excel-hygiene: trim, collapse whitespace, fold smart quotes, strip invisible chars, normalize line endings, NFC.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Advanced options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Trim leading/trailing whitespace</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Collapse internal whitespace</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Normalize line endings (\r\n → \n)</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Strip control characters</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Strip BOM</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Fold smart characters (curly quotes, em-dash, NBSP)</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Strip zero-width / invisible characters</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Unicode NFC normalization</div>
|
||||||
|
<div class="dt-check"><span class="box"></span> Unicode NFKC compat fold (lossy: ① → 1, fi → fi)</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Scope</h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to clean (default: all string columns)</label>
|
||||||
|
<div class="dt-multiselect">
|
||||||
|
<span class="dt-ms-chip">name <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">email <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">company <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">notes <span class="x">✕</span></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to skip even if they look like text</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-placeholder">Choose columns to leave untouched</span></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Case conversion</h4>
|
||||||
|
<div class="dt-field" style="max-width:360px">
|
||||||
|
<label class="dt-label">Apply case conversion to selected columns</label>
|
||||||
|
<div class="dt-select">None</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Clean Text</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Cells scanned</div><div class="value">16,480</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells changed</div><div class="value">3,947</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">% changed</div><div class="value">24.0%</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Columns processed</div><div class="value">4</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)</div>
|
||||||
|
|
||||||
|
<h4>Changes by column</h4>
|
||||||
|
<div class="dt-table-wrap" style="max-width:360px">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>cells_changed</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">company</td><td>1,604</td></tr>
|
||||||
|
<tr><td class="idx">name</td><td>1,210</td></tr>
|
||||||
|
<tr><td class="idx">notes</td><td>982</td></tr>
|
||||||
|
<tr><td class="idx">email</td><td>151</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Examples (first 25 changes)</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Row</th><th>Column</th><th>Before</th><th>After</th><th>Ops applied</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>1</td><td>name</td><td><span class="hidden-char hidden-whitespace" title="U+0020 SP LEAD">·</span>Jane Doe<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>Jane Doe</td><td>trim</td></tr>
|
||||||
|
<tr><td>1</td><td>company</td><td>Acme<span class="hidden-char hidden-whitespace" title="U+00A0 NBSP">·</span>Inc.</td><td>Acme Inc.</td><td>fold_smart</td></tr>
|
||||||
|
<tr><td>1</td><td>notes</td><td>VIP<span class="hidden-char hidden-special" title="U+201D RIGHT DOUBLE QUOTE">”</span></td><td>VIP"</td><td>fold_smart</td></tr>
|
||||||
|
<tr><td>2</td><td>name</td><td>Bob<span class="hidden-char hidden-whitespace" title="U+0020 SP">·</span><span class="hidden-char hidden-whitespace" title="U+0020 SP">·</span>Smith</td><td>Bob Smith</td><td>collapse_ws</td></tr>
|
||||||
|
<tr><td>2</td><td>email</td><td>bob@globex.com<span class="hidden-char hidden-special" title="U+200B ZWSP">∅</span></td><td>bob@globex.com</td><td>strip_zero_width</td></tr>
|
||||||
|
<tr><td>2</td><td>notes</td><td>—<span class="hidden-char hidden-control" title="U+0007 CTRL">␣</span></td><td>—</td><td>strip_control</td></tr>
|
||||||
|
<tr><td>3</td><td>company</td><td>Initech<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>Initech</td><td>trim</td></tr>
|
||||||
|
<tr><td>4</td><td>name</td><td><span class="hidden-char hidden-whitespace" title="U+0009 TAB">→</span>Wei Chen</td><td>Wei Chen</td><td>trim</td></tr>
|
||||||
|
<tr><td>4</td><td>notes</td><td>“key<span class="hidden-char hidden-special" title="U+2014 EM DASH">—</span>account”</td><td>"key-account"</td><td>fold_smart, nfc</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Cleaned preview (first 10 rows)</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>company</th><th>notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td class="dt-cell-add">Jane Doe</td><td>jane@acme.io</td><td class="dt-cell-add">Acme Inc.</td><td class="dt-cell-add">VIP"</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td class="dt-cell-add">Bob Smith</td><td class="dt-cell-add">bob@globex.com</td><td>Globex</td><td class="dt-cell-add">—</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Ana López</td><td>ana@initech.com</td><td class="dt-cell-add">Initech</td><td>follow up</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td class="dt-cell-add">Wei Chen</td><td>WEI@umbrella.co</td><td>Umbrella</td><td class="dt-cell-add">"key-account"</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Changed cells highlighted. Toggle “Show hidden characters” to inspect the invisibles being removed.</p>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download cleaned CSV</button>
|
||||||
|
<button class="dt-btn">Download changes audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
224
layout-review/03_format_standardizer.html
Normal file
224
layout-review/03_format_standardizer.html
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Standardize Formats</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="03_format_standardizer">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Standardize Formats</strong>, shown with a file imported from the upload screen and a completed run (results + changes audit + standardized preview). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Standardize Formats</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Make dates, phones, currency, and names look the same throughout.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- File pickup banner (using file from upload screen) -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">description</span>
|
||||||
|
<span>Using <strong>customers_export.csv</strong> from the upload screen.</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn" style="margin-bottom:4px">Use a different file</button>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: customers_export.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">18,442 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>full_name</th><th>phone</th><th>amount</th><th>signup_date</th><th>active</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>jane DOE</td><td>(512) 555-0190</td><td>$1,234.5</td><td>01/04/2024</td><td>Y</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>bob smith</td><td>720.555.7781</td><td>$99</td><td>2024-2-11</td><td>yes</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>ALICIA REYES</td><td>+1 415 555 2233</td><td>$45,000</td><td>Mar 3, 2024</td><td>n</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>m. okafor</td><td>2125550148</td><td>$7.999</td><td>2024/04/22</td><td>true</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (collapsed after run; opened here to show the most informative content) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<h3 style="margin-top:0">Column types</h3>
|
||||||
|
<p class="dt-caption">Assign each column to a field type. Auto-detected suggestions are pre-filled; pick <strong>(skip)</strong> to leave a column untouched.</p>
|
||||||
|
|
||||||
|
<!-- Per-column type selectboxes, 3 per row -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<div class="dt-field"><label class="dt-label">full_name</label><div class="dt-select">Name</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">phone</label><div class="dt-select">Phone</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">amount</label><div class="dt-select">Currency</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<div class="dt-field"><label class="dt-label">signup_date</label><div class="dt-select">Date</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">active</label><div class="dt-select">Boolean</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">notes</label><div class="dt-select">(skip)</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<h3>Format options</h3>
|
||||||
|
|
||||||
|
<!-- Standards preset radio (vertical) -->
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Standards preset</label>
|
||||||
|
<div style="display:flex;flex-direction:column;gap:8px;margin-top:4px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> US (default) — ISO 8601 dates · E.164 phones · USD</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> European — DMY input · INTL phones · EUR comma decimal</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> UK — DD/MM/YYYY · GB phones · Yes/No booleans</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> ISO Strict — ISO 8601 · bare-number currency · true/false</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Legacy US — MM/DD/YYYY · National phones · Yes/No</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Custom — keep current settings</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">Pick a published standard or regional convention as the baseline. Every option below is still individually overridable.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Two-column format options -->
|
||||||
|
<div class="dt-cols-2" style="margin-top:14px">
|
||||||
|
<!-- Left column: Dates + Phones -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0"><strong>Dates</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Output format</label><div class="dt-select">YYYY-MM-DD (ISO)</div></div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Ambiguous input order (e.g. 01/02/2024)</label>
|
||||||
|
<div class="dt-radio-row">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> MDY (US)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> DMY (EU)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4><strong>Phones</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Output format</label><div class="dt-select">E.164 (+15551234567)</div></div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Default region (ISO-2)</label>
|
||||||
|
<div class="dt-input">US</div>
|
||||||
|
<div class="dt-help-text">Region used when the input has no country code. US, GB, DE, etc.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Right column: Currency + Names + Booleans -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0"><strong>Currency</strong></h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Decimal separator in input</label>
|
||||||
|
<div class="dt-radio-row">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> dot (1,234.56)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> comma (1.234,56)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field" style="max-width:200px"><label class="dt-label">Round to decimals</label><div class="dt-input">2</div></div>
|
||||||
|
<div class="dt-check"><span class="box"></span> Preserve original precision (don't round)</div>
|
||||||
|
<div class="dt-check"><span class="box"></span> Preserve currency code (emit <code>USD 1234.56</code>, <code>EUR 99.00</code>, etc.)</div>
|
||||||
|
|
||||||
|
<h4><strong>Names</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Casing</label><div class="dt-select">Title Case</div></div>
|
||||||
|
|
||||||
|
<h4><strong>Booleans</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Output style</label><div class="dt-select">True/False</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Standardize Formats</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Cells scanned</div><div class="value">92,210</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells changed</div><div class="value">61,838</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">% changed</div><div class="value">67.1%</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Unparseable</div><div class="value">47</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>47 cell(s) in typed columns didn't match a recognizable shape and were left as-is. Check the changes audit below to find them, or re-classify the column to <strong>(skip)</strong>.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Changes by column -->
|
||||||
|
<p style="margin-bottom:6px"><strong>Changes by column</strong></p>
|
||||||
|
<div class="dt-table-wrap" style="max-width:520px">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>field_type</th><th>cells_changed</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>amount</td><td>currency</td><td>17,902</td></tr>
|
||||||
|
<tr><td>full_name</td><td>name</td><td>16,041</td></tr>
|
||||||
|
<tr><td>phone</td><td>phone</td><td>14,388</td></tr>
|
||||||
|
<tr><td>signup_date</td><td>date</td><td>11,205</td></tr>
|
||||||
|
<tr><td>active</td><td>boolean</td><td>2,302</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Examples (first 25 changes) -->
|
||||||
|
<p style="margin:14px 0 6px"><strong>Examples (first 25 changes)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>row</th><th>column</th><th>field_type</th><th>before</th><th>after</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>1</td><td>full_name</td><td>name</td><td class="dt-cell-del">jane DOE</td><td class="dt-cell-add">Jane Doe</td></tr>
|
||||||
|
<tr><td>1</td><td>phone</td><td>phone</td><td class="dt-cell-del">(512) 555-0190</td><td class="dt-cell-add">+15125550190</td></tr>
|
||||||
|
<tr><td>1</td><td>amount</td><td>currency</td><td class="dt-cell-del">$1,234.5</td><td class="dt-cell-add">1234.50</td></tr>
|
||||||
|
<tr><td>1</td><td>signup_date</td><td>date</td><td class="dt-cell-del">01/04/2024</td><td class="dt-cell-add">2024-01-04</td></tr>
|
||||||
|
<tr><td>1</td><td>active</td><td>boolean</td><td class="dt-cell-del">Y</td><td class="dt-cell-add">True</td></tr>
|
||||||
|
<tr><td>2</td><td>full_name</td><td>name</td><td class="dt-cell-del">bob smith</td><td class="dt-cell-add">Bob Smith</td></tr>
|
||||||
|
<tr><td>2</td><td>phone</td><td>phone</td><td class="dt-cell-del">720.555.7781</td><td class="dt-cell-add">+17205557781</td></tr>
|
||||||
|
<tr><td>2</td><td>signup_date</td><td>date</td><td class="dt-cell-del">2024-2-11</td><td class="dt-cell-add">2024-02-11</td></tr>
|
||||||
|
<tr><td>3</td><td>signup_date</td><td>date</td><td class="dt-cell-del">Mar 3, 2024</td><td class="dt-cell-add">2024-03-03</td></tr>
|
||||||
|
<tr><td>4</td><td>amount</td><td>currency</td><td class="dt-cell-del">$7.999</td><td class="dt-cell-add">8.00</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Standardized preview -->
|
||||||
|
<p style="margin:14px 0 6px"><strong>Standardized preview (first 10 rows)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>full_name</th><th>phone</th><th>amount</th><th>signup_date</th><th>active</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>+15125550190</td><td>1234.50</td><td>2024-01-04</td><td>True</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>+17205557781</td><td>99.00</td><td>2024-02-11</td><td>True</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Alicia Reyes</td><td>+14155552233</td><td>45000.00</td><td>2024-03-03</td><td>False</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>M. Okafor</td><td>+12125550148</td><td>8.00</td><td>2024-04-22</td><td>True</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (3 columns) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download standardized CSV</button>
|
||||||
|
<button class="dt-btn">Download changes audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
271
layout-review/04_missing_handler.html
Normal file
271
layout-review/04_missing_handler.html
Normal file
@@ -0,0 +1,271 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Fix Missing Values</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="04_missing_handler">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Fix Missing Values</strong>, shown with a file imported and a completed run (per-column missingness profile + before/after results). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Fix Missing Values</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Find blank cells (even hidden ones) and fill them in or remove them.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<p class="dt-caption">Tip: files imported on the Home screen are picked up here automatically.</p>
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">survey_responses.csv</span>
|
||||||
|
<span class="size">684 KB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed after a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: survey_responses.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">2,150 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>respondent_id</th><th>age</th><th>region</th><th>income</th><th>satisfaction</th><th>comments</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>R-1001</td><td>34</td><td>West</td><td>52000</td><td>4</td><td>great service</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>R-1002</td><td class="dt-cell-flag">N/A</td><td>East</td><td class="dt-cell-flag"></td><td>3</td><td class="dt-cell-flag">?</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>R-1003</td><td>41</td><td class="dt-cell-flag">-</td><td>61000</td><td class="dt-cell-flag">NULL</td><td>none</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R-1004</td><td>29</td><td>South</td><td class="dt-cell-flag">N/A</td><td>5</td><td>quick</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (Missingness profile + Strategy) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<h3>Missingness profile</h3>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Rows</div><div class="value">2,150</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells missing</div><div class="value">1,043</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">% cells missing</div><div class="value">8.1%</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Complete rows</div><div class="value">1,388</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>dtype</th><th>missing</th><th>missing_pct</th><th>disguised</th><th>has_missing</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>respondent_id</td><td>object</td><td>0</td><td>0.0%</td><td>0</td><td>False</td></tr>
|
||||||
|
<tr><td>age</td><td>float64</td><td>187</td><td>8.7%</td><td>61</td><td>True</td></tr>
|
||||||
|
<tr><td>region</td><td>object</td><td>142</td><td>6.6%</td><td>142</td><td>True</td></tr>
|
||||||
|
<tr><td>income</td><td>float64</td><td>329</td><td>15.3%</td><td>118</td><td>True</td></tr>
|
||||||
|
<tr><td>satisfaction</td><td>float64</td><td>95</td><td>4.4%</td><td>40</td><td>True</td></tr>
|
||||||
|
<tr><td>comments</td><td>object</td><td>290</td><td>13.5%</td><td>290</td><td>True</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<h3>Strategy</h3>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Preset</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column;gap:10px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> detect-only (standardize sentinels to NaN, no fill or drop)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> safe-fill (numeric → median, categorical → mode)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> drop-incomplete (drop any row with missing)</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. safe-fill: also fill — numeric columns with median, others with mode. drop-incomplete: also drop every row that has any missing cell.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Advanced options expander (open — most informative) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Advanced options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<h4>Detection</h4>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Standardize disguised nulls to NaN</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Sentinel values (comma-separated)</label>
|
||||||
|
<div class="dt-input">N/A, n/a, NA, NULL, null, None, -, --, ?, #N/A</div>
|
||||||
|
<div class="dt-help-text">Matched case-insensitively after stripping whitespace.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<h4>Strategy override</h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Global strategy</label>
|
||||||
|
<div class="dt-select">(use preset)</div>
|
||||||
|
<div class="dt-help-text">drop_row / drop_col use the thresholds below. mean / median / interpolate are numeric only — non-numeric columns fall back to the categorical strategy.</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Categorical fallback (for non-numeric columns)</label>
|
||||||
|
<div class="dt-select">mode</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Drop thresholds</h4>
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Row drop threshold (drop rows with ≥ this fraction missing across selected cols)</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:100%"></div><div class="knob" style="left:100%"></div></div><div class="val">1.00</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Column drop threshold (drop columns with ≥ this fraction missing)</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:100%"></div><div class="knob" style="left:100%"></div></div><div class="val">1.00</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Scope</h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to handle (default: all)</label>
|
||||||
|
<div class="dt-multiselect">
|
||||||
|
<span class="dt-ms-chip">respondent_id <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">age <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">region <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">income <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">satisfaction <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">comments <span class="x">✕</span></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to skip</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-placeholder">Choose columns</span></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Per-column strategy overrides (optional)</h4>
|
||||||
|
<p class="dt-caption">Set a different strategy for specific columns. Leave any row blank to use the global strategy.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Column</th><th>Override</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>age</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px">median</span></td></tr>
|
||||||
|
<tr><td>region</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px">mode</span></td></tr>
|
||||||
|
<tr><td>income</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px"></span></td></tr>
|
||||||
|
<tr><td>satisfaction</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px"></span></td></tr>
|
||||||
|
<tr><td>comments</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px">constant</span></td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Handle Missing Values</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<div id="missing-results-anchor"></div>
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Sentinels → NaN</div><div class="value">651</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells filled</div><div class="value">1,043</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Rows dropped</div><div class="value">0</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Columns dropped</div><div class="value">0</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Missingness — before vs. after</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>before_missing</th><th>before_pct</th><th>after_missing</th><th>after_pct</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>respondent_id</td><td>0</td><td>0.0</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>age</td><td class="dt-cell-flag">187</td><td>8.7</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>region</td><td class="dt-cell-flag">142</td><td>6.6</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>income</td><td class="dt-cell-flag">329</td><td>15.3</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>satisfaction</td><td class="dt-cell-flag">95</td><td>4.4</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>comments</td><td class="dt-cell-flag">290</td><td>13.5</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Strategy applied per column</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>strategy</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>age</td><td>median</td></tr>
|
||||||
|
<tr><td>region</td><td>mode</td></tr>
|
||||||
|
<tr><td>income</td><td>median</td></tr>
|
||||||
|
<tr><td>satisfaction</td><td>median</td></tr>
|
||||||
|
<tr><td>comments</td><td>constant</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Audit (first 50 changes)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>row</th><th>column</th><th>old_value</th><th>new_value</th><th>reason</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2</td><td>age</td><td class="dt-cell-flag">N/A</td><td class="dt-cell-add">37.0</td><td>fill: median</td></tr>
|
||||||
|
<tr><td>2</td><td>income</td><td class="dt-cell-flag">(blank)</td><td class="dt-cell-add">54000.0</td><td>fill: median</td></tr>
|
||||||
|
<tr><td>2</td><td>comments</td><td class="dt-cell-flag">?</td><td class="dt-cell-add">(no comment)</td><td>fill: constant</td></tr>
|
||||||
|
<tr><td>3</td><td>region</td><td class="dt-cell-flag">-</td><td class="dt-cell-add">West</td><td>fill: mode</td></tr>
|
||||||
|
<tr><td>3</td><td>satisfaction</td><td class="dt-cell-flag">NULL</td><td class="dt-cell-add">4.0</td><td>fill: median</td></tr>
|
||||||
|
<tr><td>4</td><td>income</td><td class="dt-cell-flag">N/A</td><td class="dt-cell-add">54000.0</td><td>fill: median</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">… and 1,037 more (download the full audit below).</p>
|
||||||
|
|
||||||
|
<p><strong>Handled preview (first 10 rows)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>respondent_id</th><th>age</th><th>region</th><th>income</th><th>satisfaction</th><th>comments</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>R-1001</td><td>34.0</td><td>West</td><td>52000.0</td><td>4.0</td><td>great service</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>R-1002</td><td class="dt-cell-add">37.0</td><td>East</td><td class="dt-cell-add">54000.0</td><td>3.0</td><td class="dt-cell-add">(no comment)</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>R-1003</td><td>41.0</td><td class="dt-cell-add">West</td><td>61000.0</td><td class="dt-cell-add">4.0</td><td>none</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R-1004</td><td>29.0</td><td>South</td><td class="dt-cell-add">54000.0</td><td>5.0</td><td>quick</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (html_download_button anchors) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download handled CSV</button>
|
||||||
|
<button class="dt-btn">Download changes audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
222
layout-review/05_column_mapper.html
Normal file
222
layout-review/05_column_mapper.html
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Map Columns</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="05_column_mapper">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Map Columns</strong>, shown with a file imported, an interactive target schema + mapping configured, and a completed run (results + mapped preview). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Map Columns</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Rename columns, change their order, and set each one as text, number, or date.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<p class="dt-caption">You can also import a file on the home screen and pick it up here.</p>
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding & delimiter auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">crm_contacts_raw.csv</span>
|
||||||
|
<span class="size">684 KB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed after a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: crm_contacts_raw.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">4,210 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>Full Name</th><th>EmailAddr</th><th>Phone #</th><th>Signup</th><th>Amount Spent</th><th>Notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>512-555-0190</td><td>01/04/2024</td><td>$1,204.50</td><td>VIP</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com</td><td>720-555-7781</td><td>02/11/2024</td><td>$88.00</td><td></td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Carla Reyes</td><td>carla@initech.net</td><td>415-555-3322</td><td>03/02/2024</td><td>$612.10</td><td>renewal</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>Dev Patel</td><td>dev@umbrella.co</td><td>206-555-9043</td><td>03/19/2024</td><td>$0.00</td><td></td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (open — heart of the tool) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<!-- ===== Target schema ===== -->
|
||||||
|
<h3 style="margin-top:0">Target schema</h3>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">How would you like to define the target schema?</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column; gap:8px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> Build interactively (start from current columns)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Import schema JSON</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Skip (rename / coerce only — no schema)</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">An interactive build is fastest for one-off cleanup. Import a JSON when you have a fixed contract (a CRM import format, db schema). Skip when you only want to rename or coerce specific columns.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="dt-caption">Edit the table to define your target schema. Add rows for fields the input doesn't have yet (with a default), or remove rows for columns you want to drop.</p>
|
||||||
|
|
||||||
|
<!-- Schema editor (st.data_editor, num_rows=dynamic) -->
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Target name</th><th>Type</th><th>Required</th><th>Default (for added cols)</th><th>Aliases (comma-sep, helps fuzzy-match)</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>full_name</td><td>string</td><td>✗</td><td></td><td>Full Name, name</td></tr>
|
||||||
|
<tr><td>email</td><td>string</td><td>✓</td><td></td><td>EmailAddr, email_address</td></tr>
|
||||||
|
<tr><td>phone</td><td>string</td><td>✗</td><td></td><td>Phone #, tel</td></tr>
|
||||||
|
<tr><td>signup_date</td><td>date</td><td>✗</td><td></td><td>Signup</td></tr>
|
||||||
|
<tr><td>amount_spent</td><td>float</td><td>✗</td><td>0.0</td><td>Amount Spent</td></tr>
|
||||||
|
<tr><td>source</td><td>string</td><td>✗</td><td>crm-import</td><td></td></tr>
|
||||||
|
<tr><td class="idx" style="color:var(--ink-tertiary)"><span class="dt-mi" style="font-size:16px;vertical-align:-3px">add</span> add row</td><td></td><td></td><td></td><td></td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">6 target fields · 1 added field (<code>source</code>) not present in the input.</p>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- ===== Strategy ===== -->
|
||||||
|
<h3>Strategy</h3>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Preset</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column; gap:8px">
|
||||||
|
<span class="dt-radio"><span class="dot"></span> rename-only (just rename, leave types alone, keep extras)</span>
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> lenient-schema (rename + coerce + reorder, keep extras)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> strict-schema (rename + coerce + reorder, drop extras)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Advanced options expander -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Advanced options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Unmapped source columns</label>
|
||||||
|
<div class="dt-select">keep</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Coerce types per schema</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Reorder to schema order</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Auto-infer mapping (fuzzy match)</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Fuzzy match threshold</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:80%"></div><div class="knob" style="left:80%"></div></div><div class="val">0.80</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Enforce required fields</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- ===== Mapping ===== -->
|
||||||
|
<h3>Mapping</h3>
|
||||||
|
<!-- schema is set → source→target selectbox editor with auto-suggested flag -->
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Source</th><th>Target</th><th>Auto-suggested</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Full Name</td><td>full_name</td><td>✓</td></tr>
|
||||||
|
<tr><td>EmailAddr</td><td>email</td><td>✓</td></tr>
|
||||||
|
<tr><td>Phone #</td><td>phone</td><td>✓</td></tr>
|
||||||
|
<tr><td>Signup</td><td>signup_date</td><td>✓</td></tr>
|
||||||
|
<tr><td>Amount Spent</td><td>amount_spent</td><td>✓</td></tr>
|
||||||
|
<tr><td>Notes</td><td>(unmapped)</td><td>✗</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Pick a target for each source column. <code>Notes</code> stays unmapped — with the lenient preset it is kept as-is. <code>source</code> is added from the schema default.</p>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Apply Column Mapping</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- ===== Results ===== -->
|
||||||
|
<div id="colmap-results-anchor" style="height:1px"></div>
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Renamed</div><div class="value">5</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Dropped</div><div class="value">0</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Added</div><div class="value">1</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Coerce fails</div><div class="value">3</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-alert info"><span class="dt-mi">info</span><span>Added (with defaults): <code>source</code></span></div>
|
||||||
|
<div class="dt-alert warn"><span class="dt-mi">warning</span><span>Some cells could not be coerced and were left as NaN: amount_spent (3)</span></div>
|
||||||
|
|
||||||
|
<p><strong>Resolved mapping</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>source</th><th>target</th><th>auto</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Full Name</td><td>full_name</td><td>True</td></tr>
|
||||||
|
<tr><td>EmailAddr</td><td>email</td><td>True</td></tr>
|
||||||
|
<tr><td>Phone #</td><td>phone</td><td>True</td></tr>
|
||||||
|
<tr><td>Signup</td><td>signup_date</td><td>True</td></tr>
|
||||||
|
<tr><td>Amount Spent</td><td>amount_spent</td><td>True</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Mapped preview (first 10 rows)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th class="dt-cell-add">full_name</th><th>email</th><th>phone</th><th>signup_date</th><th>amount_spent</th><th class="dt-cell-add">source</th><th>Notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>512-555-0190</td><td>2024-01-04</td><td>1204.5</td><td>crm-import</td><td>VIP</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com</td><td>720-555-7781</td><td>2024-02-11</td><td>88.0</td><td>crm-import</td><td></td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Carla Reyes</td><td>carla@initech.net</td><td>415-555-3322</td><td>2024-03-02</td><td>612.1</td><td>crm-import</td><td>renewal</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>Dev Patel</td><td>dev@umbrella.co</td><td>206-555-9043</td><td>2024-03-19</td><td>0.0</td><td>crm-import</td><td></td></tr>
|
||||||
|
<tr><td class="idx">4</td><td>Mei Lin</td><td>mei@hooli.com</td><td>503-555-1188</td><td>2024-04-07</td><td class="dt-cell-flag">NaN</td><td>crm-import</td><td>trial</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (3 columns) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download mapped CSV</button>
|
||||||
|
<button class="dt-btn">Download mapping audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
91
layout-review/06_outlier_detector.html
Normal file
91
layout-review/06_outlier_detector.html
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Find Unusual Values</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="06_outlier_detector">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Find Unusual Values</strong> — a <strong>Coming Soon</strong> tool. The page is a stub/teaser: an "under development" notice, a list of planned features, and disabled placeholder controls (only the file uploader is live). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Find Unusual Values</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Spot values that look wrong — way too high, too low, or breaking your rules.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- st.info: under development -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>This tool is under development.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Planned features (st.markdown) -->
|
||||||
|
<p><strong>Features:</strong></p>
|
||||||
|
<ul>
|
||||||
|
<li>Z-score detection (configurable threshold)</li>
|
||||||
|
<li>IQR (interquartile range) detection</li>
|
||||||
|
<li>MAD (median absolute deviation) detection</li>
|
||||||
|
<li>Domain-rule violations (e.g., age < 0, price > $1M)</li>
|
||||||
|
<li>Visual outlier highlighting in data preview</li>
|
||||||
|
<li>Handling: flag only, remove, cap/winsorize to bounds</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- File upload (functional) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS · Import a file to preview. Processing is not yet available.</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Placeholder options (all disabled) -->
|
||||||
|
<h3>Detection Method</h3>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px">
|
||||||
|
<label class="dt-label">Method</label>
|
||||||
|
<div class="dt-select" style="opacity:.55;cursor:not-allowed">Z-Score</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px;opacity:.55">
|
||||||
|
<label class="dt-label">Z-Score threshold</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:50%"></div><div class="knob" style="left:50%"></div></div><div class="val">3.0</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px;opacity:.55">
|
||||||
|
<label class="dt-label">IQR multiplier</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:25%"></div><div class="knob" style="left:25%"></div></div><div class="val">1.5</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Handling</h3>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px">
|
||||||
|
<label class="dt-label">Action</label>
|
||||||
|
<div class="dt-select" style="opacity:.55;cursor:not-allowed">Flag only (add column)</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block is-disabled" disabled>Detect Outliers</button>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
83
layout-review/07_multi_file_merger.html
Normal file
83
layout-review/07_multi_file_merger.html
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Combine Files</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="07_multi_file_merger">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Combine Files</strong> — a Coming-Soon tool. The page is a stub: an "under development" notice, a planned-features list, a working multi-file uploader, and disabled placeholder options. <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Combine Files</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Combine several CSV or Excel files into one — even if columns differ.</p>
|
||||||
|
|
||||||
|
<!-- Under-development notice (st.info) -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>This tool is under development.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Planned features (st.markdown) -->
|
||||||
|
<p><strong>Features:</strong></p>
|
||||||
|
<ul style="font-size:14px;line-height:1.55;color:var(--ink);margin:0 0 0.6rem;padding-left:22px">
|
||||||
|
<li>Import multiple CSV/Excel files at once</li>
|
||||||
|
<li>Automatic schema alignment (matching columns by name)</li>
|
||||||
|
<li>Append mode: stack files vertically (union)</li>
|
||||||
|
<li>Join mode: merge files on shared key columns</li>
|
||||||
|
<li>Handle mismatched columns (fill missing with nulls or drop)</li>
|
||||||
|
<li>Source file tracking column</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Multi-file upload (functional) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel files</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop files here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS · multiple files allowed</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">Import multiple files to preview. Processing is not yet available.</div>
|
||||||
|
|
||||||
|
<!-- Placeholder options (all disabled) -->
|
||||||
|
<h3>Merge Strategy</h3>
|
||||||
|
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Mode</label>
|
||||||
|
<div class="dt-select" style="color:var(--ink-tertiary);background-color:var(--surface-hover)">Append (stack vertically)</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Mismatched columns</label>
|
||||||
|
<div class="dt-select" style="color:var(--ink-tertiary);background-color:var(--surface-hover)">Fill with null</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-check on" style="opacity:0.6">
|
||||||
|
<span class="box"><span class="dt-mi">check</span></span> Add source filename column
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block is-disabled">Merge Files</button>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
93
layout-review/08_validator_reporter.html
Normal file
93
layout-review/08_validator_reporter.html
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Quality Check</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="08_validator_reporter">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Quality Check</strong>, a Coming-Soon tool. The page is a stub: an "under development" notice, a feature list, a working file uploader, and disabled placeholder controls. <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Quality Check</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Check your file against rules you set, and export a PDF or Excel report.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Under-development notice (st.info) -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>This tool is under development.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Features list (st.markdown) -->
|
||||||
|
<p><strong>Features:</strong></p>
|
||||||
|
<ul>
|
||||||
|
<li>Column-level validation rules (not null, unique, regex pattern, range, enum)</li>
|
||||||
|
<li>Cross-column validation (e.g., start_date < end_date)</li>
|
||||||
|
<li>Data quality score per column and overall</li>
|
||||||
|
<li>Generate PDF quality report</li>
|
||||||
|
<li>Generate Excel report with flagged rows highlighted</li>
|
||||||
|
<li>Summary dashboard: pass/fail counts, severity breakdown</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- File upload (functional) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Import a file to preview. Processing is not yet available.</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Placeholder options -->
|
||||||
|
<h3>Validation Rules</h3>
|
||||||
|
|
||||||
|
<label class="dt-label">Load rules file (JSON)</label>
|
||||||
|
<div class="dt-uploader" style="opacity:0.55">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">JSON</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn is-disabled" disabled>Browse files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Quick checks</label>
|
||||||
|
<div class="dt-multiselect" style="opacity:0.55">
|
||||||
|
<span class="dt-ms-placeholder">Choose options</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Report Format</h3>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:320px">
|
||||||
|
<label class="dt-label">Output format</label>
|
||||||
|
<div class="dt-select" style="opacity:0.55">Excel (flagged rows)</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block is-disabled" disabled>Validate & Generate Report</button>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
231
layout-review/09_pipeline_runner.html
Normal file
231
layout-review/09_pipeline_runner.html
Normal file
@@ -0,0 +1,231 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Automated Workflows</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="09_pipeline_runner">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Automated Workflows</strong> (Pipeline Runner), shown with a file imported, a four-step pipeline configured, and a completed run (results + per-step summary). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Automated Workflows</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Run several tools in a row — save the steps once, reuse them anytime.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding & delimiter auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">customers_export.csv</span>
|
||||||
|
<span class="size">2.1 MB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: customers_export.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">18,442 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td> Jane Doe </td><td>jane@acme.io</td><td>Austin</td><td>512-555-0190</td><td>2024-01-04</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>jane doe</td><td>JANE@ACME.IO</td><td>austin</td><td>(512) 555-0190</td><td>01/04/2024</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td>720.555.7781</td><td>2024-02-11</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R. Smith</td><td>bob@globex.com</td><td>—</td><td>720-555-7781</td><td>Feb 11 2024</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options: pipeline builder (collapsed once a result exists; opened here to show structure) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<!-- Mode radio -->
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">How would you like to define the pipeline?</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column;gap:9px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> Use the recommended default (text-clean → format → missing → dedup)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Build interactively</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Import a saved pipeline JSON</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="dt-caption" style="margin:10px 0">
|
||||||
|
Edit the table to add, remove, reorder (drag the row index), enable, or configure each step.
|
||||||
|
Tool order is recommended, not enforced — violations surface as warnings below the table.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<!-- Pipeline editor (st.data_editor: Tool selectbox · Enabled checkbox · Options JSON) -->
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th class="idx"></th>
|
||||||
|
<th>Tool</th>
|
||||||
|
<th>Enabled</th>
|
||||||
|
<th>Options (JSON)</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 0</td>
|
||||||
|
<td>text_clean <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"trim": true, "collapse_whitespace": true}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 1</td>
|
||||||
|
<td>format_standardize <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"column_types": {"phone": "phone", "signup_date": "date"}}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 2</td>
|
||||||
|
<td>missing <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"strategy": "flag", "sentinels": ["N/A", "—"]}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 3</td>
|
||||||
|
<td>dedup <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"survivor_rule": "most_complete", "merge": true}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx" style="color:var(--ink-tertiary)">+</td>
|
||||||
|
<td colspan="3" style="color:var(--ink-tertiary);font-family:var(--font-sans)">Add row</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Validation: pipeline is in recommended order, so no warning shown (warning block omitted) -->
|
||||||
|
|
||||||
|
<!-- Nested explainer expander -->
|
||||||
|
<details class="dt-expander" open style="margin-top:14px">
|
||||||
|
<summary>Recommended tool order — why each step belongs where it does</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p><strong>text_clean</strong> before <strong>format_standardize</strong> — format parsers (phone / currency / date) fail on smart-quote-contaminated or NBSP-padded input — clean text first</p>
|
||||||
|
<p><strong>text_clean</strong> before <strong>missing</strong> — sentinel detection misses cells padded with NBSP / zero-width characters — clean text first</p>
|
||||||
|
<p><strong>text_clean</strong> before <strong>dedup</strong> — fuzzy matching treats NBSP-padded values as different — clean text first</p>
|
||||||
|
<p><strong>format_standardize</strong> before <strong>missing</strong> — numeric imputation needs numeric dtypes; canonical phones / currencies improve sentinel detection</p>
|
||||||
|
<p><strong>format_standardize</strong> before <strong>dedup</strong> — canonical phones / lowercase emails enable cross-format duplicate matching</p>
|
||||||
|
<p style="margin-bottom:0"><strong>missing</strong> before <strong>dedup</strong> — deduping rows with mixed NaN sentinels produces brittle merges — resolve missing values first</p>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Run -->
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Run Pipeline</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Initial rows</div><div class="value">18,442</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Final rows</div><div class="value">18,130</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Steps run</div><div class="value">4</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Elapsed</div><div class="value">1.84 s</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Per-step summary</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead>
|
||||||
|
<tr><th>step</th><th>status</th><th>elapsed_ms</th><th>summary</th><th>error</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>text_clean</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>214</td>
|
||||||
|
<td>{"cells_changed": 1204, "columns": ["name", "city"]}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>format_standardize</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>388</td>
|
||||||
|
<td>{"phone": 18301, "signup_date": 17996}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>missing</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>121</td>
|
||||||
|
<td>{"flagged_cells": 642, "sentinels_found": ["—"]}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>dedup</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>911</td>
|
||||||
|
<td>{"input_rows": 18442, "output_rows": 18130, "duplicates_removed": 312, "groups": 147}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Output preview (first 10 rows)</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>Austin</td><td class="dt-cell-add">+1 512-555-0190</td><td class="dt-cell-add">2024-01-04</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td class="dt-cell-add">+1 720-555-7781</td><td class="dt-cell-add">2024-02-11</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Carla Reyes</td><td>carla@initech.co</td><td>Phoenix</td><td class="dt-cell-add">+1 480-555-3320</td><td class="dt-cell-add">2024-03-02</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>Dan Okafor</td><td>dan@umbrella.net</td><td><span class="dt-cell-flag">⚑ missing</span></td><td class="dt-cell-add">+1 206-555-7745</td><td class="dt-cell-add">2024-03-18</td></tr>
|
||||||
|
<tr><td class="idx">4</td><td>Emily Tran</td><td>emily@hooli.com</td><td>Seattle</td><td class="dt-cell-add">+1 206-555-1182</td><td class="dt-cell-add">2024-04-05</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (3 columns) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary"><span class="dt-mi">download</span> Download cleaned CSV</button>
|
||||||
|
<button class="dt-btn"><span class="dt-mi">download</span> Download pipeline JSON</button>
|
||||||
|
<button class="dt-btn"><span class="dt-mi">download</span> Download run audit</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
189
layout-review/10_pdf_extractor.html
Normal file
189
layout-review/10_pdf_extractor.html
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — PDF to CSV</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="10_pdf_extractor">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>PDF to CSV</strong>, shown with two bank-statement PDFs imported and a completed scan (candidate transactions in the editable preview table). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>PDF to CSV</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Pull transactions out of bank-statement PDFs into a clean CSV file.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Scan options expander (collapsed by default) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Scan options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div class="dt-check on">
|
||||||
|
<span class="box"><span class="dt-mi">check</span></span>
|
||||||
|
Treat (4.50) as negative
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on">
|
||||||
|
<span class="box"><span class="dt-mi">check</span></span>
|
||||||
|
Use OCR for scanned pages
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-help-text" style="margin:0 0 10px">OCR status: ready (bundled Tesseract). Most modern bank PDFs are text-based and don't need OCR — only enable for image-based scans.</p>
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Output date format</label>
|
||||||
|
<div class="dt-select">YYYY-MM-DD (2026-01-13)</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Override year for short dates (optional)</label>
|
||||||
|
<input class="dt-input" type="text" placeholder="" value="" disabled>
|
||||||
|
<div class="dt-help-text">Leave blank for automatic (statement period → filename year → this override).</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Files section head -->
|
||||||
|
<div class="dt-files-section-head">
|
||||||
|
<h2>Files</h2>
|
||||||
|
<span class="dt-section-meta">2 files · 318.4 KB total</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Files card (Home-style bordered list + Add more files) -->
|
||||||
|
<div class="dt-card" style="padding-bottom:0">
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove statement-jan-2026.pdf">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">statement-jan-2026.pdf</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">171.2 KB</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove statement-feb-2026.pdf">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">statement-feb-2026.pdf</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">147.2 KB</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-file-add">
|
||||||
|
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M12 5v14M5 12h14"/></svg> Add more files
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Action buttons -->
|
||||||
|
<div class="dt-btn-row" style="margin-top:16px;max-width:340px">
|
||||||
|
<button class="dt-btn dt-btn-primary">Scan</button>
|
||||||
|
<button class="dt-btn">Clear all files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Warnings expander (collapsed) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Warnings (1)</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-alert warn">
|
||||||
|
<span class="dt-mi">warning</span>
|
||||||
|
<span>[statement-feb-2026.pdf] 2 lines matched a date but no amount — skipped (likely a wrapped description). Check the source if a transaction looks missing.</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h4>47 candidate transaction(s) from 2 file(s)</h4>
|
||||||
|
<p class="dt-caption">Uncheck rows to exclude. Edit any cell to fix a value the scanner got wrong. The <code>raw</code> column shows the original PDF text for that row.</p>
|
||||||
|
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Include</th>
|
||||||
|
<th>date</th>
|
||||||
|
<th>description</th>
|
||||||
|
<th>amount_debit</th>
|
||||||
|
<th>amount_credit</th>
|
||||||
|
<th>account_number</th>
|
||||||
|
<th>source_file</th>
|
||||||
|
<th>page</th>
|
||||||
|
<th>raw</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-03</td><td>OPENING BALANCE</td><td></td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">1</td><td>01/03 OPENING BALANCE 2,140.55</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-05</td><td>POS PURCHASE WHOLE FOODS MKT</td><td>84.12</td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">1</td><td>01/05 POS PURCHASE WHOLE FOODS MKT (84.12)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-08</td><td>ACH DEPOSIT PAYROLL ACME CORP</td><td></td><td>3,250.00</td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">1</td><td>01/08 ACH DEPOSIT PAYROLL ACME CORP 3,250.00</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-11</td><td>ONLINE TRANSFER TO SAVINGS</td><td>500.00</td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">2</td><td>01/11 ONLINE TRANSFER TO SAVINGS (500.00)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check" style="margin:0"><span class="box"></span></span></td>
|
||||||
|
<td class="dt-cell-flag">2026-01-12</td><td class="dt-cell-flag">INTEREST RATE 0.50% APY DETAIL</td><td></td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">2</td><td>01/12 INTEREST RATE 0.50% APY 0.00</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-14</td><td>DEBIT CARD SHELL OIL #2287</td><td>52.40</td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">2</td><td>01/14 DEBIT CARD SHELL OIL #2287 (52.40)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-02-02</td><td>POS PURCHASE TRADER JOES #511</td><td>61.88</td><td></td><td>****4821</td><td>statement-feb-2026.pdf</td><td class="idx">1</td><td>02/02 POS PURCHASE TRADER JOES #511 (61.88)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-02-06</td><td>ACH DEPOSIT PAYROLL ACME CORP</td><td></td><td>3,250.00</td><td>****4821</td><td>statement-feb-2026.pdf</td><td class="idx">2</td><td>02/06 ACH DEPOSIT PAYROLL ACME CORP 3,250.00</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-02-09</td><td>CHECK #1043</td><td>1,200.00</td><td></td><td>****4821</td><td>statement-feb-2026.pdf</td><td class="idx">2</td><td>02/09 CHECK #1043 (1,200.00)</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Download row: download button (left) + columns multiselect (right) -->
|
||||||
|
<div class="dt-row" style="margin-top:14px;align-items:flex-start">
|
||||||
|
<div style="flex:2">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Download 46 rows as CSV</button>
|
||||||
|
<p class="dt-caption" style="margin-top:8px">46 of 47 rows selected.</p>
|
||||||
|
</div>
|
||||||
|
<div style="flex:3">
|
||||||
|
<div class="dt-field" style="margin:0">
|
||||||
|
<label class="dt-label">Columns to include in CSV</label>
|
||||||
|
<div class="dt-multiselect">
|
||||||
|
<span class="dt-ms-chip">date <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">description <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">amount_debit <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">amount_credit <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">account_number <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">source_file <span class="x">✕</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text"><code>page</code> and <code>raw</code> are kept off by default; tick them if you want them in the file.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
251
layout-review/11_reconciler.html
Normal file
251
layout-review/11_reconciler.html
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Reconcile Two Files</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="11_reconciler">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Reconcile Two Files</strong>, shown with both files imported, key columns mapped, and a completed reconciliation (matched / review / unmatched results). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Reconcile Two Files</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Compare two lists of transactions (e.g. bank vs. ledger) and flag what doesn't match.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Side-by-side upload (st.columns(2) → two _side_panel) -->
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<!-- Left side -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Left (e.g. bank feed)</h4>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">bank_feed_may.csv</span>
|
||||||
|
<span class="size">214 KB</span>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption" style="margin-top:6px"><code>bank_feed_may.csv</code> — 1,204 rows, 4 columns</p>
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview left (e.g. bank feed)</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>posted_date</th><th>description</th><th>amount</th><th>ref</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2026-05-01</td><td>ACME SUPPLIES</td><td>-1240.00</td><td>CHK1041</td></tr>
|
||||||
|
<tr><td>2026-05-02</td><td>PAYROLL RUN</td><td>-8800.00</td><td>ACH5520</td></tr>
|
||||||
|
<tr><td>2026-05-03</td><td>CLIENT GLOBEX</td><td>5200.00</td><td>DEP0090</td></tr>
|
||||||
|
<tr><td>2026-05-04</td><td>UTILITY CO</td><td>-318.42</td><td>CHK1042</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
<!-- Right side -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Right (e.g. ledger)</h4>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">ledger_may.xlsx</span>
|
||||||
|
<span class="size">96 KB</span>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption" style="margin-top:6px"><code>ledger_may.xlsx</code> — 1,198 rows, 5 columns</p>
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview right (e.g. ledger)</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>txn_date</th><th>memo</th><th>value</th><th>invoice_no</th><th>account</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2026-05-01</td><td>Acme Supplies Inc</td><td>-1240.00</td><td>INV-1041</td><td>5000</td></tr>
|
||||||
|
<tr><td>2026-05-02</td><td>Monthly payroll</td><td>-8800.00</td><td>INV-5520</td><td>6000</td></tr>
|
||||||
|
<tr><td>2026-05-03</td><td>Globex retainer</td><td>5200.00</td><td>INV-0090</td><td>4000</td></tr>
|
||||||
|
<tr><td>2026-05-04</td><td>City Utilities</td><td>-318.40</td><td>INV-1042</td><td>6100</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Match settings -->
|
||||||
|
<h2>Match settings</h2>
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<!-- Left pickers (file order: posted_date, description, amount → date, desc, amount) -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Left columns</h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Date column (optional)</label><div class="dt-select">posted_date</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Description column (optional)</label><div class="dt-select">description</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Amount column</label><div class="dt-select">amount</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Reference columns (optional, e.g. check / invoice no.)</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">ref <span class="x">✕</span></span></div></div>
|
||||||
|
</div>
|
||||||
|
<!-- Right pickers (file order: txn_date, memo, value → date, desc, amount) -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Right columns</h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Date column (optional)</label><div class="dt-select">txn_date</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Description column (optional)</label><div class="dt-select">memo</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Amount column</label><div class="dt-select">value</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Reference columns (must match left count)</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">invoice_no <span class="x">✕</span></span></div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Tolerances & options (expanded=True) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Tolerances & options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<div class="dt-field"><label class="dt-label">Amount tolerance</label>
|
||||||
|
<div class="dt-input">0.0200</div>
|
||||||
|
<div class="dt-help-text">Absolute tolerance on amount (e.g. 0.01 to absorb cent rounding).</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Date tolerance (days)</label>
|
||||||
|
<div class="dt-input">1</div>
|
||||||
|
<div class="dt-help-text">Allow N calendar days of drift between posting dates.</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Invert right amount sign</label>
|
||||||
|
<div class="dt-check" style="margin-top:8px"><span class="box"></span> Invert right amount sign</div>
|
||||||
|
<div class="dt-help-text">Use when one side records debits as positive and the other as negative.</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Description similarity boost (0 disables)</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:80%"></div><div class="knob" style="left:80%"></div></div><div class="val">80</div></div>
|
||||||
|
<div class="dt-help-text">When both sides have a description column set, accept matches with this minimum fuzzy similarity even if amount/date are merely within tolerance. Lower = more permissive.</div></div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Reconcile</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Matched</div><div class="value">1,173</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Review</div><div class="value">9</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Unmatched left</div><div class="value">22</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Unmatched right</div><div class="value">16</div></div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Coverage: 97.4% of the larger side</p>
|
||||||
|
|
||||||
|
<!-- Tabs (st.tabs) — Matched active -->
|
||||||
|
<div class="dt-tabs">
|
||||||
|
<span class="dt-tab is-active">Matched (1,173)</span>
|
||||||
|
<span class="dt-tab">Review (9)</span>
|
||||||
|
<span class="dt-tab">Unmatched left (22)</span>
|
||||||
|
<span class="dt-tab">Unmatched right (16)</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Matched tab content -->
|
||||||
|
<p class="dt-caption">Preview of first 25 of 1,173 rows — download the CSV below for the full set.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr>
|
||||||
|
<th>left_posted_date</th><th>left_description</th><th>left_amount</th>
|
||||||
|
<th>right_txn_date</th><th>right_memo</th><th>right_value</th><th>amount_diff</th>
|
||||||
|
</tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2026-05-01</td><td>ACME SUPPLIES</td><td>-1240.00</td><td>2026-05-01</td><td>Acme Supplies Inc</td><td>-1240.00</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
<tr><td>2026-05-02</td><td>PAYROLL RUN</td><td>-8800.00</td><td>2026-05-02</td><td>Monthly payroll</td><td>-8800.00</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
<tr><td>2026-05-03</td><td>CLIENT GLOBEX</td><td>5200.00</td><td>2026-05-03</td><td>Globex retainer</td><td>5200.00</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
<tr><td>2026-05-04</td><td>UTILITY CO</td><td>-318.42</td><td>2026-05-04</td><td>City Utilities</td><td>-318.40</td><td class="dt-cell-flag">0.02</td></tr>
|
||||||
|
<tr><td>2026-05-06</td><td>OFFICE DEPOT</td><td>-89.15</td><td>2026-05-07</td><td>Office supplies</td><td>-89.15</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Other tab previews shown as collapsed expanders for review context -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Review (9) — ambiguous candidates</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">Pairs flagged because the algorithm couldn't pick a single best match (e.g. multiple equally-good candidates). Use the left/right indices to disambiguate manually.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>left_idx</th><th>left_amount</th><th>right_idx</th><th>right_value</th><th>candidates</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>118</td><td>-450.00</td><td>121, 209</td><td>-450.00</td><td class="dt-cell-flag">2 equal</td></tr>
|
||||||
|
<tr><td>203</td><td>1000.00</td><td>198, 244</td><td>1000.00</td><td class="dt-cell-flag">2 equal</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Unmatched left (22) — only in bank_feed_may.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">Preview of first 25 of 22 rows.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>posted_date</th><th>description</th><th>amount</th><th>ref</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-09</td><td class="dt-cell-del">BANK FEE</td><td class="dt-cell-del">-12.00</td><td class="dt-cell-del">FEE0001</td></tr>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-14</td><td class="dt-cell-del">ATM WITHDRAWAL</td><td class="dt-cell-del">-200.00</td><td class="dt-cell-del">ATM7781</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Unmatched right (16) — only in ledger_may.xlsx</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">Preview of first 25 of 16 rows.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>txn_date</th><th>memo</th><th>value</th><th>invoice_no</th><th>account</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-11</td><td class="dt-cell-del">Accrued interest</td><td class="dt-cell-del">37.50</td><td class="dt-cell-del">INV-9001</td><td class="dt-cell-del">7000</td></tr>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-22</td><td class="dt-cell-del">Depreciation</td><td class="dt-cell-del">-410.00</td><td class="dt-cell-del">INV-9044</td><td class="dt-cell-del">8000</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (st.columns(4) of html_download_button) -->
|
||||||
|
<div class="dt-btn-row">
|
||||||
|
<button class="dt-btn dt-btn-primary">Matched CSV</button>
|
||||||
|
<button class="dt-btn">Review CSV</button>
|
||||||
|
<button class="dt-btn">Unmatched left</button>
|
||||||
|
<button class="dt-btn">Unmatched right</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
473
layout-review/app.css
Normal file
473
layout-review/app.css
Normal file
@@ -0,0 +1,473 @@
|
|||||||
|
/* ===========================================================================
|
||||||
|
DataTools — static layout-review stylesheet
|
||||||
|
---------------------------------------------------------------------------
|
||||||
|
Faithful reproduction of the live Streamlit app's design system for human
|
||||||
|
review of page layouts. Tokens are copied verbatim from src/gui/theme.py
|
||||||
|
(§3 color + type scale) and the component values from
|
||||||
|
src/gui/components/_legacy.py:_DESIGN_TOKENS_CSS.
|
||||||
|
|
||||||
|
The live app applies these styles to Streamlit's data-testid DOM; here we
|
||||||
|
re-express the same look against clean semantic classes so the static HTML
|
||||||
|
stays readable. Where the app uses real .dt-* classes (page header, files
|
||||||
|
card, findings, stats) the class names are kept identical.
|
||||||
|
=========================================================================== */
|
||||||
|
|
||||||
|
@import url("https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600;700&family=Geist+Mono:wght@400;500&display=swap");
|
||||||
|
@import url("https://fonts.googleapis.com/css2?family=Material+Symbols+Outlined:opsz,wght,FILL,GRAD@20..48,400,0,0&display=block");
|
||||||
|
|
||||||
|
:root {
|
||||||
|
--font-sans: "Geist", -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
||||||
|
--font-mono: "Geist Mono", ui-monospace, "SF Mono", Menlo, monospace;
|
||||||
|
|
||||||
|
--ink: #1c1917;
|
||||||
|
--ink-secondary: #57534e;
|
||||||
|
--ink-tertiary: #a8a29e;
|
||||||
|
--bg: #fafaf7;
|
||||||
|
--surface: #ffffff;
|
||||||
|
--surface-hover: #f8f7f3;
|
||||||
|
--border: #e7e5dc;
|
||||||
|
--border-strong: #d6d3c7;
|
||||||
|
--accent: #c2410c;
|
||||||
|
--accent-hover: #9a3412;
|
||||||
|
--accent-fill: #fef4ed;
|
||||||
|
--accent-fill-strong: #fde4d3;
|
||||||
|
|
||||||
|
--warn: #b45309;
|
||||||
|
--warn-fill: #fef3c7;
|
||||||
|
--info: #0369a1;
|
||||||
|
--info-fill: #e0f2fe;
|
||||||
|
--success: #15803d;
|
||||||
|
--success-fill: #dcfce7;
|
||||||
|
--danger: #b91c1c;
|
||||||
|
--danger-fill: #fee2e2;
|
||||||
|
|
||||||
|
--r-sm: 6px;
|
||||||
|
--r-md: 10px;
|
||||||
|
--r-lg: 14px;
|
||||||
|
|
||||||
|
--sidebar-w: 264px;
|
||||||
|
}
|
||||||
|
|
||||||
|
* { box-sizing: border-box; }
|
||||||
|
|
||||||
|
html, body {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
background: var(--bg);
|
||||||
|
color: var(--ink);
|
||||||
|
font-family: var(--font-sans);
|
||||||
|
font-feature-settings: "ss01", "cv01", "cv11";
|
||||||
|
-webkit-font-smoothing: antialiased;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------- Type scale (theme.py §4) ---------- */
|
||||||
|
h1 { font-size: 32px; font-weight: 600; letter-spacing: -0.035em; line-height: 1.1; margin: 0 0 4px; }
|
||||||
|
h2 { font-size: 22px; font-weight: 600; letter-spacing: -0.025em; line-height: 1.2; margin: 1.5rem 0 0.75rem; }
|
||||||
|
h3 { font-size: 18px; font-weight: 500; letter-spacing: -0.018em; line-height: 1.25; margin: 1.25rem 0 0.5rem; }
|
||||||
|
h4 { font-size: 15px; font-weight: 500; letter-spacing: -0.012em; line-height: 1.35; margin: 1rem 0 0.5rem; }
|
||||||
|
p { font-size: 14px; font-weight: 400; line-height: 1.55; color: var(--ink); margin: 0 0 0.6rem; }
|
||||||
|
strong { font-weight: 500; color: var(--ink); }
|
||||||
|
a { color: var(--accent); text-decoration: none; }
|
||||||
|
a:hover { color: var(--accent-hover); text-decoration: underline; }
|
||||||
|
code, .dt-mono { font-family: var(--font-mono); font-size: 0.92em; font-feature-settings: "ss02"; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
App frame — sidebar + main + sticky footer
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-app { display: flex; min-height: 100vh; }
|
||||||
|
|
||||||
|
/* ---------- Sidebar (cream paper) ---------- */
|
||||||
|
.dt-sidebar {
|
||||||
|
width: var(--sidebar-w);
|
||||||
|
flex-shrink: 0;
|
||||||
|
background: #f5f4ef;
|
||||||
|
border-right: 1px solid var(--border);
|
||||||
|
padding: 18px 14px 90px;
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
align-self: flex-start;
|
||||||
|
height: 100vh;
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
.dt-brand { display: flex; align-items: center; gap: 10px; padding: 0 4px 18px; }
|
||||||
|
.dt-brand-mark {
|
||||||
|
width: 28px; height: 28px; border-radius: 7px;
|
||||||
|
background: var(--ink); color: var(--accent-fill);
|
||||||
|
display: inline-flex; align-items: center; justify-content: center;
|
||||||
|
font-weight: 700; font-size: 16px; letter-spacing: -0.04em; line-height: 1; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-brand-name { display: flex; flex-direction: column; gap: 1px; line-height: 1.05; }
|
||||||
|
.dt-brand-eyebrow {
|
||||||
|
font-size: 9.5px; font-weight: 600; letter-spacing: 0.14em;
|
||||||
|
text-transform: uppercase; color: var(--ink-tertiary); line-height: 1;
|
||||||
|
}
|
||||||
|
.dt-brand-word { font-weight: 600; font-size: 15px; letter-spacing: -0.02em; color: var(--ink); }
|
||||||
|
|
||||||
|
.dt-nav { display: flex; flex-direction: column; }
|
||||||
|
.dt-nav-section {
|
||||||
|
font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.08em;
|
||||||
|
color: var(--ink-tertiary); font-weight: 500;
|
||||||
|
padding: 14px 10px 4px; margin: 0;
|
||||||
|
display: flex; align-items: center; justify-content: space-between;
|
||||||
|
}
|
||||||
|
.dt-nav-section .dt-nav-indicator { font-size: 16px; color: var(--ink-tertiary); }
|
||||||
|
.dt-nav-link {
|
||||||
|
display: flex; align-items: center; gap: 8px;
|
||||||
|
color: var(--ink-secondary); font-size: 13px; font-weight: 500; line-height: 1.3;
|
||||||
|
padding: 5px 10px; border-radius: var(--r-sm); margin-bottom: 1px;
|
||||||
|
text-decoration: none; transition: background 0.12s ease, color 0.12s ease;
|
||||||
|
}
|
||||||
|
.dt-nav-link:hover { background: rgba(0,0,0,0.04); color: var(--ink); text-decoration: none; }
|
||||||
|
.dt-nav-link.is-active { background: rgba(0,0,0,0.04); color: var(--ink); font-weight: 600; }
|
||||||
|
.dt-nav-link .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; color: var(--ink-secondary); line-height: 1; }
|
||||||
|
.dt-nav-link.is-active .dt-mi { color: var(--ink); }
|
||||||
|
.dt-nav-link.is-soon { opacity: 0.55; }
|
||||||
|
.dt-nav-soon-tag {
|
||||||
|
margin-left: auto; font-size: 9px; font-weight: 600; letter-spacing: 0.06em;
|
||||||
|
text-transform: uppercase; color: var(--ink-tertiary);
|
||||||
|
border: 1px solid var(--border-strong); border-radius: 999px; padding: 1px 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.dt-sidebar-foot { margin-top: 22px; padding-top: 16px; border-top: 1px solid var(--border); display: flex; flex-direction: column; gap: 10px; }
|
||||||
|
.dt-sidebar-label { font-size: 11.5px; font-weight: 500; text-transform: uppercase; letter-spacing: 0.08em; color: var(--ink-tertiary); margin-bottom: 4px; }
|
||||||
|
.dt-license-badge { font-size: 12.5px; color: var(--ink-secondary); }
|
||||||
|
|
||||||
|
/* ---------- Main column ---------- */
|
||||||
|
.dt-main { flex: 1; min-width: 0; padding: 40px 56px 96px; }
|
||||||
|
.dt-main-inner { max-width: 920px; margin: 0 auto; }
|
||||||
|
|
||||||
|
/* Review banner above every mockup */
|
||||||
|
.dt-review-banner {
|
||||||
|
max-width: 920px; margin: 0 auto 20px; display: flex; gap: 10px; align-items: center;
|
||||||
|
background: var(--info-fill); color: var(--info);
|
||||||
|
border: 1px solid transparent; border-radius: var(--r-md);
|
||||||
|
padding: 8px 14px; font-size: 12.5px; line-height: 1.4;
|
||||||
|
}
|
||||||
|
.dt-review-banner a { color: var(--info); text-decoration: underline; }
|
||||||
|
.dt-review-banner .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; }
|
||||||
|
|
||||||
|
/* ---------- Sticky footer ---------- */
|
||||||
|
.dt-footer {
|
||||||
|
position: fixed; bottom: 0; left: var(--sidebar-w); right: 0;
|
||||||
|
background: rgba(255,255,255,0.97); backdrop-filter: blur(8px);
|
||||||
|
border-top: 1px solid var(--border-strong);
|
||||||
|
padding: 8px 20px; z-index: 50;
|
||||||
|
display: flex; align-items: center; gap: 8px;
|
||||||
|
}
|
||||||
|
.dt-footer-btn {
|
||||||
|
display: inline-flex; align-items: center; gap: 8px;
|
||||||
|
color: var(--ink-secondary); font-size: 13px; font-weight: 500; line-height: 1.3;
|
||||||
|
padding: 5px 10px; border-radius: var(--r-sm);
|
||||||
|
background: transparent; border: none; cursor: pointer; text-decoration: none;
|
||||||
|
}
|
||||||
|
.dt-footer-btn:hover { background: rgba(0,0,0,0.04); color: var(--ink); text-decoration: none; }
|
||||||
|
.dt-footer-btn .dt-mi { font-family: "Material Symbols Outlined"; font-size: 16px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Page header (brand + privacy pill) — .dt-page-* mirror the live app
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-page-header {
|
||||||
|
display: flex; align-items: center; justify-content: space-between; gap: 24px;
|
||||||
|
margin: 0 0 24px; padding-bottom: 22px; border-bottom: 1px solid var(--border);
|
||||||
|
}
|
||||||
|
.dt-page-brand { display: flex; flex-direction: column; gap: 8px; }
|
||||||
|
.dt-page-brand-row { display: flex; align-items: center; gap: 18px; }
|
||||||
|
.dt-page-brand-mark {
|
||||||
|
width: 56px; height: 56px; border-radius: 14px; background: var(--ink);
|
||||||
|
color: var(--accent-fill); display: inline-flex; align-items: center; justify-content: center;
|
||||||
|
font-weight: 700; font-size: 32px; letter-spacing: -0.04em; line-height: 1; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-page-brand-words { display: flex; flex-direction: column; gap: 2px; line-height: 1; }
|
||||||
|
.dt-page-eyebrow { font-size: 11.5px; font-weight: 600; letter-spacing: 0.14em; text-transform: uppercase; color: var(--ink-tertiary); line-height: 1.2; }
|
||||||
|
.dt-page-wordmark { margin: 0; font-weight: 600; font-size: 32px; letter-spacing: -0.035em; line-height: 1.1; color: var(--ink); }
|
||||||
|
.dt-page-subtitle { margin: 4px 0 0; color: var(--ink-secondary); font-size: 14px; line-height: 1.5; }
|
||||||
|
.dt-privacy-pill {
|
||||||
|
display: inline-flex; align-items: center; gap: 6px; padding: 6px 11px;
|
||||||
|
background: var(--success-fill); color: var(--success); border-radius: 999px;
|
||||||
|
font-size: 12px; font-weight: 500; white-space: nowrap; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-privacy-pill svg { width: 13px; height: 13px; stroke-width: 2; }
|
||||||
|
|
||||||
|
/* ---------- Tool header (title + Help popover) ---------- */
|
||||||
|
.dt-tool-header { display: flex; align-items: flex-start; justify-content: space-between; gap: 16px; }
|
||||||
|
.dt-tool-header h1 { margin: 0; }
|
||||||
|
.dt-help-btn {
|
||||||
|
display: inline-flex; align-items: center; gap: 6px; white-space: nowrap;
|
||||||
|
background: var(--surface); color: var(--ink); border: 1px solid var(--border-strong);
|
||||||
|
border-radius: var(--r-md); padding: 9px 16px; font-size: 13.5px; font-weight: 500;
|
||||||
|
cursor: pointer; flex-shrink: 0; margin-top: 6px;
|
||||||
|
}
|
||||||
|
.dt-help-btn .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; }
|
||||||
|
.dt-tool-caption { font-size: 12.5px; color: var(--ink-tertiary); line-height: 1.5; margin: 2px 0 0; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Buttons
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-btn {
|
||||||
|
border-radius: var(--r-md); font-family: var(--font-sans); font-weight: 500;
|
||||||
|
font-size: 13.5px; letter-spacing: -0.005em; line-height: 1; padding: 9px 16px;
|
||||||
|
border: 1px solid var(--border-strong); background: var(--surface); color: var(--ink);
|
||||||
|
cursor: pointer; transition: background 0.12s ease, border-color 0.12s ease, color 0.12s ease;
|
||||||
|
display: inline-flex; align-items: center; justify-content: center; gap: 8px;
|
||||||
|
}
|
||||||
|
.dt-btn:hover { background: var(--surface-hover); border-color: var(--ink-tertiary); }
|
||||||
|
.dt-btn-primary { background: var(--ink); color: var(--bg); border-color: var(--ink); }
|
||||||
|
.dt-btn-primary:hover { background: #292524; border-color: #292524; color: var(--bg); }
|
||||||
|
.dt-btn-tertiary { background: transparent; border: none; color: var(--ink-tertiary); padding: 4px 8px; }
|
||||||
|
.dt-btn-tertiary:hover { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-btn:disabled, .dt-btn.is-disabled {
|
||||||
|
background: var(--surface-hover); color: var(--ink-tertiary);
|
||||||
|
border: 1px solid var(--border); cursor: not-allowed;
|
||||||
|
}
|
||||||
|
.dt-btn-block { width: 100%; }
|
||||||
|
.dt-btn .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; }
|
||||||
|
|
||||||
|
.dt-btn-row { display: flex; gap: 10px; flex-wrap: wrap; }
|
||||||
|
.dt-btn-row > .dt-btn { flex: 1; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
File uploader (cream dropzone)
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-uploader {
|
||||||
|
background: var(--surface-hover); border: 1px dashed var(--border-strong);
|
||||||
|
border-radius: var(--r-md); padding: 22px 20px;
|
||||||
|
display: flex; align-items: center; justify-content: space-between; gap: 16px;
|
||||||
|
}
|
||||||
|
.dt-uploader-text { display: flex; flex-direction: column; gap: 2px; }
|
||||||
|
.dt-uploader-text .hint { font-size: 14px; color: var(--ink); }
|
||||||
|
.dt-uploader-text .sub { font-size: 12.5px; color: var(--ink-tertiary); }
|
||||||
|
.dt-uploader .dt-mi { font-family: "Material Symbols Outlined"; font-size: 24px; color: var(--ink-tertiary); }
|
||||||
|
|
||||||
|
/* Staged-file chip */
|
||||||
|
.dt-file-chip {
|
||||||
|
display: flex; align-items: center; gap: 12px;
|
||||||
|
background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-sm);
|
||||||
|
padding: 10px 14px; margin-top: 10px;
|
||||||
|
}
|
||||||
|
.dt-file-chip .name { font-family: var(--font-mono); font-size: 13px; color: var(--ink); font-feature-settings: "ss02"; }
|
||||||
|
.dt-file-chip .size { font-family: var(--font-mono); font-size: 12px; color: var(--ink-tertiary); margin-left: auto; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Expanders / bordered cards
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-expander {
|
||||||
|
background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg);
|
||||||
|
overflow: hidden; box-shadow: 0 1px 2px rgba(28,25,23,0.03); margin: 10px 0;
|
||||||
|
}
|
||||||
|
.dt-expander > summary, .dt-expander-head {
|
||||||
|
background: var(--surface-hover); border-bottom: 1px solid var(--border);
|
||||||
|
padding: 12px 16px; font-weight: 500; color: var(--ink); font-size: 14px;
|
||||||
|
cursor: pointer; list-style: none; display: flex; align-items: center; gap: 8px;
|
||||||
|
}
|
||||||
|
.dt-expander > summary::-webkit-details-marker { display: none; }
|
||||||
|
.dt-expander > summary::before {
|
||||||
|
content: "expand_more"; font-family: "Material Symbols Outlined"; font-size: 20px;
|
||||||
|
color: var(--ink-tertiary); transition: transform 0.15s ease;
|
||||||
|
}
|
||||||
|
.dt-expander[open] > summary::before { transform: rotate(180deg); }
|
||||||
|
.dt-expander-body, .dt-expander > .dt-expander-body { padding: 14px 16px; }
|
||||||
|
.dt-expander:not([open]) > summary { border-bottom: none; }
|
||||||
|
|
||||||
|
.dt-card {
|
||||||
|
background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg);
|
||||||
|
box-shadow: 0 1px 2px rgba(28,25,23,0.03); padding: 16px; margin: 10px 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Alerts
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-alert {
|
||||||
|
border-radius: var(--r-md); border: 1px solid transparent;
|
||||||
|
padding: 10px 14px; font-size: 13.5px; line-height: 1.45; margin: 10px 0;
|
||||||
|
display: flex; gap: 10px; align-items: flex-start;
|
||||||
|
}
|
||||||
|
.dt-alert .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; flex-shrink: 0; margin-top: 1px; }
|
||||||
|
.dt-alert.info { background: var(--info-fill); color: var(--info); }
|
||||||
|
.dt-alert.success { background: var(--success-fill); color: var(--success); }
|
||||||
|
.dt-alert.warn { background: var(--warn-fill); color: var(--warn); }
|
||||||
|
.dt-alert.error { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-alert code { background: rgba(0,0,0,0.05); padding: 1px 5px; border-radius: 4px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Inputs (static representations of Streamlit widgets)
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-field { margin: 10px 0; }
|
||||||
|
.dt-label { font-size: 13px; font-weight: 500; color: var(--ink); margin-bottom: 5px; display: block; }
|
||||||
|
.dt-label .req { color: var(--accent); }
|
||||||
|
.dt-input, .dt-select, .dt-textarea {
|
||||||
|
width: 100%; background: var(--surface); border: 1px solid var(--border-strong);
|
||||||
|
border-radius: var(--r-sm); padding: 8px 11px; font-family: var(--font-sans);
|
||||||
|
font-size: 13.5px; color: var(--ink);
|
||||||
|
}
|
||||||
|
.dt-select { appearance: none; background-image: linear-gradient(45deg, transparent 50%, var(--ink-tertiary) 50%), linear-gradient(135deg, var(--ink-tertiary) 50%, transparent 50%); background-position: calc(100% - 16px) 14px, calc(100% - 11px) 14px; background-size: 5px 5px, 5px 5px; background-repeat: no-repeat; }
|
||||||
|
.dt-textarea { min-height: 76px; resize: vertical; font-family: var(--font-mono); font-size: 13px; }
|
||||||
|
.dt-help-text { font-size: 12px; color: var(--ink-tertiary); margin-top: 4px; }
|
||||||
|
|
||||||
|
/* Multiselect — chips inside a box */
|
||||||
|
.dt-multiselect {
|
||||||
|
width: 100%; background: var(--surface); border: 1px solid var(--border-strong);
|
||||||
|
border-radius: var(--r-sm); padding: 6px 8px; min-height: 38px;
|
||||||
|
display: flex; flex-wrap: wrap; gap: 6px; align-items: center;
|
||||||
|
}
|
||||||
|
.dt-ms-chip {
|
||||||
|
display: inline-flex; align-items: center; gap: 5px; background: var(--accent-fill);
|
||||||
|
color: var(--accent-hover); border-radius: var(--r-sm); padding: 3px 8px;
|
||||||
|
font-size: 12.5px; font-weight: 500;
|
||||||
|
}
|
||||||
|
.dt-ms-chip .x { color: var(--accent); font-size: 13px; }
|
||||||
|
.dt-ms-placeholder { color: var(--ink-tertiary); font-size: 13px; padding: 2px 4px; }
|
||||||
|
|
||||||
|
/* Checkbox / radio */
|
||||||
|
.dt-check { display: flex; align-items: center; gap: 9px; margin: 8px 0; font-size: 13.5px; color: var(--ink); }
|
||||||
|
.dt-check .box {
|
||||||
|
width: 18px; height: 18px; border-radius: 5px; border: 1px solid var(--border-strong);
|
||||||
|
background: var(--surface); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-check.on .box { background: var(--ink); border-color: var(--ink); color: var(--bg); }
|
||||||
|
.dt-check.on .box .dt-mi { font-family: "Material Symbols Outlined"; font-size: 14px; }
|
||||||
|
.dt-radio-row { display: flex; gap: 18px; flex-wrap: wrap; margin: 8px 0; }
|
||||||
|
.dt-radio { display: inline-flex; align-items: center; gap: 7px; font-size: 13.5px; }
|
||||||
|
.dt-radio .dot { width: 16px; height: 16px; border-radius: 50%; border: 1px solid var(--border-strong); display: inline-block; flex-shrink: 0; }
|
||||||
|
.dt-radio.on .dot { border: 5px solid var(--ink); }
|
||||||
|
|
||||||
|
/* Slider */
|
||||||
|
.dt-slider { margin: 14px 0 6px; }
|
||||||
|
.dt-slider .track { position: relative; height: 4px; background: var(--border-strong); border-radius: 2px; }
|
||||||
|
.dt-slider .fill { position: absolute; left: 0; top: 0; height: 4px; background: var(--ink); border-radius: 2px; }
|
||||||
|
.dt-slider .knob { position: absolute; top: 50%; width: 16px; height: 16px; border-radius: 50%; background: var(--ink); transform: translate(-50%, -50%); }
|
||||||
|
.dt-slider .val { font-family: var(--font-mono); font-size: 12px; color: var(--ink-secondary); margin-top: 8px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Layout helpers
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-row { display: flex; gap: 16px; }
|
||||||
|
.dt-row > * { flex: 1; min-width: 0; }
|
||||||
|
.dt-cols-2 { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
|
||||||
|
.dt-cols-3 { display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px; }
|
||||||
|
.dt-divider { border: none; border-top: 1px solid var(--border); margin: 22px 0; }
|
||||||
|
.dt-caption { font-size: 12.5px; color: var(--ink-tertiary); line-height: 1.5; }
|
||||||
|
.dt-spacer { height: 12px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
DataFrame / preview table
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-table-wrap { border: 1px solid var(--border); border-radius: var(--r-md); overflow: hidden; margin: 8px 0; }
|
||||||
|
table.dt-table { width: 100%; border-collapse: collapse; font-size: 13px; }
|
||||||
|
table.dt-table th {
|
||||||
|
background: var(--surface-hover); color: var(--ink-secondary); font-weight: 500;
|
||||||
|
text-align: left; padding: 8px 12px; border-bottom: 1px solid var(--border);
|
||||||
|
font-size: 12px; text-transform: none; white-space: nowrap;
|
||||||
|
}
|
||||||
|
table.dt-table td {
|
||||||
|
padding: 7px 12px; border-bottom: 1px solid var(--border);
|
||||||
|
font-family: var(--font-mono); font-size: 12.5px; color: var(--ink); font-feature-settings: "ss02"; white-space: nowrap;
|
||||||
|
}
|
||||||
|
table.dt-table tr:last-child td { border-bottom: none; }
|
||||||
|
table.dt-table tr:nth-child(even) td { background: #fcfbf8; }
|
||||||
|
table.dt-table td.idx { color: var(--ink-tertiary); background: var(--surface-hover); }
|
||||||
|
.dt-cell-flag { color: var(--warn); }
|
||||||
|
.dt-cell-del { color: var(--danger); text-decoration: line-through; }
|
||||||
|
.dt-cell-add { color: var(--success); }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Stats overview (home) — copied from _legacy.py
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-stats { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin: 8px 0 20px; }
|
||||||
|
.dt-stat { background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg); padding: 16px 18px; box-shadow: 0 1px 2px rgba(28,25,23,0.03); }
|
||||||
|
.dt-stat-label { font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.08em; color: var(--ink-tertiary); font-weight: 500; margin-bottom: 6px; line-height: 1.4; }
|
||||||
|
.dt-stat-value { font-size: 28px; font-weight: 600; letter-spacing: -0.03em; line-height: 1; color: var(--ink); display: flex; align-items: baseline; gap: 6px; }
|
||||||
|
.dt-stat-unit { font-size: 12px; font-weight: 400; color: var(--ink-tertiary); letter-spacing: 0; }
|
||||||
|
.dt-stat.is-warn .dt-stat-value { color: var(--warn); }
|
||||||
|
.dt-stat.is-info .dt-stat-value { color: var(--info); }
|
||||||
|
.dt-stat.is-success .dt-stat-value { color: var(--success); }
|
||||||
|
@media (max-width: 900px) { .dt-stats { grid-template-columns: repeat(2, 1fr); } }
|
||||||
|
|
||||||
|
/* Metric (st.metric) */
|
||||||
|
.dt-metrics { display: flex; gap: 28px; flex-wrap: wrap; margin: 6px 0 14px; }
|
||||||
|
.dt-metric .label { font-size: 12.5px; color: var(--ink-tertiary); margin-bottom: 4px; }
|
||||||
|
.dt-metric .value { font-size: 26px; font-weight: 600; letter-spacing: -0.03em; color: var(--ink); line-height: 1; }
|
||||||
|
.dt-metric .delta { font-size: 12.5px; margin-top: 3px; }
|
||||||
|
.dt-metric .delta.up { color: var(--success); }
|
||||||
|
.dt-metric .delta.down { color: var(--danger); }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Files card (home) — copied from _legacy.py
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-files-section-head { display: flex; align-items: baseline; justify-content: space-between; margin: 4px 0 10px; gap: 12px; }
|
||||||
|
.dt-files-section-head h2 { margin: 0; }
|
||||||
|
.dt-section-meta { font-size: 12.5px; color: var(--ink-tertiary); }
|
||||||
|
.dt-file-row { display: flex; align-items: center; gap: 12px; }
|
||||||
|
.dt-file-icon-chip { width: 28px; height: 28px; border-radius: var(--r-sm); background: var(--accent-fill); color: var(--accent); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0; }
|
||||||
|
.dt-file-icon-chip svg { width: 14px; height: 14px; stroke-width: 1.8; }
|
||||||
|
.dt-file-name { font-family: var(--font-mono); font-size: 13px; color: var(--ink); font-feature-settings: "ss02"; }
|
||||||
|
.dt-file-size { font-family: var(--font-mono); font-size: 12px; color: var(--ink-tertiary); font-feature-settings: "ss02"; }
|
||||||
|
.dt-file-add {
|
||||||
|
display: flex; align-items: center; justify-content: center; gap: 8px;
|
||||||
|
width: 100%; padding: 12px 16px; background: var(--surface-hover);
|
||||||
|
border: none; border-top: 1px dashed var(--border-strong);
|
||||||
|
border-radius: 0 0 var(--r-lg) var(--r-lg); cursor: pointer;
|
||||||
|
font-size: 13px; font-weight: 500; color: var(--ink-secondary); margin-top: 14px;
|
||||||
|
}
|
||||||
|
.dt-file-add:hover { background: var(--accent-fill); color: var(--accent); }
|
||||||
|
.dt-file-add svg { width: 14px; height: 14px; stroke-width: 2; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Findings panel — copied from _legacy.py
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-finding-group-head {
|
||||||
|
display: flex; align-items: center; gap: 12px; padding: 16px 22px;
|
||||||
|
border-bottom: 1px solid var(--border); background: var(--surface-hover);
|
||||||
|
margin: -16px -16px 1.2rem; border-radius: var(--r-lg) var(--r-lg) 0 0;
|
||||||
|
cursor: pointer; user-select: none;
|
||||||
|
}
|
||||||
|
.dt-finding-group-chevron { color: var(--ink-tertiary); font-family: "Material Symbols Outlined"; font-size: 20px; line-height: 1; flex-shrink: 0; }
|
||||||
|
.dt-severity-dot { width: 8px; height: 8px; border-radius: 50%; flex-shrink: 0; display: inline-block; }
|
||||||
|
.dt-severity-dot.warn { background: var(--warn); }
|
||||||
|
.dt-severity-dot.info { background: var(--info); }
|
||||||
|
.dt-severity-dot.error { background: var(--danger); }
|
||||||
|
.dt-severity-dot.success { background: var(--success); }
|
||||||
|
.dt-group-filename { font-family: var(--font-mono); font-size: 13.5px; font-weight: 500; color: var(--ink); font-feature-settings: "ss02"; }
|
||||||
|
.dt-group-counts { margin-left: auto; display: flex; align-items: center; gap: 8px; }
|
||||||
|
.dt-count-pill { display: inline-flex; align-items: center; padding: 3px 9px; border-radius: 999px; font-size: 11.5px; font-weight: 500; line-height: 1.4; white-space: nowrap; }
|
||||||
|
.dt-count-pill.warn { background: var(--warn-fill); color: var(--warn); }
|
||||||
|
.dt-count-pill.info { background: var(--info-fill); color: var(--info); }
|
||||||
|
.dt-count-pill.error { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-count-pill.success { background: var(--success-fill); color: var(--success); }
|
||||||
|
.dt-finding-row { display: flex; align-items: flex-start; gap: 12px; padding: 12px 0; border-top: 1px solid var(--border); }
|
||||||
|
.dt-finding-row:first-of-type { border-top: none; }
|
||||||
|
.dt-finding-icon { width: 24px; height: 24px; border-radius: var(--r-sm); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0; }
|
||||||
|
.dt-finding-icon.warn { background: var(--warn-fill); color: var(--warn); }
|
||||||
|
.dt-finding-icon.info { background: var(--info-fill); color: var(--info); }
|
||||||
|
.dt-finding-icon.error { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-finding-icon .dt-mi { font-family: "Material Symbols Outlined"; font-size: 16px; line-height: 1; }
|
||||||
|
.dt-finding-body { flex: 1; min-width: 0; }
|
||||||
|
.dt-finding-title { font-size: 14px; color: var(--ink); margin: 0 0 2px; line-height: 1.4; letter-spacing: -0.005em; }
|
||||||
|
.dt-finding-title strong { font-weight: 500; }
|
||||||
|
.dt-finding-meta { font-family: var(--font-mono); font-size: 12px; color: var(--ink-tertiary); line-height: 1.4; margin: 0; font-feature-settings: "ss02"; }
|
||||||
|
|
||||||
|
/* Match-group review card (dedup) */
|
||||||
|
.dt-match-card { background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg); box-shadow: 0 1px 2px rgba(28,25,23,0.03); margin: 12px 0; overflow: hidden; }
|
||||||
|
.dt-match-head { background: var(--surface-hover); border-bottom: 1px solid var(--border); padding: 12px 16px; display: flex; align-items: center; gap: 12px; }
|
||||||
|
.dt-match-head .title { font-weight: 500; font-size: 14px; }
|
||||||
|
.dt-match-head .conf { margin-left: auto; }
|
||||||
|
.dt-match-body { padding: 14px 16px; }
|
||||||
|
.dt-keep-row { background: var(--success-fill); }
|
||||||
|
.dt-keep-tag { display: inline-flex; align-items: center; gap: 4px; background: var(--success-fill); color: var(--success); border-radius: 999px; padding: 2px 8px; font-size: 11px; font-weight: 500; }
|
||||||
|
|
||||||
|
/* Progress bar */
|
||||||
|
.dt-progress { height: 6px; background: var(--border); border-radius: 3px; overflow: hidden; margin: 10px 0; }
|
||||||
|
.dt-progress .bar { height: 100%; background: var(--ink); border-radius: 3px; }
|
||||||
|
|
||||||
|
/* Tabs */
|
||||||
|
.dt-tabs { display: flex; gap: 18px; border-bottom: 1px solid var(--border); margin: 10px 0 16px; }
|
||||||
|
.dt-tab { font-size: 13.5px; color: var(--ink-secondary); padding: 8px 2px; border-bottom: 2px solid transparent; cursor: pointer; }
|
||||||
|
.dt-tab.is-active { color: var(--ink); font-weight: 500; border-bottom-color: var(--accent); }
|
||||||
|
|
||||||
|
/* Code block */
|
||||||
|
.dt-code { background: var(--surface-hover); border: 1px solid var(--border); border-radius: var(--r-md); padding: 12px 14px; font-family: var(--font-mono); font-size: 12.5px; color: var(--ink); white-space: pre; overflow-x: auto; font-feature-settings: "ss02"; }
|
||||||
|
|
||||||
|
@media (max-width: 1100px) {
|
||||||
|
.dt-footer { left: 0; }
|
||||||
|
.dt-sidebar { display: none; }
|
||||||
|
.dt-main { padding: 28px 24px 96px; }
|
||||||
|
}
|
||||||
164
layout-review/home.html
Normal file
164
layout-review/home.html
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — File Analysis (Home)</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="home">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of the <strong>Home / File Analysis</strong> page, shown with three imported files in the post-analysis state. <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Page header: brand block + privacy pill -->
|
||||||
|
<header class="dt-page-header">
|
||||||
|
<div class="dt-page-brand">
|
||||||
|
<div class="dt-page-brand-row">
|
||||||
|
<div class="dt-page-brand-mark">D</div>
|
||||||
|
<div class="dt-page-brand-words">
|
||||||
|
<span class="dt-page-eyebrow">UNALOGIX</span>
|
||||||
|
<h1 class="dt-page-wordmark">DataTools</h1>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-page-subtitle">Clean. Normalize. Transform.</p>
|
||||||
|
</div>
|
||||||
|
<span class="dt-privacy-pill">
|
||||||
|
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">
|
||||||
|
<rect x="4" y="11" width="16" height="10" rx="2"/>
|
||||||
|
<path d="M8 11V7a4 4 0 018 0v4"/>
|
||||||
|
</svg>
|
||||||
|
Runs 100% locally
|
||||||
|
</span>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<!-- Files section head -->
|
||||||
|
<div class="dt-files-section-head">
|
||||||
|
<h2>Files</h2>
|
||||||
|
<span class="dt-section-meta">3 files · 4.7 MB total</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Files card -->
|
||||||
|
<div class="dt-card" style="padding-bottom:0">
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">customers_export.csv</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">2.1 MB</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">q3_transactions.xlsx</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">1.8 MB</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">vendor_list.csv</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">0.8 MB</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-file-add" style="margin-left:-16px;margin-right:-16px;width:calc(100% + 32px)">
|
||||||
|
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M12 5v14M5 12h14"/></svg> Add more files
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Action bar -->
|
||||||
|
<div class="dt-btn-row" style="margin-top:16px;max-width:340px">
|
||||||
|
<button class="dt-btn dt-btn-primary">Run analysis</button>
|
||||||
|
<button class="dt-btn">Clear results</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Stats overview -->
|
||||||
|
<div class="dt-stats">
|
||||||
|
<div class="dt-stat">
|
||||||
|
<div class="dt-stat-label">Files analyzed</div>
|
||||||
|
<div class="dt-stat-value">3</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-stat">
|
||||||
|
<div class="dt-stat-label">Total findings</div>
|
||||||
|
<div class="dt-stat-value">14</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-stat is-warn">
|
||||||
|
<div class="dt-stat-label">Warnings</div>
|
||||||
|
<div class="dt-stat-value">9 <span class="dt-stat-unit">to review</span></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-stat is-info">
|
||||||
|
<div class="dt-stat-label">Info</div>
|
||||||
|
<div class="dt-stat-value">5 <span class="dt-stat-unit">suggestions</span></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Per-file findings panel #1 -->
|
||||||
|
<div class="dt-card">
|
||||||
|
<div class="dt-finding-group-head">
|
||||||
|
<span class="dt-finding-group-chevron" style="transform:rotate(90deg)">chevron_right</span>
|
||||||
|
<span class="dt-severity-dot warn"></span>
|
||||||
|
<span class="dt-group-filename">customers_export.csv</span>
|
||||||
|
<div class="dt-group-counts">
|
||||||
|
<span class="dt-count-pill warn">6 warnings</span>
|
||||||
|
<span class="dt-count-pill info">2 info</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-finding-row">
|
||||||
|
<span class="dt-finding-icon warn"><span class="dt-mi">priority_high</span></span>
|
||||||
|
<div class="dt-finding-body">
|
||||||
|
<p class="dt-finding-title"><strong>312 duplicate rows</strong> across exact + near matches</p>
|
||||||
|
<p class="dt-finding-meta">column: email · Find Duplicates →</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-finding-row">
|
||||||
|
<span class="dt-finding-icon warn"><span class="dt-mi">format_color_text</span></span>
|
||||||
|
<div class="dt-finding-body">
|
||||||
|
<p class="dt-finding-title"><strong>1,204 cells</strong> with leading / trailing whitespace</p>
|
||||||
|
<p class="dt-finding-meta">columns: name, city · Clean Text →</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-finding-row">
|
||||||
|
<span class="dt-finding-icon info"><span class="dt-mi">event</span></span>
|
||||||
|
<div class="dt-finding-body">
|
||||||
|
<p class="dt-finding-title">Mixed date formats in <strong>signup_date</strong></p>
|
||||||
|
<p class="dt-finding-meta">3 formats detected · Standardize Formats →</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Per-file findings panel #2 (collapsed) -->
|
||||||
|
<div class="dt-card" style="padding-bottom:16px">
|
||||||
|
<div class="dt-finding-group-head" style="margin-bottom:-16px;border-radius:var(--r-lg);border-bottom:none">
|
||||||
|
<span class="dt-finding-group-chevron">chevron_right</span>
|
||||||
|
<span class="dt-severity-dot warn"></span>
|
||||||
|
<span class="dt-group-filename">q3_transactions.xlsx</span>
|
||||||
|
<div class="dt-group-counts">
|
||||||
|
<span class="dt-count-pill warn">3 warnings</span>
|
||||||
|
<span class="dt-count-pill info">3 info</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Per-file findings panel #3 (clean) -->
|
||||||
|
<div class="dt-card" style="padding-bottom:16px">
|
||||||
|
<div class="dt-finding-group-head" style="margin-bottom:-16px;border-radius:var(--r-lg);border-bottom:none">
|
||||||
|
<span class="dt-severity-dot success"></span>
|
||||||
|
<span class="dt-group-filename">vendor_list.csv</span>
|
||||||
|
<div class="dt-group-counts">
|
||||||
|
<span class="dt-count-pill success">no issues</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
71
layout-review/index.html
Normal file
71
layout-review/index.html
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>DataTools — Layout Review</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
<style>
|
||||||
|
.lr-wrap { max-width: 960px; margin: 0 auto; padding: 48px 32px 80px; }
|
||||||
|
.lr-grid { display: grid; grid-template-columns: repeat(2, 1fr); gap: 14px; margin-top: 18px; }
|
||||||
|
.lr-card { display: flex; align-items: center; gap: 14px; background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg); padding: 16px 18px; box-shadow: 0 1px 2px rgba(28,25,23,0.03); text-decoration: none; transition: border-color .12s ease, box-shadow .12s ease; }
|
||||||
|
.lr-card:hover { border-color: var(--border-strong); box-shadow: 0 2px 8px rgba(28,25,23,0.06); text-decoration: none; }
|
||||||
|
.lr-ico { width: 40px; height: 40px; border-radius: var(--r-md); background: var(--accent-fill); color: var(--accent); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0; }
|
||||||
|
.lr-ico .dt-mi { font-family: "Material Symbols Outlined"; font-size: 22px; }
|
||||||
|
.lr-body { min-width: 0; }
|
||||||
|
.lr-name { font-size: 15px; font-weight: 600; color: var(--ink); letter-spacing: -0.01em; display:flex; align-items:center; gap:8px; }
|
||||||
|
.lr-desc { font-size: 12.5px; color: var(--ink-secondary); margin-top: 2px; line-height: 1.45; }
|
||||||
|
.lr-sec { font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.08em; color: var(--ink-tertiary); font-weight: 600; margin: 26px 0 2px; }
|
||||||
|
.lr-soon { font-size: 9px; font-weight: 600; letter-spacing: .06em; text-transform: uppercase; color: var(--ink-tertiary); border: 1px solid var(--border-strong); border-radius: 999px; padding: 1px 6px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="lr-wrap">
|
||||||
|
<header class="dt-page-header">
|
||||||
|
<div class="dt-page-brand">
|
||||||
|
<div class="dt-page-brand-row">
|
||||||
|
<div class="dt-page-brand-mark">D</div>
|
||||||
|
<div class="dt-page-brand-words">
|
||||||
|
<span class="dt-page-eyebrow">UNALOGIX · LAYOUT REVIEW</span>
|
||||||
|
<h1 class="dt-page-wordmark">DataTools</h1>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-page-subtitle">Static HTML reproductions of every tool page, built from the live app's design tokens for human review of layouts.</p>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>These are faithful static mockups — not the running Streamlit app. Colors, type scale, spacing, and components are copied verbatim from <code>theme.py</code> and <code>components/_legacy.py</code>. Each page is shown in a representative <strong>populated</strong> state so the layout can be reviewed end-to-end. Fonts load from Google Fonts (needs network); the chrome (sidebar + footer) is shared across every page.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Analysis</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="home.html"><span class="lr-ico"><span class="dt-mi">insert_chart_outlined</span></span><span class="lr-body"><span class="lr-name">File Analysis (Home)</span><span class="lr-desc">Import files, run the analyzer, browse per-file findings.</span></span></a>
|
||||||
|
<a class="lr-card" href="11_reconciler.html"><span class="lr-ico"><span class="dt-mi">compare_arrows</span></span><span class="lr-body"><span class="lr-name">Reconcile Two Files</span><span class="lr-desc">Compare two lists of transactions and flag what doesn't match.</span></span></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Data Cleaners</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="04_missing_handler.html"><span class="lr-ico"><span class="dt-mi">help_outline</span></span><span class="lr-body"><span class="lr-name">Fix Missing Values</span><span class="lr-desc">Find blank cells (even hidden ones) and fill them in or remove them.</span></span></a>
|
||||||
|
<a class="lr-card" href="06_outlier_detector.html"><span class="lr-ico"><span class="dt-mi">insights</span></span><span class="lr-body"><span class="lr-name">Find Unusual Values <span class="lr-soon">Soon</span></span><span class="lr-desc">Spot values that look wrong — too high, too low, or rule-breaking.</span></span></a>
|
||||||
|
<a class="lr-card" href="02_text_cleaner.html"><span class="lr-ico"><span class="dt-mi">text_format</span></span><span class="lr-body"><span class="lr-name">Clean Text</span><span class="lr-desc">Trim extra spaces and strip out odd characters.</span></span></a>
|
||||||
|
<a class="lr-card" href="03_format_standardizer.html"><span class="lr-ico"><span class="dt-mi">format_list_bulleted</span></span><span class="lr-body"><span class="lr-name">Standardize Formats</span><span class="lr-desc">Make dates, phones, currency, and names look the same throughout.</span></span></a>
|
||||||
|
<a class="lr-card" href="01_deduplicator.html"><span class="lr-ico"><span class="dt-mi">search</span></span><span class="lr-body"><span class="lr-name">Find Duplicates</span><span class="lr-desc">Find rows that repeat, then keep one and remove the extras.</span></span></a>
|
||||||
|
<a class="lr-card" href="08_validator_reporter.html"><span class="lr-ico"><span class="dt-mi">check_circle</span></span><span class="lr-body"><span class="lr-name">Quality Check <span class="lr-soon">Soon</span></span><span class="lr-desc">Check your file against rules and export a PDF or Excel report.</span></span></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Transformations</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="05_column_mapper.html"><span class="lr-ico"><span class="dt-mi">view_column</span></span><span class="lr-body"><span class="lr-name">Map Columns</span><span class="lr-desc">Rename columns, reorder, and set each one as text, number, or date.</span></span></a>
|
||||||
|
<a class="lr-card" href="07_multi_file_merger.html"><span class="lr-ico"><span class="dt-mi">account_tree</span></span><span class="lr-body"><span class="lr-name">Combine Files <span class="lr-soon">Soon</span></span><span class="lr-desc">Combine several CSV or Excel files into one — even if columns differ.</span></span></a>
|
||||||
|
<a class="lr-card" href="10_pdf_extractor.html"><span class="lr-ico"><span class="dt-mi">picture_as_pdf</span></span><span class="lr-body"><span class="lr-name">PDF to CSV</span><span class="lr-desc">Pull transactions out of bank-statement PDFs into a clean CSV file.</span></span></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Automations</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="09_pipeline_runner.html"><span class="lr-ico"><span class="dt-mi">auto_awesome</span></span><span class="lr-body"><span class="lr-name">Automated Workflows</span><span class="lr-desc">Run several tools in a row — save the steps and reuse them anytime.</span></span></a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
74
layout-review/shell.js
Normal file
74
layout-review/shell.js
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
/* Shared app chrome (sidebar nav + sticky footer) for the static layout
|
||||||
|
review pages. Mirrors src/gui/app.py:_build_navigation() ordering and
|
||||||
|
src/gui/components/_legacy.py:render_sticky_footer(). Each page sets
|
||||||
|
<body data-page="<tool_id|home>"> to mark the active nav item. */
|
||||||
|
(function () {
|
||||||
|
// Sections + entries in the same order app.py registers them.
|
||||||
|
var NAV = [
|
||||||
|
{ label: "Analysis", items: [
|
||||||
|
{ id: "home", icon: "insert_chart_outlined", name: "File Analysis", href: "home.html" },
|
||||||
|
{ id: "11_reconciler", icon: "compare_arrows", name: "Reconcile Two Files", href: "11_reconciler.html" },
|
||||||
|
]},
|
||||||
|
{ label: "Data Cleaners", items: [
|
||||||
|
{ id: "04_missing_handler", icon: "help_outline", name: "Fix Missing Values", href: "04_missing_handler.html" },
|
||||||
|
{ id: "06_outlier_detector", icon: "insights", name: "Find Unusual Values", href: "06_outlier_detector.html", soon: true },
|
||||||
|
{ id: "02_text_cleaner", icon: "text_format", name: "Clean Text", href: "02_text_cleaner.html" },
|
||||||
|
{ id: "03_format_standardizer", icon: "format_list_bulleted", name: "Standardize Formats", href: "03_format_standardizer.html" },
|
||||||
|
{ id: "01_deduplicator", icon: "search", name: "Find Duplicates", href: "01_deduplicator.html" },
|
||||||
|
{ id: "08_validator_reporter", icon: "check_circle", name: "Quality Check", href: "08_validator_reporter.html", soon: true },
|
||||||
|
]},
|
||||||
|
{ label: "Transformations", items: [
|
||||||
|
{ id: "05_column_mapper", icon: "view_column", name: "Map Columns", href: "05_column_mapper.html" },
|
||||||
|
{ id: "07_multi_file_merger", icon: "account_tree", name: "Combine Files", href: "07_multi_file_merger.html", soon: true },
|
||||||
|
{ id: "10_pdf_extractor", icon: "picture_as_pdf", name: "PDF to CSV", href: "10_pdf_extractor.html" },
|
||||||
|
]},
|
||||||
|
{ label: "Automations", items: [
|
||||||
|
{ id: "09_pipeline_runner", icon: "auto_awesome", name: "Automated Workflows", href: "09_pipeline_runner.html" },
|
||||||
|
]},
|
||||||
|
];
|
||||||
|
|
||||||
|
var active = document.body.getAttribute("data-page") || "";
|
||||||
|
|
||||||
|
// ---- Sidebar -----------------------------------------------------------
|
||||||
|
var sb = document.getElementById("dt-sidebar");
|
||||||
|
if (sb) {
|
||||||
|
var html = '' +
|
||||||
|
'<a class="dt-brand" href="index.html" style="text-decoration:none">' +
|
||||||
|
'<span class="dt-brand-mark">D</span>' +
|
||||||
|
'<span class="dt-brand-name">' +
|
||||||
|
'<span class="dt-brand-eyebrow">UNALOGIX</span>' +
|
||||||
|
'<span class="dt-brand-word">DataTools</span>' +
|
||||||
|
'</span>' +
|
||||||
|
'</a>' +
|
||||||
|
'<nav class="dt-nav">';
|
||||||
|
NAV.forEach(function (sec) {
|
||||||
|
var indicator = sec.label === "Analysis" ? "−" : "−";
|
||||||
|
html += '<div class="dt-nav-section">' + sec.label +
|
||||||
|
'<span class="dt-nav-indicator">' + indicator + '</span></div>';
|
||||||
|
sec.items.forEach(function (it) {
|
||||||
|
var cls = "dt-nav-link" + (it.id === active ? " is-active" : "") + (it.soon ? " is-soon" : "");
|
||||||
|
html += '<a class="' + cls + '" href="' + it.href + '">' +
|
||||||
|
'<span class="dt-mi">' + it.icon + '</span>' +
|
||||||
|
'<span>' + it.name + '</span>' +
|
||||||
|
(it.soon ? '<span class="dt-nav-soon-tag">Soon</span>' : '') +
|
||||||
|
'</a>';
|
||||||
|
});
|
||||||
|
});
|
||||||
|
html += '</nav>' +
|
||||||
|
'<div class="dt-sidebar-foot">' +
|
||||||
|
'<div><div class="dt-sidebar-label">Language</div>' +
|
||||||
|
'<div class="dt-select" style="pointer-events:none">English</div></div>' +
|
||||||
|
'<div class="dt-license-badge">Core · 1,820 days left</div>' +
|
||||||
|
'</div>';
|
||||||
|
sb.innerHTML = html;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Sticky footer -----------------------------------------------------
|
||||||
|
var ft = document.getElementById("dt-footer");
|
||||||
|
if (ft) {
|
||||||
|
ft.innerHTML =
|
||||||
|
'<a class="dt-footer-btn" href="index.html"><span class="dt-mi">close</span>Close</a>' +
|
||||||
|
'<button class="dt-footer-btn" type="button"><span class="dt-mi">help_outline</span>Help</button>' +
|
||||||
|
'<span style="margin-left:auto;font-size:11.5px;color:var(--ink-tertiary)">DataTools · local-first · static layout preview</span>';
|
||||||
|
}
|
||||||
|
})();
|
||||||
192
marketing/COPY.md
Normal file
192
marketing/COPY.md
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
# DataTools — copy single-source-of-truth
|
||||||
|
|
||||||
|
Every customer-facing string lives here. If it appears on a landing
|
||||||
|
page, in an email, on Gumroad, in the GUI's marketing chrome, or in a
|
||||||
|
community post — change it here first, then propagate.
|
||||||
|
|
||||||
|
Why a SoT: positioning drift across 3 niches × 4 surfaces (landing,
|
||||||
|
email, Gumroad, social) is the single biggest source of buyer confusion
|
||||||
|
in v1. One file means one diff to ship a wording change everywhere.
|
||||||
|
|
||||||
|
How to use: copy a row's value into the target surface verbatim. If a
|
||||||
|
surface needs a variation, add it as a sub-row (e.g. `H1 → bookkeeper
|
||||||
|
short`) rather than editing in place.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0 · Universal (all niches)
|
||||||
|
|
||||||
|
| Slot | Value |
|
||||||
|
|------|-------|
|
||||||
|
| Product name | DataTools |
|
||||||
|
| Product tagline (one-liner) | Six CSV tools that turn 4-hour cleanup jobs into a 30-second pipeline. Local. No subscription. |
|
||||||
|
| Price (display) | **$49** |
|
||||||
|
| Price (qualifier) | one-time, lifetime updates for v1.x |
|
||||||
|
| Refund window | 30-day no-questions refund |
|
||||||
|
| Privacy claim | Your data never leaves your computer. |
|
||||||
|
| Audit claim | Every change logged to a CSV-format audit trail. |
|
||||||
|
| Format claim | $ £ € ¥ R$ kr zł and 50+ phone-country codes — handled. |
|
||||||
|
| Language claim | GUI available in English and Español. |
|
||||||
|
| Support email | support@datatools.app |
|
||||||
|
| Distribution URL | https://datatools.gumroad.com/l/datatools |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1 · Niche positioning
|
||||||
|
|
||||||
|
| Niche | Audience | One-line pain | One-line promise |
|
||||||
|
|-------|----------|---------------|-------------------|
|
||||||
|
| **bookkeeper** | Solo bookkeepers, small-firm partners doing client reconciliations | Bank exports come in 50 different shapes; QuickBooks won't import them; you can't show your client what you changed | Reconcile messy bank exports — and hand your client an audit trail |
|
||||||
|
| **revops** | RevOps / SDR-ops at 5-50-person SaaS, doing list hygiene before HubSpot/Salesforce import | You're paying per-contact for duplicates you imported last campaign | Dedupe lead lists across HubSpot, LinkedIn, and manual scrapes — locally |
|
||||||
|
| **shopify-pet** | Shopify store owners (pet niche is the lead vertical), prepping Klaviyo / Mailchimp imports | Customer exports are full of duplicates and bad phone numbers; Klaviyo silently drops them | Klaviyo-import-ready customer lists in 30 seconds — locally |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2 · Landing page strings
|
||||||
|
|
||||||
|
Each niche page uses the same skeleton. Edits to a slot go to all 3
|
||||||
|
unless marked `(niche-only)`.
|
||||||
|
|
||||||
|
### Hero — H1 (per niche)
|
||||||
|
|
||||||
|
| Niche | H1 |
|
||||||
|
|-------|----|
|
||||||
|
| bookkeeper | Reconcile messy bank exports.<br>**Hand your client an audit trail.** |
|
||||||
|
| revops | Dedupe lead lists across HubSpot, LinkedIn,<br>**and manual scrapes — locally.** |
|
||||||
|
| shopify-pet | Klaviyo-import-ready customer lists.<br>**In 30 seconds. Locally.** |
|
||||||
|
|
||||||
|
### Hero — sub-head (per niche)
|
||||||
|
|
||||||
|
| Niche | Sub-head |
|
||||||
|
|-------|----------|
|
||||||
|
| bookkeeper | Six tools, one pipeline, one $49 download. Runs on your laptop — your client's books never touch a server. |
|
||||||
|
| revops | Six tools, one pipeline, one $49 download. Runs on your laptop — prospect data never leaves your machine. |
|
||||||
|
| shopify-pet | Six tools, one pipeline, one $49 download. Runs on your laptop — customer data never leaves your machine. |
|
||||||
|
|
||||||
|
### CTAs
|
||||||
|
|
||||||
|
| Surface | Label |
|
||||||
|
|---------|-------|
|
||||||
|
| Hero primary | Buy DataTools — $49 |
|
||||||
|
| Hero secondary | Try the demo (no install) |
|
||||||
|
| Mid-page | Run it on your own file → $49 |
|
||||||
|
| Footer | Get DataTools |
|
||||||
|
| FAQ-end | Still on the fence? Try the demo. |
|
||||||
|
|
||||||
|
### Sections (universal H2s, copy verbatim)
|
||||||
|
|
||||||
|
- Five pains DataTools fixes in one pass *(revops uses: "before you import to HubSpot")*
|
||||||
|
- Try it on a real-looking sample *(per niche; bookkeeper: "bank export with a known overlap"; revops: "3-vendor lead list"; shopify-pet: "Shopify customer export")*
|
||||||
|
- Workflows you run every week *(bookkeeper: "the rest of the industry tax-codes around"; revops: "every campaign")*
|
||||||
|
- Your data never leaves your computer.
|
||||||
|
- Every change auditable. Period.
|
||||||
|
- $ £ € ¥ R$ kr zł — handled.
|
||||||
|
- Six tools. One pipeline. One $49 download.
|
||||||
|
- $49. No subscription. *(append per niche: bookkeeper "No per-client license."; revops "No per-campaign fee."; shopify-pet "No ceiling on rows or files.")*
|
||||||
|
- Questions
|
||||||
|
- *(closing CTA banner — see below)*
|
||||||
|
|
||||||
|
### Closing CTA banner (per niche)
|
||||||
|
|
||||||
|
| Niche | Banner |
|
||||||
|
|-------|--------|
|
||||||
|
| bookkeeper | Stop reconciling bank exports by hand. |
|
||||||
|
| revops | Stop paying twice for the same contact. |
|
||||||
|
| shopify-pet | Stop deduplicating customers by hand. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3 · Demo CTAs (in-app)
|
||||||
|
|
||||||
|
The hosted demo at `/demo` shows live tool runs. CTAs sit at the top
|
||||||
|
of the demo page and after each tool completes.
|
||||||
|
|
||||||
|
| Slot | Copy |
|
||||||
|
|------|------|
|
||||||
|
| Demo banner top | You're using the hosted demo. To run this on your own files, get the $49 desktop version. |
|
||||||
|
| Per-tool footer | Liked what just happened? Run it on your own file → **$49 desktop install** |
|
||||||
|
| Demo end-of-flow | That's six tools in one pass. Get the desktop version — $49, no subscription. |
|
||||||
|
| Demo "buy" button | Get DataTools — $49 |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4 · Email subject lines (per niche)
|
||||||
|
|
||||||
|
Subjects are the highest-leverage copy. One per touch, per niche.
|
||||||
|
Body copy lives in `marketing/emails/<niche>/`.
|
||||||
|
|
||||||
|
### Gumroad delivery (Day 0)
|
||||||
|
|
||||||
|
| Niche | Subject |
|
||||||
|
|-------|---------|
|
||||||
|
| bookkeeper | Your DataTools download (start here) |
|
||||||
|
| revops | Your DataTools download (start here) |
|
||||||
|
| shopify-pet | Your DataTools download (start here) |
|
||||||
|
|
||||||
|
### 5-touch onboarding sequence (Days 1, 3, 7, 14, 30)
|
||||||
|
|
||||||
|
| # | Day | bookkeeper | revops | shopify-pet |
|
||||||
|
|---|-----|------------|--------|-------------|
|
||||||
|
| 1 | 1 | Try it on this messy bank export first | Try it on this 3-vendor lead list first | Try it on this Shopify customer export first |
|
||||||
|
| 2 | 3 | The audit trail your client will actually open | The dedupe rule that catches LinkedIn drift | The phone-format step Klaviyo cares about |
|
||||||
|
| 3 | 7 | One pipeline, every client, every month | Run it before every HubSpot import | Run it before every Klaviyo sync |
|
||||||
|
| 4 | 14 | Two-minute trick: the gate report | Two-minute trick: the confidence tiers | Two-minute trick: hidden-character cleanup |
|
||||||
|
| 5 | 30 | Heard from a fellow bookkeeper? | Heard from another RevOps lead? | Heard from another store owner? |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5 · Gumroad listing
|
||||||
|
|
||||||
|
| Slot | Value |
|
||||||
|
|------|-------|
|
||||||
|
| Product title | DataTools — Local CSV cleanup pipeline · $49 |
|
||||||
|
| Tagline | Six CSV tools that turn a 4-hour cleanup job into a 30-second pipeline. Runs on your laptop. No subscription. |
|
||||||
|
| Cover image alt | Six DataTools panels — analyzer, dedupe, format, gate, text-clean, splitter — running locally |
|
||||||
|
| Description (H2 1) | What you get |
|
||||||
|
| Description body 1 | A desktop install (Mac, Windows, Linux) bundling six CSV tools you'd otherwise stitch together from Excel macros, regex, and luck. One pipeline. Audit trail per file. Files up to 1 GB. |
|
||||||
|
| Description (H2 2) | Why local |
|
||||||
|
| Description body 2 | Your data never touches a server. No upload. No "we promise we won't look." Run the pipeline, get the cleaned CSV + the audit log, close the app. Done. |
|
||||||
|
| Description (H2 3) | What's in the box |
|
||||||
|
| Description bullets | Analyzer (find what's broken) · Format standardizer (phones, addresses, currencies) · Dedupe (fuzzy matching across columns) · Gate (block bad rows from your import) · Text cleaner (hidden chars, encoding) · Splitter (chunk huge files for upload limits) |
|
||||||
|
| Description (H2 4) | Who it's for |
|
||||||
|
| Description body 4 | Bookkeepers reconciling client bank exports. RevOps deduping lead lists before HubSpot. Shopify owners prepping customer data for Klaviyo. Anyone with a 50k-row CSV they don't want to clean by hand again. |
|
||||||
|
| Refund text | 30-day no-questions refund. Email support@datatools.app. |
|
||||||
|
| Tags | csv, data cleaning, dedupe, bookkeeping, revops, shopify, local, privacy |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6 · One-liners (for social, signatures, podcasts)
|
||||||
|
|
||||||
|
Pick the line that matches the medium. Don't mix-and-match across one
|
||||||
|
campaign — pick one and let it land.
|
||||||
|
|
||||||
|
- "Six CSV tools that turn a 4-hour cleanup job into a 30-second pipeline."
|
||||||
|
- "Local CSV cleanup. Your data never leaves your computer."
|
||||||
|
- "$49, one-time, six tools, one pipeline. Mac/Win/Linux."
|
||||||
|
- "I built the CSV cleanup pipeline I wanted to stop doing by hand."
|
||||||
|
- "Bank exports, lead lists, Shopify customers — same six steps, every time."
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7 · Banned phrases
|
||||||
|
|
||||||
|
These over-promise or trip professional buyers' BS detector. Don't use:
|
||||||
|
|
||||||
|
- ~~"AI-powered"~~ — not what we do; sets the wrong expectation.
|
||||||
|
- ~~"Enterprise-grade"~~ — meaningless; says "expensive" without backing it up.
|
||||||
|
- ~~"Revolutionary" / "game-changing"~~ — every SaaS landing page uses these. Skip.
|
||||||
|
- ~~"99.9% uptime"~~ — local app; not relevant; reads as cargo-culted.
|
||||||
|
- ~~"GDPR-compliant"~~ — true (local, no transfer) but the claim invites legal scrutiny we don't need; say "local" instead.
|
||||||
|
- ~~"Free trial"~~ — there's the demo, but the desktop app is paid-only; "trial" implies time-bombed and we don't ship that.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8 · Change log
|
||||||
|
|
||||||
|
When you change a slot here, add a line below so the next person
|
||||||
|
ships from a known state.
|
||||||
|
|
||||||
|
| Date | Slot | Old → New | Why |
|
||||||
|
|------|------|-----------|-----|
|
||||||
|
| 2026-05-01 | (initial) | — | First SoT extracted from landing pages 1.0 |
|
||||||
|
| 2026-05-13 | Language claim (new) | — → "GUI available in English and Español." | Ships v1.6 i18n: EN + ES packs in GUI sidebar. Expands addressable market without a CLI/copy rebuild. |
|
||||||
32
marketing/community-posts/README.md
Normal file
32
marketing/community-posts/README.md
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# Community posts
|
||||||
|
|
||||||
|
Three drafts per niche, each value-first, ready to personalize:
|
||||||
|
|
||||||
|
1. **`01-story.md`** — "Here's how I solved X." Concrete, narrative, no
|
||||||
|
pitch in the body. The product gets one mention at the end, in
|
||||||
|
context. Goes in subreddits / Slacks / forums where direct
|
||||||
|
promotion is banned. Lead with usefulness; the link is dessert.
|
||||||
|
|
||||||
|
2. **`02-tip.md`** — A standalone tactical tip the reader can use
|
||||||
|
*without* DataTools. The product appears as "if you don't want to
|
||||||
|
do this by hand…" — earned, not pushed. Cross-post-safe.
|
||||||
|
|
||||||
|
3. **`03-soft-offer.md`** — The one post where the product is the
|
||||||
|
subject. Goes in `/r/<niche>` "what are you working on" threads,
|
||||||
|
IndieHackers launches, and niche newsletters that allow paid-tool
|
||||||
|
posts. Still leads with the problem, not the features.
|
||||||
|
|
||||||
|
## Personalization checklist before posting
|
||||||
|
|
||||||
|
- [ ] Replace `{{your-name}}` and `{{your-context}}` in the opener
|
||||||
|
- [ ] Match the community's tone (Reddit ≠ LinkedIn ≠ niche Slack)
|
||||||
|
- [ ] Add a community-specific opener line ("Long-time lurker, first post" / "Saw the thread about X yesterday — figured I'd share")
|
||||||
|
- [ ] Confirm the community's promo rules; if no-promo, drop the link from `01` / `02` and only mention "I built a thing, DM me if curious"
|
||||||
|
- [ ] Vary the URL (use the niche-specific landing page, not the generic Gumroad URL)
|
||||||
|
|
||||||
|
## Cadence guidance
|
||||||
|
|
||||||
|
- Don't post all 3 drafts in the same community in the same week. Stagger:
|
||||||
|
Week 1 → `01-story`. Week 4 → `02-tip`. Week 8 → `03-soft-offer`.
|
||||||
|
- Reply to commenters within 24h. The post itself sells less than the
|
||||||
|
comment thread that follows.
|
||||||
39
marketing/community-posts/bookkeeper/01-story.md
Normal file
39
marketing/community-posts/bookkeeper/01-story.md
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
# Bookkeeper · Post 1 — Story
|
||||||
|
|
||||||
|
**Where to post:** r/Bookkeeping, r/QuickBooks, AAT forums, ICB
|
||||||
|
member groups, Bookkeeping Slacks/Discords.
|
||||||
|
|
||||||
|
**Format:** longish post, ~400 words. Subject line / title goes
|
||||||
|
first; everything below is the body.
|
||||||
|
|
||||||
|
**Tone:** "fellow bookkeeper venting + sharing what worked" — not
|
||||||
|
salesy, not preachy.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Title
|
||||||
|
|
||||||
|
How I cut my month-end bank reconciliation from 4 hours to 30 minutes (the boring 3-step version)
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
I've been doing month-end reconciliation for {{your-client-count}} clients for {{your-years}} years and the part I hated most was the bank export cleanup. Not the reconciliation itself — the *cleanup before* the reconciliation.
|
||||||
|
|
||||||
|
You know the drill: client sends you a CSV from their bank. Half the dates are `MM/DD/YYYY`, the other half `DD-MM-YY`. The merchant column has trailing whitespace, weird unicode hyphens, and the same vendor spelled four ways ("Amzn Mktp", "AMAZON MARKETPLACE", "Amazon.com*1A2B3", "AMZN Mktplace"). QuickBooks chokes on the import, so you fix it by hand. Every. Single. Month.
|
||||||
|
|
||||||
|
Last quarter I sat down and wrote out the steps I do every single time. There were 11. I automated the 8 that were deterministic. Here are the 3 that matter most — you can do these with built-in tools, no purchase required:
|
||||||
|
|
||||||
|
**1. Normalize dates first, before anything else.**
|
||||||
|
Excel's `TEXT(DATEVALUE(A2), "yyyy-mm-dd")` works for ~80% of bank exports. The other 20% have at least one row with a value Excel parses wrong (it'll silently swap day/month). Sort by date afterwards and *visually scan* for any row that's now in the wrong year — that's your tell.
|
||||||
|
|
||||||
|
**2. Standardize merchant names with a fuzzy match, not a regex.**
|
||||||
|
A regex won't catch "Amzn Mktp" → "Amazon". A fuzzy-match function (Excel doesn't have one natively; Google Sheets has `=FUZZYMATCH` via add-ons) will. The threshold I use is 0.85 — high enough to avoid false positives, low enough to catch the spelling drift.
|
||||||
|
|
||||||
|
**3. Keep an audit trail of every change.**
|
||||||
|
This is the one most bookkeepers skip and then regret 6 months later when the client asks "wait, why did you re-classify that?". Add a sidecar CSV: `original_value, new_value, rule_applied, timestamp`. Five columns, append-only, never delete.
|
||||||
|
|
||||||
|
Doing those three turned a 4-hour job into roughly 30 minutes for me. The rest I eventually wrapped into a desktop tool I built called DataTools (the audit trail thing was the bit I needed and couldn't find anywhere — figured other bookkeepers might want it too). It's $49 if you want to skip the spreadsheet wrangling, but the 3 steps above will get you most of the way without it.
|
||||||
|
|
||||||
|
Happy to share the audit-trail CSV template if anyone wants it — just reply.
|
||||||
|
|
||||||
|
— {{your-name}}
|
||||||
27
marketing/community-posts/bookkeeper/02-tip.md
Normal file
27
marketing/community-posts/bookkeeper/02-tip.md
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# Bookkeeper · Post 2 — Tip
|
||||||
|
|
||||||
|
**Where to post:** LinkedIn (your own feed), AAT/ICB Facebook
|
||||||
|
groups, accountancy newsletters' "tip submission" inboxes.
|
||||||
|
|
||||||
|
**Format:** short, ~150 words. Practical. Reads as "thing I learned"
|
||||||
|
not "thing I'm selling".
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Title
|
||||||
|
|
||||||
|
The 30-second check that catches 90% of bank-export errors before they hit QuickBooks
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
If you do client bank reconciliations, do this once before every import:
|
||||||
|
|
||||||
|
Open the export. Sort by amount. Scroll to the bottom. Look at the totals row.
|
||||||
|
|
||||||
|
Most banks add a totals row at the bottom of the CSV that *isn't* a transaction. If you import it, QuickBooks treats it as a real entry and your books are off by exactly the value of the totals row — usually a five-figure number that takes you 40 minutes to track down.
|
||||||
|
|
||||||
|
Same trick catches blank rows the bank inserts as section breaks (especially Wells Fargo, Chase, and most UK challenger banks). One sort, one scroll, two seconds of looking — saves the rest of your evening.
|
||||||
|
|
||||||
|
If you're doing this for 20+ clients a month and want to automate the whole pre-import scrub (this trick + ~10 others), I built a $49 desktop tool called DataTools that does it: datatools.gumroad.com. No subscription, runs locally so client data stays on your machine.
|
||||||
|
|
||||||
|
— {{your-name}}
|
||||||
39
marketing/community-posts/bookkeeper/03-soft-offer.md
Normal file
39
marketing/community-posts/bookkeeper/03-soft-offer.md
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
# Bookkeeper · Post 3 — Soft offer
|
||||||
|
|
||||||
|
**Where to post:** IndieHackers "show what you're working on", r/SideProject,
|
||||||
|
r/Bookkeeping (only in monthly "self-promo" threads — read each
|
||||||
|
sub's rules), bookkeeping newsletter "tools" sections.
|
||||||
|
|
||||||
|
**Format:** ~250 words. Pitches the product but leads with the
|
||||||
|
problem and is honest about the scope.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Title
|
||||||
|
|
||||||
|
I built a desktop CSV cleanup tool for bookkeepers who hate the bank-export reconciliation grind
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
Quick context: I do {{your-context — e.g., "books for 12 small clients" or "side-bookkeeping for a few non-profits"}} and the part I dreaded most every month was cleaning bank exports before importing them to QuickBooks. Different bank, different format, every time.
|
||||||
|
|
||||||
|
I built **DataTools** — a desktop app (Mac/Win/Linux) that runs the same six cleanup steps every export needs:
|
||||||
|
|
||||||
|
- Normalizes dates, currencies, account-number formats
|
||||||
|
- Fuzzy-matches merchant-name variants ("Amzn Mktp" = "Amazon")
|
||||||
|
- Flags duplicate transactions across re-exported date ranges
|
||||||
|
- Strips trailing whitespace, hidden chars, BOM markers — the stuff QuickBooks chokes on silently
|
||||||
|
- Generates a per-file audit trail your client can open in Excel: every change, every rule that fired, timestamped
|
||||||
|
- Splits oversized exports for tools with row limits
|
||||||
|
|
||||||
|
It runs **locally** — your client's bank data never goes to a server. (This was the whole reason I built it instead of using one of the cloud "data cleaning" SaaS tools.)
|
||||||
|
|
||||||
|
It's **$49 one-time**, no subscription, no per-client license. v1.x updates included.
|
||||||
|
|
||||||
|
If you want to try before you buy: there's a hosted demo with sample bank exports at the link below. The demo is identical to the desktop app — same UI, same six tools, just running in your browser on synthetic data.
|
||||||
|
|
||||||
|
→ datatools.gumroad.com (or the bookkeeper landing page: datatools.app/bookkeeper)
|
||||||
|
|
||||||
|
Happy to answer questions in the thread.
|
||||||
|
|
||||||
|
— {{your-name}}
|
||||||
39
marketing/community-posts/revops/01-story.md
Normal file
39
marketing/community-posts/revops/01-story.md
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
# RevOps · Post 1 — Story
|
||||||
|
|
||||||
|
**Where to post:** r/revops, r/sales, RevGenius Slack, Modern Sales Pros,
|
||||||
|
Pavilion communities, LinkedIn (your own feed).
|
||||||
|
|
||||||
|
**Format:** ~400 words. Tactical war-story style. Don't pitch in the body.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Title
|
||||||
|
|
||||||
|
We were paying HubSpot for 4,200 duplicate contacts. Here's the dedupe pipeline that caught them.
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
Last quarter I ran a count on our HubSpot instance: ~4,200 contacts that were almost-certainly the same person as another contact already in the system. Our HubSpot bill is per-marketing-contact, so this was a real number. ($X/month — pick your tier.)
|
||||||
|
|
||||||
|
The problem is that HubSpot's native "find duplicates" tool is exact-match-only on a small set of fields. It misses:
|
||||||
|
|
||||||
|
- "Sarah O'Brien" vs "Sarah Obrien" (apostrophe / no-apostrophe)
|
||||||
|
- "+1 (415) 555-0143" vs "415-555-0143" vs "4155550143" (phone formats)
|
||||||
|
- "sarah@acme.com" vs "Sarah@acme.com" (case)
|
||||||
|
- Same person from a LinkedIn scrape (no phone) + a webform fill (no LinkedIn URL) + a trade-show import (only email + company)
|
||||||
|
|
||||||
|
Here's the 4-step pipeline I run before *every* HubSpot import now. You can build the first 3 with Python + pandas + rapidfuzz; the 4th is the one that matters and is the easiest to skip:
|
||||||
|
|
||||||
|
**Step 1 — Normalize before comparing.** Lowercase emails, strip phone formatting to E.164, trim whitespace, normalize unicode (NFKC). This alone catches ~40% of dupes.
|
||||||
|
|
||||||
|
**Step 2 — Fuzzy-match on name + company, blocked by email domain.** Don't fuzzy-match across the whole list (O(n²) and full of false positives). Block by email domain first — only compare contacts within the same company. Use rapidfuzz token-set ratio at threshold 85.
|
||||||
|
|
||||||
|
**Step 3 — Cross-source merge logic.** When LinkedIn-source and webform-source records match, *the LinkedIn one wins on title/company* (more recent), *the webform one wins on phone/email* (verified). Document this rule somewhere your team can read it.
|
||||||
|
|
||||||
|
**Step 4 — Confidence tiers, not yes/no.** Don't auto-merge anything below 95% confidence. Auto-merge 95-100. Queue 85-95 for manual review. Drop everything below 85. The manual queue is the magic — it catches the cases the algorithm doesn't dare touch and trains you on what your data actually looks like.
|
||||||
|
|
||||||
|
I eventually wrapped all this into a desktop tool I called DataTools because I got tired of re-running the script every campaign. Local-only, $49 if anyone wants it: datatools.app/revops. But the 4-step framework above is the real takeaway — works regardless of what tool you use.
|
||||||
|
|
||||||
|
What's your dedupe pipeline look like?
|
||||||
|
|
||||||
|
— {{your-name}}
|
||||||
27
marketing/community-posts/revops/02-tip.md
Normal file
27
marketing/community-posts/revops/02-tip.md
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
# RevOps · Post 2 — Tip
|
||||||
|
|
||||||
|
**Where to post:** LinkedIn, RevGenius Slack #tips channel,
|
||||||
|
RevOps Co-op, Modern Sales Pros.
|
||||||
|
|
||||||
|
**Format:** ~150 words. Tactical. One idea, one sentence-of-pitch
|
||||||
|
at the bottom.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Title
|
||||||
|
|
||||||
|
The 30-second pre-import check that catches LinkedIn-scrape duplicates before they hit HubSpot
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
Before you import a LinkedIn scrape (Apollo, Lusha, Cognism — same problem) into HubSpot:
|
||||||
|
|
||||||
|
Open the file. Sort by `email`. Look for blanks.
|
||||||
|
|
||||||
|
LinkedIn-sourced rows often have *no email* — just name + company + LinkedIn URL. If you import them as-is, HubSpot creates a new contact for each one. The next time someone fills your webform with the same name + company, HubSpot creates *another* new contact, because there's no key to match on.
|
||||||
|
|
||||||
|
Two-minute fix: before import, generate a synthetic dedupe key as `lower(first_name)|lower(last_name)|domain(company_url)`. Sort by it. Anything with >1 row is a likely dupe — review and merge before HubSpot ever sees it.
|
||||||
|
|
||||||
|
If you're doing this monthly across multiple lead sources and want to automate it (plus phone normalization, fuzzy matching, the whole pipeline), I built a $49 desktop tool: datatools.app/revops. Local — your prospect list never goes to a server.
|
||||||
|
|
||||||
|
— {{your-name}}
|
||||||
35
marketing/community-posts/revops/03-soft-offer.md
Normal file
35
marketing/community-posts/revops/03-soft-offer.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# RevOps · Post 3 — Soft offer
|
||||||
|
|
||||||
|
**Where to post:** IndieHackers, r/revops monthly self-promo,
|
||||||
|
RevGenius #tools-and-software, LinkedIn (your own feed).
|
||||||
|
|
||||||
|
**Format:** ~250 words.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Title
|
||||||
|
|
||||||
|
DataTools — a $49 desktop CSV pipeline for the lead-list cleanup you do before every HubSpot import
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
Built this for myself first. {{your-context — e.g., "I run RevOps at a 30-person SaaS"}} and the part of the job I dreaded was the pre-import scrub: LinkedIn export + Apollo pull + last quarter's webform list, deduped against each other and against what's already in HubSpot. Six tabs in a Google Sheet, regexes I half-remember, vlookups, an hour and a half.
|
||||||
|
|
||||||
|
**DataTools** does the six steps as one pipeline:
|
||||||
|
|
||||||
|
- **Format standardizer** — phones to E.164 (50+ country codes, per-row country awareness), emails lowercased, URLs canonicalized
|
||||||
|
- **Dedupe** — fuzzy matching with confidence tiers (95+ auto, 85-95 manual queue, <85 dropped), blocked by email domain so it scales to 50k-row lists
|
||||||
|
- **Gate** — block bad rows from your import with a per-rule report ("142 rows missing email, 38 rows with malformed phones, 12 rows with corporate-blacklist domains")
|
||||||
|
- **Text cleaner** — strips hidden chars, BOMs, weird unicode
|
||||||
|
- **Analyzer** — finds problems before you process (mixed encodings, inconsistent delimiters, near-duplicate rows)
|
||||||
|
- **Splitter** — chunk huge files for tools with row limits
|
||||||
|
|
||||||
|
Runs **locally** — Mac/Win/Linux. Your prospect data never goes to a server. (This was the actual reason I shipped it instead of using Clearbit / cloud tools — legal didn't want third-party touching prospect data after the {{2024 / 2025}} compliance review.)
|
||||||
|
|
||||||
|
**$49 one-time.** No subscription. No per-record fee. v1.x updates included.
|
||||||
|
|
||||||
|
Demo (with synthetic data) and download: datatools.app/revops
|
||||||
|
|
||||||
|
Happy to answer questions in the thread.
|
||||||
|
|
||||||
|
— {{your-name}}
|
||||||
49
marketing/community-posts/shopify-pet/01-story.md
Normal file
49
marketing/community-posts/shopify-pet/01-story.md
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
# Shopify-pet · Post 1 — Story
|
||||||
|
|
||||||
|
**Where to post:** r/shopify, r/ecommerce, Shopify community forums,
|
||||||
|
pet-business Facebook groups (Pet Industry Distributors Association,
|
||||||
|
Pet Boss Nation), Klaviyo community Slack.
|
||||||
|
|
||||||
|
**Format:** ~400 words. Owner-to-owner tone.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Title
|
||||||
|
|
||||||
|
Why my Klaviyo flows were skipping 18% of my customers (and the CSV cleanup that fixed it)
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
Background: I run {{your-store-context — e.g., "a 4-year-old pet supplements store doing about $X/month"}}. Last summer I noticed the open rate on my "abandoned cart" Klaviyo flow was lower than usual. Klaviyo's dashboard said the flow was firing fine. Took me a week to figure out the actual problem:
|
||||||
|
|
||||||
|
**Klaviyo was silently dropping 18% of my customers because their phone numbers weren't formatted correctly.** Not "wrong" — just not in the format Klaviyo's SMS module accepts. So the SMS part of the flow never sent, and the email-only fallback didn't kick in for half of those.
|
||||||
|
|
||||||
|
The root cause was the Shopify customer export. Customers had entered their phones every which way:
|
||||||
|
|
||||||
|
- `(415) 555-0143` — works
|
||||||
|
- `415.555.0143` — Klaviyo: "invalid"
|
||||||
|
- `4155550143` — Klaviyo: "invalid for this country"
|
||||||
|
- `+44 20 7946 0958` — works only if the country field is set; for ~30% of my customers it wasn't
|
||||||
|
- `415-555-0143 ext 12` — Klaviyo: "invalid"
|
||||||
|
|
||||||
|
The fix is a one-time CSV cleanup before each Klaviyo sync:
|
||||||
|
|
||||||
|
**1. Pull the Shopify customer export.**
|
||||||
|
Customers > Export > "All customers" > CSV.
|
||||||
|
|
||||||
|
**2. Run every phone number through E.164 normalization.**
|
||||||
|
E.164 is the international format Klaviyo (and basically every other SMS platform) wants: `+14155550143`. Python's `phonenumbers` library does this if you're scripting; spreadsheet add-ons exist but they're painful at >5k rows.
|
||||||
|
|
||||||
|
**3. Default the country code per row.**
|
||||||
|
If the customer's address country is "United States", default the phone country to US. This catches the rows that are missing `+1` but are obviously American.
|
||||||
|
|
||||||
|
**4. Drop or quarantine anything still un-parseable.**
|
||||||
|
Don't import broken rows hoping Klaviyo will figure it out. It won't.
|
||||||
|
|
||||||
|
**5. Re-import the cleaned CSV to a Shopify customer segment** (or push directly to Klaviyo via their API).
|
||||||
|
|
||||||
|
I eventually wrapped this whole pipeline into a desktop app called DataTools because doing it monthly was tedious. $49, runs locally so customer data stays on my machine, datatools.app/shopify-pet if you're curious. But the 5 steps above are what actually matters — works regardless of tool.
|
||||||
|
|
||||||
|
Anyone else seeing low SMS deliverability? I'd bet money it's this.
|
||||||
|
|
||||||
|
— {{your-name}}
|
||||||
28
marketing/community-posts/shopify-pet/02-tip.md
Normal file
28
marketing/community-posts/shopify-pet/02-tip.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# Shopify-pet · Post 2 — Tip
|
||||||
|
|
||||||
|
**Where to post:** LinkedIn, Shopify Discord, pet-business Facebook
|
||||||
|
groups, niche e-comm newsletters' "tip" inboxes.
|
||||||
|
|
||||||
|
**Format:** ~150 words.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Title
|
||||||
|
|
||||||
|
The hidden character in your Shopify customer export that breaks Klaviyo imports (and how to spot it)
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
Open your Shopify customer export. Look at the email column.
|
||||||
|
|
||||||
|
Some of your emails have an invisible character in them — usually a zero-width space (`U+200B`) or a non-breaking space (`U+00A0`) — copied in from a customer typing on their phone. Visually identical to a normal email. Klaviyo treats them as different addresses, so:
|
||||||
|
|
||||||
|
- Your "duplicate customer" check passes when it shouldn't
|
||||||
|
- The customer gets emailed twice
|
||||||
|
- Your unsubscribes don't propagate (the unsub list has the *clean* email; the next campaign send reaches them via the *invisible-char* email)
|
||||||
|
|
||||||
|
Spot it: in Excel, paste your email column into a single cell with `=LEN(A2)` next to it. Anything that's longer than the visible character count has a hidden char in it.
|
||||||
|
|
||||||
|
If you want to automate the cleanup (plus phone normalization, dedupe, the whole pre-Klaviyo scrub), I built a $49 desktop tool: datatools.app/shopify-pet. Local — your customer list never leaves your computer.
|
||||||
|
|
||||||
|
— {{your-name}}
|
||||||
35
marketing/community-posts/shopify-pet/03-soft-offer.md
Normal file
35
marketing/community-posts/shopify-pet/03-soft-offer.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Shopify-pet · Post 3 — Soft offer
|
||||||
|
|
||||||
|
**Where to post:** IndieHackers, r/shopify monthly self-promo, Shopify
|
||||||
|
community "apps & tools" forum, pet-business newsletters.
|
||||||
|
|
||||||
|
**Format:** ~250 words.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Title
|
||||||
|
|
||||||
|
DataTools — a $49 desktop tool that gets your Shopify customer export Klaviyo-import-ready in 30 seconds
|
||||||
|
|
||||||
|
## Body
|
||||||
|
|
||||||
|
Built this for my own store and figured fellow Shopify owners might want it.
|
||||||
|
|
||||||
|
The problem: Shopify's customer CSV export is *almost* Klaviyo-ready, but not quite. Phones in five different formats. Hidden whitespace in addresses. Duplicate-customer rows from the same person ordering twice with slightly different emails. Country fields blank for half your international orders. You either fix it by hand every month or accept that ~15-20% of your list is broken.
|
||||||
|
|
||||||
|
**DataTools** is six CSV tools as one pipeline:
|
||||||
|
|
||||||
|
- **Format standardizer** — phones to E.164 (Klaviyo-ready), addresses normalized, currencies in your store's locale
|
||||||
|
- **Dedupe** — fuzzy matching catches "Sarah O'Brien" = "sarah obrien" = "Sarah OBrien" before they become 3 customers in Klaviyo
|
||||||
|
- **Text cleaner** — strips zero-width spaces, BOMs, weird unicode the customer typed on their phone
|
||||||
|
- **Gate** — quarantine rows that won't survive the import (missing email, malformed phone) so you know what got dropped and why
|
||||||
|
- **Analyzer** — runs first, tells you what's wrong before you start fixing
|
||||||
|
- **Splitter** — chunks oversized exports for tools with row limits
|
||||||
|
|
||||||
|
Runs **locally** on Mac/Win/Linux. Customer data never goes to a server — that was the whole point. No subscription. **$49 one-time**, v1.x updates included.
|
||||||
|
|
||||||
|
Demo (with synthetic data) and download: datatools.app/shopify-pet
|
||||||
|
|
||||||
|
Built by a fellow Shopify store owner. Happy to answer questions in the thread.
|
||||||
|
|
||||||
|
— {{your-name}}
|
||||||
60
marketing/emails/README.md
Normal file
60
marketing/emails/README.md
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
# Email sequences
|
||||||
|
|
||||||
|
Per niche (`bookkeeper/`, `revops/`, `shopify-pet/`):
|
||||||
|
|
||||||
|
- **`00-delivery.md`** — Day 0 Gumroad delivery email. Triggered when
|
||||||
|
Gumroad confirms the purchase. Job #1: get the buyer to download
|
||||||
|
and open the app inside the first 24h. Buyers who don't open within
|
||||||
|
72h refund at ~3× the rate of buyers who do.
|
||||||
|
- **`01-day1.md`** — Day 1 nudge with a sample file matched to the
|
||||||
|
niche. The Day-1 email is the highest-leverage one in the
|
||||||
|
sequence; it converts "I bought it" into "I used it".
|
||||||
|
- **`02-day3.md`** — Day 3 deep-dive on one specific feature the
|
||||||
|
niche cares about most.
|
||||||
|
- **`03-day7.md`** — Day 7 workflow framing. "Use it every {month /
|
||||||
|
campaign / sync}, not as a one-off."
|
||||||
|
- **`04-day14.md`** — Day 14 power-user tip. Surfaces a non-obvious
|
||||||
|
feature; converts "I use it" into "I rely on it".
|
||||||
|
- **`05-day30.md`** — Day 30 referral / review ask.
|
||||||
|
|
||||||
|
## Sender setup
|
||||||
|
|
||||||
|
- **From:** `support@datatools.app` (single-sender to keep replies in
|
||||||
|
one inbox; don't fan out to per-niche aliases until volume warrants)
|
||||||
|
- **Reply-To:** same — every email expects a reply pathway
|
||||||
|
- **List provider:** Gumroad's built-in for delivery; Buttondown or
|
||||||
|
ConvertKit for the 5-touch sequence (Gumroad's drip is too crude
|
||||||
|
for niche segmentation)
|
||||||
|
- **Segmentation:** customers self-tag at checkout (Gumroad custom
|
||||||
|
field "What do you do?"). Map: `bookkeeper`, `revops`,
|
||||||
|
`shopify-pet`, `other`. `other` gets a generic sequence (not
|
||||||
|
drafted yet — Tier C).
|
||||||
|
|
||||||
|
## Variables
|
||||||
|
|
||||||
|
All emails use these placeholders. Set them at sequence-import time,
|
||||||
|
not per-email:
|
||||||
|
|
||||||
|
- `{{first_name}}` — Gumroad provides; fall back to "there" if blank
|
||||||
|
- `{{download_url}}` — niche-specific download URL from Gumroad
|
||||||
|
- `{{sample_file_url}}` — niche-specific sample CSV (`samples/demo/...`)
|
||||||
|
- `{{landing_page}}` — niche-specific landing page URL
|
||||||
|
- `{{support_email}}` — `support@datatools.app`
|
||||||
|
|
||||||
|
## Cadence and quiet rules
|
||||||
|
|
||||||
|
- Don't send between 10pm-7am buyer-local-time (Buttondown supports
|
||||||
|
TZ-aware send; ConvertKit doesn't out of the box)
|
||||||
|
- If the buyer replies to *any* email in the sequence, pause the
|
||||||
|
remaining touches until you've replied to them. A drip that
|
||||||
|
ignores a customer reply reads as worse than no drip.
|
||||||
|
- If the buyer requests a refund, kill the sequence immediately.
|
||||||
|
- Day 14 + Day 30 emails are skippable if the buyer has already
|
||||||
|
emailed support with a feature request or bug report — they're
|
||||||
|
engaged enough; don't pile on.
|
||||||
|
|
||||||
|
## Subject lines
|
||||||
|
|
||||||
|
Subjects are owned by `marketing/COPY.md` § 4. Don't edit subjects
|
||||||
|
in-line in the email files; edit COPY.md and re-propagate. Same
|
||||||
|
discipline applies to the closing CTA — owned by COPY.md § 0.
|
||||||
34
marketing/emails/bookkeeper/00-delivery.md
Normal file
34
marketing/emails/bookkeeper/00-delivery.md
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# Bookkeeper · Day 0 — Delivery email
|
||||||
|
|
||||||
|
**Subject:** Your DataTools download (start here)
|
||||||
|
**Send:** immediately on Gumroad purchase confirmation
|
||||||
|
**Goal:** buyer downloads + opens the app within 24h
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
Thanks for buying DataTools. Your download:
|
||||||
|
|
||||||
|
→ **{{download_url}}**
|
||||||
|
|
||||||
|
Three things to do in the next 5 minutes so you don't lose this email under the next 200:
|
||||||
|
|
||||||
|
**1. Download the installer for your OS** (Mac `.dmg`, Windows `.exe`, or Linux `.tar.gz`). About 280 MB. The link above auto-detects.
|
||||||
|
|
||||||
|
**2. Run it.** First launch takes ~5 seconds; a browser tab opens to `127.0.0.1:8501`. That's the app — running locally on your machine, no network calls. If your browser doesn't open automatically, the terminal window shows the URL.
|
||||||
|
|
||||||
|
**3. Drop in a real bank export.** Don't bother with the bundled samples — DataTools is built for messy real-world files. Pull last month's bank export from any client, drag it into the analyzer, and click "Run all". You'll see what the pipeline catches in about 20 seconds.
|
||||||
|
|
||||||
|
If something doesn't work: just reply to this email. I read every reply (it goes to my own inbox, not a queue).
|
||||||
|
|
||||||
|
If you want to refund: also just reply. 30-day no-questions; no form to fill out.
|
||||||
|
|
||||||
|
Tomorrow I'll send a sample bank export with a few of the tricky cases pre-built in, so you can see what the gate report looks like on a known input. After that you'll get one email a week for the next month with one tip each — feel free to unsubscribe at the bottom of any of them.
|
||||||
|
|
||||||
|
Welcome aboard.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
|
|
||||||
|
P.S. If you have a bookkeeper friend who'd find this useful, the share-friendly landing page is {{landing_page}}.
|
||||||
31
marketing/emails/bookkeeper/01-day1.md
Normal file
31
marketing/emails/bookkeeper/01-day1.md
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
# Bookkeeper · Day 1 — Try it on this messy bank export first
|
||||||
|
|
||||||
|
**Subject:** Try it on this messy bank export first
|
||||||
|
**Send:** Day 1, ~9am buyer-local-time
|
||||||
|
**Goal:** convert "I bought it" → "I ran it on something"
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
Yesterday's email had your download. Today's email has a *file* — a sample bank export I built specifically to break things.
|
||||||
|
|
||||||
|
→ **{{sample_file_url}}** (260 KB CSV, 1,400 rows of synthetic data — no real account info)
|
||||||
|
|
||||||
|
It's modeled after real exports I've seen from US, UK, and Canadian banks. Hidden in there:
|
||||||
|
|
||||||
|
- Mixed date formats (some `MM/DD/YYYY`, some `DD-MM-YY`, one row in `YYYY-MM-DD`)
|
||||||
|
- Six different spellings of "Amazon" across the merchant column
|
||||||
|
- Trailing whitespace + non-breaking spaces in the description column
|
||||||
|
- Three obvious duplicate transactions and two non-obvious ones (different timestamps, same amount + merchant)
|
||||||
|
- A totals row at the bottom that's not a transaction
|
||||||
|
- One row with currency in `€` instead of `$`
|
||||||
|
|
||||||
|
Drop it into DataTools, click **"Run all"** in the analyzer, and look at the gate report. It'll catch all of the above and tell you exactly what changed and why.
|
||||||
|
|
||||||
|
The audit trail (a sidecar CSV called `<filename>.audit.csv`) is the part most bookkeepers are surprised by. Open it in Excel — every change has a row: original value, new value, rule that fired, timestamp. That's the file you hand to your client when they ask "wait, why did you re-classify that?".
|
||||||
|
|
||||||
|
Try it once on the sample, then once on a real client export. Reply and tell me what it caught (or missed) — I'm building the v1.1 detector list from real-world feedback.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
35
marketing/emails/bookkeeper/02-day3.md
Normal file
35
marketing/emails/bookkeeper/02-day3.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Bookkeeper · Day 3 — The audit trail your client will actually open
|
||||||
|
|
||||||
|
**Subject:** The audit trail your client will actually open
|
||||||
|
**Send:** Day 3
|
||||||
|
**Goal:** deepen feature understanding around the audit trail (the
|
||||||
|
real differentiator vs. spreadsheet workflow)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
Most "data cleaning" tools spit out a clean file and call it done. The thing your *client* needs — and what protects you in a year when they ask "why did you change that?" — is the audit trail.
|
||||||
|
|
||||||
|
Here's the file DataTools writes alongside every cleaned export. It's a CSV called `<filename>.audit.csv` and it sits next to the cleaned file in your output folder.
|
||||||
|
|
||||||
|
Five columns, append-only:
|
||||||
|
|
||||||
|
| original_value | new_value | rule_applied | confidence | timestamp |
|
||||||
|
|----------------|-----------|--------------|------------|-----------|
|
||||||
|
| `AMZN Mktp` | `Amazon` | `merchant_canonicalize` | 0.94 | 2026-05-04T09:12:03 |
|
||||||
|
| ` Starbucks ` | `Starbucks` | `whitespace_strip` | 1.00 | 2026-05-04T09:12:03 |
|
||||||
|
| `01/02/26` | `2026-02-01` | `date_normalize_dmy` | 0.88 | 2026-05-04T09:12:03 |
|
||||||
|
|
||||||
|
Why this matters in a real client conversation:
|
||||||
|
|
||||||
|
- **The client asks "why is this Amazon when my statement says AMZN Mktp?"** — open the audit CSV, point at the `merchant_canonicalize` row. Done in 10 seconds.
|
||||||
|
- **A reviewer (auditor, accountant, you in 6 months) asks "what changed?"** — the audit CSV is the answer. Diffable, openable in Excel, no proprietary format.
|
||||||
|
- **You spot a wrong rule firing** — the `confidence` column tells you which rules to tune. Anything <0.90 is worth eyeballing.
|
||||||
|
|
||||||
|
One workflow change worth making: when you send the cleaned file to QuickBooks, send the audit CSV to the client at the same time, in a folder labeled "month-end audit trail". Most clients won't open it. The 10% that do will trust you forever.
|
||||||
|
|
||||||
|
Reply if you want me to walk through the audit format on a call — happy to do a quick screen-share for any buyer in the first 30 days.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
32
marketing/emails/bookkeeper/03-day7.md
Normal file
32
marketing/emails/bookkeeper/03-day7.md
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# Bookkeeper · Day 7 — One pipeline, every client, every month
|
||||||
|
|
||||||
|
**Subject:** One pipeline, every client, every month
|
||||||
|
**Send:** Day 7
|
||||||
|
**Goal:** reframe from one-off tool to monthly workflow
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
A week in. By now you've probably run DataTools on 1-2 client exports and confirmed it does what the landing page promised.
|
||||||
|
|
||||||
|
The thing buyers tell me they wish they'd done from day one: **set it up as a workflow, not a one-off.**
|
||||||
|
|
||||||
|
The pattern that works:
|
||||||
|
|
||||||
|
**1. Make a folder per client.** Inside each client folder, a subfolder per month: `Acme Co/2026-05/`. Drop the raw export here.
|
||||||
|
|
||||||
|
**2. Save your DataTools settings as a per-client preset.** The "Save settings" button in the analyzer drops a `.datatools-preset.json` file. Stash that in the client folder. Next month, load the preset and the analyzer pre-configures with the rules you tuned for that client (e.g., your "Amazon Marketplace" canonical name, your client's specific merchant aliases).
|
||||||
|
|
||||||
|
**3. Run the pipeline. Get three files back:** the cleaned CSV, the audit CSV, the gate report. Move them into `Acme Co/2026-05/cleaned/`.
|
||||||
|
|
||||||
|
**4. Import the cleaned CSV to QuickBooks. Email the audit CSV to the client.**
|
||||||
|
|
||||||
|
Total elapsed time per client per month, after the first: 3-5 minutes. The first month per client is longer (~15 min) because you're tuning the preset.
|
||||||
|
|
||||||
|
The buyers who do this are the ones still emailing me 3 months later — usually with feature requests for the next client they want to onboard. The buyers who only ever run it ad-hoc tend to drift back to spreadsheets within 2 months.
|
||||||
|
|
||||||
|
If you want, reply with a sanitized export and I'll show you what your starting preset should look like — happy to do this for the first 50 buyers.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
35
marketing/emails/bookkeeper/04-day14.md
Normal file
35
marketing/emails/bookkeeper/04-day14.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Bookkeeper · Day 14 — Two-minute trick: the gate report
|
||||||
|
|
||||||
|
**Subject:** Two-minute trick: the gate report
|
||||||
|
**Send:** Day 14
|
||||||
|
**Goal:** surface the gate tool — non-obvious, high-value once seen
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
The tool inside DataTools that buyers find last is the **gate** — and it's the one that quietly does the most for you.
|
||||||
|
|
||||||
|
What it does: before any row gets written to the cleaned CSV, the gate runs a per-row pass-through check. Rows that fail get *quarantined* into a separate file (`<filename>.quarantine.csv`) instead of silently dropped or silently passed.
|
||||||
|
|
||||||
|
Default rules (you can add your own):
|
||||||
|
|
||||||
|
- Missing required fields (date, amount)
|
||||||
|
- Amount in unexpected currency without a flag
|
||||||
|
- Date outside the export's stated range (catches the "totals row" issue from Day 1)
|
||||||
|
- Duplicate of another row already in the file (per the dedupe pass)
|
||||||
|
- Confidence below your threshold on a field that got auto-corrected
|
||||||
|
|
||||||
|
The 2-minute workflow:
|
||||||
|
|
||||||
|
1. Run the pipeline as usual.
|
||||||
|
2. Open `<filename>.quarantine.csv`. (It'll be tiny — typically 0-5% of rows.)
|
||||||
|
3. Eyeball it. Anything that's a real transaction, fix-and-re-include manually. Anything that's a totals row / blank row / corrupt row — confirm it's correctly quarantined and delete it.
|
||||||
|
4. Re-run the pipeline on the fixed-up version (or just append the manually-fixed rows to the cleaned CSV).
|
||||||
|
|
||||||
|
The reason this matters: silent drops are the worst possible failure mode for a bookkeeper. You'd rather a row come out wrong (you'll catch it on review) than disappear (you won't catch it for months). The gate makes the silent-drop case impossible.
|
||||||
|
|
||||||
|
Set the gate's confidence threshold to `0.85` for client work. Lower (0.75) for personal / exploratory; higher (0.92+) only if you've spent time tuning your client's preset.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
26
marketing/emails/bookkeeper/05-day30.md
Normal file
26
marketing/emails/bookkeeper/05-day30.md
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# Bookkeeper · Day 30 — Heard from a fellow bookkeeper?
|
||||||
|
|
||||||
|
**Subject:** Heard from a fellow bookkeeper?
|
||||||
|
**Send:** Day 30
|
||||||
|
**Goal:** referral / review ask. Last touch in the sequence.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
A month in. If DataTools earned its $49 — would you do me one (very small) favor?
|
||||||
|
|
||||||
|
**Pick one of these. Whichever is easiest.**
|
||||||
|
|
||||||
|
1. **Gumroad review** (60 seconds): {{download_url}}#reviews — even a single line helps the next bookkeeper trust the listing enough to click "buy".
|
||||||
|
2. **Reply to this email with one sentence I can quote** on the bookkeeper landing page. Anonymous if you prefer; I'll never use a name without explicit permission.
|
||||||
|
3. **Share the landing page** with one bookkeeper friend who'd benefit: {{landing_page}}. No referral commission scheme, just a link.
|
||||||
|
|
||||||
|
If DataTools *didn't* earn its $49 — also reply. Tell me what's missing or what's broken. The 30-day refund window is still open and I'd rather refund a buyer who didn't get value than have an unhappy customer in the wild.
|
||||||
|
|
||||||
|
Either way, this is the last automated email you'll get from me. After this you only hear from me when there's a v1.x update or if you reply to one of the previous emails.
|
||||||
|
|
||||||
|
Thanks for being an early buyer — the first 50 customers shape the next 5,000.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
34
marketing/emails/revops/00-delivery.md
Normal file
34
marketing/emails/revops/00-delivery.md
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# RevOps · Day 0 — Delivery email
|
||||||
|
|
||||||
|
**Subject:** Your DataTools download (start here)
|
||||||
|
**Send:** immediately on Gumroad purchase confirmation
|
||||||
|
**Goal:** download + first run within 24h
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
Thanks for buying DataTools. Your download:
|
||||||
|
|
||||||
|
→ **{{download_url}}**
|
||||||
|
|
||||||
|
Three things to do in the next 5 minutes:
|
||||||
|
|
||||||
|
**1. Download the installer for your OS** (Mac `.dmg`, Windows `.exe`, or Linux `.tar.gz`). About 280 MB. The link auto-detects.
|
||||||
|
|
||||||
|
**2. Run it.** First launch takes ~5 seconds; a browser tab opens to `127.0.0.1:8501`. That's the app — running locally on your machine. No data leaves the box. (Yes, even if you're on the corporate VPN. Especially then.)
|
||||||
|
|
||||||
|
**3. Drop in a real lead list.** Don't bother with the bundled samples — the gate report only gets interesting when the data is real. Pull last quarter's webform export, or your most recent Apollo / LinkedIn pull, drag it into the analyzer, and click **"Run all"**. You'll see what the dedupe + format pipeline does in about 30 seconds.
|
||||||
|
|
||||||
|
If something doesn't work: just reply. I read every reply.
|
||||||
|
|
||||||
|
Refund: also just reply. 30-day no-questions; no form.
|
||||||
|
|
||||||
|
Tomorrow I'll send a sample 3-vendor lead list (HubSpot + LinkedIn + Apollo, synthetic data) so you can see the dedupe confidence tiers in action on a known input. After that you'll get one email a week for the next month — practical tips, no upsell. Unsubscribe at the bottom of any of them.
|
||||||
|
|
||||||
|
Welcome aboard.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
|
|
||||||
|
P.S. If you have a RevOps friend who'd find this useful: {{landing_page}}.
|
||||||
36
marketing/emails/revops/01-day1.md
Normal file
36
marketing/emails/revops/01-day1.md
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# RevOps · Day 1 — Try it on this 3-vendor lead list first
|
||||||
|
|
||||||
|
**Subject:** Try it on this 3-vendor lead list first
|
||||||
|
**Send:** Day 1, ~9am buyer-local-time
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
Yesterday's email had your download. Today's email has a *file* — a synthetic 3-vendor lead list (HubSpot + LinkedIn scrape + Apollo pull) that I built specifically to break naive dedupe.
|
||||||
|
|
||||||
|
→ **{{sample_file_url}}** (1.2 MB CSV, 4,800 rows — fully synthetic, no real prospects)
|
||||||
|
|
||||||
|
What's hidden in there:
|
||||||
|
|
||||||
|
- The same person from 3 sources, with intentionally inconsistent fields:
|
||||||
|
- HubSpot row: full email + company; no LinkedIn URL
|
||||||
|
- LinkedIn row: name + title + LinkedIn URL; no email
|
||||||
|
- Apollo row: email + phone + company; misspelled name
|
||||||
|
- ~120 obvious duplicates (same email, different case)
|
||||||
|
- ~80 cross-source duplicates (different keys, same person — these are the ones HubSpot's native dedupe misses)
|
||||||
|
- ~40 phone numbers in 5 different formats per country (+1, +44, +61)
|
||||||
|
- One row per 200 with a hidden zero-width space in the email
|
||||||
|
|
||||||
|
Drop it into DataTools, click **"Run all"** in the analyzer, then run the **dedupe** tool with the default 0.85 threshold.
|
||||||
|
|
||||||
|
Look at three things in the output:
|
||||||
|
|
||||||
|
1. **The cleaned CSV** — what your import would look like
|
||||||
|
2. **The audit CSV** — every change, every rule, confidence per change
|
||||||
|
3. **The manual-review queue** (`<filename>.review.csv`) — the 0.85-0.95 confidence range. This is where the real dedupe value is; auto-merging this range is what gets people in trouble.
|
||||||
|
|
||||||
|
Try it once on the sample, then once on a real list. Reply and tell me what it caught (or missed) — the v1.1 fuzzy-matching tuning comes from real-world feedback.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
36
marketing/emails/revops/02-day3.md
Normal file
36
marketing/emails/revops/02-day3.md
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
# RevOps · Day 3 — The dedupe rule that catches LinkedIn drift
|
||||||
|
|
||||||
|
**Subject:** The dedupe rule that catches LinkedIn drift
|
||||||
|
**Send:** Day 3
|
||||||
|
**Goal:** deepen feature understanding around the cross-source dedupe
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
The thing native HubSpot / Salesforce dedupe can't do, and the thing DataTools is actually best at: **cross-source matching**, where the same person shows up via LinkedIn, a webform, and a trade-show import — with no shared key.
|
||||||
|
|
||||||
|
The rule that does the work is in the dedupe tool's **"Block by domain, fuzzy on name+title"** mode. Here's what it does:
|
||||||
|
|
||||||
|
**Step 1 — Block.** Group rows by email domain. (LinkedIn rows with no email get bucketed by `domain(linkedin_url)` — usually their company website if they listed it.) This avoids the O(n²) explosion and rules out cross-company false positives.
|
||||||
|
|
||||||
|
**Step 2 — Within each block, fuzzy-match on `first_name + last_name + title`.** Token-set ratio at 0.85 default. Catches:
|
||||||
|
|
||||||
|
- "Sarah O'Brien, VP Marketing" = "sarah obrien, vp of marketing"
|
||||||
|
- "Mike Chen, Head of Sales" = "Michael Chen, Sales Lead" (this one needs a 0.78 threshold; configurable)
|
||||||
|
- "J. Smith, Director" = "Jane Smith, Director" (only with a strong company-name match)
|
||||||
|
|
||||||
|
**Step 3 — Confidence-tier the merge.** ≥0.95 auto-merges. 0.85-0.95 goes to `<filename>.review.csv` for you to eyeball. <0.85 stays unmerged.
|
||||||
|
|
||||||
|
**Step 4 — Field-precedence on merge.** When records merge, you choose which source wins per field. Default precedence (configurable):
|
||||||
|
|
||||||
|
- `title`, `company`, `linkedin_url` → LinkedIn wins (more recent)
|
||||||
|
- `email`, `phone` → Webform wins (verified)
|
||||||
|
- `lifecycle_stage`, `owner` → HubSpot wins (your CRM is canonical)
|
||||||
|
|
||||||
|
**One trap to avoid:** don't run dedupe before format standardization. If phone formats are inconsistent across sources, the dedupe tool sees "+14155550143" and "(415) 555-0143" as different keys. Always run **format → analyzer → dedupe → gate** in that order. The pipeline UI enforces this; the per-tool runs don't.
|
||||||
|
|
||||||
|
Reply if you want me to walk through the precedence config on a screen-share — happy to do this for any buyer in the first 30 days.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
34
marketing/emails/revops/03-day7.md
Normal file
34
marketing/emails/revops/03-day7.md
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# RevOps · Day 7 — Run it before every HubSpot import
|
||||||
|
|
||||||
|
**Subject:** Run it before every HubSpot import
|
||||||
|
**Send:** Day 7
|
||||||
|
**Goal:** reframe from one-off tool to per-campaign workflow
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
A week in. By now you've probably run DataTools on a real list once or twice and confirmed the dedupe catches more than HubSpot's native check.
|
||||||
|
|
||||||
|
The thing that turns DataTools into a per-month-cost saver instead of a one-off purchase: **make it the gate on every import.**
|
||||||
|
|
||||||
|
The pattern that works:
|
||||||
|
|
||||||
|
**1. One DataTools run per campaign source.** Webform pull → DataTools. LinkedIn scrape → DataTools. Apollo export → DataTools. Each run produces a "clean" CSV.
|
||||||
|
|
||||||
|
**2. Concatenate the cleaned CSVs.** Standard pandas `concat` or just paste in Excel.
|
||||||
|
|
||||||
|
**3. One more DataTools run on the concatenation.** This is the cross-source dedupe pass — the one that catches the same person across the three sources.
|
||||||
|
|
||||||
|
**4. Compare against your current HubSpot export.** DataTools' dedupe against your existing CRM as the second source catches the people you already paid for last quarter and don't need to import again.
|
||||||
|
|
||||||
|
**5. Import only the residue** — the rows that survived all four passes — into HubSpot.
|
||||||
|
|
||||||
|
The buyers running this pipeline tell me they've cut their HubSpot marketing-contact bill 15-25% within two months. Not because their pipeline got smaller — because they stopped paying for duplicates.
|
||||||
|
|
||||||
|
**One thing to set up once:** save your dedupe settings as a `.datatools-preset.json` and commit it to your RevOps team's repo (or a shared Drive folder). Same preset every campaign means consistent results across whoever's running it that week.
|
||||||
|
|
||||||
|
If you want, reply with a sanitized lead list and I'll suggest a starting preset for your sources — happy to do this for the first 50 buyers.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
34
marketing/emails/revops/04-day14.md
Normal file
34
marketing/emails/revops/04-day14.md
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# RevOps · Day 14 — Two-minute trick: the confidence tiers
|
||||||
|
|
||||||
|
**Subject:** Two-minute trick: the confidence tiers
|
||||||
|
**Send:** Day 14
|
||||||
|
**Goal:** surface the manual-review queue — non-obvious, high-value
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
The single most-skipped feature in DataTools is also the one with the highest payoff per minute: the **manual-review queue**.
|
||||||
|
|
||||||
|
Here's what's happening under the hood: every dedupe decision DataTools makes has a confidence score (0.0 to 1.0). The dedupe tool by default puts decisions into three buckets:
|
||||||
|
|
||||||
|
- **≥0.95** → auto-merge (cleaned CSV)
|
||||||
|
- **0.85 - 0.95** → manual-review queue (`<filename>.review.csv`)
|
||||||
|
- **<0.85** → unmerged (kept as separate rows)
|
||||||
|
|
||||||
|
The 0.85-0.95 bucket is the magic. It's the range where a tuned algorithm catches *most* duplicates but where the wrong choice is a real cost (merging two genuinely different people = lost prospect; not merging two duplicates = paid contact you didn't need).
|
||||||
|
|
||||||
|
The 2-minute workflow:
|
||||||
|
|
||||||
|
1. Run dedupe.
|
||||||
|
2. Open `<filename>.review.csv`. Each row is a candidate merge with: confidence, the two records side-by-side, the rule that fired.
|
||||||
|
3. Eyeball each row. Mark `keep_merge` (Y/N) in the rightmost column.
|
||||||
|
4. Re-run dedupe with the `--apply-review-decisions <filename>.review.csv` flag (or click "Apply review decisions" in the GUI).
|
||||||
|
5. Final cleaned CSV reflects your manual choices.
|
||||||
|
|
||||||
|
For a 5,000-row lead list, the review queue is typically 20-60 rows. ~3 minutes of work. The output is dramatically better than auto-merge-everything-≥0.85, which is what most tools (including HubSpot's) do silently.
|
||||||
|
|
||||||
|
**Pro move:** save your `keep_merge` decisions over time. After 3-4 campaigns you'll have a corpus of "yes-merges" and "no-merges" you can use to retune the auto-merge threshold for *your* data. Most teams find their sweet spot is somewhere in 0.88-0.92.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
26
marketing/emails/revops/05-day30.md
Normal file
26
marketing/emails/revops/05-day30.md
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# RevOps · Day 30 — Heard from another RevOps lead?
|
||||||
|
|
||||||
|
**Subject:** Heard from another RevOps lead?
|
||||||
|
**Send:** Day 30
|
||||||
|
**Goal:** referral / review ask
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
A month in. If DataTools earned its $49 — would you do me one small favor?
|
||||||
|
|
||||||
|
**Pick the one that's easiest.**
|
||||||
|
|
||||||
|
1. **Gumroad review** (60 seconds): {{download_url}}#reviews — every line helps the next RevOps lead trust the listing enough to click "buy".
|
||||||
|
2. **Reply to this email with one sentence I can quote** on the RevOps landing page. Anonymous if you prefer; I'll never use a name without explicit permission.
|
||||||
|
3. **Share the landing page** with one RevOps friend who'd benefit: {{landing_page}}. No referral commission, just a link.
|
||||||
|
|
||||||
|
If DataTools *didn't* earn its $49 — also reply. Tell me what's missing or broken. The 30-day refund window is still open and I'd rather refund than have an unhappy customer in the wild.
|
||||||
|
|
||||||
|
Either way, this is the last automated email you'll get from me. After this you only hear from me when there's a v1.x update or if you reply to one of the previous emails.
|
||||||
|
|
||||||
|
Thanks for being an early buyer — the first 50 customers shape the next 5,000.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
34
marketing/emails/shopify-pet/00-delivery.md
Normal file
34
marketing/emails/shopify-pet/00-delivery.md
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# Shopify-pet · Day 0 — Delivery email
|
||||||
|
|
||||||
|
**Subject:** Your DataTools download (start here)
|
||||||
|
**Send:** immediately on Gumroad purchase confirmation
|
||||||
|
**Goal:** download + first run within 24h
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
Thanks for buying DataTools. Your download:
|
||||||
|
|
||||||
|
→ **{{download_url}}**
|
||||||
|
|
||||||
|
Three things to do in the next 5 minutes:
|
||||||
|
|
||||||
|
**1. Download the installer for your OS** (Mac `.dmg`, Windows `.exe`, or Linux `.tar.gz`). About 280 MB. The link auto-detects.
|
||||||
|
|
||||||
|
**2. Run it.** First launch takes ~5 seconds; a browser tab opens to `127.0.0.1:8501`. That's the app — running locally on your machine. No data leaves the box. Your customer list never goes to a server.
|
||||||
|
|
||||||
|
**3. Drop in a real Shopify customer export.** Don't bother with the bundled samples. Customers > Export > "All customers" > CSV in Shopify admin. Drag it into DataTools' analyzer, click **"Run all"**. You'll see what it catches — typically a few hundred phone-format issues, some hidden-character emails, and a handful of cross-row duplicates — in about 30 seconds.
|
||||||
|
|
||||||
|
If something doesn't work: reply to this email. Goes to my inbox.
|
||||||
|
|
||||||
|
Refund: also reply. 30-day no-questions; no form.
|
||||||
|
|
||||||
|
Tomorrow I'll send a sample Shopify customer export with the tricky cases pre-built in, so you can see what the cleanup catches on a known input. After that you'll get one email a week for the next month with one tip each. Unsubscribe at the bottom of any of them.
|
||||||
|
|
||||||
|
Welcome aboard.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
|
|
||||||
|
P.S. Got a fellow store owner who'd find this useful? {{landing_page}}.
|
||||||
32
marketing/emails/shopify-pet/01-day1.md
Normal file
32
marketing/emails/shopify-pet/01-day1.md
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# Shopify-pet · Day 1 — Try it on this Shopify customer export first
|
||||||
|
|
||||||
|
**Subject:** Try it on this Shopify customer export first
|
||||||
|
**Send:** Day 1, ~9am buyer-local-time
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
Yesterday's email had your download. Today's email has a *file* — a synthetic Shopify customer export I built specifically to break things Klaviyo silently chokes on.
|
||||||
|
|
||||||
|
→ **{{sample_file_url}}** (480 KB CSV, 2,200 rows — fully synthetic, no real customer data)
|
||||||
|
|
||||||
|
What's hidden in there:
|
||||||
|
|
||||||
|
- Phone numbers in 6 different formats (`(415) 555-0143`, `415.555.0143`, `4155550143`, `+44 20 7946 0958` without country field, `+1-415-555-0143 ext 12`, `415 555 0143`)
|
||||||
|
- Email addresses with embedded zero-width spaces (looks identical to a clean email; Klaviyo treats as different addresses)
|
||||||
|
- ~80 obvious customer duplicates (same email, different case)
|
||||||
|
- ~40 cross-row duplicates (different email, same name + same shipping address — usually the same person ordering with two emails)
|
||||||
|
- Shipping addresses with mixed `St.` / `Street` / `St` / `STREET` for the same street name
|
||||||
|
- 12 customers from outside North America with country field blank
|
||||||
|
|
||||||
|
Drop it into DataTools. Click **"Run all"** in the analyzer. Then run **format → dedupe → text-clean → gate** in that order.
|
||||||
|
|
||||||
|
Look at the **gate report** at the end — it'll tell you exactly which rows would have broken Klaviyo, with a one-line "why" per row.
|
||||||
|
|
||||||
|
If you want to see the difference: import the **raw** file to a test Klaviyo list, then import the **cleaned** file to a different test list. Compare the SMS-deliverable count. The delta is what you've been losing every month.
|
||||||
|
|
||||||
|
Reply and tell me what it caught (or missed) — v1.1 detector improvements come from real-world feedback.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
33
marketing/emails/shopify-pet/02-day3.md
Normal file
33
marketing/emails/shopify-pet/02-day3.md
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# Shopify-pet · Day 3 — The phone-format step Klaviyo cares about
|
||||||
|
|
||||||
|
**Subject:** The phone-format step Klaviyo cares about
|
||||||
|
**Send:** Day 3
|
||||||
|
**Goal:** deepen feature understanding around the format standardizer
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
The single biggest source of "Klaviyo dropped this customer silently" is phone formatting. DataTools fixes this in one tool — the **format standardizer** — but the *settings* matter.
|
||||||
|
|
||||||
|
Klaviyo (and basically every modern SMS platform) wants phones in **E.164** format: `+` then country code then number, no spaces, no dashes, no extension. Like: `+14155550143`.
|
||||||
|
|
||||||
|
Three settings in DataTools' format standardizer that get this right:
|
||||||
|
|
||||||
|
**1. Set "Phone output format" to `E.164`.** Default is `national` (`(415) 555-0143`) — fine for display, broken for Klaviyo. Change it once; the preset remembers.
|
||||||
|
|
||||||
|
**2. Set "Default country" per row, not per file.** This is the non-obvious one. For each customer:
|
||||||
|
- If the `country` field has a value (e.g., "Canada", "CA", "Canadá"), use it.
|
||||||
|
- If blank, fall back to the country in the *shipping address*.
|
||||||
|
- If still blank, fall back to the file-level default (you set this — typically your store's primary market).
|
||||||
|
|
||||||
|
DataTools does this automatically when you check "Use per-row country detection". *Skip this and ~30% of international customers will end up with US country codes prepended to their numbers — which Klaviyo accepts but routes wrong, and your SMS never arrives.*
|
||||||
|
|
||||||
|
**3. Set "Quarantine un-parseable phones" to ON.** Don't drop them silently; don't pass them to Klaviyo broken. Send them to `<filename>.quarantine.csv` so you can fix the worst 10-20 by hand and re-include them.
|
||||||
|
|
||||||
|
The combination — E.164 + per-row country + quarantine — typically takes a Shopify export from "60-70% of phones survive Klaviyo's import" to "97-99%". On a 10,000-customer list, that's 2,500 - 3,500 more customers reachable per campaign.
|
||||||
|
|
||||||
|
Reply if you want me to walk through these settings on a screen-share — happy to do this for any buyer in the first 30 days.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
35
marketing/emails/shopify-pet/03-day7.md
Normal file
35
marketing/emails/shopify-pet/03-day7.md
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
# Shopify-pet · Day 7 — Run it before every Klaviyo sync
|
||||||
|
|
||||||
|
**Subject:** Run it before every Klaviyo sync
|
||||||
|
**Send:** Day 7
|
||||||
|
**Goal:** reframe from one-off tool to per-sync workflow
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
A week in. By now you've probably run DataTools on a real customer export once or twice and seen the cleanup catch things you'd been losing in Klaviyo for months.
|
||||||
|
|
||||||
|
The thing that turns DataTools into a recurring win instead of a one-off purchase: **run it before every sync, not just the first time.**
|
||||||
|
|
||||||
|
The pattern that works for most stores:
|
||||||
|
|
||||||
|
**1. Pick a cadence.** Most stores I talk to do this monthly; high-volume stores do it weekly. The cadence should match your "I'm planning a campaign" rhythm.
|
||||||
|
|
||||||
|
**2. The Sunday-morning ritual:**
|
||||||
|
- Pull a fresh customer export from Shopify (Customers > Export > "All customers")
|
||||||
|
- Drop into DataTools
|
||||||
|
- Run the pipeline (analyzer → format → text-clean → dedupe → gate)
|
||||||
|
- Review the gate quarantine file (typically 0.5-2% of rows)
|
||||||
|
- Push the cleaned CSV to Klaviyo (their CSV import or via their API)
|
||||||
|
|
||||||
|
**3. Save your settings as a preset.** The "Save settings" button writes a `.datatools-preset.json`. Keep it in your store's Drive / Notion / wherever your shop docs live. Next month, load preset, run pipeline, done in 4 minutes.
|
||||||
|
|
||||||
|
**4. After 3 months, retune the preset.** Look at your manual-review queue across the 3 runs. If you're consistently approving 0.86-confidence merges, drop the auto-merge threshold to 0.85. If you're rejecting 0.92 merges, raise it to 0.94. The preset improves with use.
|
||||||
|
|
||||||
|
The store owners doing this monthly tell me their open rates go up 8-15% in the first 90 days — not from new content, just from the email actually reaching the inbox.
|
||||||
|
|
||||||
|
If you want, reply with a sanitized export and I'll suggest a starting preset for your store — happy to do this for the first 50 buyers.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
32
marketing/emails/shopify-pet/04-day14.md
Normal file
32
marketing/emails/shopify-pet/04-day14.md
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# Shopify-pet · Day 14 — Two-minute trick: hidden-character cleanup
|
||||||
|
|
||||||
|
**Subject:** Two-minute trick: hidden-character cleanup
|
||||||
|
**Send:** Day 14
|
||||||
|
**Goal:** surface the text cleaner — non-obvious, high-value
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
The tool inside DataTools that buyers find last is the **text cleaner** — and on Shopify customer exports it's usually the one with the most "wait, that was a problem?" moments.
|
||||||
|
|
||||||
|
What it catches: invisible characters that got into your customer data when customers typed on their phones. The most common offenders:
|
||||||
|
|
||||||
|
- **Zero-width space** (`U+200B`) inside emails — Klaviyo treats `sarah@acme.com` (with hidden char) and `sarah@acme.com` (without) as different addresses
|
||||||
|
- **Non-breaking space** (`U+00A0`) inside addresses — Shopify accepts it, Klaviyo accepts it, but USPS address validation fails on it
|
||||||
|
- **BOM marker** (`U+FEFF`) at the start of CSV cells — usually from a customer pasting from Word or a PDF
|
||||||
|
- **Right-to-left mark** (`U+200F`) — rare, but appears in customer names from Hebrew/Arabic locales
|
||||||
|
|
||||||
|
The 2-minute workflow:
|
||||||
|
|
||||||
|
1. After the format standardizer pass, run the text cleaner.
|
||||||
|
2. It produces an additional sidecar file: `<filename>.hidden-chars.csv` — every cell where it found a hidden char, with a "what was hidden where" annotation.
|
||||||
|
3. Skim it. Most are fine to silently strip (zero-width spaces, BOMs). For rare ones (right-to-left marks in a name), confirm before stripping — sometimes they're load-bearing.
|
||||||
|
4. Click "Apply cleanup". The text cleaner replaces the hidden chars in the cleaned CSV.
|
||||||
|
|
||||||
|
The reason this matters: **dedupe runs after text-clean.** Two emails with a hidden char difference look identical in the GUI but get treated as two separate customers — and your dedupe pass won't catch them unless the text cleaner ran first.
|
||||||
|
|
||||||
|
The pipeline order baked into the GUI is: `analyzer → format → text-clean → dedupe → gate`. Stick to it; per-tool runs out of order are the most common source of "wait, why didn't dedupe catch this?".
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
26
marketing/emails/shopify-pet/05-day30.md
Normal file
26
marketing/emails/shopify-pet/05-day30.md
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# Shopify-pet · Day 30 — Heard from another store owner?
|
||||||
|
|
||||||
|
**Subject:** Heard from another store owner?
|
||||||
|
**Send:** Day 30
|
||||||
|
**Goal:** referral / review ask
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Hi {{first_name}},
|
||||||
|
|
||||||
|
A month in. If DataTools earned its $49 — would you do me one small favor?
|
||||||
|
|
||||||
|
**Pick the one that's easiest.**
|
||||||
|
|
||||||
|
1. **Gumroad review** (60 seconds): {{download_url}}#reviews — every line helps the next Shopify owner trust the listing enough to click "buy".
|
||||||
|
2. **Reply to this email with one sentence I can quote** on the landing page. Anonymous if you prefer; I'll never use a name without explicit permission.
|
||||||
|
3. **Share the landing page** with one fellow store owner who'd benefit: {{landing_page}}. No referral commission, just a link.
|
||||||
|
|
||||||
|
If DataTools *didn't* earn its $49 — also reply. Tell me what's missing or broken. The 30-day refund window is still open and I'd rather refund than have an unhappy customer in the wild.
|
||||||
|
|
||||||
|
Either way, this is the last automated email you'll get from me. After this you only hear from me when there's a v1.x update or if you reply to one of the previous emails.
|
||||||
|
|
||||||
|
Thanks for being an early buyer — the first 50 customers shape the next 5,000.
|
||||||
|
|
||||||
|
— Michael
|
||||||
|
{{support_email}}
|
||||||
@@ -12,9 +12,14 @@ markers =
|
|||||||
e2e: end-to-end CLI / integration tests
|
e2e: end-to-end CLI / integration tests
|
||||||
install: import / dependency sanity tests
|
install: import / dependency sanity tests
|
||||||
fixture_sweep: parametrized sweep over the test-cases/ folder
|
fixture_sweep: parametrized sweep over the test-cases/ folder
|
||||||
|
gui: Streamlit AppTest-driven tests (live in tests/gui/)
|
||||||
|
|
||||||
# Warnings discipline: fail on unexpected DeprecationWarning from our own
|
# Warnings discipline: fail on any DeprecationWarning *or* ResourceWarning
|
||||||
# code, but tolerate third-party deprecations that we can't fix.
|
# from our own ``src`` package so a leaked file handle or stale stdlib call
|
||||||
|
# can't slip in unnoticed. Tolerate third-party deprecations / resource
|
||||||
|
# warnings — we can't fix pandas / openpyxl / streamlit churn from here.
|
||||||
filterwarnings =
|
filterwarnings =
|
||||||
error::DeprecationWarning:src
|
error::DeprecationWarning:src
|
||||||
|
error::ResourceWarning:src
|
||||||
ignore::DeprecationWarning
|
ignore::DeprecationWarning
|
||||||
|
ignore::ResourceWarning
|
||||||
|
|||||||
@@ -1,2 +1,6 @@
|
|||||||
pytest>=8.0,<9
|
pytest>=8.0,<9
|
||||||
pytest-cov>=5.0,<6
|
pytest-cov>=5.0,<6
|
||||||
|
# Test-only: generate small fixture PDFs in
|
||||||
|
# tests/test_pdf_extract_smoke.py so we can exercise pdfplumber +
|
||||||
|
# pypdfium2 end-to-end without committing binary fixtures.
|
||||||
|
fpdf2==2.8.7
|
||||||
|
|||||||
@@ -8,3 +8,16 @@ tqdm>=4.66,<5
|
|||||||
typer>=0.12,<1
|
typer>=0.12,<1
|
||||||
phonenumbers>=8.13,<9
|
phonenumbers>=8.13,<9
|
||||||
streamlit>=1.35,<2
|
streamlit>=1.35,<2
|
||||||
|
cryptography>=41,<49
|
||||||
|
# PDF Extractor stack — pinned to exact tested versions so a future
|
||||||
|
# upstream release can't quietly change pdfplumber's word-position
|
||||||
|
# behavior or pypdfium2's OCR rendering mid-build. Bump these
|
||||||
|
# explicitly when re-testing against a new release.
|
||||||
|
#
|
||||||
|
# ``pypdfium2`` is here for the OCR fallback path only (rasterizing
|
||||||
|
# pages to images for Tesseract). The drawable-canvas dep was
|
||||||
|
# removed when the visual picker was ripped out — the scanner is
|
||||||
|
# pure heuristic now, no coordinate UI.
|
||||||
|
pdfplumber==0.11.9
|
||||||
|
pypdfium2==5.8.0
|
||||||
|
pytesseract==0.3.13
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user