Compare commits
158 Commits
d0423a8912
...
rollback-2
| Author | SHA1 | Date | |
|---|---|---|---|
| 58d0009849 | |||
| b6c39d7a09 | |||
| b2fa8503e6 | |||
| b703911df3 | |||
| 93ccada974 | |||
| 17faf84aed | |||
| 4d8513b1a3 | |||
| ac94208d8f | |||
| 4955fb239b | |||
| 4a8961d58a | |||
| fe4b5dc755 | |||
| 209b5fb1aa | |||
| 904356f4e8 | |||
| 7203a81af7 | |||
| dd3b9bd59d | |||
| 2bd94c4441 | |||
| 9c426194b1 | |||
| 6627895a10 | |||
| ea99e292d2 | |||
| 0be59c0f03 | |||
| 3a3a9a895b | |||
| d090f8cb5e | |||
| e44af3a45e | |||
| 450d4fc9a8 | |||
| a0042d4aba | |||
| a18b126885 | |||
| 981a1a9cba | |||
| dbcf4d4048 | |||
| 34b56b404a | |||
| ad7c22d7fb | |||
| 6f2ad57490 | |||
| a1824b8dc4 | |||
| 155dd30746 | |||
| 3cf935c999 | |||
| 263af3c7c2 | |||
| bece2b4030 | |||
| 60969c0770 | |||
| 48cd9e8249 | |||
| d80befd05a | |||
| 10015c40e1 | |||
| e6ee2e3481 | |||
| 538e23d219 | |||
| 2d927bc95f | |||
| 967d3f6a11 | |||
| b86828d791 | |||
| 5a8e2ec9e1 | |||
| 2f349e8191 | |||
| aea520d2f7 | |||
| b8aff862ed | |||
| c16e2a5e29 | |||
| 7c9139f199 | |||
| b3ae913bb9 | |||
| ba07dcb6c7 | |||
| 76c9f5a679 | |||
| a8ff8f4bd0 | |||
| 4451f74895 | |||
| a022059b1e | |||
| 69240fc922 | |||
| 9a7d861903 | |||
| 1016a4d2c4 | |||
| 6c3939d21b | |||
| d436e34a45 | |||
| 0bb72ecd7e | |||
| 74d0ee270f | |||
| 06f1ea6cf7 | |||
| 784695e3a7 | |||
| 4816da1ad6 | |||
| 6703e2c15c | |||
| a9788ba712 | |||
| da7d86f457 | |||
| 2501119ac2 | |||
| 444dffbc63 | |||
| 3c4b80895e | |||
| b0ee65e922 | |||
| 65b663be97 | |||
| c942b8aa19 | |||
| 61e63913cb | |||
| e011c0b6e6 | |||
| 2fe324279e | |||
| 04dc326020 | |||
| d487a44170 | |||
| f106275643 | |||
| 8232ab1ca7 | |||
| 4c8e1199a4 | |||
| e282f061dc | |||
| 5daae9e5fa | |||
| 48cb802dfb | |||
| d022167ba2 | |||
| 24ee021314 | |||
| add3b866ee | |||
| b568773a1f | |||
| 4a7f99f0ec | |||
| b2449d3139 | |||
| d840230e48 | |||
| 9e8b4b2ca9 | |||
| dd231f5a38 | |||
| 143c775cdf | |||
| d1b9f642e2 | |||
| 65c85107b6 | |||
| d9e32e578b | |||
| 7cb1bc922d | |||
| be7191a5d1 | |||
| 2d2ff43754 | |||
| 36510eee7b | |||
| 1caedbbbc7 | |||
| c0bfd4dbc9 | |||
| 59c6d0f914 | |||
| ee0b1f6f6b | |||
| c73d716d06 | |||
| f0885aeb1e | |||
| 229e1afd45 | |||
| 7ad19ac7f4 | |||
| 84e4665ab0 | |||
| 4685bb4289 | |||
| e96d5901f4 | |||
| ecfc52499f | |||
| 21fd8a4cd7 | |||
| 42f8d78dd5 | |||
| 0f89d7ba66 | |||
| b9147f3b66 | |||
| 5128d35961 | |||
| 696996c119 | |||
| ae9d4a2db5 | |||
| ef9f8b5de4 | |||
| aeead05e4c | |||
| 6415be8bf4 | |||
| d1aaf3c2b9 | |||
| 27f0648093 | |||
| 0a61d52200 | |||
| ca14ce2952 | |||
| 502a72cd46 | |||
| 604debb9a9 | |||
| c575efd26e | |||
| 175389219f | |||
| c568aec8a7 | |||
| ff2eaeb6c4 | |||
| dad744f17f | |||
| fc6c22c6a7 | |||
| db5ec084da | |||
| 93e43fc0d9 | |||
| 624f99653e | |||
| 86ad21db79 | |||
| 2bbaba954b | |||
| b5cd74d474 | |||
| 1cf69dd23b | |||
| 673b902377 | |||
| bab2c9468c | |||
| 4179cb5156 | |||
| 52e04f63a9 | |||
| 23c51fd759 | |||
| 65e17e0a70 | |||
| e534fb4989 | |||
| d32b58e61a | |||
| e612c751a8 | |||
| e435103113 | |||
| b2c7b94fe9 | |||
| 070e3c9f06 | |||
| 35d46a0c1a |
120
.github/workflows/build.yml
vendored
120
.github/workflows/build.yml
vendored
@@ -1,8 +1,18 @@
|
|||||||
name: Build installers
|
name: Build installers
|
||||||
|
|
||||||
# Triggers:
|
# Triggers:
|
||||||
# * Tag push (v*) → produces installers, attaches to a GitHub Release.
|
# * Tag push (v*) → produces installers + portable zips, attaches them
|
||||||
# * Manual dispatch → produces installers as workflow artifacts only.
|
# to a GitHub Release.
|
||||||
|
# * Manual dispatch → uploads everything as workflow artifacts only.
|
||||||
|
#
|
||||||
|
# Outputs per platform (downloadable by buyers):
|
||||||
|
# * macOS: .dmg installer + portable .zip (signed .app inside).
|
||||||
|
# * Windows: .exe installer + portable .zip (no-install).
|
||||||
|
# * Linux: .AppImage (already portable; no separate zip).
|
||||||
|
#
|
||||||
|
# Self-contained: every artifact ships its own Python interpreter + every
|
||||||
|
# runtime dep through PyInstaller. No pre/post install steps on the
|
||||||
|
# buyer's machine.
|
||||||
#
|
#
|
||||||
# What this workflow doesn't do (yet):
|
# What this workflow doesn't do (yet):
|
||||||
# * Code signing (Mac Developer ID, Windows code-signing cert).
|
# * Code signing (Mac Developer ID, Windows code-signing cert).
|
||||||
@@ -29,14 +39,17 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- os: macos-latest
|
- os: macos-latest
|
||||||
artifact_name: DataTools-mac.dmg
|
platform: mac
|
||||||
artifact_path: dist/DataTools-*-mac.dmg
|
installer_glob: dist/DataTools-*-mac.dmg
|
||||||
|
portable_glob: dist/DataTools-*-mac-portable.zip
|
||||||
- os: windows-latest
|
- os: windows-latest
|
||||||
artifact_name: DataTools-win.exe
|
platform: win
|
||||||
artifact_path: dist/DataTools-*-win-setup.exe
|
installer_glob: dist/DataTools-*-win-setup.exe
|
||||||
|
portable_glob: dist/DataTools-*-win-portable.zip
|
||||||
- os: ubuntu-latest
|
- os: ubuntu-latest
|
||||||
artifact_name: DataTools-linux.AppImage
|
platform: linux
|
||||||
artifact_path: dist/DataTools-*-linux-x86_64.AppImage
|
installer_glob: dist/DataTools-*-linux-x86_64.AppImage
|
||||||
|
portable_glob: '' # AppImage is already a portable single file
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
@@ -50,7 +63,31 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
pip install pyinstaller
|
pip install pyinstaller pillow
|
||||||
|
|
||||||
|
# ---- Tesseract bundling cache --------------------------------
|
||||||
|
# The fetch logic inside build/make_release.py downloads:
|
||||||
|
# * build/vendor/tessdata/eng.traineddata (~16 MB, shared)
|
||||||
|
# * build/_tesseract/<platform>/ (binary + libs, 30-120 MB)
|
||||||
|
# Cache both so iterative CI runs don't re-download. The
|
||||||
|
# cache key bakes in the pinned Tesseract version + tessdata
|
||||||
|
# URL so a version bump invalidates automatically.
|
||||||
|
- name: Cache Tesseract bundle inputs
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
build/_tesseract
|
||||||
|
build/vendor/tessdata
|
||||||
|
key: tesseract-${{ runner.os }}-5.5.0-tessdata_best-v1
|
||||||
|
|
||||||
|
# ---- Linux: install patchelf so make_release.py can rewrite
|
||||||
|
# RPATH on the bundled tesseract binary. apt-get install
|
||||||
|
# tesseract-ocr is handled inside make_release.py itself. -----
|
||||||
|
- name: Install Linux build prereqs for Tesseract bundling
|
||||||
|
if: matrix.os == 'ubuntu-latest'
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y patchelf
|
||||||
|
|
||||||
- name: Read version
|
- name: Read version
|
||||||
id: version
|
id: version
|
||||||
@@ -59,15 +96,47 @@ jobs:
|
|||||||
VER=$(python -c "import re; print(re.search(r'__version__\s*=\s*\"([^\"]+)\"', open('src/__init__.py').read()).group(1))")
|
VER=$(python -c "import re; print(re.search(r'__version__\s*=\s*\"([^\"]+)\"', open('src/__init__.py').read()).group(1))")
|
||||||
echo "version=$VER" >> "$GITHUB_OUTPUT"
|
echo "version=$VER" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Generate platform icons
|
||||||
|
run: python build/generate_icons.py
|
||||||
|
|
||||||
|
# Stage Tesseract before PyInstaller. The make_release.py
|
||||||
|
# helpers handle the per-platform fetch (UB-Mannheim on Win,
|
||||||
|
# brew on Mac, apt on Linux) and stage the binary + libs into
|
||||||
|
# build/_tesseract/<platform>/ where the spec picks them up.
|
||||||
|
# We invoke a tiny inline Python so the workflow doesn't have
|
||||||
|
# to know the per-platform target string.
|
||||||
|
- name: Stage Tesseract binary + tessdata
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
DATATOOLS_PLATFORM: ${{ matrix.platform }}
|
||||||
|
run: |
|
||||||
|
python - <<'PY'
|
||||||
|
import os, sys
|
||||||
|
sys.path.insert(0, "build")
|
||||||
|
from make_release import fetch_tessdata, fetch_tesseract_for_platform
|
||||||
|
target = os.environ["DATATOOLS_PLATFORM"]
|
||||||
|
fetch_tessdata()
|
||||||
|
fetch_tesseract_for_platform(target)
|
||||||
|
PY
|
||||||
|
|
||||||
- name: Build PyInstaller bundle
|
- name: Build PyInstaller bundle
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
# The spec reads this to find the per-platform staging dir;
|
||||||
|
# see build/datatools.spec for the contract.
|
||||||
|
DATATOOLS_TESS_STAGING: build/_tesseract/${{ matrix.platform }}
|
||||||
run: pyinstaller build/datatools.spec --clean --noconfirm
|
run: pyinstaller build/datatools.spec --clean --noconfirm
|
||||||
|
|
||||||
# ---- Per-platform packaging ----------------------------------
|
# ---- Per-platform installer packaging ------------------------
|
||||||
|
|
||||||
- name: Package macOS DMG
|
- name: Package macOS DMG (installer)
|
||||||
if: matrix.os == 'macos-latest'
|
if: matrix.os == 'macos-latest'
|
||||||
run: bash build/macos/build_dmg.sh "${{ steps.version.outputs.version }}"
|
run: bash build/macos/build_dmg.sh "${{ steps.version.outputs.version }}"
|
||||||
|
|
||||||
|
- name: Package macOS portable .zip
|
||||||
|
if: matrix.os == 'macos-latest'
|
||||||
|
run: bash build/macos/build_zip.sh "${{ steps.version.outputs.version }}"
|
||||||
|
|
||||||
- name: Install Inno Setup (Windows)
|
- name: Install Inno Setup (Windows)
|
||||||
if: matrix.os == 'windows-latest'
|
if: matrix.os == 'windows-latest'
|
||||||
run: choco install innosetup --no-progress -y
|
run: choco install innosetup --no-progress -y
|
||||||
@@ -78,6 +147,10 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
iscc /DAppVersion=${{ steps.version.outputs.version }} build\installer.iss
|
iscc /DAppVersion=${{ steps.version.outputs.version }} build\installer.iss
|
||||||
|
|
||||||
|
- name: Package Windows portable .zip
|
||||||
|
if: matrix.os == 'windows-latest'
|
||||||
|
run: python build/build_portable_zip.py win ${{ steps.version.outputs.version }}
|
||||||
|
|
||||||
- name: Install AppImage tooling (Linux)
|
- name: Install AppImage tooling (Linux)
|
||||||
if: matrix.os == 'ubuntu-latest'
|
if: matrix.os == 'ubuntu-latest'
|
||||||
run: |
|
run: |
|
||||||
@@ -92,17 +165,32 @@ jobs:
|
|||||||
|
|
||||||
# ---- Upload + release ----------------------------------------
|
# ---- Upload + release ----------------------------------------
|
||||||
|
|
||||||
- name: Upload artifact
|
- name: Upload installer artifact
|
||||||
uses: actions/upload-artifact@v4
|
uses: actions/upload-artifact@v4
|
||||||
with:
|
with:
|
||||||
name: ${{ matrix.artifact_name }}
|
name: DataTools-${{ matrix.platform }}-installer
|
||||||
path: ${{ matrix.artifact_path }}
|
path: ${{ matrix.installer_glob }}
|
||||||
if-no-files-found: error
|
if-no-files-found: error
|
||||||
|
|
||||||
- name: Attach to Release (tag push only)
|
- name: Upload portable artifact
|
||||||
|
if: matrix.portable_glob != ''
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: DataTools-${{ matrix.platform }}-portable
|
||||||
|
path: ${{ matrix.portable_glob }}
|
||||||
|
if-no-files-found: error
|
||||||
|
|
||||||
|
- name: Attach installer to Release (tag push only)
|
||||||
if: startsWith(github.ref, 'refs/tags/v')
|
if: startsWith(github.ref, 'refs/tags/v')
|
||||||
uses: softprops/action-gh-release@v2
|
uses: softprops/action-gh-release@v2
|
||||||
with:
|
with:
|
||||||
files: ${{ matrix.artifact_path }}
|
files: ${{ matrix.installer_glob }}
|
||||||
fail_on_unmatched_files: true
|
fail_on_unmatched_files: true
|
||||||
generate_release_notes: true
|
generate_release_notes: true
|
||||||
|
|
||||||
|
- name: Attach portable to Release (tag push only)
|
||||||
|
if: startsWith(github.ref, 'refs/tags/v') && matrix.portable_glob != ''
|
||||||
|
uses: softprops/action-gh-release@v2
|
||||||
|
with:
|
||||||
|
files: ${{ matrix.portable_glob }}
|
||||||
|
fail_on_unmatched_files: true
|
||||||
|
|||||||
13
.gitignore
vendored
13
.gitignore
vendored
@@ -11,6 +11,19 @@ dist/
|
|||||||
build/build/
|
build/build/
|
||||||
build/__pycache__/
|
build/__pycache__/
|
||||||
build/dist/
|
build/dist/
|
||||||
|
# Generated by build/generate_icons.py from src/gui/assets/datatools_icon_256.png.
|
||||||
|
# Build artifacts, not source — regenerated each CI run.
|
||||||
|
build/icon.ico
|
||||||
|
build/icon.icns
|
||||||
|
build/icon.png
|
||||||
|
|
||||||
|
# Tesseract bundling — fetched at build time, not committed. See
|
||||||
|
# build/vendor/README.md for the canonical URLs and rationale.
|
||||||
|
# - build/_tesseract/ : per-platform binary + DLLs/dylibs staging dir
|
||||||
|
# - build/vendor/tessdata/eng.traineddata : ~16 MB language data
|
||||||
|
build/_tesseract/
|
||||||
|
build/vendor/tessdata/*.traineddata
|
||||||
|
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
|
|
||||||
# Claude Code agent worktrees + local settings
|
# Claude Code agent worktrees + local settings
|
||||||
|
|||||||
@@ -1,5 +1,8 @@
|
|||||||
[client]
|
[client]
|
||||||
toolbarMode = "minimal"
|
# ``viewer`` is the most aggressive — hides Streamlit's running
|
||||||
|
# indicator, deploy button, and status icons. Keeps the main content
|
||||||
|
# area's top-right corner clean.
|
||||||
|
toolbarMode = "viewer"
|
||||||
|
|
||||||
[browser]
|
[browser]
|
||||||
gatherUsageStats = false
|
gatherUsageStats = false
|
||||||
@@ -9,3 +12,17 @@ gatherUsageStats = false
|
|||||||
# reads "Limit 1024MB per file" — matches the analyzer + gate's stated
|
# reads "Limit 1024MB per file" — matches the analyzer + gate's stated
|
||||||
# 1 GB efficiency target. See docs/REQUIREMENTS.md §1.1.
|
# 1 GB efficiency target. See docs/REQUIREMENTS.md §1.1.
|
||||||
maxUploadSize = 1024
|
maxUploadSize = 1024
|
||||||
|
|
||||||
|
# Warm, editorial palette inspired by the
|
||||||
|
# ``datatools_layout_redesign.html`` mockup — cream paper background,
|
||||||
|
# stone ink, burnt-orange accent. Streamlit reads these on startup and
|
||||||
|
# threads them through its widget chrome (file uploader, focus rings,
|
||||||
|
# primary buttons, links). Heavier visual restyling rides on the CSS
|
||||||
|
# in ``_legacy.py:_DESIGN_TOKENS_CSS``.
|
||||||
|
[theme]
|
||||||
|
base = "light"
|
||||||
|
primaryColor = "#c2410c"
|
||||||
|
backgroundColor = "#fafaf7"
|
||||||
|
secondaryBackgroundColor = "#f5f4ef"
|
||||||
|
textColor = "#1c1917"
|
||||||
|
font = "sans serif"
|
||||||
|
|||||||
220
LICENSE_TESSERACT.txt
Normal file
220
LICENSE_TESSERACT.txt
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
This license applies to the bundled Tesseract OCR binary distributed
|
||||||
|
inside DataTools installer artifacts (Windows .exe, macOS .dmg, Linux
|
||||||
|
.AppImage) and the corresponding portable .zip downloads.
|
||||||
|
|
||||||
|
Tesseract OCR upstream: https://github.com/tesseract-ocr/tesseract
|
||||||
|
Copyright (C) 2006-2024 Google Inc. and the Tesseract OCR contributors
|
||||||
|
|
||||||
|
The Tesseract OCR binary is distributed under the Apache License,
|
||||||
|
Version 2.0, the full text of which is reproduced verbatim below.
|
||||||
|
|
||||||
|
The bundled `eng.traineddata` data file is the "best" English model
|
||||||
|
from https://github.com/tesseract-ocr/tessdata_best and is licensed
|
||||||
|
under the Apache License, Version 2.0 as well.
|
||||||
|
|
||||||
|
DataTools itself is proprietary and is NOT covered by this license;
|
||||||
|
see LICENSE.txt at the repository root for DataTools' own license.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for describing the origin of the Work and
|
||||||
|
reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may accept and charge a
|
||||||
|
fee for, acceptance of support, warranty, indemnity, or other
|
||||||
|
liability obligations and/or rights consistent with this License.
|
||||||
|
However, in accepting such obligations, You may act only on Your
|
||||||
|
own behalf and on Your sole responsibility, not on behalf of any
|
||||||
|
other Contributor, and only if You agree to indemnify, defend,
|
||||||
|
and hold each Contributor harmless for any liability incurred by,
|
||||||
|
or claims asserted against, such Contributor by reason of your
|
||||||
|
accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied. See the License for the specific language governing
|
||||||
|
permissions and limitations under the License.
|
||||||
40
README.es.md
40
README.es.md
@@ -8,27 +8,37 @@ Limpieza local de CSV / Excel. CLI + GUI en el navegador, sin nube, sin ceremoni
|
|||||||
|
|
||||||
| # | Herramienta | Estado |
|
| # | Herramienta | Estado |
|
||||||
|---|------|--------|
|
|---|------|--------|
|
||||||
| 01 | **Eliminador de duplicados** — coincidencia exacta + difusa, 5 normalizadores, reglas de superviviente, auditoría | Listo |
|
| 01 | **Buscar duplicados** — coincidencia exacta + difusa, 5 normalizadores, reglas de superviviente, auditoría | Listo |
|
||||||
| 02 | **Limpiador de texto** — espacios, caracteres tipográficos, BOM, finales de línea, mayúsculas/minúsculas | Listo |
|
| 02 | **Limpiar texto** — espacios, caracteres tipográficos, BOM, finales de línea, mayúsculas/minúsculas | Listo |
|
||||||
| 03 | **Estandarizador de formatos** — fechas, teléfonos, correos, direcciones, nombres, monedas, booleanos | Listo |
|
| 03 | **Estandarizar formatos** — fechas, teléfonos, correos, direcciones, nombres, monedas, booleanos | Listo |
|
||||||
| 04 | **Gestor de valores faltantes** — detección de nulos disfrazados, perfil, media/mediana/moda/ffill/bfill/interpolación, estrategias de descarte | Listo |
|
| 04 | **Corregir valores faltantes** — detección de nulos disfrazados, perfil, media/mediana/moda/ffill/bfill/interpolación, estrategias de descarte | Listo |
|
||||||
| 05 | **Mapeador de columnas** — autodetección difusa de renombrados, esquema objetivo con coerción de tipos, campos requeridos con valores por defecto, descartar/reordenar | Listo |
|
| 05 | **Mapear columnas** — autodetección difusa de renombrados, esquema objetivo con coerción de tipos, campos requeridos con valores por defecto, descartar/reordenar | Listo |
|
||||||
| 06 | Detector de valores atípicos | Próximamente |
|
| 06 | Detectar valores atípicos | Próximamente |
|
||||||
| 07 | Combinador de varios archivos | Próximamente |
|
| 07 | Combinar archivos | Próximamente |
|
||||||
| 08 | Validador e informes | Próximamente |
|
| 08 | Verificación de calidad | Próximamente |
|
||||||
| 09 | **Ejecutor de canalizaciones** — encadena herramientas en un orden recomendado (no forzado), guarda/carga JSON, automatiza limpiezas semanales | Listo |
|
| 09 | **Flujos automatizados** — encadena herramientas en un orden recomendado (no forzado), guarda/carga JSON, automatiza limpiezas semanales | Listo |
|
||||||
|
|
||||||
|
Cada página de herramienta incluye una ventana emergente de **Help** (a la derecha del título) con una guía compacta de Cuándo usarla / Pasos / Ejemplos / Consejo. El texto vive en los paquetes de idioma (`tools.<id>.help_md`).
|
||||||
|
|
||||||
## Descarga (usuarios no técnicos)
|
## Descarga (usuarios no técnicos)
|
||||||
|
|
||||||
Instaladores precompilados — no se requiere Python:
|
Paquetes precompilados — sin instalar Python, sin permisos de administrador, sin internet en ejecución. Cada versión ofrece dos formatos por sistema operativo: un **instalador** que crea accesos directos en el escritorio + menú Inicio / Launchpad, y un **.zip portable** que descomprimes y haces doble clic. Elige el que te permita tu política de TI.
|
||||||
|
|
||||||
| Plataforma | Descarga | Nota de primer arranque |
|
| Plataforma | Instalador (recomendado) | Portable (sin instalar) |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| **macOS** | `DataTools-X.Y.Z-mac.dmg` | Arrastra DataTools.app a /Applications y haz doble clic. |
|
| **macOS** | `DataTools-X.Y.Z-mac.dmg` — ábrelo, arrastra DataTools.app a /Applications, ejecútalo desde Launchpad. | `DataTools-X.Y.Z-mac-portable.zip` — descomprime donde quieras, doble clic en `DataTools.app`. |
|
||||||
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` | Ejecuta el instalador; se inicia desde el menú Inicio. |
|
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — ejecuta el instalador (por usuario, sin admin). Crea acceso directo en el escritorio + entrada en el menú Inicio. | `DataTools-X.Y.Z-win-portable.zip` — descomprime donde quieras, doble clic en `DataTools.exe`. |
|
||||||
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` | `chmod +x` al archivo y luego doble clic. |
|
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x` y doble clic. | El AppImage ya es portable. |
|
||||||
|
|
||||||
Última versión: consulta [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (o el listado de Gumroad). Los instaladores ocupan ~150–200 MB; el lanzador arranca un servidor local en http://127.0.0.1:8501 y abre tu navegador. Nada se envía a la nube.
|
Última versión: consulta [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (o el listado de Gumroad). Cada paquete ocupa ~300 MB descomprimido; al primer arranque la app levanta un servidor local en http://127.0.0.1:8501 y abre tu navegador predeterminado. Nada sale de tu equipo — instalador y portable son idénticos por dentro.
|
||||||
|
|
||||||
|
**Tesseract OCR viene incluido.** El soporte para PDFs escaneados del Extractor de PDF funciona sin configuración adicional en las tres plataformas — no hace falta instalar Tesseract por separado. Atribución de licencia: ver [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
||||||
|
|
||||||
|
**Avisos del primer arranque (una sola vez):**
|
||||||
|
- **macOS** sin firma: clic derecho → **Abrir** → confirma. (Las compilaciones firmadas se lo saltan.)
|
||||||
|
- **Windows** SmartScreen: pulsa **Más información** → **Ejecutar de todas formas**.
|
||||||
|
|
||||||
|
Guía detallada de instalación y resolución de problemas: [Guía del usuario §1](docs/USER-GUIDE.es.md#1-instalaci%C3%B3n).
|
||||||
|
|
||||||
## Instalar desde el código (desarrolladores)
|
## Instalar desde el código (desarrolladores)
|
||||||
|
|
||||||
|
|||||||
40
README.md
40
README.md
@@ -8,27 +8,37 @@ Local CSV / Excel cleaning. CLI + browser GUI, no cloud, no install ceremony. GU
|
|||||||
|
|
||||||
| # | Tool | Status |
|
| # | Tool | Status |
|
||||||
|---|------|--------|
|
|---|------|--------|
|
||||||
| 01 | **Deduplicator** — exact + fuzzy match, 5 normalizers, survivor rules, audit | Ready |
|
| 01 | **Find Duplicates** — exact + fuzzy match, 5 normalizers, survivor rules, audit | Ready |
|
||||||
| 02 | **Text Cleaner** — whitespace, smart chars, BOM, line endings, case ops | Ready |
|
| 02 | **Clean Text** — whitespace, smart chars, BOM, line endings, case ops | Ready |
|
||||||
| 03 | **Format Standardizer** — dates, phones, emails, addresses, names, currencies, booleans | Ready |
|
| 03 | **Standardize Formats** — dates, phones, emails, addresses, names, currencies, booleans | Ready |
|
||||||
| 04 | **Missing Value Handler** — disguised-null detection, profile, mean/median/mode/ffill/bfill/interpolate, drop strategies | Ready |
|
| 04 | **Fix Missing Values** — disguised-null detection, profile, mean/median/mode/ffill/bfill/interpolate, drop strategies | Ready |
|
||||||
| 05 | **Column Mapper** — fuzzy auto-rename, target schema with type coercion, required fields with defaults, drop/reorder | Ready |
|
| 05 | **Map Columns** — fuzzy auto-rename, target schema with type coercion, required fields with defaults, drop/reorder | Ready |
|
||||||
| 06 | Outlier Detector | Coming Soon |
|
| 06 | Find Unusual Values | Coming Soon |
|
||||||
| 07 | Multi-File Merger | Coming Soon |
|
| 07 | Combine Files | Coming Soon |
|
||||||
| 08 | Validator & Reporter | Coming Soon |
|
| 08 | Quality Check | Coming Soon |
|
||||||
| 09 | **Pipeline Runner** — chain tools with recommended (not forced) order, save/load JSON, automate weekly cleanups | Ready |
|
| 09 | **Automated Workflows** — chain tools with recommended (not forced) order, save/load JSON, automate weekly cleanups | Ready |
|
||||||
|
|
||||||
|
Every tool page has an in-tool **Help** popover (right of the title) with a compact When-to-use / Steps / Examples / Tip card. Copy lives in the language packs (`tools.<id>.help_md`).
|
||||||
|
|
||||||
## Download (non-technical users)
|
## Download (non-technical users)
|
||||||
|
|
||||||
Pre-built installers — no Python required:
|
Pre-built bundles — no Python install, no admin rights, no internet at runtime. Each release ships two flavors per OS: an **installer** that wires up Desktop + Start Menu / Launchpad shortcuts, and a **portable .zip** you unzip and double-click. Pick whichever your IT policy allows.
|
||||||
|
|
||||||
| Platform | Download | First-launch note |
|
| Platform | Installer (recommended) | Portable (no install) |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| **macOS** | `DataTools-X.Y.Z-mac.dmg` | Drag DataTools.app into /Applications, then double-click. |
|
| **macOS** | `DataTools-X.Y.Z-mac.dmg` — open, drag DataTools.app into /Applications, launch from Launchpad. | `DataTools-X.Y.Z-mac-portable.zip` — unzip anywhere, double-click `DataTools.app`. |
|
||||||
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` | Run the installer; launches from Start Menu. |
|
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — run installer (per-user, no admin). Desktop shortcut + Start Menu entry created. | `DataTools-X.Y.Z-win-portable.zip` — unzip anywhere, double-click `DataTools.exe`. |
|
||||||
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` | `chmod +x` the file, then double-click. |
|
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x`, double-click. | The AppImage is already portable. |
|
||||||
|
|
||||||
Latest release: see [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (or the Gumroad listing). The installers are ~150–200 MB; the launcher boots a local server at http://127.0.0.1:8501 and opens your browser. Nothing is sent to the cloud.
|
Latest release: see [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (or the Gumroad listing). Each bundle is ~300 MB unpacked; on first launch the app starts a local server at http://127.0.0.1:8501 and opens your default browser. Nothing leaves your machine — installers and portables are byte-identical inside.
|
||||||
|
|
||||||
|
**Tesseract OCR is bundled.** Scanned-PDF support in the PDF Extractor works out of the box on all three platforms — no separate Tesseract install required. License attribution: see [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
||||||
|
|
||||||
|
**First-launch warnings (one-time):**
|
||||||
|
- **macOS** unsigned builds: right-click → **Open** → confirm. (Signed builds skip this.)
|
||||||
|
- **Windows** SmartScreen: click **More info** → **Run anyway**.
|
||||||
|
|
||||||
|
Detailed install + troubleshooting walkthrough: [User Guide §1](docs/USER-GUIDE.md#1-install).
|
||||||
|
|
||||||
## Install from source (developers)
|
## Install from source (developers)
|
||||||
|
|
||||||
|
|||||||
139
build/README.md
139
build/README.md
@@ -19,23 +19,56 @@ build/
|
|||||||
│ Mac .app bundle config. Reads the version
|
│ Mac .app bundle config. Reads the version
|
||||||
│ from src/__init__.py.
|
│ from src/__init__.py.
|
||||||
├── installer.iss Inno Setup script — Windows .exe installer.
|
├── installer.iss Inno Setup script — Windows .exe installer.
|
||||||
|
│ Adds Start Menu + Desktop + App Paths entries.
|
||||||
|
├── generate_icons.py Builds icon.ico / icon.icns / icon.png from
|
||||||
|
│ src/gui/assets/datatools_icon_256.png. Run
|
||||||
|
│ once before pyinstaller (CI does this).
|
||||||
|
├── build_portable_zip.py Cross-platform: zips dist/DataTools/ into a
|
||||||
|
│ no-install portable download. Used by the
|
||||||
|
│ Windows + Linux portable artifacts.
|
||||||
├── macos/
|
├── macos/
|
||||||
│ └── build_dmg.sh Wraps dist/DataTools.app into a .dmg with a
|
│ ├── build_dmg.sh Wraps dist/DataTools.app into a .dmg with a
|
||||||
│ drag-to-/Applications layout.
|
│ │ drag-to-/Applications layout (installer).
|
||||||
|
│ └── build_zip.sh Wraps dist/DataTools.app into a portable
|
||||||
|
│ .zip via ditto (preserves bundle metadata).
|
||||||
├── appimage/
|
├── appimage/
|
||||||
│ ├── AppRun Entry point invoked when the AppImage runs.
|
│ ├── AppRun Entry point invoked when the AppImage runs.
|
||||||
│ ├── datatools.desktop Linux desktop-entry metadata.
|
│ ├── datatools.desktop Linux desktop-entry metadata.
|
||||||
│ └── build.sh Wraps dist/DataTools/ into an .AppImage.
|
│ └── build.sh Wraps dist/DataTools/ into an .AppImage.
|
||||||
├── hooks/ PyInstaller hooks for libs the static analyser
|
├── hooks/ PyInstaller hooks for libs the static analyser
|
||||||
│ └── hook-streamlit.py misses (Streamlit's dynamic imports).
|
│ └── hook-streamlit.py misses (Streamlit's dynamic imports).
|
||||||
├── icon.icns macOS app icon (TODO: produce from a 1024×1024
|
├── icon.{ico,icns,png} Generated by generate_icons.py — gitignored.
|
||||||
│ PNG. Optional — bundle still builds without).
|
|
||||||
├── icon.ico Windows app icon (TODO).
|
|
||||||
├── icon.png Linux AppImage icon (TODO — build.sh generates
|
|
||||||
│ a placeholder if missing).
|
|
||||||
└── README.md this file
|
└── README.md this file
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Distribution outputs per platform
|
||||||
|
|
||||||
|
Each CI run produces two downloads per platform — an installer for
|
||||||
|
buyers who want shortcuts wired automatically, and a portable .zip
|
||||||
|
for buyers (or IT-locked-down machines) that can't run installers:
|
||||||
|
|
||||||
|
| Platform | Installer | Portable |
|
||||||
|
|----------|----------------------------------------|------------------------------------------------|
|
||||||
|
| macOS | `DataTools-<ver>-mac.dmg` | `DataTools-<ver>-mac-portable.zip` (ditto .app)|
|
||||||
|
| Windows | `DataTools-<ver>-win-setup.exe` | `DataTools-<ver>-win-portable.zip` |
|
||||||
|
| Linux | `DataTools-<ver>-linux-x86_64.AppImage`| (the AppImage IS the portable) |
|
||||||
|
|
||||||
|
All six outputs are self-contained: every dependency (Python, pandas,
|
||||||
|
streamlit, pdfplumber, **Tesseract OCR + `eng.traineddata`**, the lot)
|
||||||
|
is frozen into the bundle. The buyer does not need to install Python,
|
||||||
|
pip, Tesseract, or anything else first. With Tesseract bundled, each
|
||||||
|
artifact is roughly **250–300 MB** on disk (up from ~120 MB pre-OCR);
|
||||||
|
unpacked installs run ~300–400 MB once scratch space is counted.
|
||||||
|
|
||||||
|
## Easy-launch surface
|
||||||
|
|
||||||
|
| Affordance | Windows | macOS |
|
||||||
|
|------------------|--------------------------------------------------|------------------------------------------------------|
|
||||||
|
| Desktop shortcut | Inno Setup `desktopicon` task (checked default) | The .app bundle in /Applications is the icon |
|
||||||
|
| App menu | Start Menu → DataTools (always installed) | Launchpad + Spotlight (auto from /Applications) |
|
||||||
|
| Taskbar / Dock | User pins manually (OS forbids programmatic pin) | User pins manually after first launch |
|
||||||
|
| Run from terminal| `DataTools` (registered via App Paths) | `open -a DataTools` (auto from .app bundle) |
|
||||||
|
|
||||||
CI: `.github/workflows/build.yml` runs the full pipeline on tag push
|
CI: `.github/workflows/build.yml` runs the full pipeline on tag push
|
||||||
(matrix: macos-latest, windows-latest, ubuntu-latest) and attaches
|
(matrix: macos-latest, windows-latest, ubuntu-latest) and attaches
|
||||||
the resulting installers to a GitHub Release. Manual
|
the resulting installers to a GitHub Release. Manual
|
||||||
@@ -43,12 +76,46 @@ the resulting installers to a GitHub Release. Manual
|
|||||||
|
|
||||||
## Releasing
|
## Releasing
|
||||||
|
|
||||||
|
### Single-command local build (recommended for one-developer workflow)
|
||||||
|
|
||||||
|
PyInstaller can't cross-compile, so a single machine produces one
|
||||||
|
platform's packages. Run this on each target OS:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# One-time setup per machine:
|
||||||
|
pip install -r requirements.txt
|
||||||
|
pip install pyinstaller pillow
|
||||||
|
# Windows only: install Inno Setup from https://jrsoftware.org/isdl.php
|
||||||
|
# Linux only: drop appimagetool onto PATH (see preflight output)
|
||||||
|
|
||||||
|
# Build everything for the current OS:
|
||||||
|
python build/make_release.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Outputs land in `dist/`:
|
||||||
|
- Windows host → `DataTools-<ver>-win-setup.exe` + `DataTools-<ver>-win-portable.zip`
|
||||||
|
- macOS host → `DataTools-<ver>-mac.dmg` + `DataTools-<ver>-mac-portable.zip`
|
||||||
|
- Linux host → `DataTools-<ver>-linux-x86_64.AppImage`
|
||||||
|
|
||||||
|
Useful flags:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python build/make_release.py --preflight # check tooling, build nothing
|
||||||
|
python build/make_release.py --clean # wipe dist/ first
|
||||||
|
python build/make_release.py --skip-installer # just the portable zip
|
||||||
|
python build/make_release.py --skip-portable # just the installer
|
||||||
|
```
|
||||||
|
|
||||||
|
### CI build (push tag → GitHub Release)
|
||||||
|
|
||||||
|
If you have CI runners for all three OSes:
|
||||||
|
|
||||||
1. Bump `__version__` in `src/__init__.py`.
|
1. Bump `__version__` in `src/__init__.py`.
|
||||||
2. `git commit -am "release: vX.Y.Z" && git tag vX.Y.Z`.
|
2. `git commit -am "release: vX.Y.Z" && git tag vX.Y.Z`.
|
||||||
3. `git push && git push --tags`.
|
3. `git push && git push --tags`.
|
||||||
4. CI builds all three platforms and creates a GitHub Release with
|
4. CI builds all three platforms and creates a Release with the
|
||||||
the installers attached.
|
installers + portable zips attached.
|
||||||
5. Mirror the GitHub Release assets to Gumroad (manual until v2).
|
5. Mirror the Release assets to Gumroad (manual until v2).
|
||||||
|
|
||||||
## Signing (Phase 2 — needs accounts/credentials)
|
## Signing (Phase 2 — needs accounts/credentials)
|
||||||
|
|
||||||
@@ -223,6 +290,56 @@ Mac code-signing in CI requires the cert + private key as a GitHub
|
|||||||
secret (encoded with `base64`). Detailed walkthrough belongs in a
|
secret (encoded with `base64`). Detailed walkthrough belongs in a
|
||||||
later doc — for v1, sign locally and upload to GitHub Releases.
|
later doc — for v1, sign locally and upload to GitHub Releases.
|
||||||
|
|
||||||
|
## Tesseract bundling (PDF Extractor OCR)
|
||||||
|
|
||||||
|
Frozen artifacts ship a per-platform Tesseract binary plus the English
|
||||||
|
`eng.traineddata` model so scanned-PDF support in the PDF Extractor
|
||||||
|
works out of the box — no separate user install. Source / pip
|
||||||
|
developer setups still need system Tesseract on `PATH`.
|
||||||
|
|
||||||
|
**Layout inside the bundle**:
|
||||||
|
|
||||||
|
```
|
||||||
|
DataTools/ (or DataTools.app/Contents/MacOS/)
|
||||||
|
└── tesseract/
|
||||||
|
├── tesseract (Linux/macOS binary; tesseract.exe on Windows)
|
||||||
|
└── tessdata/
|
||||||
|
└── eng.traineddata
|
||||||
|
```
|
||||||
|
|
||||||
|
The runtime resolver (in `src/`, owned by the runtime team) walks:
|
||||||
|
|
||||||
|
1. `DATATOOLS_TESSERACT_BIN` env var override.
|
||||||
|
2. `Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"` — frozen
|
||||||
|
bundles only.
|
||||||
|
3. `tesseract` on `PATH`.
|
||||||
|
4. Windows well-known paths.
|
||||||
|
|
||||||
|
**Where the bytes come from**:
|
||||||
|
|
||||||
|
- **Tessdata** — vendored in-repo at `build/vendor/tessdata/eng.traineddata`
|
||||||
|
(sourced from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best)).
|
||||||
|
`datatools.spec` copies it into `tesseract/tessdata/`.
|
||||||
|
- **Binary** — fetched per-platform at build time by
|
||||||
|
`build/make_release.py` from pinned upstream URLs. Current pin:
|
||||||
|
**Tesseract 5.5.0**.
|
||||||
|
|
||||||
|
**Updating Tesseract**:
|
||||||
|
|
||||||
|
1. Bump the version pin and the per-platform fetch URLs in
|
||||||
|
`build/make_release.py`.
|
||||||
|
2. If the model schema changed upstream, refresh
|
||||||
|
`build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the
|
||||||
|
matching tag.
|
||||||
|
3. Rebuild on each platform (`python build/make_release.py`) and
|
||||||
|
smoke-test a scanned PDF through the PDF Extractor.
|
||||||
|
4. Update `LICENSE_TESSERACT.txt` at the repo root if upstream license
|
||||||
|
terms change (Apache-2.0 today).
|
||||||
|
|
||||||
|
License attribution for the bundled binary lives at
|
||||||
|
`LICENSE_TESSERACT.txt` at the repo root — it must ship alongside any
|
||||||
|
binary that contains Tesseract.
|
||||||
|
|
||||||
## Common pitfalls
|
## Common pitfalls
|
||||||
|
|
||||||
| Symptom | Fix |
|
| Symptom | Fix |
|
||||||
@@ -246,7 +363,7 @@ much state to trust:
|
|||||||
4. Double-click the app icon.
|
4. Double-click the app icon.
|
||||||
5. Browser should open to http://127.0.0.1:850x within 5 seconds.
|
5. Browser should open to http://127.0.0.1:850x within 5 seconds.
|
||||||
6. Drop samples/demo/shopify_pet_customers.csv into the
|
6. Drop samples/demo/shopify_pet_customers.csv into the
|
||||||
Pipeline Runner page; click Run; AFTER preview should appear.
|
Automated Workflows page; click Run; AFTER preview should appear.
|
||||||
7. Confirm in the network tab: zero outbound calls except to
|
7. Confirm in the network tab: zero outbound calls except to
|
||||||
127.0.0.1 and the Streamlit static asset paths (also local).
|
127.0.0.1 and the Streamlit static asset paths (also local).
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -9,6 +9,11 @@
|
|||||||
# latest release from https://github.com/AppImage/AppImageKit/releases).
|
# latest release from https://github.com/AppImage/AppImageKit/releases).
|
||||||
#
|
#
|
||||||
# Output: dist/DataTools-<version>-linux-x86_64.AppImage
|
# Output: dist/DataTools-<version>-linux-x86_64.AppImage
|
||||||
|
#
|
||||||
|
# Tesseract bundling: no-op here. The PyInstaller bundle in
|
||||||
|
# dist/DataTools/ already contains tesseract/{tesseract, *.so,
|
||||||
|
# tessdata/eng.traineddata} from the spec's datas; ``cp -R``
|
||||||
|
# below carries it along into the AppDir.
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
|||||||
69
build/build_portable_zip.py
Normal file
69
build/build_portable_zip.py
Normal file
@@ -0,0 +1,69 @@
|
|||||||
|
"""Wrap the PyInstaller folder build into a portable .zip.
|
||||||
|
|
||||||
|
Self-contained download: unzip → double-click the launcher → app runs.
|
||||||
|
No installer, no Python install, no admin rights required.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python build/build_portable_zip.py <platform> <version>
|
||||||
|
|
||||||
|
Where ``platform`` is one of ``win`` / ``mac`` / ``linux``. The
|
||||||
|
script just produces a generic ``dist/DataTools/`` zip; on macOS the
|
||||||
|
preferred portable format is the ``ditto``-wrapped .app — see
|
||||||
|
``build/macos/build_zip.sh`` for that flow. This helper exists mainly
|
||||||
|
for Windows + Linux, where there's no .app bundle to wrap.
|
||||||
|
|
||||||
|
Output:
|
||||||
|
dist/DataTools-<version>-<platform>-portable.zip
|
||||||
|
|
||||||
|
The zip root is the ``DataTools/`` folder so an unzip produces a
|
||||||
|
self-contained dir the user can drop anywhere (Desktop, USB stick,
|
||||||
|
network share). On Windows, the launcher is ``DataTools.exe`` inside
|
||||||
|
that folder; on Linux, ``DataTools``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import shutil
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
REPO = Path(__file__).resolve().parent.parent
|
||||||
|
DIST_DIR = REPO / "dist"
|
||||||
|
BUNDLE_DIR = DIST_DIR / "DataTools"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
sys.stderr.write(
|
||||||
|
"usage: python build/build_portable_zip.py <platform> <version>\n"
|
||||||
|
)
|
||||||
|
return 2
|
||||||
|
platform = sys.argv[1]
|
||||||
|
version = sys.argv[2]
|
||||||
|
|
||||||
|
if not BUNDLE_DIR.is_dir():
|
||||||
|
sys.stderr.write(
|
||||||
|
f"Bundle dir not found at {BUNDLE_DIR}.\n"
|
||||||
|
"Run ``pyinstaller build/datatools.spec --clean --noconfirm`` first.\n"
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
out_stem = DIST_DIR / f"DataTools-{version}-{platform}-portable"
|
||||||
|
# ``make_archive`` takes a base name (no extension) and produces
|
||||||
|
# ``<base>.zip``. ``root_dir`` = parent of what we want compressed,
|
||||||
|
# ``base_dir`` = the folder name inside the archive root. This
|
||||||
|
# combo yields a single top-level ``DataTools/`` directory inside
|
||||||
|
# the .zip rather than dumping its contents loose.
|
||||||
|
archive = shutil.make_archive(
|
||||||
|
base_name=str(out_stem),
|
||||||
|
format="zip",
|
||||||
|
root_dir=str(DIST_DIR),
|
||||||
|
base_dir="DataTools",
|
||||||
|
)
|
||||||
|
size_mb = Path(archive).stat().st_size / (1024 * 1024)
|
||||||
|
print(f"wrote {archive} ({size_mb:.1f} MB)")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@@ -24,6 +24,7 @@
|
|||||||
|
|
||||||
# -*- mode: python ; coding: utf-8 -*-
|
# -*- mode: python ; coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from PyInstaller.utils.hooks import (
|
from PyInstaller.utils.hooks import (
|
||||||
collect_all,
|
collect_all,
|
||||||
@@ -58,6 +59,15 @@ hidden_imports += collect_submodules("charset_normalizer")
|
|||||||
hidden_imports += collect_submodules("openpyxl")
|
hidden_imports += collect_submodules("openpyxl")
|
||||||
hidden_imports += collect_submodules("loguru")
|
hidden_imports += collect_submodules("loguru")
|
||||||
|
|
||||||
|
# PDF Extractor stack. ``pypdfium2`` has its own PyInstaller hook
|
||||||
|
# under ``build/hooks/`` that pulls in the native PDFium binary —
|
||||||
|
# keep the ``collect_submodules`` calls here for belt-and-braces.
|
||||||
|
hidden_imports += collect_submodules("pdfplumber")
|
||||||
|
hidden_imports += collect_submodules("pdfminer")
|
||||||
|
hidden_imports += collect_submodules("pypdfium2")
|
||||||
|
hidden_imports += collect_submodules("PIL")
|
||||||
|
hidden_imports += collect_submodules("pytesseract")
|
||||||
|
|
||||||
# Our own engine + GUI modules. Even though we import them directly
|
# Our own engine + GUI modules. Even though we import them directly
|
||||||
# at the top of ``launcher.py`` / ``app.py``, the Streamlit
|
# at the top of ``launcher.py`` / ``app.py``, the Streamlit
|
||||||
# session-state and per-page page discovery layers re-import via
|
# session-state and per-page page discovery layers re-import via
|
||||||
@@ -77,6 +87,14 @@ datas += collect_data_files("streamlit", include_py_files=False)
|
|||||||
# phonenumbers ships its country/area-code metadata as resources.
|
# phonenumbers ships its country/area-code metadata as resources.
|
||||||
datas += collect_data_files("phonenumbers", include_py_files=False)
|
datas += collect_data_files("phonenumbers", include_py_files=False)
|
||||||
|
|
||||||
|
# PDF Extractor data files. ``pypdfium2`` ships a native PDFium
|
||||||
|
# shared library (``.dll`` / ``.so`` / ``.dylib``) under its package
|
||||||
|
# dir; ``pdfminer`` ships the Adobe CMap tables it uses for
|
||||||
|
# character mapping. The drawable-canvas frontend bundle is gone
|
||||||
|
# now that the visual picker was removed.
|
||||||
|
datas += collect_data_files("pypdfium2", include_py_files=False)
|
||||||
|
datas += collect_data_files("pdfminer", include_py_files=False)
|
||||||
|
|
||||||
# Our application files. PyInstaller's bundler treats source as code
|
# Our application files. PyInstaller's bundler treats source as code
|
||||||
# (.pyc) by default; we add it again as data so the launcher's
|
# (.pyc) by default; we add it again as data so the launcher's
|
||||||
# ``Path(sys._MEIPASS) / "src" / "gui" / "app.py"`` resolution works.
|
# ``Path(sys._MEIPASS) / "src" / "gui" / "app.py"`` resolution works.
|
||||||
@@ -86,6 +104,78 @@ datas += [
|
|||||||
(str(REPO / ".streamlit" / "config.toml"),".streamlit"),
|
(str(REPO / ".streamlit" / "config.toml"),".streamlit"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# ----- Tesseract OCR bundle ----------------------------------------
|
||||||
|
# ``build/make_release.py`` stages the per-platform Tesseract binary
|
||||||
|
# + its runtime libs (DLLs/dylibs/sos) into
|
||||||
|
# ``build/_tesseract/<target>/`` and the shared eng.traineddata into
|
||||||
|
# ``build/vendor/tessdata/``. We add both to ``datas`` so PyInstaller
|
||||||
|
# drops them at the path the runtime expects:
|
||||||
|
#
|
||||||
|
# <bundle>/tesseract/tesseract[.exe]
|
||||||
|
# <bundle>/tesseract/<all dll/dylib/so deps>
|
||||||
|
# <bundle>/tesseract/tessdata/eng.traineddata
|
||||||
|
#
|
||||||
|
# The runtime discovery code in src/pdf_extract.py reads this layout
|
||||||
|
# from ``Path(sys._MEIPASS) / "tesseract" / ...``. Keep the two ends
|
||||||
|
# in sync — if you rename "tesseract" here, update pdf_extract.py too.
|
||||||
|
#
|
||||||
|
# The orchestrator (make_release.py) sets DATATOOLS_TESS_STAGING to
|
||||||
|
# the right per-platform dir before invoking PyInstaller. For ad-hoc
|
||||||
|
# `pyinstaller build/datatools.spec` runs without the orchestrator,
|
||||||
|
# fall back to the canonical staging path.
|
||||||
|
_tess_staging_env = os.environ.get("DATATOOLS_TESS_STAGING")
|
||||||
|
if _tess_staging_env:
|
||||||
|
_tess_staging = Path(_tess_staging_env)
|
||||||
|
else:
|
||||||
|
# Pick the obvious per-host staging dir as a fallback so spec-only
|
||||||
|
# builds (without the orchestrator) still work in dev.
|
||||||
|
import sys as _sys_for_target
|
||||||
|
_target_guess = (
|
||||||
|
"win" if _sys_for_target.platform.startswith("win")
|
||||||
|
else "mac" if _sys_for_target.platform == "darwin"
|
||||||
|
else "linux"
|
||||||
|
)
|
||||||
|
_tess_staging = REPO / "build" / "_tesseract" / _target_guess
|
||||||
|
|
||||||
|
_tessdata = REPO / "build" / "vendor" / "tessdata"
|
||||||
|
|
||||||
|
if _tess_staging.is_dir() and any(_tess_staging.iterdir()):
|
||||||
|
# Drop every file in the staging dir directly under
|
||||||
|
# ``<bundle>/tesseract/`` (binary + DLL/dylib/so siblings).
|
||||||
|
datas += [(str(_tess_staging), "tesseract")]
|
||||||
|
else:
|
||||||
|
# Don't hard-fail spec parse — useful for first-time devs running
|
||||||
|
# PyInstaller before fetching binaries. Surface a loud warning
|
||||||
|
# though, since the OCR feature will silently fail at runtime.
|
||||||
|
print(
|
||||||
|
f"WARNING: {_tess_staging} is empty or missing — OCR will be "
|
||||||
|
"disabled in the bundle. Run build/make_release.py (which "
|
||||||
|
"calls fetch_tesseract_for_platform) before pyinstaller, or "
|
||||||
|
"pre-stage the binary manually."
|
||||||
|
)
|
||||||
|
|
||||||
|
if (_tessdata / "eng.traineddata").exists():
|
||||||
|
datas += [(str(_tessdata), "tesseract/tessdata")]
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"WARNING: {_tessdata}/eng.traineddata is missing — OCR will "
|
||||||
|
"have no language data at runtime. Run build/make_release.py "
|
||||||
|
"or fetch manually per build/vendor/README.md."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bundle the Apache-2.0 LICENSE text alongside the binary. The docs
|
||||||
|
# agent maintains LICENSE_TESSERACT.txt at the repo root; PyInstaller
|
||||||
|
# drops it at the bundle root next to DataTools[.exe].
|
||||||
|
_tess_license = REPO / "LICENSE_TESSERACT.txt"
|
||||||
|
if _tess_license.exists():
|
||||||
|
datas += [(str(_tess_license), ".")]
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
"WARNING: LICENSE_TESSERACT.txt missing at repo root. Required "
|
||||||
|
"by Apache-2.0 for redistribution; the docs agent should "
|
||||||
|
"create it. Continuing without it for now."
|
||||||
|
)
|
||||||
|
|
||||||
# ----- Analysis ------------------------------------------------------
|
# ----- Analysis ------------------------------------------------------
|
||||||
|
|
||||||
a = Analysis(
|
a = Analysis(
|
||||||
@@ -141,6 +231,13 @@ coll = COLLECT(
|
|||||||
|
|
||||||
# macOS .app bundle wrapper. PyInstaller produces it only on Mac;
|
# macOS .app bundle wrapper. PyInstaller produces it only on Mac;
|
||||||
# this block is a no-op on Win/Linux.
|
# this block is a no-op on Win/Linux.
|
||||||
|
#
|
||||||
|
# Tesseract bundling note: ``BUNDLE(coll, ...)`` carries the entire
|
||||||
|
# COLLECT output (binaries + datas) into the .app's
|
||||||
|
# Contents/Resources tree, so the ``tesseract/`` subdir we built up
|
||||||
|
# in ``datas`` lands at ``DataTools.app/Contents/Resources/tesseract/``
|
||||||
|
# and the runtime ``sys._MEIPASS`` resolves there. No extra plumbing
|
||||||
|
# needed.
|
||||||
import sys as _sys
|
import sys as _sys
|
||||||
if _sys.platform == "darwin":
|
if _sys.platform == "darwin":
|
||||||
app = BUNDLE(
|
app = BUNDLE(
|
||||||
|
|||||||
78
build/generate_icons.py
Normal file
78
build/generate_icons.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
"""Generate platform-specific app icons from the source PNG asset.
|
||||||
|
|
||||||
|
Outputs:
|
||||||
|
build/icon.ico Windows multi-resolution icon (16..256 px sizes).
|
||||||
|
build/icon.icns macOS icon bundle (16..1024 px scaled tiers).
|
||||||
|
build/icon.png Plain 256x256 PNG used by the Linux AppImage.
|
||||||
|
|
||||||
|
Source: ``src/gui/assets/datatools_icon_256.png`` (the same icon
|
||||||
|
``st.set_page_config`` uses, so the installer / Dock / Taskbar match
|
||||||
|
the in-app tab favicon).
|
||||||
|
|
||||||
|
Run manually:
|
||||||
|
python build/generate_icons.py
|
||||||
|
|
||||||
|
CI runs this automatically before invoking PyInstaller (see
|
||||||
|
``.github/workflows/build.yml``). Both files are .gitignored — they
|
||||||
|
are build artifacts derived from the committed PNG.
|
||||||
|
|
||||||
|
Self-contained: pulls only Pillow (already a transitive dep of
|
||||||
|
``pdfplumber``) so no extra installs are required.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
# Repo layout: this script lives at <REPO>/build/. The source PNG is at
|
||||||
|
# <REPO>/src/gui/assets/datatools_icon_256.png.
|
||||||
|
BUILD_DIR = Path(__file__).resolve().parent
|
||||||
|
REPO = BUILD_DIR.parent
|
||||||
|
SOURCE_PNG = REPO / "src" / "gui" / "assets" / "datatools_icon_256.png"
|
||||||
|
|
||||||
|
# Windows ICO needs every size the OS might render at: taskbar (16/24),
|
||||||
|
# Start Menu (32/48), tile (64/128), shell properties dialog (256).
|
||||||
|
ICO_SIZES = [(16, 16), (24, 24), (32, 32), (48, 48), (64, 64),
|
||||||
|
(128, 128), (256, 256)]
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
if not SOURCE_PNG.exists():
|
||||||
|
sys.stderr.write(
|
||||||
|
f"Source icon not found at {SOURCE_PNG}.\n"
|
||||||
|
"Add a 256x256 (or larger) RGBA PNG there and re-run.\n"
|
||||||
|
)
|
||||||
|
return 1
|
||||||
|
|
||||||
|
src = Image.open(SOURCE_PNG).convert("RGBA")
|
||||||
|
if src.size[0] < 256 or src.size[1] < 256:
|
||||||
|
sys.stderr.write(
|
||||||
|
f"Source icon is {src.size}; recommend 256x256 or larger "
|
||||||
|
"so downscaled tiers look crisp.\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
ico_path = BUILD_DIR / "icon.ico"
|
||||||
|
src.save(ico_path, format="ICO", sizes=ICO_SIZES)
|
||||||
|
print(f"wrote {ico_path} ({ico_path.stat().st_size:,} bytes)")
|
||||||
|
|
||||||
|
icns_path = BUILD_DIR / "icon.icns"
|
||||||
|
# Pillow's ICNS writer derives the per-tier sizes from the source
|
||||||
|
# image; passing a 256x256 source yields ic07..ic12 entries which
|
||||||
|
# cover Finder, Dock, and the Get Info panel.
|
||||||
|
src.save(icns_path, format="ICNS")
|
||||||
|
print(f"wrote {icns_path} ({icns_path.stat().st_size:,} bytes)")
|
||||||
|
|
||||||
|
# AppImage uses a plain PNG for its desktop entry. Copy the source
|
||||||
|
# so the AppImage build script doesn't have to know the asset path.
|
||||||
|
png_path = BUILD_DIR / "icon.png"
|
||||||
|
src.save(png_path, format="PNG")
|
||||||
|
print(f"wrote {png_path} ({png_path.stat().st_size:,} bytes)")
|
||||||
|
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
31
build/hooks/hook-pypdfium2.py
Normal file
31
build/hooks/hook-pypdfium2.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
"""PyInstaller hook for pypdfium2.
|
||||||
|
|
||||||
|
``pypdfium2`` ships the native PDFium shared library as a data file
|
||||||
|
inside its package directory (``pdfium``-prefixed ``.dll`` on
|
||||||
|
Windows, ``.so`` on Linux, ``.dylib`` on macOS). PyInstaller's
|
||||||
|
default discovery picks up Python ``.py``/``.pyc`` but can miss
|
||||||
|
the binary if the package is wheel-installed and the shared lib
|
||||||
|
isn't on the ``__init__``'s module-level path it scans.
|
||||||
|
|
||||||
|
This hook is belt-and-braces — the main spec already calls
|
||||||
|
``collect_data_files("pypdfium2")`` and ``collect_submodules``,
|
||||||
|
but PyInstaller's hook-discovery-by-name is the documented
|
||||||
|
escape hatch for native-bundled libraries. Without this, the
|
||||||
|
visual picker (which renders PDF pages via
|
||||||
|
``pypdfium2.PdfDocument(...).render(...)``) silently fails on
|
||||||
|
installed builds with a ``FileNotFoundError`` for the PDFium
|
||||||
|
shared library.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from PyInstaller.utils.hooks import (
|
||||||
|
collect_all,
|
||||||
|
collect_data_files,
|
||||||
|
collect_dynamic_libs,
|
||||||
|
)
|
||||||
|
|
||||||
|
datas, binaries, hiddenimports = collect_all("pypdfium2")
|
||||||
|
# Make absolutely sure the bundled PDFium .dll/.so/.dylib is
|
||||||
|
# carried over — PyInstaller treats it as a dynamic lib, not data.
|
||||||
|
binaries += collect_dynamic_libs("pypdfium2")
|
||||||
|
# And its raw data files (the type stubs + metadata file).
|
||||||
|
datas += collect_data_files("pypdfium2", include_py_files=False)
|
||||||
@@ -1,11 +1,26 @@
|
|||||||
; Inno Setup script for DataTools — Windows installer.
|
; Inno Setup script for DataTools — Windows installer.
|
||||||
;
|
;
|
||||||
; Compile from the repo root:
|
; Compile from the repo root:
|
||||||
; iscc /DAppVersion=1.0.0 build\installer.iss
|
; iscc /DAppVersion=3.0 build\installer.iss
|
||||||
;
|
;
|
||||||
; CI passes the version via /DAppVersion to keep src/__init__.py the
|
; CI passes the version via /DAppVersion to keep src/__init__.py the
|
||||||
; single source of truth. Local manual builds: pass /DAppVersion or
|
; single source of truth. Local manual builds: pass /DAppVersion or
|
||||||
; let the default kick in.
|
; let the default kick in.
|
||||||
|
;
|
||||||
|
; What this installer wires up (covers the "easy launch" surface):
|
||||||
|
; * Start Menu group: Start → DataTools → DataTools / Uninstall
|
||||||
|
; * Desktop shortcut: optional, checked by default during install
|
||||||
|
; * Quick Launch: optional, off by default (legacy Win 7 + power
|
||||||
|
; users who keep the bar enabled). Windows 10/11
|
||||||
|
; users pin to taskbar manually via right-click —
|
||||||
|
; OS security policy forbids programmatic pinning.
|
||||||
|
; * App Paths entry: so ``DataTools`` typed into Win+R / cmd works.
|
||||||
|
;
|
||||||
|
; Self-contained: the installer contains a frozen PyInstaller bundle
|
||||||
|
; (Python + every runtime dep). No pre-install or post-install steps
|
||||||
|
; on the buyer's machine. UAC is NOT required because we install
|
||||||
|
; per-user by default; the prompt only fires if the buyer asks for an
|
||||||
|
; all-users install.
|
||||||
|
|
||||||
#ifndef AppVersion
|
#ifndef AppVersion
|
||||||
#define AppVersion "0.0.0-dev"
|
#define AppVersion "0.0.0-dev"
|
||||||
@@ -18,11 +33,15 @@ AppVersion={#AppVersion}
|
|||||||
AppVerName=DataTools {#AppVersion}
|
AppVerName=DataTools {#AppVersion}
|
||||||
AppPublisher=DataTools
|
AppPublisher=DataTools
|
||||||
AppPublisherURL=https://datatools.app
|
AppPublisherURL=https://datatools.app
|
||||||
|
AppSupportURL=https://datatools.app/support
|
||||||
|
AppUpdatesURL=https://datatools.app/releases
|
||||||
DefaultDirName={autopf}\DataTools
|
DefaultDirName={autopf}\DataTools
|
||||||
DefaultGroupName=DataTools
|
DefaultGroupName=DataTools
|
||||||
DisableProgramGroupPage=yes
|
DisableProgramGroupPage=yes
|
||||||
OutputDir=..\dist
|
OutputDir=..\dist
|
||||||
OutputBaseFilename=DataTools-{#AppVersion}-win-setup
|
OutputBaseFilename=DataTools-{#AppVersion}-win-setup
|
||||||
|
SetupIconFile=icon.ico
|
||||||
|
UninstallDisplayIcon={app}\DataTools.exe
|
||||||
Compression=lzma2/max
|
Compression=lzma2/max
|
||||||
SolidCompression=yes
|
SolidCompression=yes
|
||||||
WizardStyle=modern
|
WizardStyle=modern
|
||||||
@@ -30,20 +49,45 @@ ArchitecturesInstallIn64BitMode=x64
|
|||||||
PrivilegesRequired=lowest
|
PrivilegesRequired=lowest
|
||||||
PrivilegesRequiredOverridesAllowed=dialog
|
PrivilegesRequiredOverridesAllowed=dialog
|
||||||
; Allow per-user install (no UAC prompt) when admin isn't available.
|
; Allow per-user install (no UAC prompt) when admin isn't available.
|
||||||
|
; Buyers without admin rights can still install without IT involvement.
|
||||||
|
|
||||||
|
ChangesAssociations=no
|
||||||
|
CloseApplications=force
|
||||||
|
RestartApplications=no
|
||||||
|
|
||||||
[Languages]
|
[Languages]
|
||||||
Name: "english"; MessagesFile: "compiler:Default.isl"
|
Name: "english"; MessagesFile: "compiler:Default.isl"
|
||||||
|
|
||||||
[Tasks]
|
[Tasks]
|
||||||
Name: "desktopicon"; Description: "Create a &desktop shortcut"; GroupDescription: "Additional shortcuts:"
|
Name: "desktopicon"; Description: "Create a &desktop shortcut"; GroupDescription: "Additional shortcuts:"
|
||||||
|
Name: "quicklaunchicon"; Description: "Create a &Quick Launch shortcut"; GroupDescription: "Additional shortcuts:"; Flags: unchecked; OnlyBelowVersion: 6.1
|
||||||
|
|
||||||
[Files]
|
[Files]
|
||||||
|
; PyInstaller's dist/DataTools/ tree includes:
|
||||||
|
; * DataTools.exe + frozen Python runtime
|
||||||
|
; * tesseract/tesseract.exe + DLLs + tessdata/eng.traineddata
|
||||||
|
; (bundled via build/datatools.spec datas; runtime discovery in
|
||||||
|
; src/pdf_extract.py reads sys._MEIPASS / "tesseract" / ...).
|
||||||
|
; * LICENSE_TESSERACT.txt at the bundle root (Apache-2.0).
|
||||||
|
; The recursesubdirs flag below picks all of those up — no separate
|
||||||
|
; Files: entry needed for tesseract/.
|
||||||
Source: "..\dist\DataTools\*"; DestDir: "{app}"; Flags: recursesubdirs ignoreversion
|
Source: "..\dist\DataTools\*"; DestDir: "{app}"; Flags: recursesubdirs ignoreversion
|
||||||
|
|
||||||
[Icons]
|
[Icons]
|
||||||
Name: "{group}\DataTools"; Filename: "{app}\DataTools.exe"
|
; Start Menu entries — created unconditionally so the app is always
|
||||||
|
; discoverable via Start search.
|
||||||
|
Name: "{group}\DataTools"; Filename: "{app}\DataTools.exe"; IconFilename: "{app}\DataTools.exe"
|
||||||
Name: "{group}\Uninstall DataTools"; Filename: "{uninstallexe}"
|
Name: "{group}\Uninstall DataTools"; Filename: "{uninstallexe}"
|
||||||
Name: "{autodesktop}\DataTools"; Filename: "{app}\DataTools.exe"; Tasks: desktopicon
|
; Desktop shortcut — opt-in via the Tasks page.
|
||||||
|
Name: "{autodesktop}\DataTools"; Filename: "{app}\DataTools.exe"; IconFilename: "{app}\DataTools.exe"; Tasks: desktopicon
|
||||||
|
; Quick Launch (legacy) — only relevant on Win 7 and older.
|
||||||
|
Name: "{userappdata}\Microsoft\Internet Explorer\Quick Launch\DataTools"; Filename: "{app}\DataTools.exe"; IconFilename: "{app}\DataTools.exe"; Tasks: quicklaunchicon
|
||||||
|
|
||||||
|
[Registry]
|
||||||
|
; App Paths — lets the buyer launch from Win+R or cmd with just
|
||||||
|
; "DataTools" instead of a full path. Per-user hive so the per-user
|
||||||
|
; install path doesn't need admin to register.
|
||||||
|
Root: HKCU; Subkey: "Software\Microsoft\Windows\CurrentVersion\App Paths\DataTools.exe"; ValueType: string; ValueName: ""; ValueData: "{app}\DataTools.exe"; Flags: uninsdeletekey
|
||||||
|
|
||||||
[Run]
|
[Run]
|
||||||
Filename: "{app}\DataTools.exe"; Description: "Launch DataTools"; Flags: nowait postinstall skipifsilent
|
Filename: "{app}\DataTools.exe"; Description: "Launch DataTools"; Flags: nowait postinstall skipifsilent
|
||||||
|
|||||||
@@ -10,6 +10,11 @@
|
|||||||
#
|
#
|
||||||
# Code signing + notarization happen separately (see build/README.md
|
# Code signing + notarization happen separately (see build/README.md
|
||||||
# "Signing"). This script only handles the packaging step.
|
# "Signing"). This script only handles the packaging step.
|
||||||
|
#
|
||||||
|
# Tesseract bundling: no-op here. The .app already contains
|
||||||
|
# Contents/Resources/tesseract/{tesseract, *.dylib, tessdata/} thanks
|
||||||
|
# to PyInstaller's BUNDLE() carrying the spec's datas through. This
|
||||||
|
# script just wraps the finished .app — no extra steps for OCR.
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
|||||||
43
build/macos/build_zip.sh
Executable file
43
build/macos/build_zip.sh
Executable file
@@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Wrap dist/DataTools.app into a no-install portable .zip.
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# bash build/macos/build_zip.sh <version>
|
||||||
|
#
|
||||||
|
# Why a portable .zip in addition to the .dmg:
|
||||||
|
# * Buyers who don't want an installer can unzip and double-click the
|
||||||
|
# .app directly — no drag-to-/Applications step, no installer
|
||||||
|
# chrome. Self-contained: the .app holds Python + every dep.
|
||||||
|
# * IT-locked-down machines often block .dmg auto-mount but allow
|
||||||
|
# .zip download + extraction.
|
||||||
|
#
|
||||||
|
# Run after ``pyinstaller build/datatools.spec --clean --noconfirm``
|
||||||
|
# has produced ``dist/DataTools.app``. Output goes to
|
||||||
|
# ``dist/DataTools-<version>-mac-portable.zip``.
|
||||||
|
#
|
||||||
|
# Tesseract bundling: no-op here. The bundled Tesseract binary +
|
||||||
|
# dylibs + tessdata are already inside DataTools.app/Contents/Resources/tesseract/
|
||||||
|
# (placed by PyInstaller's BUNDLE/datas mechanism). ``ditto -c -k``
|
||||||
|
# preserves the whole .app tree.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
VERSION="${1:-0.0.0-dev}"
|
||||||
|
APP="dist/DataTools.app"
|
||||||
|
ZIP="dist/DataTools-${VERSION}-mac-portable.zip"
|
||||||
|
|
||||||
|
if [[ ! -d "$APP" ]]; then
|
||||||
|
echo "Error: $APP not found. Run pyinstaller build/datatools.spec first." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ``ditto`` preserves the .app bundle's extended attributes and
|
||||||
|
# resource forks (a plain ``zip`` strips them and can break code
|
||||||
|
# signatures + Info.plist resolution on the buyer's machine).
|
||||||
|
#
|
||||||
|
# --sequesterRsrc keeps the AppleDouble metadata inside the archive
|
||||||
|
# rather than as parallel ._ files on disk after extraction.
|
||||||
|
rm -f "$ZIP"
|
||||||
|
ditto -c -k --sequesterRsrc --keepParent "$APP" "$ZIP"
|
||||||
|
|
||||||
|
echo "Built $ZIP ($(du -h "$ZIP" | cut -f1))"
|
||||||
757
build/make_release.py
Normal file
757
build/make_release.py
Normal file
@@ -0,0 +1,757 @@
|
|||||||
|
"""Single-command release builder for DataTools.
|
||||||
|
|
||||||
|
PyInstaller can't cross-compile — to produce a Windows .exe you run
|
||||||
|
this on Windows, for a Mac .dmg you run it on macOS, for a Linux
|
||||||
|
AppImage you run it on Linux. One script, one OS at a time.
|
||||||
|
|
||||||
|
What this script does (in order):
|
||||||
|
1. Preflight — checks PyInstaller, Pillow, and the platform's
|
||||||
|
packager (Inno Setup on Win / hdiutil + ditto on Mac /
|
||||||
|
appimagetool on Linux) are reachable. Bails with install
|
||||||
|
instructions if anything is missing.
|
||||||
|
2. Generates icon.ico / icon.icns / icon.png from the PNG asset.
|
||||||
|
3. Runs PyInstaller against build/datatools.spec.
|
||||||
|
4. Wraps the PyInstaller output into:
|
||||||
|
* Windows: DataTools-<ver>-win-setup.exe (Inno Setup)
|
||||||
|
+ DataTools-<ver>-win-portable.zip
|
||||||
|
* macOS: DataTools-<ver>-mac.dmg
|
||||||
|
+ DataTools-<ver>-mac-portable.zip
|
||||||
|
* Linux: DataTools-<ver>-linux-x86_64.AppImage
|
||||||
|
5. Prints what landed in dist/ and the byte sizes.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python build/make_release.py # build everything for this OS
|
||||||
|
python build/make_release.py --preflight # check tooling, don't build
|
||||||
|
python build/make_release.py --skip-installer # only the portable zip
|
||||||
|
python build/make_release.py --skip-portable # only the installer
|
||||||
|
python build/make_release.py --clean # wipe dist/ first
|
||||||
|
|
||||||
|
Run from the repo root or from build/ — either works.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
REPO = Path(__file__).resolve().parent.parent
|
||||||
|
BUILD = REPO / "build"
|
||||||
|
DIST = REPO / "dist"
|
||||||
|
|
||||||
|
# Tesseract bundling. The runtime discovery code in
|
||||||
|
# ``src/pdf_extract.py`` looks for the binary at
|
||||||
|
# ``Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"`` and tessdata
|
||||||
|
# at ``... / "tesseract" / "tessdata" / "eng.traineddata"``. We stage
|
||||||
|
# everything under ``build/_tesseract/<platform>/`` (gitignored) and
|
||||||
|
# the PyInstaller spec adds that staging dir to ``datas=`` so it lands
|
||||||
|
# at the right place inside the frozen bundle.
|
||||||
|
TESSERACT_VERSION = "5.5.0"
|
||||||
|
TESSDATA_DIR = BUILD / "vendor" / "tessdata"
|
||||||
|
TESSDATA_URL = (
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata"
|
||||||
|
)
|
||||||
|
TESSERACT_STAGING = BUILD / "_tesseract"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Output helpers — colourless so logs stay readable in any terminal/CI tail.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _step(msg: str) -> None:
|
||||||
|
print(f"\n==> {msg}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _ok(msg: str) -> None:
|
||||||
|
print(f" ok: {msg}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _warn(msg: str) -> None:
|
||||||
|
print(f" warn: {msg}", flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _err(msg: str) -> None:
|
||||||
|
print(f" ERROR: {msg}", file=sys.stderr, flush=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _run(cmd: list[str], cwd: Path | None = None, env: dict | None = None) -> None:
|
||||||
|
"""Run *cmd*, stream output, exit on failure with a useful banner."""
|
||||||
|
printable = " ".join(map(str, cmd))
|
||||||
|
print(f" $ {printable}", flush=True)
|
||||||
|
try:
|
||||||
|
subprocess.run(cmd, check=True, cwd=cwd or REPO, env=env)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
_err(f"command failed (exit {e.returncode}): {printable}")
|
||||||
|
sys.exit(e.returncode)
|
||||||
|
except FileNotFoundError:
|
||||||
|
_err(f"command not found: {cmd[0]}")
|
||||||
|
sys.exit(127)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Platform detection
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_platform() -> str:
|
||||||
|
"""Return ``win`` / ``mac`` / ``linux`` based on sys.platform."""
|
||||||
|
p = sys.platform
|
||||||
|
if p.startswith("win"):
|
||||||
|
return "win"
|
||||||
|
if p == "darwin":
|
||||||
|
return "mac"
|
||||||
|
if p.startswith("linux"):
|
||||||
|
return "linux"
|
||||||
|
_err(f"unsupported platform {p!r}; this script handles win/mac/linux only.")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Version — single source of truth in src/__init__.py
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _read_version() -> str:
|
||||||
|
init_py = (REPO / "src" / "__init__.py").read_text(encoding="utf-8")
|
||||||
|
m = re.search(r'__version__\s*=\s*["\']([^"\']+)["\']', init_py)
|
||||||
|
if not m:
|
||||||
|
_err("could not parse __version__ from src/__init__.py")
|
||||||
|
sys.exit(1)
|
||||||
|
return m.group(1)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Preflight — check tooling before doing anything destructive
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _have_module(name: str) -> bool:
|
||||||
|
try:
|
||||||
|
__import__(name)
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _have_command(name: str) -> bool:
|
||||||
|
return shutil.which(name) is not None
|
||||||
|
|
||||||
|
|
||||||
|
# Per-platform install hints. The error messages quote these so a buyer
|
||||||
|
# building from source isn't left guessing what to install next.
|
||||||
|
_INSTALL_HINTS = {
|
||||||
|
"pyinstaller": "pip install pyinstaller",
|
||||||
|
"pil": "pip install pillow",
|
||||||
|
"iscc": "Inno Setup (Windows): https://jrsoftware.org/isdl.php — install, then re-open the shell so iscc lands on PATH.",
|
||||||
|
"hdiutil": "ships with macOS — if it's missing your Mac install is broken.",
|
||||||
|
"ditto": "ships with macOS — if it's missing your Mac install is broken.",
|
||||||
|
"appimagetool": "Linux: download appimagetool-x86_64.AppImage from https://github.com/AppImage/AppImageKit/releases, chmod +x, drop on PATH.",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def preflight(target: str) -> None:
|
||||||
|
"""Verify every tool the target build needs is reachable; exit if not."""
|
||||||
|
_step(f"preflight ({target})")
|
||||||
|
|
||||||
|
missing: list[tuple[str, str]] = []
|
||||||
|
|
||||||
|
# Python-side deps — same on every platform. The ``_INSTALL_HINTS``
|
||||||
|
# lookup uses lowercase keys so module name capitalization doesn't
|
||||||
|
# need to match.
|
||||||
|
for mod in ("PyInstaller", "PIL"):
|
||||||
|
if not _have_module(mod):
|
||||||
|
hint = _INSTALL_HINTS.get(mod.lower(), f"pip install {mod}")
|
||||||
|
missing.append((mod.lower(), hint))
|
||||||
|
else:
|
||||||
|
_ok(f"{mod} importable")
|
||||||
|
|
||||||
|
# PyInstaller's CLI must also be reachable as a binary, not just as
|
||||||
|
# an importable module — the spec is invoked via the ``pyinstaller``
|
||||||
|
# command. ``python -m PyInstaller`` is a fine fallback so don't
|
||||||
|
# hard-fail if only the CLI binary is missing.
|
||||||
|
if _have_command("pyinstaller"):
|
||||||
|
_ok("pyinstaller on PATH")
|
||||||
|
else:
|
||||||
|
_warn("pyinstaller binary not on PATH — will fall back to `python -m PyInstaller`")
|
||||||
|
|
||||||
|
# Platform-specific packagers.
|
||||||
|
if target == "win":
|
||||||
|
if _have_command("iscc"):
|
||||||
|
_ok("Inno Setup (iscc) on PATH")
|
||||||
|
else:
|
||||||
|
missing.append(("iscc", _INSTALL_HINTS["iscc"]))
|
||||||
|
elif target == "mac":
|
||||||
|
for tool in ("hdiutil", "ditto"):
|
||||||
|
if _have_command(tool):
|
||||||
|
_ok(f"{tool} on PATH")
|
||||||
|
else:
|
||||||
|
missing.append((tool, _INSTALL_HINTS[tool]))
|
||||||
|
elif target == "linux":
|
||||||
|
if _have_command("appimagetool"):
|
||||||
|
_ok("appimagetool on PATH")
|
||||||
|
else:
|
||||||
|
missing.append(("appimagetool", _INSTALL_HINTS["appimagetool"]))
|
||||||
|
|
||||||
|
if missing:
|
||||||
|
_err("missing prerequisites:")
|
||||||
|
for name, hint in missing:
|
||||||
|
print(f" - {name}: {hint}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
_ok("all prerequisites present")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tesseract bundling — fetch the binary + tessdata at build time.
|
||||||
|
#
|
||||||
|
# We download (not vendor) because:
|
||||||
|
# * Binaries are large (5-40 MB per platform) and license-encumbered
|
||||||
|
# to keep current in git.
|
||||||
|
# * tessdata is Apache-2.0 and ~16 MB — fine to redistribute but
|
||||||
|
# bloats clones for contributors who don't touch OCR.
|
||||||
|
#
|
||||||
|
# Caching layout:
|
||||||
|
# build/_tesseract/win/tesseract.exe + DLLs
|
||||||
|
# build/_tesseract/mac/tesseract + dylibs
|
||||||
|
# build/_tesseract/linux/tesseract + libs
|
||||||
|
# build/vendor/tessdata/eng.traineddata (shared across platforms)
|
||||||
|
#
|
||||||
|
# The PyInstaller spec reads ``build/_tesseract/<platform>/`` and the
|
||||||
|
# tessdata dir, then bundles them under ``<bundle>/tesseract/``.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _download(url: str, dest: Path, *, expected_min_bytes: int = 1024) -> None:
|
||||||
|
"""Download *url* to *dest* atomically. Sanity-check the size."""
|
||||||
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
tmp = dest.with_suffix(dest.suffix + ".part")
|
||||||
|
print(f" GET {url}", flush=True)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(url, timeout=120) as r, open(tmp, "wb") as f:
|
||||||
|
shutil.copyfileobj(r, f)
|
||||||
|
except Exception as e: # noqa: BLE001 — bubble any network error up
|
||||||
|
if tmp.exists():
|
||||||
|
tmp.unlink()
|
||||||
|
_err(f"download failed: {url}\n {e}")
|
||||||
|
raise
|
||||||
|
size = tmp.stat().st_size
|
||||||
|
if size < expected_min_bytes:
|
||||||
|
tmp.unlink()
|
||||||
|
raise RuntimeError(
|
||||||
|
f"downloaded file too small ({size} bytes < {expected_min_bytes}); "
|
||||||
|
f"the URL probably 404'd into an HTML error page."
|
||||||
|
)
|
||||||
|
tmp.replace(dest)
|
||||||
|
_ok(f"downloaded {dest.name} ({size / (1024 * 1024):.1f} MB)")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_tessdata() -> Path:
|
||||||
|
"""Ensure ``build/vendor/tessdata/eng.traineddata`` exists; return its path.
|
||||||
|
|
||||||
|
Shared across platforms. Downloaded once and cached. The
|
||||||
|
runtime expects this file at ``<bundle>/tesseract/tessdata/eng.traineddata``;
|
||||||
|
the PyInstaller spec handles the placement.
|
||||||
|
"""
|
||||||
|
_step("fetch tessdata (eng.traineddata)")
|
||||||
|
TESSDATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
target = TESSDATA_DIR / "eng.traineddata"
|
||||||
|
if target.exists() and target.stat().st_size > 1_000_000:
|
||||||
|
_ok(f"already cached: {target.relative_to(REPO)} "
|
||||||
|
f"({target.stat().st_size / (1024 * 1024):.1f} MB)")
|
||||||
|
return target
|
||||||
|
# ~16 MB on disk for the "best" model. Allow some slack on the
|
||||||
|
# min-bytes check (3 MB) so we still catch HTML 404 pages.
|
||||||
|
_download(TESSDATA_URL, target, expected_min_bytes=3 * 1024 * 1024)
|
||||||
|
return target
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_tesseract_windows(staging: Path) -> None:
|
||||||
|
"""Stage tesseract.exe + DLLs into *staging*.
|
||||||
|
|
||||||
|
Strategy (no easy stand-alone Windows tarball exists — UB-Mannheim
|
||||||
|
ships the canonical Windows builds as Inno Setup installers):
|
||||||
|
|
||||||
|
1. Download the installer .exe from the UB-Mannheim mirror.
|
||||||
|
2. Extract it with 7-Zip (which can read Inno Setup archives via
|
||||||
|
the {app} group). 7-Zip is preinstalled on
|
||||||
|
``windows-latest`` GitHub Actions runners (`C:\\Program Files\\7-Zip\\7z.exe`).
|
||||||
|
3. Copy tesseract.exe + every DLL + the tessdata dir from the
|
||||||
|
extraction into ``staging/``.
|
||||||
|
|
||||||
|
The DLL set tesseract.exe needs at runtime (per UB-Mannheim's
|
||||||
|
Inno Setup script):
|
||||||
|
libtesseract-5.dll, libleptonica-6.dll, libgomp-1.dll,
|
||||||
|
libstdc++-6.dll, libwinpthread-1.dll, libgcc_s_seh-1.dll,
|
||||||
|
liblz4.dll, libjpeg-8.dll, libpng16-16.dll, libtiff-6.dll,
|
||||||
|
libwebp-7.dll, libwebpmux-3.dll, libopenjp2-7.dll, zlib1.dll
|
||||||
|
The whole {app} tree from the installer is ~120 MB; we copy
|
||||||
|
just the .exe + .dll files (~50 MB) since the runtime only
|
||||||
|
needs the binary and its direct deps.
|
||||||
|
"""
|
||||||
|
# UB-Mannheim posts builds under a versioned filename; the exact
|
||||||
|
# build revision changes (5.5.0.20241111 at time of writing).
|
||||||
|
# We pin a specific rev so reproducible builds don't drift.
|
||||||
|
rev = "20241111" # patch rev for tesseract 5.5.0 on the UB-Mannheim mirror
|
||||||
|
fname = f"tesseract-ocr-w64-setup-{TESSERACT_VERSION}.{rev}.exe"
|
||||||
|
url = f"https://digi.bib.uni-mannheim.de/tesseract/{fname}"
|
||||||
|
|
||||||
|
cache = TESSERACT_STAGING / fname
|
||||||
|
if not cache.exists():
|
||||||
|
_download(url, cache, expected_min_bytes=20 * 1024 * 1024)
|
||||||
|
|
||||||
|
# 7-Zip is preinstalled on windows-latest runners; on a dev box
|
||||||
|
# the user installs it (choco install 7zip) or substitutes
|
||||||
|
# innoextract. Locate it.
|
||||||
|
sevenz = (
|
||||||
|
shutil.which("7z")
|
||||||
|
or shutil.which("7z.exe")
|
||||||
|
or r"C:\Program Files\7-Zip\7z.exe"
|
||||||
|
)
|
||||||
|
if not Path(sevenz).exists() and not shutil.which("7z"):
|
||||||
|
_err(
|
||||||
|
"7-Zip not found. On Windows CI runners it's preinstalled; "
|
||||||
|
"on a dev box install via ``choco install 7zip`` or extract "
|
||||||
|
f"{cache} manually into {staging}/ and re-run with "
|
||||||
|
"TESSERACT_SKIP_FETCH=1."
|
||||||
|
)
|
||||||
|
raise FileNotFoundError("7z")
|
||||||
|
|
||||||
|
extract = TESSERACT_STAGING / "win_extract"
|
||||||
|
if extract.exists():
|
||||||
|
shutil.rmtree(extract)
|
||||||
|
extract.mkdir(parents=True)
|
||||||
|
_run([str(sevenz), "x", "-y", f"-o{extract}", str(cache)])
|
||||||
|
|
||||||
|
staging.mkdir(parents=True, exist_ok=True)
|
||||||
|
# The Inno Setup payload lands under ``{app}/`` inside the
|
||||||
|
# extraction. Recursively grab tesseract.exe + DLLs.
|
||||||
|
found_exe = False
|
||||||
|
for root, _dirs, files in os.walk(extract):
|
||||||
|
for f in files:
|
||||||
|
src = Path(root) / f
|
||||||
|
if f.lower() == "tesseract.exe":
|
||||||
|
shutil.copy2(src, staging / "tesseract.exe")
|
||||||
|
found_exe = True
|
||||||
|
elif f.lower().endswith(".dll"):
|
||||||
|
shutil.copy2(src, staging / f)
|
||||||
|
if not found_exe:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"tesseract.exe not found inside extracted installer at {extract}"
|
||||||
|
)
|
||||||
|
_ok(f"staged Windows tesseract into {staging.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_tesseract_macos(staging: Path) -> None:
|
||||||
|
"""Stage tesseract + dylibs into *staging* on macOS.
|
||||||
|
|
||||||
|
Strategy: use Homebrew. ``brew install tesseract`` is the
|
||||||
|
sanctioned macOS path and the binary it installs is the same one
|
||||||
|
every guide on the internet points at. We copy the binary +
|
||||||
|
every dylib it links against into the staging dir, then run
|
||||||
|
``install_name_tool`` to rewrite the load paths so the binary
|
||||||
|
works after relocation into the .app bundle.
|
||||||
|
|
||||||
|
Caveat: ``brew`` must be on PATH (it is on ``macos-latest``
|
||||||
|
runners). If it isn't, we surface a helpful error rather than
|
||||||
|
fail mysteriously.
|
||||||
|
"""
|
||||||
|
if not shutil.which("brew"):
|
||||||
|
_err(
|
||||||
|
"Homebrew not found. On macos-latest GitHub runners it's "
|
||||||
|
"preinstalled; on a dev Mac install from https://brew.sh and "
|
||||||
|
"re-run. Alternatively pre-stage tesseract into "
|
||||||
|
f"{staging}/ and set TESSERACT_SKIP_FETCH=1."
|
||||||
|
)
|
||||||
|
raise FileNotFoundError("brew")
|
||||||
|
|
||||||
|
# ``brew install`` is idempotent — fine to run on every build. We
|
||||||
|
# don't pin the version through brew because brew tracks its own
|
||||||
|
# taps; instead we assert the version matches TESSERACT_VERSION
|
||||||
|
# after install.
|
||||||
|
_run(["brew", "install", "tesseract"])
|
||||||
|
|
||||||
|
# Find the binary brew just installed.
|
||||||
|
tess_path = shutil.which("tesseract")
|
||||||
|
if not tess_path:
|
||||||
|
raise RuntimeError("brew install tesseract succeeded but tesseract not on PATH")
|
||||||
|
|
||||||
|
staging.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(tess_path, staging / "tesseract")
|
||||||
|
|
||||||
|
# Copy every non-system dylib the binary links against. The
|
||||||
|
# ``otool -L`` output lists absolute paths under /opt/homebrew/
|
||||||
|
# (Apple Silicon) or /usr/local/ (Intel). We skip /usr/lib/* and
|
||||||
|
# /System/* (Apple-shipped, present on every Mac).
|
||||||
|
try:
|
||||||
|
otool = subprocess.run(
|
||||||
|
["otool", "-L", str(staging / "tesseract")],
|
||||||
|
check=True, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise RuntimeError(f"otool failed: {e.stderr}") from e
|
||||||
|
|
||||||
|
deps = []
|
||||||
|
for line in otool.stdout.splitlines()[1:]:
|
||||||
|
path = line.strip().split(" ", 1)[0]
|
||||||
|
if path.startswith(("/opt/homebrew/", "/usr/local/")):
|
||||||
|
deps.append(path)
|
||||||
|
|
||||||
|
# Copy each dep and its transitive deps. One level of recursion
|
||||||
|
# is usually enough for the tesseract dep tree (libtesseract →
|
||||||
|
# libleptonica → libpng/libjpeg/libtiff/libwebp).
|
||||||
|
copied: set[str] = set()
|
||||||
|
|
||||||
|
def _copy_with_deps(libpath: str) -> None:
|
||||||
|
if libpath in copied or not Path(libpath).exists():
|
||||||
|
return
|
||||||
|
copied.add(libpath)
|
||||||
|
dest = staging / Path(libpath).name
|
||||||
|
shutil.copy2(libpath, dest)
|
||||||
|
# Rewrite the dest's own load path to @loader_path so the
|
||||||
|
# bundle is relocatable.
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["install_name_tool", "-id", f"@loader_path/{Path(libpath).name}", str(dest)],
|
||||||
|
check=True, capture_output=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
# Not fatal — install_name_tool refuses on already-relative
|
||||||
|
# IDs. The dyld loader will still find them via
|
||||||
|
# @loader_path rewrites on the consumer side.
|
||||||
|
pass
|
||||||
|
# Walk this lib's own deps.
|
||||||
|
try:
|
||||||
|
sub = subprocess.run(
|
||||||
|
["otool", "-L", libpath], check=True, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
for sub_line in sub.stdout.splitlines()[1:]:
|
||||||
|
sub_path = sub_line.strip().split(" ", 1)[0]
|
||||||
|
if sub_path.startswith(("/opt/homebrew/", "/usr/local/")):
|
||||||
|
_copy_with_deps(sub_path)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
for dep in deps:
|
||||||
|
_copy_with_deps(dep)
|
||||||
|
|
||||||
|
# Rewrite the tesseract binary's references to point at
|
||||||
|
# @loader_path/<dyname> so it can find its deps inside the bundle.
|
||||||
|
bin_path = staging / "tesseract"
|
||||||
|
for dep in deps:
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["install_name_tool", "-change", dep,
|
||||||
|
f"@loader_path/{Path(dep).name}", str(bin_path)],
|
||||||
|
check=True, capture_output=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
_ok(f"staged macOS tesseract + {len(copied)} dylibs into {staging.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_tesseract_linux(staging: Path) -> None:
|
||||||
|
"""Stage tesseract + .so files into *staging* on Linux.
|
||||||
|
|
||||||
|
Strategy: ``apt-get install tesseract-ocr libtesseract5``
|
||||||
|
(preinstalled on most ubuntu-latest images; we run install
|
||||||
|
anyway because the package is idempotent). Then copy the
|
||||||
|
binary + every .so it links against into staging. ``patchelf``
|
||||||
|
rewrites RPATH so the bundle is relocatable.
|
||||||
|
"""
|
||||||
|
if not shutil.which("apt-get") and not shutil.which("tesseract"):
|
||||||
|
_err(
|
||||||
|
"Neither apt-get nor a pre-installed tesseract found. On "
|
||||||
|
"ubuntu-latest runners both are present. On other distros "
|
||||||
|
"install tesseract-ocr via your package manager and re-run "
|
||||||
|
"with TESSERACT_SKIP_FETCH=1 after pre-staging the binary."
|
||||||
|
)
|
||||||
|
raise FileNotFoundError("tesseract")
|
||||||
|
|
||||||
|
if shutil.which("apt-get") and not shutil.which("tesseract"):
|
||||||
|
_run(["sudo", "apt-get", "update"])
|
||||||
|
_run(["sudo", "apt-get", "install", "-y", "tesseract-ocr", "libtesseract5"])
|
||||||
|
|
||||||
|
tess_path = shutil.which("tesseract")
|
||||||
|
if not tess_path:
|
||||||
|
raise RuntimeError("apt-get install succeeded but tesseract not on PATH")
|
||||||
|
|
||||||
|
staging.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(tess_path, staging / "tesseract")
|
||||||
|
|
||||||
|
# Collect .so dependencies via ldd. Skip the dynamic linker and
|
||||||
|
# libc/libpthread/libdl/libm/libstdc++/libgcc_s — those are
|
||||||
|
# guaranteed to exist on every Linux target and shipping them can
|
||||||
|
# cause GLIBC mismatch errors on older distros. The interesting
|
||||||
|
# tesseract-specific deps are libtesseract, libleptonica, and the
|
||||||
|
# image format libs (libpng, libjpeg, libtiff, libwebp, libgif).
|
||||||
|
SKIP_PREFIXES = (
|
||||||
|
"linux-vdso", "/lib64/ld-linux", "/lib/ld-linux",
|
||||||
|
"libc.so", "libdl.so", "libpthread.so", "libm.so",
|
||||||
|
"librt.so", "libnsl.so", "libutil.so",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
ldd = subprocess.run(
|
||||||
|
["ldd", str(staging / "tesseract")],
|
||||||
|
check=True, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise RuntimeError(f"ldd failed: {e.stderr}") from e
|
||||||
|
|
||||||
|
copied = 0
|
||||||
|
for line in ldd.stdout.splitlines():
|
||||||
|
# Format: " libfoo.so.N => /path/to/libfoo.so.N (0x...)"
|
||||||
|
parts = line.split("=>")
|
||||||
|
if len(parts) != 2:
|
||||||
|
continue
|
||||||
|
soname = parts[0].strip()
|
||||||
|
if soname.startswith(SKIP_PREFIXES):
|
||||||
|
continue
|
||||||
|
path_part = parts[1].strip().split(" ", 1)[0]
|
||||||
|
if not path_part or not Path(path_part).exists():
|
||||||
|
continue
|
||||||
|
shutil.copy2(path_part, staging / Path(path_part).name)
|
||||||
|
copied += 1
|
||||||
|
|
||||||
|
# patchelf is optional — if present, rewrite RPATH to $ORIGIN so
|
||||||
|
# the binary finds its bundled .so files. If absent, the
|
||||||
|
# PyInstaller LD_LIBRARY_PATH that the launcher sets will cover
|
||||||
|
# it (we already chdir into _MEIPASS for the runtime).
|
||||||
|
if shutil.which("patchelf"):
|
||||||
|
try:
|
||||||
|
_run(["patchelf", "--set-rpath", "$ORIGIN", str(staging / "tesseract")])
|
||||||
|
except SystemExit:
|
||||||
|
_warn("patchelf rpath rewrite failed — relying on LD_LIBRARY_PATH at runtime")
|
||||||
|
|
||||||
|
_ok(f"staged Linux tesseract + {copied} .so files into {staging.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_tesseract_for_platform(target: str) -> Path:
|
||||||
|
"""Stage the per-platform Tesseract binary + libs into ``build/_tesseract/<target>/``.
|
||||||
|
|
||||||
|
Returns the staging dir path. The PyInstaller spec adds this dir
|
||||||
|
(plus tessdata) to its ``datas=`` so the bundle ends up with
|
||||||
|
everything under ``<bundle>/tesseract/`` where the runtime
|
||||||
|
discovery code expects it.
|
||||||
|
|
||||||
|
Honours ``TESSERACT_SKIP_FETCH=1`` — set this when you've
|
||||||
|
pre-staged the binary by hand (offline build, behind a proxy,
|
||||||
|
custom build of tesseract, etc.). The script still verifies the
|
||||||
|
binary is present and surfaces a helpful error if not.
|
||||||
|
"""
|
||||||
|
_step(f"fetch tesseract binary ({target})")
|
||||||
|
staging = TESSERACT_STAGING / target
|
||||||
|
exe_name = "tesseract.exe" if target == "win" else "tesseract"
|
||||||
|
exe_path = staging / exe_name
|
||||||
|
|
||||||
|
if os.environ.get("TESSERACT_SKIP_FETCH") == "1":
|
||||||
|
if not exe_path.exists():
|
||||||
|
_err(
|
||||||
|
f"TESSERACT_SKIP_FETCH=1 but {exe_path} is missing. "
|
||||||
|
"Pre-stage the binary + its libs into that dir, then re-run."
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
_ok(f"skipping fetch (TESSERACT_SKIP_FETCH=1); using {exe_path.relative_to(REPO)}")
|
||||||
|
return staging
|
||||||
|
|
||||||
|
if exe_path.exists():
|
||||||
|
_ok(f"already staged: {exe_path.relative_to(REPO)}")
|
||||||
|
return staging
|
||||||
|
|
||||||
|
if target == "win":
|
||||||
|
_fetch_tesseract_windows(staging)
|
||||||
|
elif target == "mac":
|
||||||
|
_fetch_tesseract_macos(staging)
|
||||||
|
elif target == "linux":
|
||||||
|
_fetch_tesseract_linux(staging)
|
||||||
|
else:
|
||||||
|
_err(f"unknown target {target!r} for tesseract fetch")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
if not exe_path.exists():
|
||||||
|
_err(
|
||||||
|
f"fetch step finished but {exe_path.relative_to(REPO)} is missing. "
|
||||||
|
"Inspect the logs above; you may need to pre-stage the binary manually."
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
return staging
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Build steps
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def step_generate_icons() -> None:
|
||||||
|
_step("generate icons")
|
||||||
|
_run([sys.executable, str(BUILD / "generate_icons.py")])
|
||||||
|
|
||||||
|
|
||||||
|
def step_pyinstaller(clean: bool, *, target: str | None = None) -> None:
|
||||||
|
_step("pyinstaller bundle")
|
||||||
|
# Use ``python -m PyInstaller`` so we don't depend on the binary
|
||||||
|
# being on PATH (Windows users frequently see this — pip's
|
||||||
|
# Scripts/ dir isn't auto-added).
|
||||||
|
cmd = [sys.executable, "-m", "PyInstaller",
|
||||||
|
str(BUILD / "datatools.spec"),
|
||||||
|
"--noconfirm"]
|
||||||
|
if clean:
|
||||||
|
cmd.append("--clean")
|
||||||
|
# The spec reads ``DATATOOLS_TESS_STAGING`` to find the per-platform
|
||||||
|
# tesseract staging dir. Passing it via env keeps the spec file
|
||||||
|
# platform-agnostic — the spec doesn't need to detect win/mac/linux
|
||||||
|
# itself; the orchestrator already did.
|
||||||
|
env = os.environ.copy()
|
||||||
|
if target:
|
||||||
|
env["DATATOOLS_TESS_STAGING"] = str(TESSERACT_STAGING / target)
|
||||||
|
_run(cmd, env=env)
|
||||||
|
|
||||||
|
|
||||||
|
def step_package_win(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
||||||
|
out: list[Path] = []
|
||||||
|
if do_installer:
|
||||||
|
_step("Windows installer (Inno Setup)")
|
||||||
|
_run(["iscc", f"/DAppVersion={version}", str(BUILD / "installer.iss")])
|
||||||
|
out.append(DIST / f"DataTools-{version}-win-setup.exe")
|
||||||
|
if do_portable:
|
||||||
|
_step("Windows portable .zip")
|
||||||
|
_run([sys.executable, str(BUILD / "build_portable_zip.py"), "win", version])
|
||||||
|
out.append(DIST / f"DataTools-{version}-win-portable.zip")
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def step_package_mac(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
||||||
|
out: list[Path] = []
|
||||||
|
if do_installer:
|
||||||
|
_step("macOS DMG (installer)")
|
||||||
|
_run(["bash", str(BUILD / "macos" / "build_dmg.sh"), version])
|
||||||
|
out.append(DIST / f"DataTools-{version}-mac.dmg")
|
||||||
|
if do_portable:
|
||||||
|
_step("macOS portable .zip")
|
||||||
|
_run(["bash", str(BUILD / "macos" / "build_zip.sh"), version])
|
||||||
|
out.append(DIST / f"DataTools-{version}-mac-portable.zip")
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def step_package_linux(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
||||||
|
# On Linux the AppImage IS the portable. We ignore the two flags
|
||||||
|
# and always produce the single file — splitting wouldn't add
|
||||||
|
# value.
|
||||||
|
if not (do_installer or do_portable):
|
||||||
|
return []
|
||||||
|
_step("Linux AppImage")
|
||||||
|
_run(["bash", str(BUILD / "appimage" / "build.sh"), version])
|
||||||
|
return [DIST / f"DataTools-{version}-linux-x86_64.AppImage"]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Orchestration
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _summarise(outputs: list[Path]) -> None:
|
||||||
|
_step("done — outputs")
|
||||||
|
if not outputs:
|
||||||
|
_warn("no files produced (everything skipped via flags)")
|
||||||
|
return
|
||||||
|
for p in outputs:
|
||||||
|
if p.exists():
|
||||||
|
size_mb = p.stat().st_size / (1024 * 1024)
|
||||||
|
print(f" {p.relative_to(REPO)} ({size_mb:.1f} MB)")
|
||||||
|
else:
|
||||||
|
_warn(f"expected output missing: {p.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog="make_release.py",
|
||||||
|
description=(
|
||||||
|
"Build the installer + portable zip for the current OS. "
|
||||||
|
"Cross-compilation isn't supported by PyInstaller — run "
|
||||||
|
"this once per platform you want to target."
|
||||||
|
),
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--platform", choices=("auto", "win", "mac", "linux"), default="auto",
|
||||||
|
help="Override OS detection (mostly for testing). Default: auto.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--preflight", action="store_true",
|
||||||
|
help="Check tooling and exit without building.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--clean", action="store_true",
|
||||||
|
help="Wipe dist/ before building.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-installer", action="store_true",
|
||||||
|
help="Don't build the OS installer (.exe / .dmg).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-portable", action="store_true",
|
||||||
|
help="Don't build the portable .zip.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
target = _detect_platform() if args.platform == "auto" else args.platform
|
||||||
|
version = _read_version()
|
||||||
|
do_installer = not args.skip_installer
|
||||||
|
do_portable = not args.skip_portable
|
||||||
|
|
||||||
|
print(f"DataTools release builder")
|
||||||
|
print(f" target: {target} (host: {platform.platform()})")
|
||||||
|
print(f" version: {version}")
|
||||||
|
print(f" installer: {'yes' if do_installer else 'no'}")
|
||||||
|
print(f" portable: {'yes' if do_portable else 'no'}")
|
||||||
|
print(f" dist dir: {DIST}")
|
||||||
|
|
||||||
|
if target != _detect_platform():
|
||||||
|
_warn(
|
||||||
|
f"--platform {target} but host is {_detect_platform()}. "
|
||||||
|
"PyInstaller can't cross-compile — the bundle will be for "
|
||||||
|
"the HOST, only the packaging step will follow your override. "
|
||||||
|
"Useful only for testing the packager paths."
|
||||||
|
)
|
||||||
|
|
||||||
|
preflight(target)
|
||||||
|
if args.preflight:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
if args.clean and DIST.exists():
|
||||||
|
_step(f"cleaning {DIST}")
|
||||||
|
shutil.rmtree(DIST)
|
||||||
|
|
||||||
|
step_generate_icons()
|
||||||
|
|
||||||
|
# Stage Tesseract OCR before PyInstaller runs. The spec reads
|
||||||
|
# ``build/_tesseract/<target>/`` + ``build/vendor/tessdata/`` and
|
||||||
|
# bundles them under ``<bundle>/tesseract/`` so the runtime
|
||||||
|
# discovery in src/pdf_extract.py finds them at:
|
||||||
|
# Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"
|
||||||
|
# Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"
|
||||||
|
fetch_tessdata()
|
||||||
|
fetch_tesseract_for_platform(target)
|
||||||
|
|
||||||
|
step_pyinstaller(clean=args.clean, target=target)
|
||||||
|
|
||||||
|
if target == "win":
|
||||||
|
outputs = step_package_win(version, do_installer, do_portable)
|
||||||
|
elif target == "mac":
|
||||||
|
outputs = step_package_mac(version, do_installer, do_portable)
|
||||||
|
else:
|
||||||
|
outputs = step_package_linux(version, do_installer, do_portable)
|
||||||
|
|
||||||
|
_summarise(outputs)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
62
build/vendor/README.md
vendored
Normal file
62
build/vendor/README.md
vendored
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# build/vendor/ — third-party bundle inputs (fetched at build time)
|
||||||
|
|
||||||
|
This tree holds the third-party assets that get bundled into the
|
||||||
|
PyInstaller artifacts but that we deliberately do **not** keep in git
|
||||||
|
(too large / license-encumbered / re-fetchable on demand).
|
||||||
|
|
||||||
|
The build pipeline (`build/make_release.py`) populates everything in
|
||||||
|
here before the PyInstaller step. The contents are git-ignored except
|
||||||
|
for this README.
|
||||||
|
|
||||||
|
## tessdata/
|
||||||
|
|
||||||
|
Holds the Tesseract language data file(s) used by the PDF Extractor
|
||||||
|
OCR fallback. Only English is bundled today.
|
||||||
|
|
||||||
|
### Canonical source
|
||||||
|
|
||||||
|
We use the **"best" model** from `tesseract-ocr/tessdata_best` (LSTM,
|
||||||
|
slower but higher accuracy than the legacy `tessdata` set, and only
|
||||||
|
~12 MB compressed → ~16 MB uncompressed):
|
||||||
|
|
||||||
|
```
|
||||||
|
https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
|
||||||
|
```
|
||||||
|
|
||||||
|
There is also `tessdata_fast/` (~4 MB, lower accuracy) if you ever
|
||||||
|
want to optimise for bundle size over recognition quality. For bank
|
||||||
|
statements (the only OCR use case so far), the extra accuracy of the
|
||||||
|
`_best` model is worth the 10 MB.
|
||||||
|
|
||||||
|
### Why we don't vendor it in git
|
||||||
|
|
||||||
|
* ~16 MB binary file — bloats clone times for everyone, including
|
||||||
|
contributors who never touch the OCR code path.
|
||||||
|
* Apache-2.0-licensed and stable; the file rarely changes upstream
|
||||||
|
(last touched 2021), so a build-time fetch is safe.
|
||||||
|
* The Tesseract project explicitly distributes these via GitHub
|
||||||
|
raw URLs — they're meant to be downloaded, not redistributed
|
||||||
|
through other repos.
|
||||||
|
|
||||||
|
### How it gets populated
|
||||||
|
|
||||||
|
`build/make_release.py::fetch_tessdata()` checks for
|
||||||
|
`build/vendor/tessdata/eng.traineddata` on every run. If it's
|
||||||
|
missing, the script downloads it from the canonical URL above and
|
||||||
|
caches it here. Subsequent builds reuse the cached file.
|
||||||
|
|
||||||
|
On CI, the directory is restored from the GitHub Actions cache so we
|
||||||
|
don't pay the download cost on every run (`.github/workflows/build.yml`
|
||||||
|
caches `build/vendor/tessdata/` keyed on the URL above).
|
||||||
|
|
||||||
|
## Manual one-time fetch (if you're offline or behind a proxy)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p build/vendor/tessdata
|
||||||
|
curl -L -o build/vendor/tessdata/eng.traineddata \
|
||||||
|
https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify the file is non-empty and starts with the magic bytes
|
||||||
|
`b"\x00\x00\x00\x00"` followed by a header that `pytesseract` can
|
||||||
|
read; the script does a basic sanity check after download.
|
||||||
0
build/vendor/tessdata/.gitkeep
vendored
Normal file
0
build/vendor/tessdata/.gitkeep
vendored
Normal file
481
docs/ADMIN.md
Normal file
481
docs/ADMIN.md
Normal file
@@ -0,0 +1,481 @@
|
|||||||
|
# ADMIN — Internal license operations
|
||||||
|
|
||||||
|
Creator/operator-only reference. End users should read `USER-GUIDE.md` instead.
|
||||||
|
|
||||||
|
This doc covers everything the creator does that buyers never see: minting
|
||||||
|
through the live server, where state lives on the box, how to rotate secrets,
|
||||||
|
generating the signing keypair, the dev vs. production key story, and how to
|
||||||
|
recover from key loss.
|
||||||
|
|
||||||
|
For the end-to-end system + tech stack diagrams, see `ARCHITECTURE.md`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Live deployment (PR 1)
|
||||||
|
|
||||||
|
The license server is running at:
|
||||||
|
|
||||||
|
| URL | What it serves |
|
||||||
|
|---|---|
|
||||||
|
| `https://datatools.unalogix.com/` | Marketing site (placeholder — "DataTools — coming soon") |
|
||||||
|
| `https://licenses.datatools.unalogix.com/health` | Liveness + DB reachability probe |
|
||||||
|
| `https://licenses.datatools.unalogix.com/internal/*` | nginx-blocked on the public side — accessible only via SSH tunnel |
|
||||||
|
| Postgres @ `127.0.0.1:5433` (localhost) | DB containing the authoritative `licenses` table |
|
||||||
|
|
||||||
|
**Host**: `46.225.166.142` (Ubuntu 24.04), nginx 1.24, Postgres 16-alpine + FastAPI in Docker.
|
||||||
|
|
||||||
|
**Cert**: Let's Encrypt, covers both subdomains, expires 2026-08-12, auto-renews via `certbot.timer`.
|
||||||
|
|
||||||
|
### On-box state
|
||||||
|
|
||||||
|
| Path | Contents |
|
||||||
|
|---|---|
|
||||||
|
| `/srv/datatools-license/` | Deploy root, mode 750, owned by `datatools-api` |
|
||||||
|
| `/srv/datatools-license/compose.yml` | Production docker-compose definition |
|
||||||
|
| `/srv/datatools-license/app/` | Git clone of this repo (re-clone or `git pull` to update) |
|
||||||
|
| `/srv/datatools-license/secrets/` | Mode 750 dir holding `pg_password`, `admin_token`. Files are mode 400, owned UID 10001 (container app user) |
|
||||||
|
| `/srv/datatools-license/backups/` | Postgres dumps land here (cron not yet wired — see §"Backups" below) |
|
||||||
|
| `/etc/nginx/sites-available/unalogix` | nginx config for both subdomains |
|
||||||
|
| `/etc/letsencrypt/live/datatools.unalogix.com/` | TLS cert + key |
|
||||||
|
|
||||||
|
Container names: `datatools-api`, `datatools-postgres`. Both use
|
||||||
|
`restart: unless-stopped`.
|
||||||
|
|
||||||
|
### Get the admin token
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh michael@46.225.166.142 'sudo cat /srv/datatools-license/secrets/admin_token'
|
||||||
|
```
|
||||||
|
|
||||||
|
The token is **never** in git, in environment-variable dumps, or in
|
||||||
|
`docker inspect`. It lives on disk under mode 400 / UID 10001 (so only
|
||||||
|
root and the container app user can read it).
|
||||||
|
|
||||||
|
### Rotate the admin token
|
||||||
|
|
||||||
|
Any time it's been shown somewhere it shouldn't, or as routine hygiene:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/datatools-license
|
||||||
|
openssl rand -hex 32 > secrets/admin_token
|
||||||
|
chown 10001:10001 secrets/admin_token
|
||||||
|
chmod 400 secrets/admin_token
|
||||||
|
docker compose restart api # ~3 seconds; old token stops working immediately
|
||||||
|
```
|
||||||
|
|
||||||
|
### Mint a license from your laptop
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Open the SSH tunnel (leave running in a background terminal)
|
||||||
|
ssh -L 8090:127.0.0.1:8090 michael@46.225.166.142 -N &
|
||||||
|
|
||||||
|
# 2. Set the auth env
|
||||||
|
export DATATOOLS_ADMIN_TOKEN="$(ssh michael@46.225.166.142 'sudo cat /srv/datatools-license/secrets/admin_token')"
|
||||||
|
export DATATOOLS_ADMIN_URL=http://127.0.0.1:8090
|
||||||
|
|
||||||
|
# 3. Mint
|
||||||
|
python3 -m src.admin_cli mint \
|
||||||
|
--name "Buyer Name" \
|
||||||
|
--email buyer@example.com \
|
||||||
|
--tier core
|
||||||
|
|
||||||
|
# 4. (optional) List or revoke
|
||||||
|
python3 -m src.admin_cli list --email buyer@example.com
|
||||||
|
python3 -m src.admin_cli revoke DT1-CORE-xxxx-yyyy --reason "refund"
|
||||||
|
```
|
||||||
|
|
||||||
|
The blob lands in the response (and in the `licenses` table). Deliver it
|
||||||
|
to the buyer however suits — copy-paste into email, attach as `.dtlic`.
|
||||||
|
|
||||||
|
### Inspect / debug
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Container status + recent logs
|
||||||
|
ssh michael@46.225.166.142 'cd /srv/datatools-license && docker compose ps && docker compose logs api --tail 30'
|
||||||
|
|
||||||
|
# Query the licenses table directly
|
||||||
|
ssh michael@46.225.166.142 'cd /srv/datatools-license && docker compose exec -T postgres \
|
||||||
|
psql -U datatools_api -d datatools_licenses -c "SELECT license_key, email, tier, source, expires_at FROM licenses ORDER BY created_at DESC LIMIT 20;"'
|
||||||
|
|
||||||
|
# Public-side health
|
||||||
|
curl https://licenses.datatools.unalogix.com/health
|
||||||
|
```
|
||||||
|
|
||||||
|
### Bring it down / back up / rebuild
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/datatools-license
|
||||||
|
|
||||||
|
# Restart just the API (e.g. after rotating a secret)
|
||||||
|
docker compose restart api
|
||||||
|
|
||||||
|
# Restart everything
|
||||||
|
docker compose restart
|
||||||
|
|
||||||
|
# Bring down (DB volume PRESERVED)
|
||||||
|
docker compose down
|
||||||
|
|
||||||
|
# Bring up
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
# Rebuild the image after a git pull
|
||||||
|
cd app && git pull
|
||||||
|
cd ..
|
||||||
|
docker compose build && docker compose up -d
|
||||||
|
docker compose exec api alembic upgrade head # if new migrations
|
||||||
|
```
|
||||||
|
|
||||||
|
### Backups (not yet automated)
|
||||||
|
|
||||||
|
Postgres state is the system of record for the customer list — once PR 2
|
||||||
|
auto-mints from Gumroad webhooks, losing the DB would mean losing every
|
||||||
|
buyer record. Schedule a daily dump:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# /etc/cron.daily/datatools-license-backup — see SETUP-LICENSE-SERVER.md §9
|
||||||
|
```
|
||||||
|
|
||||||
|
Until that's in place, dump manually before any risky operation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose exec -T postgres \
|
||||||
|
pg_dump -U datatools_api datatools_licenses \
|
||||||
|
| gzip > backups/db-$(date -u +%Y%m%dT%H%M%SZ).sql.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
### Production signing key (not yet rotated)
|
||||||
|
|
||||||
|
The server currently signs with the in-tree dev keypair (no
|
||||||
|
`DATATOOLS_LICENSE_PRIVKEY_FILE` configured → falls back to
|
||||||
|
`src/license/_dev_keypair.py`). That matches what the desktop currently
|
||||||
|
verifies against, so existing buyers continue to work.
|
||||||
|
|
||||||
|
**Before shipping v1.0 to paying buyers**, rotate to a production keypair:
|
||||||
|
|
||||||
|
1. `python scripts/generate_keypair.py` (on a trusted machine).
|
||||||
|
2. Save the private hex to `/srv/datatools-license/secrets/license_privkey`,
|
||||||
|
chmod 400, chown 10001:10001.
|
||||||
|
3. Bake the public hex into the PyInstaller build's
|
||||||
|
`DATATOOLS_LICENSE_PUBKEY` env.
|
||||||
|
4. Wire `DATATOOLS_LICENSE_PRIVKEY_FILE` + `DATATOOLS_LICENSE_PUBKEY`
|
||||||
|
into compose.yml's `api.environment` and add `license_privkey` to
|
||||||
|
the secrets block.
|
||||||
|
5. `docker compose restart api`.
|
||||||
|
|
||||||
|
### What's deployed (PR 1) vs queued (PR 2 / 3)
|
||||||
|
|
||||||
|
| Capability | Status |
|
||||||
|
|---|---|
|
||||||
|
| Mint API + Postgres + auth | **Live** |
|
||||||
|
| `datatools-admin` CLI (manual mints) | **Live** |
|
||||||
|
| `licenses.datatools.unalogix.com/health` public | **Live** |
|
||||||
|
| Gumroad webhook receiver | **PR 2 — code merged, deploy pending** |
|
||||||
|
| Postmark transactional email | **PR 2 — code merged, deploy pending** |
|
||||||
|
| Buyer renewal / re-delivery portal | **PR 3** |
|
||||||
|
| Cloudflare in front (DDoS / WAF) | Deferred (DNS at supercp/cPanel) |
|
||||||
|
| Production signing keypair | Deferred (still using dev key) |
|
||||||
|
| Automated DB backups | **Pending** — see §"Backups" |
|
||||||
|
|
||||||
|
### Running a Gumroad webhook (PR 2)
|
||||||
|
|
||||||
|
Once PR 2 is deployed, sales fire `POST` to
|
||||||
|
`https://licenses.datatools.unalogix.com/webhooks/gumroad?secret=<gumroad_secret>`.
|
||||||
|
Auth is the URL secret (Gumroad's recommended pattern). The handler
|
||||||
|
audit-logs the raw payload, mints idempotently keyed on `sale_id`,
|
||||||
|
sends the buyer their blob via Postmark, and returns 200 (always —
|
||||||
|
non-2xx would trigger 3-day retry storms).
|
||||||
|
|
||||||
|
**Adding a new SKU:**
|
||||||
|
|
||||||
|
1. Create the product in Gumroad and copy its `product_id`.
|
||||||
|
2. Edit `/srv/datatools-license/app/server/config/products.yaml`,
|
||||||
|
add a row under `gumroad:` with that ID + the tier you sold.
|
||||||
|
3. `cd /srv/datatools-license && docker compose restart api` — the
|
||||||
|
config is read at startup and cached.
|
||||||
|
|
||||||
|
**Inspecting webhook activity:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Recent webhook deliveries (all storefronts share this table)
|
||||||
|
ssh michael@46.225.166.142 'cd /srv/datatools-license && docker compose exec -T postgres \
|
||||||
|
psql -U datatools_api -d datatools_licenses -c \
|
||||||
|
"SELECT received_at, order_id, processed, error FROM gumroad_events ORDER BY received_at DESC LIMIT 20;"'
|
||||||
|
|
||||||
|
# Failures only (replay candidates)
|
||||||
|
ssh michael@46.225.166.142 'cd /srv/datatools-license && docker compose exec -T postgres \
|
||||||
|
psql -U datatools_api -d datatools_licenses -c \
|
||||||
|
"SELECT id, received_at, order_id, error FROM gumroad_events WHERE processed=false ORDER BY received_at DESC;"'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Replaying a failed webhook** (after fixing the products.yaml mapping
|
||||||
|
or whatever surfaced the error): the safest path is to ask the buyer
|
||||||
|
to re-trigger via Gumroad's "Send Test Ping" button in their order
|
||||||
|
record, *or* mint manually via `datatools-admin mint --source manual`
|
||||||
|
and add a note linking to the original `gumroad_events.id`.
|
||||||
|
|
||||||
|
**Testing without buyers:** Gumroad's seller dashboard has a "Send
|
||||||
|
Test Ping" button. It sets `test=true` in the payload; the adapter
|
||||||
|
tags the resulting license with `notes='gumroad test ping'` so it's
|
||||||
|
trivially filterable later.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## TL;DR — I just need a license for my dev machine
|
||||||
|
|
||||||
|
You're running from source, so the repo's embedded dev keypair signs and
|
||||||
|
verifies. No env vars needed.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/generate_license.py \
|
||||||
|
--name "Michael Dombaugh" \
|
||||||
|
--email michael.dombaugh@gmail.com \
|
||||||
|
--tier core
|
||||||
|
```
|
||||||
|
|
||||||
|
Copy the `DTLIC2:…` blob from stdout, then activate:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m src.license_cli activate "DTLIC2:..." \
|
||||||
|
--name "Michael Dombaugh" \
|
||||||
|
--email michael.dombaugh@gmail.com
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m src.license_cli status
|
||||||
|
```
|
||||||
|
|
||||||
|
License lands at `~/.datatools/license.json`, valid 1 year.
|
||||||
|
|
||||||
|
> The `--name` / `--email` you pass to `activate` **must** match the values
|
||||||
|
> the blob was minted with — they're part of the signed payload.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Key model (Ed25519, asymmetric)
|
||||||
|
|
||||||
|
| Key | Lives where | Used for |
|
||||||
|
|-----|------------|---------|
|
||||||
|
| **Private** (32 bytes hex) | Creator's password manager / KMS only | Signing license blobs |
|
||||||
|
| **Public** (32 bytes hex) | Baked into the shipped binary | Verifying blobs at activation |
|
||||||
|
|
||||||
|
The split is the whole point: an attacker with a copy of the binary still
|
||||||
|
can't mint blobs — they'd need the private key, which never ships.
|
||||||
|
|
||||||
|
There's also an in-tree **dev keypair** (`src/license/_dev_keypair.py`)
|
||||||
|
derived deterministically from a seed. It's used when no env vars are set,
|
||||||
|
so devs/tests can sign and verify locally without juggling secrets. Frozen
|
||||||
|
builds that still use it are rejected at startup by
|
||||||
|
`assert_production_safe` — see `src/license/crypto.py:84`.
|
||||||
|
|
||||||
|
Blob format prefix: `DTLIC2:` (v1 was HMAC; v2 is Ed25519).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## One-time setup — generating the production keypair
|
||||||
|
|
||||||
|
Run once, before the first paid release.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/generate_keypair.py --output keypair.env
|
||||||
|
```
|
||||||
|
|
||||||
|
You'll get:
|
||||||
|
|
||||||
|
```
|
||||||
|
DATATOOLS_LICENSE_PRIVKEY=<64 hex chars> # KEEP SECRET
|
||||||
|
DATATOOLS_LICENSE_PUBKEY=<64 hex chars> # BAKE INTO BUILD
|
||||||
|
```
|
||||||
|
|
||||||
|
Then:
|
||||||
|
|
||||||
|
1. **Stash the private key** in a password manager / KMS / hardware token.
|
||||||
|
Losing it means no more renewals — see "Recovery" below.
|
||||||
|
2. **Delete `keypair.env`** from disk once stored.
|
||||||
|
3. **Set the public key** as `DATATOOLS_LICENSE_PUBKEY` in the PyInstaller
|
||||||
|
build environment. The shipped binary embeds it via the env at freeze time.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Minting a buyer license (production)
|
||||||
|
|
||||||
|
With the production private key loaded:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export DATATOOLS_LICENSE_PRIVKEY=<your-private-hex>
|
||||||
|
|
||||||
|
python scripts/generate_license.py \
|
||||||
|
--name "Buyer Name" \
|
||||||
|
--email buyer@example.com \
|
||||||
|
--tier core \
|
||||||
|
--years 1 \
|
||||||
|
--output buyer.dtlic
|
||||||
|
```
|
||||||
|
|
||||||
|
Flags:
|
||||||
|
|
||||||
|
| Flag | Default | Notes |
|
||||||
|
|------|---------|-------|
|
||||||
|
| `--name` | required | Buyer's full name. Goes into signed payload. |
|
||||||
|
| `--email` | required | Buyer's email. Goes into signed payload. |
|
||||||
|
| `--tier` | `core` | One of: `lite`, `core`, `pro` |
|
||||||
|
| `--years` | `1` | Lifetime in years |
|
||||||
|
| `--key` | random | Override the auto-generated license key |
|
||||||
|
| `--output` / `-o` | stdout | Write blob to file instead of printing |
|
||||||
|
|
||||||
|
Deliver the blob to the buyer either inline in the purchase email or as
|
||||||
|
the attached `.dtlic` file.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tiers
|
||||||
|
|
||||||
|
| Tier | Features |
|
||||||
|
|------|---------|
|
||||||
|
| **lite** | Find Duplicates, Clean Text, Standardize Formats |
|
||||||
|
| **core** | All 9 tools |
|
||||||
|
| **pro** | All 9 tools + future Pro-only features |
|
||||||
|
|
||||||
|
Source of truth: `src/license/features.py::all_features_for_tier`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Useful one-liners
|
||||||
|
|
||||||
|
Mint a free internal/team license (dev key, no env needed):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/generate_license.py --name "QA Bot" --email qa@datatools.app --tier core --years 5
|
||||||
|
```
|
||||||
|
|
||||||
|
Mint with a stable, human-readable key:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python scripts/generate_license.py --name "Acme Corp" --email ops@acme.com \
|
||||||
|
--tier pro --key "DT1-PRO-ACME-2026"
|
||||||
|
```
|
||||||
|
|
||||||
|
Renew an existing buyer (just re-mint with the same email; they paste the
|
||||||
|
new blob):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m src.license_cli renew "DTLIC2:..."
|
||||||
|
```
|
||||||
|
|
||||||
|
Check what's active locally:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m src.license_cli status
|
||||||
|
```
|
||||||
|
|
||||||
|
Wipe a local license (move to a new machine, debug a buyer issue):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python -m src.license_cli deactivate
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Customer record-keeping — the issuance log
|
||||||
|
|
||||||
|
Every successful `scripts/generate_license.py` run appends one JSON
|
||||||
|
line to a local **issuance log**. This is the creator-side system of
|
||||||
|
record for "who has a license" until the server-side flow in
|
||||||
|
`docs/LICENSE-SERVER.md` lands.
|
||||||
|
|
||||||
|
**Path:** `~/.datatools-creator/issued.jsonl` (override with
|
||||||
|
`$DATATOOLS_ISSUANCE_LOG`). Mode 600. Outside the buyer-facing
|
||||||
|
`~/.datatools/` dir so it never gets bundled into a shipped install.
|
||||||
|
|
||||||
|
**Format** — one record per line:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"license_key": "DT1-CORE-5dd8e1db-d90c4656",
|
||||||
|
"name": "Michael Dombaugh",
|
||||||
|
"email": "michael.dombaugh@gmail.com",
|
||||||
|
"tier": "core",
|
||||||
|
"issued_at": "2026-05-13T22:10:27Z",
|
||||||
|
"expires_at": "2031-05-13T22:10:27Z",
|
||||||
|
"blob": "DTLIC2:..."
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
The full blob is stored so you can re-deliver to a buyer who lost
|
||||||
|
their email without re-minting (the re-minted blob would have a
|
||||||
|
different signature and would invalidate any device they'd already
|
||||||
|
activated against the old one).
|
||||||
|
|
||||||
|
**Useful operations:**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Full list of issued licenses
|
||||||
|
cat ~/.datatools-creator/issued.jsonl | jq
|
||||||
|
|
||||||
|
# Find by buyer email
|
||||||
|
jq -r 'select(.email == "buyer@example.com")' ~/.datatools-creator/issued.jsonl
|
||||||
|
|
||||||
|
# Count by tier
|
||||||
|
jq -r .tier ~/.datatools-creator/issued.jsonl | sort | uniq -c
|
||||||
|
|
||||||
|
# Licenses expiring in the next 30 days
|
||||||
|
jq -r 'select(.expires_at < "'"$(date -u -d '+30 days' +%Y-%m-%dT%H:%M:%SZ)"'") | .email' \
|
||||||
|
~/.datatools-creator/issued.jsonl
|
||||||
|
|
||||||
|
# Re-deliver a buyer's blob
|
||||||
|
jq -r 'select(.email == "buyer@example.com") | .blob' \
|
||||||
|
~/.datatools-creator/issued.jsonl
|
||||||
|
```
|
||||||
|
|
||||||
|
**Skipping the log** for test mints: pass `--no-log`. Never use this
|
||||||
|
for real buyer fulfillment — an unlogged mint is invisible to every
|
||||||
|
future query and to the eventual server-side migration.
|
||||||
|
|
||||||
|
**Backup:** treat this file like a small business ledger. Copy it
|
||||||
|
into your password manager / encrypted cloud sync alongside the
|
||||||
|
private key. Losing it doesn't break anything cryptographically (you
|
||||||
|
can still mint new licenses) but it does lose the customer list.
|
||||||
|
|
||||||
|
**Migrating to the server:** the JSONL schema is intentionally close
|
||||||
|
to the planned `licenses` table in `docs/LICENSE-SERVER.md`. Once the
|
||||||
|
server is up, a one-shot import script will read the JSONL and
|
||||||
|
insert each row.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Recovery — what if the private key is lost?
|
||||||
|
|
||||||
|
Existing licenses keep working until they expire (the public key in the
|
||||||
|
shipped binary still verifies them). What breaks:
|
||||||
|
|
||||||
|
- **Renewals** — you can't mint a new blob for an existing buyer.
|
||||||
|
- **New sales** — you can't mint anything.
|
||||||
|
|
||||||
|
Path forward:
|
||||||
|
|
||||||
|
1. Generate a new keypair (`scripts/generate_keypair.py`).
|
||||||
|
2. Ship a new build with the new public key.
|
||||||
|
3. Re-issue every active buyer a new blob signed by the new private key.
|
||||||
|
4. Communicate the upgrade path to buyers.
|
||||||
|
|
||||||
|
Treat the private key like a code-signing cert — back it up to two
|
||||||
|
independent secure locations.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Files & code pointers
|
||||||
|
|
||||||
|
| Path | Purpose |
|
||||||
|
|------|---------|
|
||||||
|
| `scripts/generate_keypair.py` | One-time keypair generation |
|
||||||
|
| `scripts/generate_license.py` | Mint a signed blob |
|
||||||
|
| `src/license/crypto.py` | Sign / verify / dev-key detection |
|
||||||
|
| `src/license/_dev_keypair.py` | In-tree dev keypair (never ships in prod) |
|
||||||
|
| `src/license/manager.py` | `assert_production_safe` startup check |
|
||||||
|
| `src/license/features.py` | Tier → features mapping |
|
||||||
|
| `src/license_cli.py` | End-user `activate` / `status` / `renew` / `deactivate` |
|
||||||
|
| `~/.datatools/license.json` | Where activated licenses are stored on each machine |
|
||||||
|
| `~/.datatools-creator/issued.jsonl` | Creator-side issuance log (one JSON line per mint) |
|
||||||
|
| `docs/LICENSE-SERVER.md` | Design for the future online issuance + record-keeping system |
|
||||||
|
| `docs/SETUP-LICENSE-SERVER.md` | Self-hosted server install runbook (DNS, Docker, nginx, TLS, backups) |
|
||||||
241
docs/ARCHITECTURE.md
Normal file
241
docs/ARCHITECTURE.md
Normal file
@@ -0,0 +1,241 @@
|
|||||||
|
# ARCHITECTURE — end-to-end view
|
||||||
|
|
||||||
|
Stitches the desktop app (`TECHNICAL.md`) and the license server
|
||||||
|
(`LICENSE-SERVER.md`) into a single picture. Read this first for "how
|
||||||
|
does it all fit together"; drill into the per-component docs for
|
||||||
|
detail.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. System diagram
|
||||||
|
|
||||||
|
```
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ OPERATOR / DEVELOPER LAPTOP │
|
||||||
|
│ │
|
||||||
|
│ git clone / push ←─── code lives in git.invixiom.com │
|
||||||
|
│ datatools-admin CLI ─── manual mints, list, revoke ─────┐ │
|
||||||
|
│ ssh -L 8090:127.0.0.1:8090 ───── tunnel for /internal/* ─────┤ │
|
||||||
|
└────────────────────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
┌─────────────────────────────────────────────────────────┘
|
||||||
|
│
|
||||||
|
│ internal Bearer-auth API (over SSH tunnel only)
|
||||||
|
▼
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ LICENSE SERVER — 46.225.166.142 │
|
||||||
|
│ ───────────────────────────────────────────────────────────────── │
|
||||||
|
│ │
|
||||||
|
│ ┌──────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ nginx 1.24 (TLS termination, public reverse proxy) │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ datatools.unalogix.com → static placeholder │ │
|
||||||
|
│ │ licenses.datatools.unalogix.com → 127.0.0.1:8090 (FastAPI) │ │
|
||||||
|
│ │ /internal/* on public surface → blocked (404) │ │
|
||||||
|
│ └────────────────────────────┬─────────────────────────────────────┘ │
|
||||||
|
│ │ │
|
||||||
|
│ ┌────────────────────────────▼─────────────────────────────────────┐ │
|
||||||
|
│ │ FastAPI app — datatools-api (Docker container, UID 10001) │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ ┌──────────────────┐ ┌──────────────────┐ ┌───────────────┐ │ │
|
||||||
|
│ │ │ /webhooks/* │ │ /internal/* │ │ /health │ │ │
|
||||||
|
│ │ │ (storefronts) │ │ (Bearer-auth) │ │ (public) │ │ │
|
||||||
|
│ │ └────────┬─────────┘ └────────┬─────────┘ └───────────────┘ │ │
|
||||||
|
│ │ │ │ │ │
|
||||||
|
│ │ ▼ ▼ │ │
|
||||||
|
│ │ ┌────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ SourceAdapter (Protocol) — normalized │ │ │
|
||||||
|
│ │ │ • ManualAdapter • GumroadAdapter │ │ │
|
||||||
|
│ │ │ • (LemonSqueezy, Stripe — future) │ │ │
|
||||||
|
│ │ └────────────────┬───────────────────────┘ │ │
|
||||||
|
│ │ │ SaleEvent / RefundEvent │ │
|
||||||
|
│ │ ▼ │ │
|
||||||
|
│ │ ┌────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ mint_from_sale() │ │ │
|
||||||
|
│ │ │ • Ed25519 sign via PyCA cryptography │ │ │
|
||||||
|
│ │ │ • idempotent on (source, order_id) │ │ │
|
||||||
|
│ │ └────────────────┬───────────────────────┘ │ │
|
||||||
|
│ └────────────────────┼─────────────────────────────────────────────┘ │
|
||||||
|
│ │ SQL │
|
||||||
|
│ ┌────────────────────▼─────────────────────────────────────────────┐ │
|
||||||
|
│ │ Postgres 16 — datatools-postgres (container, vol pg_data) │ │
|
||||||
|
│ │ • licenses — authoritative customer record │ │
|
||||||
|
│ │ • gumroad_events — webhook audit log (idempotency, replay) │ │
|
||||||
|
│ └──────────────────────────────────────────────────────────────────┘ │
|
||||||
|
└───────────────────────┬────────────────────────────────┬───────────────┘
|
||||||
|
│ │
|
||||||
|
┌───────────┘ └──────────┐
|
||||||
|
│ POST /email (httpx) Gumroad Ping│
|
||||||
|
▼ POST │
|
||||||
|
┌───────────────────┐ ┌─────────────▼──┐
|
||||||
|
│ Postmark │ │ Gumroad │
|
||||||
|
│ (transactional │ │ (storefront, │
|
||||||
|
│ email) │ │ payments) │
|
||||||
|
└───────┬───────────┘ └────────────────┘
|
||||||
|
│ DKIM-signed email with license blob ▲
|
||||||
|
▼ │
|
||||||
|
┌────────────────────────────────────────────────────────────────┴───────┐
|
||||||
|
│ BUYER'S MACHINE │
|
||||||
|
│ │
|
||||||
|
│ Receives email ──► copies DTLIC2: blob ──► pastes into desktop app │
|
||||||
|
│ │
|
||||||
|
│ ┌──────────────────────────────────────────────────────────────────┐ │
|
||||||
|
│ │ DataTools desktop (Python 3.12 + Streamlit + Typer CLIs) │ │
|
||||||
|
│ │ │ │
|
||||||
|
│ │ ┌────────────────────────────────────────────────────────────┐ │ │
|
||||||
|
│ │ │ Activate screen — verifies blob signature │ │ │
|
||||||
|
│ │ │ against EMBEDDED Ed25519 public key │ │ │
|
||||||
|
│ │ │ (NO network call to the license server, ever) │ │ │
|
||||||
|
│ │ └─────────────────────────┬──────────────────────────────────┘ │ │
|
||||||
|
│ │ ▼ │ │
|
||||||
|
│ │ ~/.datatools/license.json (signed blob, mode 644, on disk) │ │
|
||||||
|
│ └──────────────────────────────────────────────────────────────────┘ │
|
||||||
|
│ │
|
||||||
|
│ Pays via web browser ─────► Gumroad ────► (kicks off the Ping) │
|
||||||
|
└────────────────────────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
**Three primary flows**, distinguishable by where the green arrows
|
||||||
|
start in the diagram:
|
||||||
|
|
||||||
|
1. **Sale → fulfillment** (the automated path)
|
||||||
|
Buyer pays at Gumroad → Gumroad fires Ping to
|
||||||
|
`licenses.datatools.unalogix.com/webhooks/gumroad?secret=…` → nginx
|
||||||
|
→ FastAPI → audit-log row → adapter normalizes payload → `mint_from_sale`
|
||||||
|
writes the `licenses` row + Ed25519-signs the blob → Postmark emails
|
||||||
|
the buyer their blob. End-to-end latency: a few hundred milliseconds.
|
||||||
|
|
||||||
|
2. **Manual mint** (operator path — comps, support replacements)
|
||||||
|
Operator opens SSH tunnel → `datatools-admin mint` → `/internal/mint`
|
||||||
|
(Bearer-authed, never publicly reachable) → same `mint_from_sale`
|
||||||
|
path → blob returned in HTTP response. Operator delivers to buyer
|
||||||
|
out-of-band.
|
||||||
|
|
||||||
|
3. **Activation** (buyer path — fully offline)
|
||||||
|
Buyer pastes blob into desktop's Activate screen → desktop verifies
|
||||||
|
the Ed25519 signature against the public key **embedded in the
|
||||||
|
shipped binary** → license written to `~/.datatools/license.json`.
|
||||||
|
The desktop app makes **no network calls** to the license server at
|
||||||
|
any point. This preserves the "your data never leaves your computer"
|
||||||
|
promise (`DECISIONS.md §9b`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Tech stack
|
||||||
|
|
||||||
|
Layered view of what technology lives where. "External SaaS" entries
|
||||||
|
are services we depend on but don't operate.
|
||||||
|
|
||||||
|
```
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ DESKTOP APP (shipped binary, runs on buyer's box) │
|
||||||
|
├──────────────────┬─────────────────────────────────────────────────────┤
|
||||||
|
│ GUI │ Streamlit 1.35 — local web server, browser opens │
|
||||||
|
│ CLI │ Typer 0.12 — per-tool entry points │
|
||||||
|
│ Core logic │ pandas 2.x, numpy, rapidfuzz, charset-normalizer │
|
||||||
|
│ Crypto (verify) │ PyCA cryptography — Ed25519 public-key verify only │
|
||||||
|
│ Storage │ ~/.datatools/license.json (file, mode 644) │
|
||||||
|
│ Internationalization │ i18n via JSON catalogs in src/i18n/ │
|
||||||
|
│ Build │ PyInstaller — one-file binary, per OS │
|
||||||
|
│ Runtimes │ Python 3.12 (bundled into installer) │
|
||||||
|
│ Platforms │ Windows · macOS · Linux │
|
||||||
|
└──────────────────┴─────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ LICENSE SERVER (this box; non-buyer-facing) │
|
||||||
|
├──────────────────┬─────────────────────────────────────────────────────┤
|
||||||
|
│ Edge │ nginx 1.24 + Let's Encrypt (auto-renew via timer) │
|
||||||
|
│ HTTP framework │ FastAPI 0.119 + Starlette + Pydantic v2 │
|
||||||
|
│ ASGI server │ uvicorn 0.39 (+uvloop, +httptools, +watchfiles) │
|
||||||
|
│ Form parsing │ python-multipart (for Gumroad form-encoded Pings) │
|
||||||
|
│ ORM │ SQLAlchemy 2.0 │
|
||||||
|
│ Migrations │ Alembic 1.18 (one initial migration so far) │
|
||||||
|
│ Database │ Postgres 16-alpine (containerized, single node) │
|
||||||
|
│ Database driver │ psycopg 3.3 (with binary wheel) │
|
||||||
|
│ Crypto (sign) │ PyCA cryptography — Ed25519 private-key sign │
|
||||||
|
│ HTTP client │ httpx 0.28 (Postmark calls, test mocking) │
|
||||||
|
│ Config │ Pydantic Settings + YAML (products.yaml) │
|
||||||
|
│ Container │ Docker + Docker Compose v2 plugin │
|
||||||
|
│ Image base │ python:3.12-slim │
|
||||||
|
│ Process user │ UID 10001 (non-root `app` user defined in image) │
|
||||||
|
│ Logging │ stdlib `logging` to container stdout → docker logs │
|
||||||
|
│ Host OS │ Ubuntu 24.04 LTS │
|
||||||
|
└──────────────────┴─────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ OPERATOR / DEVELOPER MACHINE │
|
||||||
|
├──────────────────┬─────────────────────────────────────────────────────┤
|
||||||
|
│ Source control │ git → self-hosted Gitea (git.invixiom.com) │
|
||||||
|
│ Admin CLI │ Typer (src/admin_cli.py) │
|
||||||
|
│ Server access │ SSH tunnel for /internal/* (no public exposure) │
|
||||||
|
│ Break-glass │ scripts/generate_license.py (offline-only mints, │
|
||||||
|
│ │ used when the license server is unreachable) │
|
||||||
|
│ Test runner │ pytest 8.3 + SQLite in-memory (no docker required) │
|
||||||
|
│ Smoke test │ bash + docker compose (server/scripts/smoke.sh) │
|
||||||
|
└──────────────────┴─────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
┌────────────────────────────────────────────────────────────────────────┐
|
||||||
|
│ EXTERNAL SaaS / dependencies │
|
||||||
|
├──────────────────┬─────────────────────────────────────────────────────┤
|
||||||
|
│ Storefront │ Gumroad — Ping webhook to /webhooks/gumroad │
|
||||||
|
│ Transactional │ Postmark — HTTP API for license-delivery emails │
|
||||||
|
│ email │ (LoggingEmailService fallback when token unset) │
|
||||||
|
│ TLS CA │ Let's Encrypt — ACME HTTP-01 challenge via certbot │
|
||||||
|
│ Authoritative │ supercp / cPanel (your DNS host for unalogix.com) │
|
||||||
|
│ DNS │ — Cloudflare front-door deferred │
|
||||||
|
│ Source hosting │ Self-hosted Gitea (git.invixiom.com) — not on the │
|
||||||
|
│ │ datatools box; shares the same physical host │
|
||||||
|
└──────────────────┴─────────────────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Trust + isolation boundaries
|
||||||
|
|
||||||
|
Worth tracing explicitly because the threat model differs at each
|
||||||
|
boundary:
|
||||||
|
|
||||||
|
| Boundary | What crosses it | Trust model |
|
||||||
|
|---|---|---|
|
||||||
|
| Buyer ↔ Gumroad | Payment, buyer details | Out of scope — Gumroad's problem |
|
||||||
|
| Gumroad → license server (webhook) | Signed-by-shared-secret POST | URL secret check; non-matching = 404 (no info leak); audit-log everything regardless |
|
||||||
|
| License server → Postmark | DKIM-signed transactional mail | Postmark verified-sender domain; HTTP API auth via server token |
|
||||||
|
| License server → Postgres | SQL over local docker bridge | Same compose project; password from on-disk secret file |
|
||||||
|
| Operator → license server (`/internal/*`) | Bearer token over SSH tunnel | Token only on disk + in the operator's env; nginx blocks `/internal/*` publicly as defense-in-depth |
|
||||||
|
| License server → buyer (email) | Plaintext blob in inbox | Buyer's email account hygiene; we deliberately don't encrypt — blob is self-protecting (signature) |
|
||||||
|
| Buyer → desktop app (activation) | Signed blob pasted in | Verified against pubkey **embedded in the shipped binary**; no network call |
|
||||||
|
|
||||||
|
The single most important property to preserve: **the desktop app
|
||||||
|
never talks to the license server.** All trust in the desktop comes
|
||||||
|
from the embedded public key + the signed blob. This is what makes
|
||||||
|
the offline activation guarantee real, and what keeps a license-server
|
||||||
|
outage from breaking buyers who've already activated.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Where things are stored
|
||||||
|
|
||||||
|
| Lives on… | Path / location | Contents |
|
||||||
|
|---|---|---|
|
||||||
|
| Buyer's machine | `~/.datatools/license.json` | Activated license blob |
|
||||||
|
| Buyer's machine | Postmark email | Delivery copy of the blob |
|
||||||
|
| License server | `licenses` table (Postgres) | Authoritative customer record — name, email, tier, blob, source, order ID, promotion, amount paid |
|
||||||
|
| License server | `gumroad_events` table | Append-only webhook delivery audit log |
|
||||||
|
| License server | `/srv/datatools-license/secrets/` | Postgres password, admin Bearer token, (PR 2) Postmark token + Gumroad secret |
|
||||||
|
| License server | `/etc/letsencrypt/live/datatools.unalogix.com/` | TLS cert + key |
|
||||||
|
| Operator's laptop | `~/.datatools-creator/issued.jsonl` | Creator-side issuance log (pre-server era, kept as a break-glass backup) |
|
||||||
|
| Operator's laptop | Git clone of this repo | Source code, including `server/config/products.yaml` |
|
||||||
|
| Gitea | This repo's commits | Everything except secrets |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Related docs
|
||||||
|
|
||||||
|
| Doc | Scope |
|
||||||
|
|---|---|
|
||||||
|
| `TECHNICAL.md` | Desktop app internals (core libs, GUI, CLIs) |
|
||||||
|
| `LICENSE-SERVER.md` | Server architecture rationale + DB schema |
|
||||||
|
| `SETUP-LICENSE-SERVER.md` | Server install runbook (DNS, packages, nginx, TLS, Postgres) |
|
||||||
|
| `ADMIN.md` | Day-2 operations (minting, rotation, inspection) |
|
||||||
|
| `DECISIONS.md` | Architecture decision records — `§9b` = no online activation check |
|
||||||
|
| `USER-GUIDE.md` | Buyer-facing documentation |
|
||||||
@@ -47,7 +47,7 @@ Sell niche Python automation tools as one-time downloadable digital products. Ta
|
|||||||
|
|
||||||
**Surface**: desktop install per OS (PyInstaller) with Streamlit GUI + CLI. Constrained demo on Streamlit Community Cloud.
|
**Surface**: desktop install per OS (PyInstaller) with Streamlit GUI + CLI. Constrained demo on Streamlit Community Cloud.
|
||||||
|
|
||||||
## 4a. Lead bundle — Deduplicator
|
## 4a. Lead bundle — Find Duplicates
|
||||||
|
|
||||||
Highest pain density across all 4 personas. Feeds landing copy, demo design, feature priority. Tech spec: TECHNICAL.md §11.1.
|
Highest pain density across all 4 personas. Feeds landing copy, demo design, feature priority. Tech spec: TECHNICAL.md §11.1.
|
||||||
|
|
||||||
@@ -208,7 +208,7 @@ Headroom enables optional ad spend ($100-200/mo) once a bundle has proven conver
|
|||||||
|
|
||||||
## 13. Honest status (2026-05-01)
|
## 13. Honest status (2026-05-01)
|
||||||
|
|
||||||
- 3 of 9 tools shipped (Dedup, Text Cleaner, Format Standardizer).
|
- 3 of 9 tools shipped (Find Duplicates, Clean Text, Standardize Formats).
|
||||||
- Cross-platform build pipeline designed, not yet built.
|
- Cross-platform build pipeline designed, not yet built.
|
||||||
- macOS code signing not yet set up.
|
- macOS code signing not yet set up.
|
||||||
- Streamlit GUI shipped for the 3 ready tools.
|
- Streamlit GUI shipped for the 3 ready tools.
|
||||||
|
|||||||
@@ -8,15 +8,15 @@ Tres módulos de CLI, uno por cada herramienta Lista:
|
|||||||
|
|
||||||
| Módulo | Comando | Propósito |
|
| Módulo | Comando | Propósito |
|
||||||
|--------|---------|---------|
|
|--------|---------|---------|
|
||||||
| `src.cli` | `python -m src.cli FILE` | Eliminador de duplicados |
|
| `src.cli` | `python -m src.cli FILE` | Buscar duplicados |
|
||||||
| `src.cli_text_clean` | `python -m src.cli_text_clean FILE` | Limpiador de texto |
|
| `src.cli_text_clean` | `python -m src.cli_text_clean FILE` | Limpiar texto |
|
||||||
| `src.cli_analyze` | `python -m src.cli_analyze FILE` | Analizador (escaneo de solo lectura) |
|
| `src.cli_analyze` | `python -m src.cli_analyze FILE` | Analizador (escaneo de solo lectura) |
|
||||||
|
|
||||||
Cada comando es **previsualización por defecto** — añade `--apply` para escribir la salida.
|
Cada comando es **previsualización por defecto** — añade `--apply` para escribir la salida.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# Eliminador de duplicados
|
# Buscar duplicados
|
||||||
|
|
||||||
```
|
```
|
||||||
python -m src.cli ARCHIVO_ENTRADA [OPCIONES]
|
python -m src.cli ARCHIVO_ENTRADA [OPCIONES]
|
||||||
@@ -125,7 +125,7 @@ Registro: `logs/dedup_YYYYMMDD_HHMMSS.log`.
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# Limpiador de texto
|
# Limpiar texto
|
||||||
|
|
||||||
```
|
```
|
||||||
python -m src.cli_text_clean ARCHIVO_ENTRADA [OPCIONES]
|
python -m src.cli_text_clean ARCHIVO_ENTRADA [OPCIONES]
|
||||||
@@ -156,7 +156,7 @@ Higiene a nivel de carácter. Ver [TECHNICAL.md §10.2](TECHNICAL.md) (solo en i
|
|||||||
- `--config RUTA` / `--save-config RUTA`.
|
- `--config RUTA` / `--save-config RUTA`.
|
||||||
|
|
||||||
### Archivo
|
### Archivo
|
||||||
- `--sheet`, `--encoding`, `--header-row` — iguales que en el Eliminador de duplicados.
|
- `--sheet`, `--encoding`, `--header-row` — iguales que en Buscar duplicados.
|
||||||
|
|
||||||
## Presets
|
## Presets
|
||||||
|
|
||||||
|
|||||||
@@ -6,15 +6,15 @@ Three CLI modules, one per Ready tool:
|
|||||||
|
|
||||||
| Module | Command | Purpose |
|
| Module | Command | Purpose |
|
||||||
|--------|---------|---------|
|
|--------|---------|---------|
|
||||||
| `src.cli` | `python -m src.cli FILE` | Deduplicator |
|
| `src.cli` | `python -m src.cli FILE` | Find Duplicates |
|
||||||
| `src.cli_text_clean` | `python -m src.cli_text_clean FILE` | Text Cleaner |
|
| `src.cli_text_clean` | `python -m src.cli_text_clean FILE` | Clean Text |
|
||||||
| `src.cli_analyze` | `python -m src.cli_analyze FILE` | Analyzer (read-only scan) |
|
| `src.cli_analyze` | `python -m src.cli_analyze FILE` | Analyzer (read-only scan) |
|
||||||
|
|
||||||
Every command is **preview-only by default** — add `--apply` to write output.
|
Every command is **preview-only by default** — add `--apply` to write output.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# Deduplicator
|
# Find Duplicates
|
||||||
|
|
||||||
```
|
```
|
||||||
python -m src.cli INPUT_FILE [OPTIONS]
|
python -m src.cli INPUT_FILE [OPTIONS]
|
||||||
@@ -123,7 +123,7 @@ Log: `logs/dedup_YYYYMMDD_HHMMSS.log`.
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# Text Cleaner
|
# Clean Text
|
||||||
|
|
||||||
```
|
```
|
||||||
python -m src.cli_text_clean INPUT_FILE [OPTIONS]
|
python -m src.cli_text_clean INPUT_FILE [OPTIONS]
|
||||||
@@ -154,7 +154,7 @@ Character-level hygiene. See [TECHNICAL.md §10.2](TECHNICAL.md) for the spec.
|
|||||||
- `--config PATH` / `--save-config PATH`.
|
- `--config PATH` / `--save-config PATH`.
|
||||||
|
|
||||||
### File
|
### File
|
||||||
- `--sheet`, `--encoding`, `--header-row` — same as Deduplicator.
|
- `--sheet`, `--encoding`, `--header-row` — same as Find Duplicates.
|
||||||
|
|
||||||
## Presets
|
## Presets
|
||||||
|
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ Each candidate scored 1-5 on 6 dimensions. Total /30 → verdict.
|
|||||||
|
|
||||||
**v1.2 rationale**:
|
**v1.2 rationale**:
|
||||||
- Buyer persona ("hates Excel work but can't code") won't learn a CLI. Refunds at this price.
|
- Buyer persona ("hates Excel work but can't code") won't learn a CLI. Refunds at this price.
|
||||||
- Deduplicator needs interactive review — not viable in pure CLI.
|
- Find Duplicates needs interactive review — not viable in pure CLI.
|
||||||
- Dual interface keeps CLI for automation without sacrificing primary buyer surface.
|
- Dual interface keeps CLI for automation without sacrificing primary buyer surface.
|
||||||
|
|
||||||
## 4a. Functional scope principle (v1.2)
|
## 4a. Functional scope principle (v1.2)
|
||||||
@@ -170,11 +170,60 @@ $49-79/bundle · $149 full suite (when 3+ exist).
|
|||||||
| Apr 28 (v1.3) | Add hosted browser demo as conversion lever | Direct consequence of Streamlit choice. See §5. |
|
| Apr 28 (v1.3) | Add hosted browser demo as conversion lever | Direct consequence of Streamlit choice. See §5. |
|
||||||
| Apr 28 (v1.4) | Re-apply 04/06 boundary work (silent-drift recovery) | Stream B v1.2 content overwritten in parallel v1.3 work. Restored per no-silent-drift rule. |
|
| Apr 28 (v1.4) | Re-apply 04/06 boundary work (silent-drift recovery) | Stream B v1.2 content overwritten in parallel v1.3 work. Restored per no-silent-drift rule. |
|
||||||
| Apr 28 (v1.5) | Add `02_text_cleaner.py`; renumber 02-08 → 03-09 | Character-level hygiene had no clear owner. See TECHNICAL §10. |
|
| Apr 28 (v1.5) | Add `02_text_cleaner.py`; renumber 02-08 → 03-09 | Character-level hygiene had no clear owner. See TECHNICAL §10. |
|
||||||
| Apr 29 (v1.7) | Adopt Text Cleaner Tier 1/2/3 spec; lock `excel-hygiene` default | Promotes from stub to buildable v1 target. Full spec in TECHNICAL §11.2. |
|
| Apr 29 (v1.7) | Adopt Clean Text Tier 1/2/3 spec; lock `excel-hygiene` default | Promotes from stub to buildable v1 target. Full spec in TECHNICAL §11.2. |
|
||||||
| Apr 28 (v1.6) | Fold conversation-history content into docs (deduplicator spec, lead bundle use cases, full GUI matrix, 04/06 examples, Streamlit-to-SaaS reasoning) | No new decisions; promote at-risk analysis from chat history per no-silent-drift rule. |
|
| Apr 28 (v1.6) | Fold conversation-history content into docs (deduplicator spec, lead bundle use cases, full GUI matrix, 04/06 examples, Streamlit-to-SaaS reasoning) | No new decisions; promote at-risk analysis from chat history per no-silent-drift rule. |
|
||||||
| May 1 (v1.6) | Mark Format Standardizer **Ready** | 199-row buyer corpus passing; Tier 1 + most Tier 2 built. |
|
| May 1 (v1.6) | Mark Standardize Formats **Ready** | 199-row buyer corpus passing; Tier 1 + most Tier 2 built. |
|
||||||
| May 1 (v1.6) | Add `src/core/errors.py` structured hierarchy | Uniform helpful messages across CLI + GUI. See TECHNICAL §7. |
|
| May 1 (v1.6) | Add `src/core/errors.py` structured hierarchy | Uniform helpful messages across CLI + GUI. See TECHNICAL §7. |
|
||||||
| May 13 (v1.6) | Ship in-house JSON i18n + EN/ES packs | Expand addressable market (Spanish-first buyers, LatAm bookkeepers) without a `gettext` build step. JSON packs editable by non-devs; parity test prevents drift. See TECHNICAL §10b. |
|
| May 13 (v1.6) | Ship in-house JSON i18n + EN/ES packs | Expand addressable market (Spanish-first buyers, LatAm bookkeepers) without a `gettext` build step. JSON packs editable by non-devs; parity test prevents drift. See TECHNICAL §10b. |
|
||||||
|
| May 13 (v1.6) | Ship licensing: 1-year HMAC-signed blobs, name+email registration, offline verification, tier-scaffolded for future SKUs | Unlock the lifetime-update business model without recurring infra. Honor-system DRM (HMAC + 30-day refund) — sufficient at $49. See §9b below. |
|
||||||
|
| May 13 (v1.6) | Add Lite SKU (Find Duplicates + Clean Text + Standardize Formats) | Lower-priced entry point for buyers who only need the three universal tools. Per-tool feature gating + lock badges on the home grid surface the upgrade path. See §9b. |
|
||||||
|
| May 13 (v1.6) | Remove user-facing free trial | A 1-year all-features trial undercut the paid Lite SKU. Paid-only keeps tier economics clean. Internal ``_mint`` API still exists for tests and the seller's key generator. See §9b. |
|
||||||
|
| May 13 (v1.6) | Upgrade license crypto: HMAC → Ed25519 (asymmetric) | HMAC's symmetric secret was extractable from the shipped binary — anyone with the binary could mint blobs. Ed25519 splits sign (seller) from verify (binary), so binary compromise doesn't let an attacker forge licenses. Blob prefix bumped DTLIC1 → DTLIC2. See §9b. |
|
||||||
|
| May 13 (v1.6) | Add ``assert_production_safe`` tripwire | A shipped build with ``DATATOOLS_DEV_MODE=1`` or the in-source dev pubkey would silently defeat licensing. The tripwire refuses to boot such a build. No-op in source / pytest runs. See §9b. |
|
||||||
|
|
||||||
|
## 9b. Licensing model
|
||||||
|
|
||||||
|
**Decision (v1.6)**: offline HMAC-signed license blobs, 1-year lifetime, name + email registration required. Tier-scaffolded so future SKUs (PRO, ENTERPRISE) can carve per-tool feature sets without code changes.
|
||||||
|
|
||||||
|
| Option | Verdict |
|
||||||
|
|---|---|
|
||||||
|
| **Offline HMAC blob (chosen)** | **CHOSEN.** No server, no internet, fits the no-touch constraint. Honor-system at this price point. |
|
||||||
|
| Online activation check | Rejected. Conflicts with the "your data never leaves your computer" promise; introduces support load (server downtime, network issues). |
|
||||||
|
| No license at all | Rejected. The lifetime-update value prop requires *some* gating to make renewal meaningful. |
|
||||||
|
| Time-bombed binary (PyInstaller --no-license) | Rejected. Can't deliver renewals without re-shipping the installer. |
|
||||||
|
| Hardware-locked license | Rejected. Friction on legitimate device-swaps; doesn't match the buyer persona's tolerance. |
|
||||||
|
|
||||||
|
**Threat model** (v1.6 — Ed25519): the binary ships only the public key. A motivated reverse engineer who pulls everything out of the binary has the verification key but not the signing key — they can't mint new licenses. The earlier HMAC scheme had this hole; the asymmetric upgrade closes it. The remaining attack surface is:
|
||||||
|
|
||||||
|
- Re-signing with a forked binary that ships an attacker-controlled pubkey + auto-grants licenses. Costs more effort than the price of a legitimate copy and the result is per-fork, not shareable.
|
||||||
|
- Hooking the verification call to always return True. Defeats DRM entirely but only on the attacker's own machine — they could just write down "I unlocked DataTools" and skip the work.
|
||||||
|
- Setting ``DATATOOLS_DEV_MODE=1`` to bypass checks. **Refused in shipped builds** by ``assert_production_safe``; works in source/test runs only.
|
||||||
|
|
||||||
|
The 30-day refund window covers casual blob sharing from a different angle (anyone who shares their blob is implicitly authorizing the buyer to issue them a refund-on-demand).
|
||||||
|
|
||||||
|
**What's enforced**:
|
||||||
|
- License blob signature must match (HMAC-SHA256 with the build secret).
|
||||||
|
- Buyer-entered name + email must match the values embedded in the blob.
|
||||||
|
- Expiry date must be in the future.
|
||||||
|
- Tier must include the requested feature.
|
||||||
|
|
||||||
|
**What's NOT enforced**:
|
||||||
|
- Number of devices the same blob is used on (no concurrent-use detection).
|
||||||
|
- Reverse-engineered re-signing of expired blobs (would require RSA / online check).
|
||||||
|
|
||||||
|
**Future SKUs**: the ``FEATURES_BY_TIER`` table in ``src/license/features.py`` is the single source of truth for "which tools each tier unlocks". Adding a PRO SKU that excludes Automated Workflows is a 1-line edit there + a 1-line edit at the gate site. No consumer-code churn.
|
||||||
|
|
||||||
|
**v1.6 SKU lineup**:
|
||||||
|
|
||||||
|
| Tier | Tools unlocked | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| LITE | Find Duplicates, Clean Text, Standardize Formats | Entry SKU. Three universal tools that handle the most common bookkeeping / RevOps / Klaviyo prep workflows. |
|
||||||
|
| CORE | All 9 tools | Full v1 suite. |
|
||||||
|
| PRO | All 9 tools (scaffolded) | Reserved for future per-feature carve-outs (e.g., scheduled pipelines, API access). |
|
||||||
|
| ENTERPRISE | All 9 tools (scaffolded) | Reserved for future bulk / multi-seat SKUs. |
|
||||||
|
| TRIAL | Same as LITE | Deprecated — no longer issuable. Mapping kept for any legacy on-disk trial licenses to load without error. |
|
||||||
|
|
||||||
|
**Trial removed (v1.6)**: a 1-year free trial that unlocked every tool would undercut the paid Lite SKU (why pay for Lite when trial gives more for longer?). Paid-only keeps the funnel clean. The internal ``LicenseManager._mint`` API still exists for tests and for the seller's ``scripts/generate_license.py`` key generator; there's no user-facing way to self-issue a license.
|
||||||
|
|
||||||
## 8. Re-lock triggers
|
## 8. Re-lock triggers
|
||||||
|
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ CLI (src/cli*.py) GUI (src/gui/app.py + pages/)
|
|||||||
| `core.errors` | `DataToolsError` hierarchy, `ensure_dataframe()`, `ensure_choice()`, `wrap_file_read/write()`, `format_for_user()` |
|
| `core.errors` | `DataToolsError` hierarchy, `ensure_dataframe()`, `ensure_choice()`, `wrap_file_read/write()`, `format_for_user()` |
|
||||||
| `core._constants` | `US_STATE_NAMES`, `US_STATE_CODES`, `USPS_EXPANSIONS`, `USPS_COMPRESSIONS` |
|
| `core._constants` | `US_STATE_NAMES`, `US_STATE_CODES`, `USPS_EXPANSIONS`, `USPS_COMPRESSIONS` |
|
||||||
|
|
||||||
## Data flow — Deduplicator
|
## Data flow — Find Duplicates
|
||||||
|
|
||||||
```
|
```
|
||||||
read_file() # auto-detect encoding, delimiter, header
|
read_file() # auto-detect encoding, delimiter, header
|
||||||
@@ -96,6 +96,36 @@ DeduplicationResult # deduplicated_df, removed_df, match_groups, l
|
|||||||
|
|
||||||
No other call sites change. Gate auto-discovers it via the registry.
|
No other call sites change. Gate auto-discovers it via the registry.
|
||||||
|
|
||||||
|
### Tool page header — `render_tool_header(tool_id)`
|
||||||
|
|
||||||
|
Every tool page renders its title block via `render_tool_header(tool_id)` in `src/gui/components/_legacy.py` — do not call `st.title()` + `st.caption()` directly. The helper renders:
|
||||||
|
|
||||||
|
- `tools.<id>.page_title` as the page title (left column).
|
||||||
|
- A **Help** popover button right of the title (icon `:material/help_outline:`, label from `help.button_label`). Clicking opens an `st.popover` containing the markdown body.
|
||||||
|
- `tools.<id>.page_caption` as the caption below.
|
||||||
|
|
||||||
|
All copy is i18n-driven; editors can tweak help text without touching Python. If a tool is missing its `help_md` key, the popover falls back to `help.missing_body`.
|
||||||
|
|
||||||
|
**`help_md` structure** (markdown, stored as a single string with `\n` line breaks in JSON):
|
||||||
|
|
||||||
|
```
|
||||||
|
**When to use**
|
||||||
|
- bullet 1
|
||||||
|
- bullet 2
|
||||||
|
|
||||||
|
**Steps**
|
||||||
|
1. numbered step
|
||||||
|
2. numbered step
|
||||||
|
|
||||||
|
**Examples**
|
||||||
|
- example 1
|
||||||
|
- example 2
|
||||||
|
|
||||||
|
**Tip** one-sentence pro tip.
|
||||||
|
```
|
||||||
|
|
||||||
|
Keep it short — the popover is intentionally compact. Mirror the structure across every tool so the muscle memory transfers.
|
||||||
|
|
||||||
### i18n — language packs
|
### i18n — language packs
|
||||||
|
|
||||||
The GUI's user-facing strings live in `src/i18n/packs/<code>.json`, keyed by ISO-639-1 code. English (`en.json`) is canonical; missing keys in other packs fall back to English, and missing keys in English fall back to the literal dotted key so a typo is visible rather than silent.
|
The GUI's user-facing strings live in `src/i18n/packs/<code>.json`, keyed by ISO-639-1 code. English (`en.json`) is canonical; missing keys in other packs fall back to English, and missing keys in English fall back to the literal dotted key so a typo is visible rather than silent.
|
||||||
@@ -120,12 +150,123 @@ st.warning(t("gate.warning", name=filename)) # {name} interpolated via str.for
|
|||||||
3. Use the dotted key at the call site: `t("section.subsection.key")` or `t("section.key", name=value)` for placeholder interpolation.
|
3. Use the dotted key at the call site: `t("section.subsection.key")` or `t("section.key", name=value)` for placeholder interpolation.
|
||||||
|
|
||||||
**Authoring rules:**
|
**Authoring rules:**
|
||||||
- Keys live under semantic sections (`home.*`, `upload.*`, `findings.*`, `tools.<id>.name`). Don't nest by language or by tool unless the string is genuinely tool-specific.
|
- Keys live under semantic sections (`home.*`, `upload.*`, `findings.*`, `help.*`, `tools.<id>.name`). Don't nest by language or by tool unless the string is genuinely tool-specific.
|
||||||
|
- Per-tool header copy lives under `tools.<id>.{page_title, page_caption, help_md}`. `page_caption` is the one-line subtitle under the title; `help_md` is the popover body (see *Tool page header* above). Top-level `help.button_label` / `help.missing_body` are shared across every tool.
|
||||||
- Use `{named}` placeholders (not positional `{0}`) so translators see what's being interpolated.
|
- Use `{named}` placeholders (not positional `{0}`) so translators see what's being interpolated.
|
||||||
- Strings can contain Streamlit markdown (`**bold**`) — pass through `st.markdown` / `st.caption` as usual.
|
- Strings can contain Streamlit markdown (`**bold**`) — pass through `st.markdown` / `st.caption` as usual.
|
||||||
- Do **not** put strings inside the farewell-overlay JS payload without going through `_js_html_safe()` in `src/gui/components/_legacy.py`; the helper escapes both the JS string terminator and HTML special chars. The test `TestFarewellEscape` pins that contract.
|
- Do **not** put strings inside the farewell-overlay JS payload without going through `_js_html_safe()` in `src/gui/components/_legacy.py`; the helper escapes both the JS string terminator and HTML special chars. The test `TestFarewellEscape` pins that contract.
|
||||||
- The sidebar picker is mounted by `hide_streamlit_chrome()`, so every page that calls that helper automatically gets the picker. Pages that don't call it (rare) can call `render_language_selector()` directly.
|
- The sidebar picker is mounted by `hide_streamlit_chrome()`, so every page that calls that helper automatically gets the picker. Pages that don't call it (rare) can call `render_language_selector()` directly.
|
||||||
|
|
||||||
|
### Licensing
|
||||||
|
|
||||||
|
The license layer lives at ``src/license/``. The public API:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from src.license import (
|
||||||
|
get_manager, require_feature, current_state,
|
||||||
|
FeatureFlag, Tier, License,
|
||||||
|
)
|
||||||
|
|
||||||
|
mgr = get_manager()
|
||||||
|
if not mgr.is_valid():
|
||||||
|
raise RuntimeError("Not licensed")
|
||||||
|
require_feature(FeatureFlag.DEDUPLICATOR)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Storage**: ``~/.datatools/license.json`` (override via
|
||||||
|
``DATATOOLS_LICENSE_PATH``). Signed with Ed25519 (asymmetric) — the
|
||||||
|
seller's private key signs; the buyer's binary verifies with the
|
||||||
|
embedded public key.
|
||||||
|
|
||||||
|
**Key material**:
|
||||||
|
|
||||||
|
| Variable | Who has it | Where it's used |
|
||||||
|
|---|---|---|
|
||||||
|
| ``DATATOOLS_LICENSE_PRIVKEY`` | Seller only | ``scripts/generate_license.py`` (mint a buyer's blob), ``scripts/generate_keypair.py`` writes a fresh one |
|
||||||
|
| ``DATATOOLS_LICENSE_PUBKEY`` | Every shipped binary | Verification at activation time; set at build time via PyInstaller env |
|
||||||
|
|
||||||
|
If neither env var is set, ``src.license.crypto`` falls back to the
|
||||||
|
deterministic dev keypair in ``src/license/_dev_keypair.py``. The
|
||||||
|
dev key is in source on purpose (so tests work without secrets),
|
||||||
|
but a frozen build that's still using it is a build-config bug —
|
||||||
|
:func:`assert_production_safe` refuses to start such a binary.
|
||||||
|
|
||||||
|
**First-time setup for shipped builds**:
|
||||||
|
|
||||||
|
1. ``python scripts/generate_keypair.py --output prod-keys.env`` —
|
||||||
|
creates a fresh keypair.
|
||||||
|
2. Stash ``DATATOOLS_LICENSE_PRIVKEY`` somewhere safe (password
|
||||||
|
manager / KMS). Lose it and you can't issue renewals without
|
||||||
|
reshipping a new build with a new public key.
|
||||||
|
3. Configure the PyInstaller build env with
|
||||||
|
``DATATOOLS_LICENSE_PUBKEY=<hex>`` so the shipped binary
|
||||||
|
verifies against the production key.
|
||||||
|
4. Mint buyer licenses with
|
||||||
|
``DATATOOLS_LICENSE_PRIVKEY=<hex> python scripts/generate_license.py ...``.
|
||||||
|
|
||||||
|
**Dev bypass**: ``DATATOOLS_DEV_MODE=1`` short-circuits every check.
|
||||||
|
The test suite's autouse fixture sets this so existing tests don't
|
||||||
|
need their own license fixtures. Tests that need the real check
|
||||||
|
explicitly use ``isolated_license_path`` /
|
||||||
|
``activated_license_manager`` / ``unactivated_license_manager``.
|
||||||
|
|
||||||
|
**Adding a feature flag**:
|
||||||
|
|
||||||
|
1. Add the enum value to ``FeatureFlag`` in ``src/license/schema.py``.
|
||||||
|
2. Add it to the relevant tier's set in
|
||||||
|
``FEATURES_BY_TIER`` in ``src/license/features.py``.
|
||||||
|
3. Gate at the call site: ``require_feature(FeatureFlag.YOUR_FLAG)``.
|
||||||
|
|
||||||
|
**Adding a new tier**:
|
||||||
|
|
||||||
|
1. Add the enum value to ``Tier``.
|
||||||
|
2. Add a row to ``FEATURES_BY_TIER`` listing the unlocked flags.
|
||||||
|
3. Add ``license.tier_<name>`` translation keys to every i18n pack.
|
||||||
|
4. The activation flow, sidebar status badge, feature gate, and home
|
||||||
|
grid lock badge all pick up the new tier automatically.
|
||||||
|
|
||||||
|
**Worked example — the Lite tier**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# src/license/schema.py
|
||||||
|
class Tier(str, Enum):
|
||||||
|
LITE = "lite" # new
|
||||||
|
CORE = "core"
|
||||||
|
...
|
||||||
|
|
||||||
|
# src/license/features.py
|
||||||
|
FEATURES_BY_TIER = {
|
||||||
|
...
|
||||||
|
Tier.LITE: frozenset({
|
||||||
|
FeatureFlag.DEDUPLICATOR,
|
||||||
|
FeatureFlag.TEXT_CLEANER,
|
||||||
|
FeatureFlag.FORMAT_STANDARDIZER,
|
||||||
|
}),
|
||||||
|
Tier.CORE: _all(),
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Then in en.json/es.json add ``license.tier_lite``. That's it — the
|
||||||
|
existing ``require_feature_or_render_upgrade`` (GUI) and
|
||||||
|
``guard(feature=...)`` (CLI) calls in every tool page/CLI route a
|
||||||
|
Lite user into the upgrade prompt for any tool the tier doesn't
|
||||||
|
unlock. The home grid's lock badge fires off the same feature
|
||||||
|
lookup.
|
||||||
|
|
||||||
|
**Minting a license** (creator-only):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
DATATOOLS_LICENSE_SECRET=<shipping-secret> \
|
||||||
|
python scripts/generate_license.py \
|
||||||
|
--name "Jane Doe" --email jane@example.com \
|
||||||
|
--tier core --years 1
|
||||||
|
```
|
||||||
|
|
||||||
|
The script prints a ``DTLIC1:`` blob to stdout — deliver this in the
|
||||||
|
Gumroad / purchase email. The buyer pastes it into the activation
|
||||||
|
page or runs ``python -m src.license_cli activate <blob> --name ...``.
|
||||||
|
|
||||||
### Add a format-standardizer field type
|
### Add a format-standardizer field type
|
||||||
|
|
||||||
1. Add value to `FieldType` enum in `core/format_standardize.py`.
|
1. Add value to `FieldType` enum in `core/format_standardize.py`.
|
||||||
@@ -155,11 +296,46 @@ GUI / CLI handlers: use `format_for_user(exc, context="...")` to render.
|
|||||||
|
|
||||||
All `DataToolsError` subclasses extend stdlib `ValueError` or `OSError` so existing handlers still catch them.
|
All `DataToolsError` subclasses extend stdlib `ValueError` or `OSError` so existing handlers still catch them.
|
||||||
|
|
||||||
|
## PDF Extractor — bundled Tesseract
|
||||||
|
|
||||||
|
Frozen builds (installer / portable .zip / AppImage) ship Tesseract OCR inside the bundle so scanned PDFs work without a separate system install. Source / `pip` developer environments still resolve Tesseract from `PATH`.
|
||||||
|
|
||||||
|
**Runtime layout (frozen bundles)**:
|
||||||
|
|
||||||
|
| Resource | Path |
|
||||||
|
|---|---|
|
||||||
|
| Tesseract binary | `Path(sys._MEIPASS) / "tesseract" / "tesseract"` (Linux/macOS), `…/tesseract/tesseract.exe` (Windows) |
|
||||||
|
| Tessdata directory | `Path(sys._MEIPASS) / "tesseract" / "tessdata"` |
|
||||||
|
| English model | `Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"` |
|
||||||
|
|
||||||
|
**Discovery order** (PDF Extractor runtime):
|
||||||
|
|
||||||
|
1. `DATATOOLS_TESSERACT_BIN` env var (override — explicit path to a `tesseract` binary).
|
||||||
|
2. Bundled path under `sys._MEIPASS` (frozen bundles only — falls through to step 3 otherwise).
|
||||||
|
3. `tesseract` on `PATH` (developer setups, source checkouts).
|
||||||
|
4. Windows well-known locations (`C:\Program Files\Tesseract-OCR\tesseract.exe`, etc.).
|
||||||
|
|
||||||
|
**Where the bytes come from**:
|
||||||
|
|
||||||
|
- **Tessdata** is vendored at `build/vendor/tessdata/eng.traineddata` — the "best" English model from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best). PyInstaller's spec copies it into `tesseract/tessdata/` inside the bundle.
|
||||||
|
- **Tesseract binary** is fetched at build time by `build/make_release.py` — per-platform download URLs are pinned in that script. The current pin is **Tesseract 5.5.0**.
|
||||||
|
|
||||||
|
**To update Tesseract**:
|
||||||
|
|
||||||
|
1. Bump the version pin + the per-platform fetch URLs in `build/make_release.py`.
|
||||||
|
2. If upstream changed the `eng.traineddata` schema, refresh `build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the matching tag.
|
||||||
|
3. Rebuild on each platform (`python build/make_release.py`) and smoke-test a scanned-PDF run through the PDF Extractor before tagging the release.
|
||||||
|
4. Update `LICENSE_TESSERACT.txt` at the repo root if the upstream license terms change (Tesseract is Apache-2.0 today).
|
||||||
|
|
||||||
## Tests
|
## Tests
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# All
|
# All (core + CLI + GUI)
|
||||||
pytest -q
|
pytest -q
|
||||||
|
# Quick loop — skip the GUI layer
|
||||||
|
pytest -q -m 'not gui'
|
||||||
|
# Only the GUI tests
|
||||||
|
pytest -q -m gui
|
||||||
# By module
|
# By module
|
||||||
pytest tests/test_dedup.py
|
pytest tests/test_dedup.py
|
||||||
# Include slow / integration
|
# Include slow / integration
|
||||||
@@ -171,16 +347,42 @@ pytest tests/test_dedup.py::TestExactMatch::test_basic
|
|||||||
Test layout:
|
Test layout:
|
||||||
```
|
```
|
||||||
tests/
|
tests/
|
||||||
├── conftest.py # fixtures
|
├── conftest.py # core/CLI fixtures
|
||||||
├── test_dedup.py · test_normalizers.py · test_io.py · test_config.py
|
├── test_dedup.py · test_normalizers.py · test_io.py · test_config.py
|
||||||
├── test_analyze.py · test_normalize.py · test_text_clean.py
|
├── test_analyze.py · test_normalize.py · test_text_clean.py
|
||||||
├── test_format_standardize.py
|
├── test_format_standardize.py
|
||||||
├── test_format_standardize_corpus.py # 199-row buyer corpus
|
├── test_format_standardize_corpus.py # 199-row buyer corpus
|
||||||
├── test_audit_fixes.py · test_errors.py · test_fixes_unit.py
|
├── test_audit_fixes.py · test_errors.py · test_fixes_unit.py
|
||||||
├── test_corpus.py · test_encodings_corpus.py · test_fixtures_sweep.py
|
├── test_corpus.py · test_encodings_corpus.py · test_fixtures_sweep.py
|
||||||
└── test_cli.py · test_cli_*.py · test_e2e.py · test_install.py
|
├── test_cli.py · test_cli_*.py · test_e2e.py · test_install.py
|
||||||
|
├── test_perf_regressions.py # shape pins for the perf wins
|
||||||
|
└── gui/ # Streamlit AppTest-driven tests
|
||||||
|
├── conftest.py # AppTest fixtures + helpers
|
||||||
|
├── _findings_panel_harness.py # isolated component test page
|
||||||
|
├── test_smoke.py # every page renders in EN + ES
|
||||||
|
├── test_chrome.py # language selector, hide_chrome
|
||||||
|
├── test_gate.py # require_normalization_gate
|
||||||
|
├── test_workflows.py # happy path per Ready tool
|
||||||
|
├── test_dedup_review.py # match-group card interactions
|
||||||
|
├── test_advanced_panels.py # config_panel widgets
|
||||||
|
├── test_errors.py # malformed-upload error paths
|
||||||
|
└── test_findings_panel.py # analyzer findings rendering
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### GUI test layer
|
||||||
|
|
||||||
|
GUI tests drive pages with `streamlit.testing.v1.AppTest` —
|
||||||
|
in-process, no browser, no display. They pre-populate
|
||||||
|
`st.session_state` with stashed-upload bytes (via the
|
||||||
|
`stash_upload()` helper in `tests/gui/conftest.py`) and either click
|
||||||
|
buttons via `app.button[i].click().run()` or assert on the
|
||||||
|
`session_state` after the run.
|
||||||
|
|
||||||
|
Marker registered in `pytest.ini`. Default `pytest` runs everything;
|
||||||
|
`pytest -m 'not gui'` skips them for a faster core-only loop.
|
||||||
|
Coming-Soon stubs are pinned by the smoke tests so a regression
|
||||||
|
("import error", "missing widget") shows up immediately.
|
||||||
|
|
||||||
Fixture corpora: `test-cases/text-cleaner-corpus/` (21 files) · `test-cases/encodings-corpus/` (31 files) · `test-cases/format-cleaner-corpus/` (7 files + spec).
|
Fixture corpora: `test-cases/text-cleaner-corpus/` (21 files) · `test-cases/encodings-corpus/` (31 files) · `test-cases/format-cleaner-corpus/` (7 files + spec).
|
||||||
|
|
||||||
## Known limitations
|
## Known limitations
|
||||||
|
|||||||
244
docs/FUTURE-TOOLS.md
Normal file
244
docs/FUTURE-TOOLS.md
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
# Future tools — design notes
|
||||||
|
|
||||||
|
> Creator-only. Specs for tools the strategic plan refuses to build right now
|
||||||
|
> but that surface repeatedly enough to be worth documenting once instead of
|
||||||
|
> re-thinking from scratch every time a customer asks.
|
||||||
|
> **Status of these tools**: post-launch, post-revenue. See `PLAN.md` §2.1 —
|
||||||
|
> new-tool development is frozen until DataTools has a paying customer and a
|
||||||
|
> repeated demand signal for the same idea. This file is the resting place
|
||||||
|
> for those ideas in the meantime; nothing here ships unless a future
|
||||||
|
> decision says it does.
|
||||||
|
|
||||||
|
Each entry follows the same shape: **What it does**, **Why someone would
|
||||||
|
want it**, **Can we ship it now?**, **Approach**, **GUI sketch**, **Effort**,
|
||||||
|
**Risks/unknowns**, **Ship criteria** (the signal that overrides the freeze).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. PDF → CSV extractor (bank statements + similar)
|
||||||
|
|
||||||
|
### What it does
|
||||||
|
|
||||||
|
Takes a PDF (typically a bank statement, expense report, paystub, invoice,
|
||||||
|
or any document where humans-but-not-computers can read a table) and turns
|
||||||
|
the tabular content into a CSV that the rest of DataTools can consume.
|
||||||
|
|
||||||
|
The user shows the tool **where** the data lives by drawing rectangles on
|
||||||
|
a rendered preview of the first page; the tool then applies those region
|
||||||
|
templates to every page of the document (and remembers the template so the
|
||||||
|
same template can be re-applied to next month's statement without
|
||||||
|
re-clicking).
|
||||||
|
|
||||||
|
### Why someone would want it
|
||||||
|
|
||||||
|
Bookkeepers, accountants, and any small-business operator who:
|
||||||
|
|
||||||
|
- Gets bank/credit-card statements only as PDFs (most US banks; many
|
||||||
|
European ones).
|
||||||
|
- Wants to import transactions into QuickBooks / Xero / a spreadsheet
|
||||||
|
without paying $10–$30/month for a SaaS converter (Docparser,
|
||||||
|
Rossum, Hubdoc) or relying on a Python script they can't maintain.
|
||||||
|
- Has 12 months × N accounts of statements to back-fill into a
|
||||||
|
ledger.
|
||||||
|
|
||||||
|
This is the most-requested DataTools adjacency in the casual feedback we
|
||||||
|
have so far. It maps tightly onto the **bookkeeper niche** identified in
|
||||||
|
`PLAN.md` §2.3 — that persona is exactly who needs PDF extraction and is
|
||||||
|
exactly the kind of operator who'd pay for a one-time desktop tool over a
|
||||||
|
recurring SaaS subscription.
|
||||||
|
|
||||||
|
### Can we ship it now?
|
||||||
|
|
||||||
|
**No.** Current state, verified 2026-05-17:
|
||||||
|
|
||||||
|
- No PDF dependency in `requirements.txt` or `requirements-dev.txt`.
|
||||||
|
- No PDF-touching code anywhere under `src/`. The single
|
||||||
|
string-mention of "PDF" in the codebase is in the **output** copy for
|
||||||
|
the Quality Check tool ("generate PDF/Excel quality reports"),
|
||||||
|
unrelated to extraction.
|
||||||
|
- No region-selection / canvas component in the Streamlit GUI today.
|
||||||
|
|
||||||
|
Building this requires net-new infrastructure on three axes (libraries,
|
||||||
|
extraction core, region-picker UI). Estimates below.
|
||||||
|
|
||||||
|
### Approach (technical)
|
||||||
|
|
||||||
|
PDFs split cleanly into two populations and the strategy differs:
|
||||||
|
|
||||||
|
1. **Native / text-layer PDFs** — text is stored as text, just laid out
|
||||||
|
visually. Most modern bank statements are this. Solvable with
|
||||||
|
coordinate-aware text extraction:
|
||||||
|
|
||||||
|
- **`pdfplumber`** (BSD-3, on top of `pdfminer.six`) — gives `(x0, y0,
|
||||||
|
x1, y1, text)` per character/word/line for each page. Mature, well
|
||||||
|
tested, single dependency, no native compiler. **First-choice.**
|
||||||
|
- **`pypdf`** (BSD-3) — text-only, no positions. Too coarse for
|
||||||
|
statement parsing; useful only for "the whole document as one big
|
||||||
|
string."
|
||||||
|
- **`camelot-py`** (MIT) — purpose-built for table extraction.
|
||||||
|
Heavier (needs `ghostscript` and `tk`/`opencv` for some modes),
|
||||||
|
and assumes the table grid is already visible. Worth evaluating
|
||||||
|
as a fallback for documents with explicit ruled tables.
|
||||||
|
|
||||||
|
2. **Scanned / image-only PDFs** — pixels of a scanner; no text layer.
|
||||||
|
Less common from major banks today but still happens with old PDFs
|
||||||
|
and receipts. Needs OCR:
|
||||||
|
|
||||||
|
- **`pytesseract`** wrapping the **Tesseract** binary (Apache-2). The
|
||||||
|
OCR is good for English on clean scans, mediocre on receipts.
|
||||||
|
Detect with `pdfplumber`: a page where every character is in a
|
||||||
|
glyph "image" object means the page is image-only → OCR fallback.
|
||||||
|
|
||||||
|
The extraction core would be a state machine:
|
||||||
|
|
||||||
|
1. Render page to an image (`pdfplumber.Page.to_image()` returns a PIL
|
||||||
|
image at a chosen DPI).
|
||||||
|
2. User draws a header region and per-row regions (or marks a single
|
||||||
|
table bounding box + column dividers) on the preview.
|
||||||
|
3. For each PDF page, crop the corresponding pixel region (or pdf
|
||||||
|
coordinate region), pull the text in that crop, and apply per-region
|
||||||
|
parsing (date, amount, description).
|
||||||
|
4. Emit one CSV row per detected statement row.
|
||||||
|
|
||||||
|
Bank-statement-specific niceties — implementable as templates on top of
|
||||||
|
the generic engine:
|
||||||
|
|
||||||
|
- Recurring-template store: save "Chase visa October layout" once, the
|
||||||
|
next month's PDF lands on the same template automatically. JSON file
|
||||||
|
in `~/.datatools/templates/` keyed by a layout fingerprint (page
|
||||||
|
size + header text hash).
|
||||||
|
- Multi-page row stitching: a row that wraps across pages gets merged
|
||||||
|
back together based on date-column continuity.
|
||||||
|
- Currency / sign inference: a column that mixes `$1,234.56` and
|
||||||
|
`($45.00)` — already handled by the (now-existing) Standardize
|
||||||
|
Formats analyzer rules.
|
||||||
|
|
||||||
|
### GUI sketch
|
||||||
|
|
||||||
|
The hardest part of the whole project. Streamlit doesn't ship a native
|
||||||
|
"draw rectangles on an image" widget. Options:
|
||||||
|
|
||||||
|
- **`streamlit-drawable-canvas`** — community component (MIT-licensed).
|
||||||
|
Lets the user draw freehand rectangles on top of a background image.
|
||||||
|
Returns the rectangle coordinates as JSON. Active maintenance.
|
||||||
|
**First-choice for the region picker.**
|
||||||
|
- **`streamlit-cropper`** — single-rectangle crop tool. Good if we only
|
||||||
|
needed the table bbox; too limited for "header region + column
|
||||||
|
dividers + repeating-row template."
|
||||||
|
- **Custom React component** — fully tailored UX but adds a build
|
||||||
|
toolchain DataTools doesn't have today. Last resort.
|
||||||
|
|
||||||
|
Sketch of the proposed page (under "Transformations" in the sidebar
|
||||||
|
section):
|
||||||
|
|
||||||
|
```
|
||||||
|
🧾 PDF → CSV (Beta)
|
||||||
|
─────────────────────────────────────────────────────────────────────
|
||||||
|
Upload a PDF [ Browse… ]
|
||||||
|
(statement / invoice / form — text-based PDFs work best)
|
||||||
|
|
||||||
|
[ ▸ Preview: October-statement.pdf · 3 pages ]
|
||||||
|
┌────────────────────────────────────────────────┐
|
||||||
|
│ CHASE BANK │
|
||||||
|
│ Statement period Oct 1–31, 2025 │
|
||||||
|
│ ┌─[1: header strip — drawn in red]──────────┐ │
|
||||||
|
│ │ Date Description Amount │ │
|
||||||
|
│ └────────────────────────────────────────────┘ │
|
||||||
|
│ ┌─[2: row template — drawn in green]────────┐ │
|
||||||
|
│ │ 10/03 AMAZON.COM #42… -45.67 │ │
|
||||||
|
│ └────────────────────────────────────────────┘ │
|
||||||
|
│ ⋮ (more transactions) │
|
||||||
|
└────────────────────────────────────────────────┘
|
||||||
|
|
||||||
|
Columns: [Date] [Description] [Amount] [+ Add column]
|
||||||
|
|
||||||
|
Apply template to: ( ) Only this page
|
||||||
|
(•) All pages with this layout
|
||||||
|
( ) All pages (force)
|
||||||
|
|
||||||
|
[ Save template as… Chase Visa Oct 2025 ]
|
||||||
|
|
||||||
|
[ Run extraction → CSV ]
|
||||||
|
```
|
||||||
|
|
||||||
|
After "Run extraction": the standard tool-page result layout (preview
|
||||||
|
table, "Saved to ~/Downloads/<name>_extracted.csv", "Open Downloads
|
||||||
|
folder" — matching the other Ready tools).
|
||||||
|
|
||||||
|
The **template save/recall** is what makes this a one-time setup
|
||||||
|
instead of a per-document chore — bookkeepers don't want to re-draw
|
||||||
|
rectangles every month.
|
||||||
|
|
||||||
|
### Effort estimate
|
||||||
|
|
||||||
|
| Phase | Scope | Estimate | Risk |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **A. Backend, native PDFs only** | pdfplumber-based extraction, hard-coded region passed via a JSON config (no GUI) | **1–2 weeks** | Low — straightforward use of pdfplumber. |
|
||||||
|
| **B. Region-picker GUI** | streamlit-drawable-canvas, multi-region drawing, per-region role assignment (date / amount / description) | **2–3 weeks** | Medium — the canvas component has quirks; persisting region state across reruns is non-trivial. |
|
||||||
|
| **C. Multi-page application + template persistence** | Apply one page's template to N pages, save/load templates, layout fingerprint | **1–2 weeks** | Medium — "is the next page the same layout?" is a real perception problem; we'll need a heuristic. |
|
||||||
|
| **D. Scanned-PDF OCR fallback** | Detect image-only pages, run Tesseract, merge OCR text into the extraction path | **2–3 weeks** | High — OCR accuracy is variable; we'd want a quality threshold + a "fail this page noisily" path. Bundling Tesseract with the PyInstaller build is its own packaging headache. |
|
||||||
|
| **E. Bank-statement specifics** | Cross-page row stitching, currency-sign inference, multi-account splits | **1–2 weeks** | Medium — every bank's idea of a "statement" differs. Templates absorb most of the variance. |
|
||||||
|
|
||||||
|
**Realistic total for a polished v1**: 6–10 calendar weeks of focused work
|
||||||
|
(text-PDFs + GUI + templates + statement-specific niceties). Add another
|
||||||
|
2–3 weeks if scanned PDFs are required at launch.
|
||||||
|
|
||||||
|
**Minimum viable extract** (just text PDFs, single-region drawing, no
|
||||||
|
template recall, no OCR): **3–4 weeks**. Worth scoping a beta at that
|
||||||
|
level before committing to the full surface.
|
||||||
|
|
||||||
|
### Difficulty rating
|
||||||
|
|
||||||
|
**Medium-hard.** Not because any single piece is novel — pdfplumber +
|
||||||
|
streamlit-drawable-canvas are well-trodden libraries — but because the
|
||||||
|
*combination* (point-and-click region selection that persists across
|
||||||
|
multiple PDF pages and across documents with similar layouts) is where
|
||||||
|
most of the engineering goes. The "every bank does it slightly
|
||||||
|
differently" reality makes templates a hard requirement rather than a
|
||||||
|
nice-to-have, and templates raise the design effort.
|
||||||
|
|
||||||
|
### Risks / unknowns
|
||||||
|
|
||||||
|
- **Scanned-PDF coverage**: if a meaningful slice of the addressable
|
||||||
|
market sends image-only PDFs (older statements, scanned receipts),
|
||||||
|
shipping text-only extraction limits the audience. Decide via the
|
||||||
|
first 10–20 user requests.
|
||||||
|
- **PyInstaller packaging of Tesseract**: bundling the OCR binary into
|
||||||
|
the desktop build is non-trivial. May force a "Tesseract not found —
|
||||||
|
install it separately" path on first launch, which hurts the "one-
|
||||||
|
click install" story.
|
||||||
|
- **Bank layout drift**: a template captured today can stop working
|
||||||
|
next month if the bank redesigns its statement. Layout-fingerprint
|
||||||
|
detection has to fail loudly rather than silently produce garbage.
|
||||||
|
- **PII surface**: bank statements are some of the most sensitive
|
||||||
|
documents the user might touch. The "runs locally — your data never
|
||||||
|
leaves this computer" guarantee is even more load-bearing here than
|
||||||
|
for CSVs. No telemetry, no cloud OCR services, hard line.
|
||||||
|
|
||||||
|
### Ship criteria
|
||||||
|
|
||||||
|
Before this tool re-enters active development, all of these need to be
|
||||||
|
true:
|
||||||
|
|
||||||
|
- DataTools has shipped to **≥1 paying customer** (the `PLAN.md` §2.1
|
||||||
|
freeze condition).
|
||||||
|
- **At least 3 paying customers OR 5 demo-traffic emails** have
|
||||||
|
explicitly asked for PDF extraction. Below that signal, build
|
||||||
|
something else.
|
||||||
|
- The bookkeeper niche (per `PLAN.md` §2.3) has at least one converted
|
||||||
|
customer — that's the persona who actually needs this tool, and
|
||||||
|
confirming they pay before building a tool aimed squarely at them
|
||||||
|
is the discipline the freeze exists to enforce.
|
||||||
|
|
||||||
|
If those three trip, the **Phase A minimum-viable beta (3–4 weeks)**
|
||||||
|
goes first — text PDFs + single-region drawing — so we can see real
|
||||||
|
user behaviour before committing to the full template surface.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## (placeholder for additional future-tool entries)
|
||||||
|
|
||||||
|
Add new entries above this line. Keep the same shape:
|
||||||
|
What / Why / Can we ship now / Approach / GUI / Effort / Risks /
|
||||||
|
Ship criteria. The shape is what makes "is this idea ready" a
|
||||||
|
factual question instead of an opinion.
|
||||||
259
docs/LICENSE-SERVER.md
Normal file
259
docs/LICENSE-SERVER.md
Normal file
@@ -0,0 +1,259 @@
|
|||||||
|
# LICENSE-SERVER — online issuance & record-keeping
|
||||||
|
|
||||||
|
**Status:** **deployed (PR 1 + PR 2 code merged)**. Live at
|
||||||
|
`licenses.datatools.unalogix.com`. See `ADMIN.md §"Live deployment"`
|
||||||
|
for day-2 operations, and `ARCHITECTURE.md` for the end-to-end
|
||||||
|
diagram including the desktop and storefronts.
|
||||||
|
|
||||||
|
This doc describes the smallest useful server we could build to
|
||||||
|
replace the manual mint-and-paste workflow, without compromising the
|
||||||
|
"your data never leaves your computer" promise to buyers (see
|
||||||
|
`DECISIONS.md §9b`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Goals
|
||||||
|
|
||||||
|
1. **Automate fulfillment.** Gumroad sale → buyer gets a blob in
|
||||||
|
their inbox within seconds. No creator intervention.
|
||||||
|
2. **Authoritative customer list.** A queryable record of who has
|
||||||
|
what tier, when it expires, what they paid. Replaces the JSONL
|
||||||
|
log as the system of record.
|
||||||
|
3. **Self-service renewal & re-delivery.** Buyer enters their email
|
||||||
|
→ gets a fresh blob or a copy of their existing one. Cuts support
|
||||||
|
load.
|
||||||
|
4. **Move the private key off the founder's laptop.** Today the prod
|
||||||
|
private key has to be loaded as an env var to mint anything;
|
||||||
|
that's a security hazard. Server-side, it lives in a KMS and the
|
||||||
|
laptop never touches it.
|
||||||
|
|
||||||
|
## Non-goals
|
||||||
|
|
||||||
|
- **No phone-home from the desktop app.** Activation stays offline.
|
||||||
|
The shipped binary still verifies blobs against the embedded
|
||||||
|
pubkey with no network call. `DECISIONS.md §9b` stands.
|
||||||
|
- **No per-machine activation limits enforced server-side.** v1
|
||||||
|
treats one license = one buyer, used on as many of their machines
|
||||||
|
as they want. Revisit only if abuse appears.
|
||||||
|
- **No telemetry.** The server only knows what the buyer or Gumroad
|
||||||
|
tells it (purchase events, renewal requests). It does not learn
|
||||||
|
anything from desktop installations.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────────┐
|
||||||
|
│ Gumroad │
|
||||||
|
└────────┬────────┘
|
||||||
|
│ webhook (sale, refund)
|
||||||
|
▼
|
||||||
|
┌──────────────┐ ┌───────────────┐ ┌──────────────┐
|
||||||
|
│ Buyer email │◄──────│ Mint API │──────►│ licenses │
|
||||||
|
│ (SMTP send) │ │ (Python web) │ │ (Postgres) │
|
||||||
|
└──────────────┘ └───────┬───────┘ └──────────────┘
|
||||||
|
│ sign() via
|
||||||
|
▼
|
||||||
|
┌─────────────────┐
|
||||||
|
│ KMS / HSM │
|
||||||
|
│ (private key) │
|
||||||
|
└─────────────────┘
|
||||||
|
|
||||||
|
┌─────────────────────────────────────────┐
|
||||||
|
│ Renewal / re-delivery portal │
|
||||||
|
│ - buyer enters email │
|
||||||
|
│ - signed magic link │
|
||||||
|
│ - sees current license + "resend" │
|
||||||
|
└─────────────────────────────────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Components
|
||||||
|
|
||||||
|
### 1. Mint API
|
||||||
|
|
||||||
|
Thin Python web service (FastAPI or Flask — Streamlit isn't appropriate
|
||||||
|
here). Two internal endpoints:
|
||||||
|
|
||||||
|
- `POST /internal/mint` — name, email, tier, years → blob + DB row.
|
||||||
|
Auth: shared HMAC header from the webhook receiver only.
|
||||||
|
- `POST /internal/revoke` — license_key → sets `revoked_at`. Auth: same.
|
||||||
|
|
||||||
|
The mint endpoint is the **only** place that calls `crypto.sign()`.
|
||||||
|
It pulls the private key from the KMS at request time; the key
|
||||||
|
material never lives in the API process's environment.
|
||||||
|
|
||||||
|
### 2. Webhook receiver
|
||||||
|
|
||||||
|
Public endpoint `POST /webhooks/gumroad`. Verifies Gumroad's
|
||||||
|
signature, maps the payload to a `mint` call, returns 200. Stores
|
||||||
|
the raw payload to a `gumroad_events` table for audit.
|
||||||
|
|
||||||
|
Refunds: webhook → `POST /internal/revoke` keyed on
|
||||||
|
`gumroad_order_id`. The desktop app doesn't currently honor
|
||||||
|
revocations (no online check), but future buyers won't be able to
|
||||||
|
renew a revoked license, and the row remains as evidence if a
|
||||||
|
dispute escalates.
|
||||||
|
|
||||||
|
### 3. Renewal portal
|
||||||
|
|
||||||
|
Single-page form, public. Buyer enters email → server emails a
|
||||||
|
signed magic link → click → page shows their license (tier, expiry,
|
||||||
|
"resend blob" button, "renew" button).
|
||||||
|
|
||||||
|
Renew flow: button → `POST /internal/mint` with the same name/email
|
||||||
|
and a fresh expiry → buyer gets the new blob → pastes into desktop
|
||||||
|
app via existing `license_cli.py renew`. No code change in the
|
||||||
|
desktop app.
|
||||||
|
|
||||||
|
### 4. Database
|
||||||
|
|
||||||
|
Postgres (small — a few thousand rows for the foreseeable future).
|
||||||
|
Single source of truth for the customer list.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Schema
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE licenses (
|
||||||
|
license_key text PRIMARY KEY, -- DT1-{TIER}-xxxx-xxxx
|
||||||
|
name text NOT NULL,
|
||||||
|
email text NOT NULL,
|
||||||
|
tier text NOT NULL, -- lite | core | pro | enterprise
|
||||||
|
issued_at timestamptz NOT NULL,
|
||||||
|
expires_at timestamptz NOT NULL,
|
||||||
|
blob text NOT NULL, -- DTLIC2:...
|
||||||
|
gumroad_order_id text UNIQUE, -- null for manual mints
|
||||||
|
revoked_at timestamptz, -- null = active
|
||||||
|
notes text -- free-form support notes
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_licenses_email ON licenses (lower(email));
|
||||||
|
CREATE INDEX idx_licenses_expires ON licenses (expires_at) WHERE revoked_at IS NULL;
|
||||||
|
CREATE INDEX idx_licenses_gumroad ON licenses (gumroad_order_id);
|
||||||
|
|
||||||
|
CREATE TABLE gumroad_events (
|
||||||
|
id bigserial PRIMARY KEY,
|
||||||
|
received_at timestamptz NOT NULL DEFAULT now(),
|
||||||
|
event_type text NOT NULL, -- sale | refund | dispute | ...
|
||||||
|
order_id text,
|
||||||
|
raw_payload jsonb NOT NULL,
|
||||||
|
processed boolean NOT NULL DEFAULT false,
|
||||||
|
error text -- non-null if processing failed
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
The `licenses` schema is the JSONL log fields plus
|
||||||
|
`gumroad_order_id`, `revoked_at`, `notes`. The migration script from
|
||||||
|
JSONL → Postgres is therefore a flat insert.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Security
|
||||||
|
|
||||||
|
- **Private key**: AWS KMS, GCP KMS, or HashiCorp Vault. Mint API
|
||||||
|
has IAM permission to *use* the key (sign operation), not to
|
||||||
|
*export* it. Rotating to a new key still requires a new desktop
|
||||||
|
build (the pubkey is embedded); plan a 90-day overlap window where
|
||||||
|
both keys are accepted.
|
||||||
|
- **Webhook secret**: Gumroad's HMAC signature, verified before
|
||||||
|
touching the body.
|
||||||
|
- **Internal endpoints**: not reachable from the public internet —
|
||||||
|
bind to localhost or a private subnet, fronted by the webhook
|
||||||
|
receiver and the renewal portal.
|
||||||
|
- **PII**: name + email + Gumroad order ID. Standard customer-data
|
||||||
|
hygiene — DB backups encrypted at rest, no PII in application
|
||||||
|
logs, GDPR delete-on-request supported via a `DELETE FROM
|
||||||
|
licenses WHERE email = ?` (the desktop activation still works
|
||||||
|
until the license expires; the buyer just won't appear in our
|
||||||
|
records anymore).
|
||||||
|
- **Mint API access**: short-lived signed tokens for any creator
|
||||||
|
CLI that talks to it. The CLI is a thin wrapper around the same
|
||||||
|
`POST /internal/mint`; the days of running
|
||||||
|
`scripts/generate_license.py` against the prod private key on a
|
||||||
|
laptop are over once the server exists.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Migration plan
|
||||||
|
|
||||||
|
Three phases, each independently revertable.
|
||||||
|
|
||||||
|
### Phase 0 (done)
|
||||||
|
|
||||||
|
- Ed25519 signing with prod key on creator's laptop.
|
||||||
|
- Local JSONL issuance log at `~/.datatools-creator/issued.jsonl`.
|
||||||
|
|
||||||
|
### Phase 1 — server stands up, no behavior change
|
||||||
|
|
||||||
|
1. Stand up Postgres + Mint API in a small VPS / Fly.io / Render box.
|
||||||
|
2. Provision a KMS-held keypair; **the public key must match the one
|
||||||
|
already embedded in the shipped binary** — i.e., import the
|
||||||
|
existing prod private key into KMS, do not generate a new one. If
|
||||||
|
the existing key is laptop-only and can't be imported, plan a
|
||||||
|
build-with-new-pubkey + buyer-side rotation cycle (see
|
||||||
|
`ADMIN.md` Recovery).
|
||||||
|
3. Run a one-shot script: read `~/.datatools-creator/issued.jsonl`,
|
||||||
|
`INSERT … ON CONFLICT (license_key) DO NOTHING` each row.
|
||||||
|
4. Add a creator-only CLI command `datatools-admin mint` that calls
|
||||||
|
`POST /internal/mint` instead of running the local script. Local
|
||||||
|
script stays as a fallback.
|
||||||
|
|
||||||
|
At this point: nothing buyer-facing has changed. The creator now has
|
||||||
|
two ways to mint (server or local) and a real DB.
|
||||||
|
|
||||||
|
### Phase 2 — automation
|
||||||
|
|
||||||
|
5. Wire the Gumroad webhook. New buyers get automated fulfillment.
|
||||||
|
6. Manual mints (friends, comps, support replacements) still go
|
||||||
|
through `datatools-admin mint`, which writes to the same DB.
|
||||||
|
7. Old local script is deprecated but kept (read-only) as a break-glass
|
||||||
|
tool if the server is down.
|
||||||
|
|
||||||
|
### Phase 3 — self-service
|
||||||
|
|
||||||
|
8. Ship the renewal portal.
|
||||||
|
9. Replace "email support to lose-my-blob" with a self-service form.
|
||||||
|
|
||||||
|
Each phase ships independently. The desktop app sees no change
|
||||||
|
across any of them — that's the whole point.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Open questions
|
||||||
|
|
||||||
|
- **Hosting choice.** *Decided: self-hosted* on the existing
|
||||||
|
`46.225.166.142` box alongside the `*.invixiom.com` services.
|
||||||
|
Runbook in `SETUP-LICENSE-SERVER.md`. Operator owns uptime,
|
||||||
|
backups, TLS renewal, and key custody — see that doc's
|
||||||
|
"Operational concerns" section.
|
||||||
|
- **Per-seat or per-device limits?** v1 says no. Revisit if/when
|
||||||
|
abuse is observable.
|
||||||
|
- **Email delivery.** Postmark or SES — both fine. Pick whichever the
|
||||||
|
rest of the stack uses. Avoid Gmail SMTP for transactional mail.
|
||||||
|
- **Audit log retention.** `gumroad_events` rows are unbounded growth
|
||||||
|
but trivially small. Default to forever; partition by year if it
|
||||||
|
ever exceeds a few GB.
|
||||||
|
- **Existing Gumroad customers.** Before any of this lands, every
|
||||||
|
buyer is already in Gumroad's records. A one-shot import from
|
||||||
|
Gumroad's CSV export → `licenses` table would catch anyone whose
|
||||||
|
blob the JSONL log doesn't have (e.g., if the creator's laptop
|
||||||
|
was lost before this design lands).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Code pointers (current state, for the future implementer)
|
||||||
|
|
||||||
|
| File | What it does now | What changes |
|
||||||
|
|------|------------------|--------------|
|
||||||
|
| `scripts/generate_license.py` | Sign locally, append JSONL | Becomes a CLI client of the Mint API |
|
||||||
|
| `src/license/crypto.py` | `sign()` reads `$DATATOOLS_LICENSE_PRIVKEY` | `sign()` calls KMS; the env var stays as a fallback for local dev |
|
||||||
|
| `src/license_cli.py` | Activate / status / renew — already buyer-facing | **No change.** Still verifies offline against embedded pubkey |
|
||||||
|
| `src/license/manager.py` | Verify, persist | **No change.** |
|
||||||
|
|
||||||
|
The desktop app is deliberately decoupled from any of this. The
|
||||||
|
server is a fulfillment + record-keeping layer wrapped around the
|
||||||
|
existing, frozen, offline activation flow.
|
||||||
@@ -30,7 +30,7 @@ Status legend:
|
|||||||
| ✓ | Item | Where it lives |
|
| ✓ | Item | Where it lives |
|
||||||
|---|------|----------------|
|
|---|------|----------------|
|
||||||
| 🟢 | 6 of 9 tools shipped (Dedup, Text, Format, Missing, Column-Map, Pipeline) | `src/core/`, `src/cli_*.py`, `src/gui/pages/` |
|
| 🟢 | 6 of 9 tools shipped (Dedup, Text, Format, Missing, Column-Map, Pipeline) | `src/core/`, `src/cli_*.py`, `src/gui/pages/` |
|
||||||
| 🟢 | Pipeline Runner (the retention multiplier per `PLAN.md` §2.6) | `src/core/pipeline.py`, `src/cli_pipeline.py`, `src/gui/pages/9_Pipeline_Runner.py` |
|
| 🟢 | Automated Workflows (the retention multiplier per `PLAN.md` §2.6) | `src/core/pipeline.py`, `src/cli_pipeline.py`, `src/gui/pages/9_Pipeline_Runner.py` |
|
||||||
| 🟢 | 1,729 passing tests · 0 skipped · 0 xfailed | `tests/` |
|
| 🟢 | 1,729 passing tests · 0 skipped · 0 xfailed | `tests/` |
|
||||||
| 🟢 | 3 niche demo datasets + pre-tuned pipeline JSONs | `samples/demo/` |
|
| 🟢 | 3 niche demo datasets + pre-tuned pipeline JSONs | `samples/demo/` |
|
||||||
| 🟢 | Streamlit demo app + Cloud entry shim | `streamlit_app.py`, `src/gui/app_demo.py` |
|
| 🟢 | Streamlit demo app + Cloud entry shim | `streamlit_app.py`, `src/gui/app_demo.py` |
|
||||||
@@ -269,6 +269,7 @@ moves until $5k/mo MRR:
|
|||||||
| | Why locked |
|
| | Why locked |
|
||||||
|---|---|
|
|---|---|
|
||||||
| ❌ More tools (06–08) | `PLAN.md` §2.1 distribution-gate. Tool 09 was the exception; no others until first paid customer + one external review. |
|
| ❌ More tools (06–08) | `PLAN.md` §2.1 distribution-gate. Tool 09 was the exception; no others until first paid customer + one external review. |
|
||||||
|
| ❌ Tool #10 PDF → CSV (the most-asked-for adjacency) | Parked in `docs/FUTURE-TOOLS.md` with full design + 3–4 wk MVP / 6–10 wk polished estimate. Ship trigger: paying customer + ≥3 paid or ≥5 demo emails asking for PDF + the bookkeeper niche converting first. None have fired yet. |
|
||||||
| ❌ SaaS pivot | `DECISIONS.md` §4 — recurring infra conflicts with the lifestyle constraint. |
|
| ❌ SaaS pivot | `DECISIONS.md` §4 — recurring infra conflicts with the lifestyle constraint. |
|
||||||
| ❌ Live chat / sales calls | `DECISIONS.md` §1 #8 — no-touch is locked until $5k/mo. |
|
| ❌ Live chat / sales calls | `DECISIONS.md` §1 #8 — no-touch is locked until $5k/mo. |
|
||||||
| ❌ Custom integrations / one-off consulting | Breaks "build once, sell many." |
|
| ❌ Custom integrations / one-off consulting | Breaks "build once, sell many." |
|
||||||
|
|||||||
32
docs/PLAN.md
32
docs/PLAN.md
@@ -29,8 +29,8 @@ win.
|
|||||||
|
|
||||||
| Asset | State |
|
| Asset | State |
|
||||||
|---|---|
|
|---|---|
|
||||||
| Tools 1–5 (Dedup, Text Clean, Format Standardize, Missing, Column Mapper) | Ready · 1,691 tests passing · 0 xfailed |
|
| Tools 1–5 (Find Duplicates, Clean Text, Standardize Formats, Fix Missing Values, Map Columns) | Ready · 1,691 tests passing · 0 xfailed |
|
||||||
| Tools 6–9 (Outlier, Multi-File Merge, Validator, Pipeline) | Coming Soon |
|
| Tools 6–9 (Find Unusual Values, Combine Files, Quality Check, Automated Workflows) | Coming Soon |
|
||||||
| PyInstaller installer pipeline | Not started |
|
| PyInstaller installer pipeline | Not started |
|
||||||
| macOS code signing (Apple Dev Program) | Not started |
|
| macOS code signing (Apple Dev Program) | Not started |
|
||||||
| Hosted browser demo (Streamlit Cloud) | Not deployed |
|
| Hosted browser demo (Streamlit Cloud) | Not deployed |
|
||||||
@@ -52,12 +52,20 @@ Tools 6–8 are blocked behind a **distribution gate**: no work on them
|
|||||||
until the existing 5 tools have a paying customer + one external review
|
until the existing 5 tools have a paying customer + one external review
|
||||||
(BUSINESS.md §4 sequence rule, applied recursively inside the bundle).
|
(BUSINESS.md §4 sequence rule, applied recursively inside the bundle).
|
||||||
|
|
||||||
**Exception granted 2026-05-01**: Tool 09 Pipeline Runner is built
|
**Exception granted 2026-05-01**: Tool 09 Automated Workflows is built
|
||||||
*now*. Rationale: the pipeline transforms the bundle from "5 tools you
|
*now*. Rationale: the pipeline transforms the bundle from "5 tools you
|
||||||
buy" into "an automatable workflow you depend on." That conversion is
|
buy" into "an automatable workflow you depend on." That conversion is
|
||||||
what produces retention and word-of-mouth — the only marketing channel
|
what produces retention and word-of-mouth — the only marketing channel
|
||||||
that scales under the no-network/no-touch constraint.
|
that scales under the no-network/no-touch constraint.
|
||||||
|
|
||||||
|
**Parked behind the freeze**: post-launch tool ideas are captured in
|
||||||
|
`docs/FUTURE-TOOLS.md` with feasibility, GUI sketch, effort estimate,
|
||||||
|
and ship criteria for each. Currently parked: **#10 PDF → CSV
|
||||||
|
extractor** (bank statements et al.) — gated on a paying customer +
|
||||||
|
≥3 paying customers or ≥5 demo emails explicitly asking for PDF
|
||||||
|
extraction, with the bookkeeper niche converting at least one customer
|
||||||
|
first. None of those triggers have fired yet.
|
||||||
|
|
||||||
### 2.2 The demo *is* the product. Make it embarrassingly good.
|
### 2.2 The demo *is* the product. Make it embarrassingly good.
|
||||||
|
|
||||||
- Three persona-tagged sample datasets, not one generic CSV: Shopify
|
- Three persona-tagged sample datasets, not one generic CSV: Shopify
|
||||||
@@ -104,10 +112,10 @@ demo dataset.
|
|||||||
| # | Pain | $ / time impact | Tools that fix it |
|
| # | Pain | $ / time impact | Tools that fix it |
|
||||||
|---|------|-----------------|---|
|
|---|------|-----------------|---|
|
||||||
| S1 | **Klaviyo / Mailchimp / Omnisend per-contact billing.** Subscriber list with 10–18 % duplicate rate (case drift, plus signs in Gmail addresses, multiple devices) → recurring overpay forever. | $30–300/mo per percent of dupes on a 50 k list — recurring | Dedup + Format Standardize (email canonicalization) + Pipeline (re-run weekly) |
|
| S1 | **Klaviyo / Mailchimp / Omnisend per-contact billing.** Subscriber list with 10–18 % duplicate rate (case drift, plus signs in Gmail addresses, multiple devices) → recurring overpay forever. | $30–300/mo per percent of dupes on a 50 k list — recurring | Dedup + Format Standardize (email canonicalization) + Pipeline (re-run weekly) |
|
||||||
| S2 | **Product feed rejected by Google Merchant Center / Meta Catalog.** Smart quotes in titles, NBSP in SKU, inconsistent attributes; campaign launch delayed 24–72 h while feed gets fixed. | 1–3 days delayed launch × campaign value | Text Cleaner + Format Standardize |
|
| S2 | **Product feed rejected by Google Merchant Center / Meta Catalog.** Smart quotes in titles, NBSP in SKU, inconsistent attributes; campaign launch delayed 24–72 h while feed gets fixed. | 1–3 days delayed launch × campaign value | Clean Text + Standardize Formats |
|
||||||
| S3 | **Multi-channel order consolidation.** Shopify + Etsy + Amazon + Faire + wholesale spreadsheet, each with a different column for "customer email" / "order total" / "ship country". | 4–8 hr / month manually merging | Column Mapper + Dedup + Pipeline |
|
| S3 | **Multi-channel order consolidation.** Shopify + Etsy + Amazon + Faire + wholesale spreadsheet, each with a different column for "customer email" / "order total" / "ship country". | 4–8 hr / month manually merging | Map Columns + Find Duplicates + Automated Workflows |
|
||||||
| S4 | **Subscription identity fragmentation.** Pet-box subscribers cancel and re-sub under a different email; cohort analysis says churn is 20 % when it's actually 12 % — pricing decisions wrong. | Mis-priced LTV → over- or under-paid acquisition | Dedup with `merge=true` survivor |
|
| S4 | **Subscription identity fragmentation.** Pet-box subscribers cancel and re-sub under a different email; cohort analysis says churn is 20 % when it's actually 12 % — pricing decisions wrong. | Mis-priced LTV → over- or under-paid acquisition | Dedup with `merge=true` survivor |
|
||||||
| S5 | **International tax / VAT MOSS compliance.** Country column is `UK` / `U.K.` / `United Kingdom` / `GB` in the same export; VAT report breaks. Phone formats per region break call-center routing. | Compliance penalty risk + ops friction | Format Standardize (per-row country) + Column Mapper |
|
| S5 | **International tax / VAT MOSS compliance.** Country column is `UK` / `U.K.` / `United Kingdom` / `GB` in the same export; VAT report breaks. Phone formats per region break call-center routing. | Compliance penalty risk + ops friction | Standardize Formats (per-row country) + Map Columns |
|
||||||
|
|
||||||
#### Bookkeeper / freelance accountant
|
#### Bookkeeper / freelance accountant
|
||||||
|
|
||||||
@@ -126,7 +134,7 @@ demo dataset.
|
|||||||
| R1 | **HubSpot / Marketo / Iterable per-contact tier pricing.** 10 k contacts → enterprise tier at $4–8 k/mo. Every duplicate is a recurring tax. | $200–800 / month per 1 k duplicate contacts — recurring | Dedup with cross-source merge + Pipeline |
|
| R1 | **HubSpot / Marketo / Iterable per-contact tier pricing.** 10 k contacts → enterprise tier at $4–8 k/mo. Every duplicate is a recurring tax. | $200–800 / month per 1 k duplicate contacts — recurring | Dedup with cross-source merge + Pipeline |
|
||||||
| R2 | **Email-deliverability / sender reputation.** Sending to invalid or duplicate addresses tanks reputation; recovery takes weeks. | Catastrophic — entire email programme degraded | Format Standardize (email canonicalization) + Missing (sentinel detection) |
|
| R2 | **Email-deliverability / sender reputation.** Sending to invalid or duplicate addresses tanks reputation; recovery takes weeks. | Catastrophic — entire email programme degraded | Format Standardize (email canonicalization) + Missing (sentinel detection) |
|
||||||
| R3 | **GDPR / contact-data privacy.** Uploading lead data to a third-party cleaning SaaS is itself a GDPR concern; legal review blocks adoption. | Compliance risk + 4–8 wk legal-review delay | Local-only desktop app, zero outbound calls |
|
| R3 | **GDPR / contact-data privacy.** Uploading lead data to a third-party cleaning SaaS is itself a GDPR concern; legal review blocks adoption. | Compliance risk + 4–8 wk legal-review delay | Local-only desktop app, zero outbound calls |
|
||||||
| R4 | **Multi-vendor lead-source unification.** Apollo, ZoomInfo, LinkedIn Sales Nav, manual scrapes — each export has different headers, scoring, country format. | 1–3 days per campaign of manual unification | Column Mapper (alias matching) + Format Standardize (per-row country) + Dedup |
|
| R4 | **Multi-vendor lead-source unification.** Apollo, ZoomInfo, LinkedIn Sales Nav, manual scrapes — each export has different headers, scoring, country format. | 1–3 days per campaign of manual unification | Map Columns (alias matching) + Standardize Formats (per-row country) + Find Duplicates |
|
||||||
| R5 | **Suppression-list management across 5+ platforms.** Each platform has its own format; un-deduped suppression lists let opt-outs slip through, triggering CAN-SPAM / GDPR exposure. | Compliance risk + churn-back cost | Pipeline saved as JSON, re-run on each new suppression batch |
|
| R5 | **Suppression-list management across 5+ platforms.** Each platform has its own format; un-deduped suppression lists let opt-outs slip through, triggering CAN-SPAM / GDPR exposure. | Compliance risk + churn-back cost | Pipeline saved as JSON, re-run on each new suppression batch |
|
||||||
|
|
||||||
### 2.4 Operationalize the moat the docs already name.
|
### 2.4 Operationalize the moat the docs already name.
|
||||||
@@ -154,7 +162,7 @@ right after "runs locally."
|
|||||||
Copy seed: *"Every change auditable. Hand the audit CSV to your client
|
Copy seed: *"Every change auditable. Hand the audit CSV to your client
|
||||||
with the cleaned file."*
|
with the cleaned file."*
|
||||||
|
|
||||||
### 2.6 The Pipeline Runner is the retention multiplier.
|
### 2.6 Automated Workflows is the retention multiplier.
|
||||||
|
|
||||||
A buyer with a saved pipeline isn't a one-off purchase — they're a
|
A buyer with a saved pipeline isn't a one-off purchase — they're a
|
||||||
recurring user who recommends the product. This is exactly the
|
recurring user who recommends the product. This is exactly the
|
||||||
@@ -172,8 +180,8 @@ trigger DECISIONS.md §8 already names).
|
|||||||
### 2.8 Dependency-aware pipeline UX.
|
### 2.8 Dependency-aware pipeline UX.
|
||||||
|
|
||||||
Tools have soft execution-order preferences (Text Clean before Format
|
Tools have soft execution-order preferences (Text Clean before Format
|
||||||
Standardize, Format before Dedup, Missing before Dedup). The Pipeline
|
Standardize, Format before Dedup, Missing before Dedup). Automated
|
||||||
Runner *recommends* the order, *warns* on reversals, and **never
|
Workflows *recommends* the order, *warns* on reversals, and **never
|
||||||
forces** — the user owns their workflow. Implementation: see
|
forces** — the user owns their workflow. Implementation: see
|
||||||
`src/core/pipeline.py` `SOFT_DEPENDENCIES`.
|
`src/core/pipeline.py` `SOFT_DEPENDENCIES`.
|
||||||
|
|
||||||
@@ -184,7 +192,7 @@ forces** — the user owns their workflow. Implementation: see
|
|||||||
| 1 | PyInstaller pipeline · Mac/Win unsigned installers · Apple Dev Program enrollment (1–2 wk lead) | `dist/datatools-mac.dmg` and `dist/datatools-win.exe` install on a clean machine |
|
| 1 | PyInstaller pipeline · Mac/Win unsigned installers · Apple Dev Program enrollment (1–2 wk lead) | `dist/datatools-mac.dmg` and `dist/datatools-win.exe` install on a clean machine |
|
||||||
| 2 | Demo deployed to Streamlit Cloud · landing page v1 with embedded demo · 3 persona datasets in the demo | Public URL serves a working pipeline run on a sample dataset in < 30 s |
|
| 2 | Demo deployed to Streamlit Cloud · landing page v1 with embedded demo · 3 persona datasets in the demo | Public URL serves a working pipeline run on a sample dataset in < 30 s |
|
||||||
| 3 | Gumroad listing live · share value-first in 3 niche communities (no pitch) · 1 long-tail SEO post for the lead persona | First listing impression captured · post not removed for self-promotion |
|
| 3 | Gumroad listing live · share value-first in 3 niche communities (no pitch) · 1 long-tail SEO post for the lead persona | First listing impression captured · post not removed for self-promotion |
|
||||||
| 4 | Pipeline Runner v1.0 shipped (this week, 2026-05-01 — exception per §2.1) · v1.1 patch announced with Tool 09 + intl improvements | Pipeline saves/loads JSON · 3 demo pipelines preloaded |
|
| 4 | Automated Workflows v1.0 shipped (this week, 2026-05-01 — exception per §2.1) · v1.1 patch announced with Tool 09 + intl improvements | Pipeline saves/loads JSON · 3 demo pipelines preloaded |
|
||||||
| 5–8 | Bookkeeper landing page · agency landing page · second tool's promo cycle · priority-support tier added (defer purchase until §2.7 trigger) | Three live landing pages with distinct H1, demo dataset, conversion target |
|
| 5–8 | Bookkeeper landing page · agency landing page · second tool's promo cycle · priority-support tier added (defer purchase until §2.7 trigger) | Three live landing pages with distinct H1, demo dataset, conversion target |
|
||||||
| 9–13 | Tool 06–08 only **if** revenue trajectory supports continued investment · otherwise more market work on the existing 5 + 09 | Decision made on 13 Aug 2026 with revenue data, not feature ambition |
|
| 9–13 | Tool 06–08 only **if** revenue trajectory supports continued investment · otherwise more market work on the existing 5 + 09 | Decision made on 13 Aug 2026 with revenue data, not feature ambition |
|
||||||
|
|
||||||
@@ -202,7 +210,7 @@ These flip the plan, not the underlying criteria:
|
|||||||
|
|
||||||
## 5. Anti-temptations (things the plan refuses)
|
## 5. Anti-temptations (things the plan refuses)
|
||||||
|
|
||||||
- **More tools before more buyers.** Locked. Exception only for Pipeline Runner per §2.1.
|
- **More tools before more buyers.** Locked. Exception only for Automated Workflows per §2.1.
|
||||||
- **SaaS pivot.** Recurring infra conflicts with the lifestyle constraint (DECISIONS.md §4).
|
- **SaaS pivot.** Recurring infra conflicts with the lifestyle constraint (DECISIONS.md §4).
|
||||||
- **Live chat / sales calls.** Conflicts with no-touch (DECISIONS.md §1 #8).
|
- **Live chat / sales calls.** Conflicts with no-touch (DECISIONS.md §1 #8).
|
||||||
- **Custom integrations / one-off consulting.** $300/hr looks tempting; breaks the "build once, sell many" model that justifies the entire strategy.
|
- **Custom integrations / one-off consulting.** $300/hr looks tempting; breaks the "build once, sell many" model that justifies the entire strategy.
|
||||||
|
|||||||
@@ -144,7 +144,7 @@ Reading PLAN.md §3 + this doc together, the rough script:
|
|||||||
| **M1** (June) | Installers · demo · 3 landing pages · Gumroad live | Whether the funnel mechanically works. Numbers will be noisy; just look for one purchase. |
|
| **M1** (June) | Installers · demo · 3 landing pages · Gumroad live | Whether the funnel mechanically works. Numbers will be noisy; just look for one purchase. |
|
||||||
| **M2** (July) | M1 + community posts in 3 niches + 1 SEO post | Which persona converts. Re-allocate effort to the highest-converting niche. |
|
| **M2** (July) | M1 + community posts in 3 niches + 1 SEO post | Which persona converts. Re-allocate effort to the highest-converting niche. |
|
||||||
| **M3** (August) | M2 + landing-page changes from M2 review | Whether intent-rate moved on the change. Decide tools 06–08 go/no-go. |
|
| **M3** (August) | M2 + landing-page changes from M2 review | Whether intent-rate moved on the change. Decide tools 06–08 go/no-go. |
|
||||||
| **M4** (September) | M3 + first repeat-buyer signals | Whether the Pipeline Runner is producing retention as designed. |
|
| **M4** (September) | M3 + first repeat-buyer signals | Whether Automated Workflows is producing retention as designed. |
|
||||||
|
|
||||||
By end of M4, the data tells you whether the plan is producing
|
By end of M4, the data tells you whether the plan is producing
|
||||||
$1k–3k/mo (BUSINESS.md §6 6-month target) — extrapolated from the
|
$1k–3k/mo (BUSINESS.md §6 6-month target) — extrapolated from the
|
||||||
|
|||||||
@@ -6,11 +6,13 @@
|
|||||||
|
|
||||||
## Inicio rápido
|
## Inicio rápido
|
||||||
|
|
||||||
1. Descarga el instalador para tu sistema operativo desde tu correo de compra.
|
1. Descarga desde tu correo de compra. Dos formatos por sistema operativo — elige uno:
|
||||||
2. Ejecútalo (no se requieren conocimientos de Python).
|
- **Instalador** (`.dmg` en macOS, `.exe` en Windows) — crea acceso directo en el escritorio + entrada en el menú Inicio / Launchpad.
|
||||||
3. Lánzalo desde el acceso directo del escritorio → tu navegador predeterminado se abrirá en una página local.
|
- **.zip portable** — descomprime y haz doble clic. Sin instalación, sin admin, se ejecuta desde cualquier lugar.
|
||||||
|
2. Ábrelo (no necesitas Python; todo viene incluido).
|
||||||
|
3. La app arranca un servidor local y abre tu navegador. Nada sale de tu equipo.
|
||||||
|
|
||||||
Instrucciones completas: [USER-GUIDE.es.md](USER-GUIDE.es.md).
|
Paso a paso completo incluyendo SmartScreen / Gatekeeper: [USER-GUIDE.es.md §1](USER-GUIDE.es.md#1-instalaci%C3%B3n).
|
||||||
|
|
||||||
## Documentación
|
## Documentación
|
||||||
|
|
||||||
|
|||||||
@@ -6,11 +6,13 @@
|
|||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
|
||||||
1. Download the installer for your OS from your purchase email.
|
1. Download from your purchase email. Two flavors per OS — pick one:
|
||||||
2. Run it (no Python knowledge required).
|
- **Installer** (`.dmg` on macOS, `.exe` on Windows) — wires up Desktop + Start Menu / Launchpad shortcuts.
|
||||||
3. Launch via the desktop shortcut → your default browser opens to a local page.
|
- **Portable .zip** — unzip and double-click. No install, no admin rights, runs from anywhere.
|
||||||
|
2. Open it (no Python needed; everything is bundled inside).
|
||||||
|
3. The app starts a local server and opens your browser. Nothing leaves your machine.
|
||||||
|
|
||||||
Full instructions: [USER-GUIDE.md](USER-GUIDE.md).
|
Full step-by-step including SmartScreen / Gatekeeper workarounds: [USER-GUIDE.md §1](USER-GUIDE.md#1-install).
|
||||||
|
|
||||||
## Docs
|
## Docs
|
||||||
|
|
||||||
|
|||||||
@@ -21,8 +21,8 @@ project-root/
|
|||||||
│ └── CLI-REFERENCE.md
|
│ └── CLI-REFERENCE.md
|
||||||
├── src/
|
├── src/
|
||||||
│ ├── core/ # shared logic — both CLI + GUI call into this
|
│ ├── core/ # shared logic — both CLI + GUI call into this
|
||||||
│ ├── cli.py # Deduplicator CLI
|
│ ├── cli.py # Find Duplicates CLI
|
||||||
│ ├── cli_text_clean.py # Text Cleaner CLI
|
│ ├── cli_text_clean.py # Clean Text CLI
|
||||||
│ ├── cli_analyze.py # Analyzer CLI
|
│ ├── cli_analyze.py # Analyzer CLI
|
||||||
│ └── gui/
|
│ └── gui/
|
||||||
│ ├── app.py # Streamlit entry
|
│ ├── app.py # Streamlit entry
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ Sample size: 1,000 rows (configurable).
|
|||||||
- Full-DataFrame `auto_fix`: ~5 min (~30 µs/cell).
|
- Full-DataFrame `auto_fix`: ~5 min (~30 µs/cell).
|
||||||
- Output write: ~10 s.
|
- Output write: ~10 s.
|
||||||
- Recommended RAM: 3–4× input size for the full-Apply path.
|
- Recommended RAM: 3–4× input size for the full-Apply path.
|
||||||
- **Format standardizer** (`standardize_dataframe`): ~2.7M rows/sec on
|
- **Standardize Formats** (`standardize_dataframe`): ~2.7M rows/sec on
|
||||||
cache-warm repetition-heavy columns (synthetic 1M-row in-memory
|
cache-warm repetition-heavy columns (synthetic 1M-row in-memory
|
||||||
benchmark, 2 typed columns); the fused single-pass loop replaced a
|
benchmark, 2 typed columns); the fused single-pass loop replaced a
|
||||||
3-pass ``.tolist()`` cycle, so per-call overhead is now dominated by
|
3-pass ``.tolist()`` cycle, so per-call overhead is now dominated by
|
||||||
@@ -87,20 +87,20 @@ Sample size: 1,000 rows (configurable).
|
|||||||
thread-pool scaffolding; on CPython 3.12 with the GIL it's
|
thread-pool scaffolding; on CPython 3.12 with the GIL it's
|
||||||
roughly neutral, but the API is ready for the free-threaded
|
roughly neutral, but the API is ready for the free-threaded
|
||||||
(PEP 703) Python 3.13+ build where it will help.
|
(PEP 703) Python 3.13+ build where it will help.
|
||||||
- **Text cleaner** (`clean_dataframe`): ~1M rows/sec on
|
- **Clean Text** (`clean_dataframe`): ~1M rows/sec on
|
||||||
repetition-heavy columns (per-call string cache: the pipeline runs
|
repetition-heavy columns (per-call string cache: the pipeline runs
|
||||||
once per *unique* cell value, not once per row).
|
once per *unique* cell value, not once per row).
|
||||||
- **Missing handler** (`handle_missing`): lazy-copy — when sentinel
|
- **Fix Missing Values** (`handle_missing`): lazy-copy — when sentinel
|
||||||
standardization runs but finds nothing, AND no drops AND no fills
|
standardization runs but finds nothing, AND no drops AND no fills
|
||||||
apply, the input frame is returned as-is. On a clean 1 GB file this
|
apply, the input frame is returned as-is. On a clean 1 GB file this
|
||||||
saves the 1 GB allocation that the unconditional upfront copy used
|
saves the 1 GB allocation that the unconditional upfront copy used
|
||||||
to take.
|
to take.
|
||||||
- **Column mapper** (`map_columns`): rename + drop both already
|
- **Map Columns** (`map_columns`): rename + drop both already
|
||||||
return fresh frames; the explicit upfront `df.copy()` is now
|
return fresh frames; the explicit upfront `df.copy()` is now
|
||||||
removed and downstream mutating steps (schema-add, coerce) copy on
|
removed and downstream mutating steps (schema-add, coerce) copy on
|
||||||
demand via `_ensure_owned()`. Rename-only and identity-mapping
|
demand via `_ensure_owned()`. Rename-only and identity-mapping
|
||||||
paths run with zero explicit copies.
|
paths run with zero explicit copies.
|
||||||
- **Deduplicator**:
|
- **Find Duplicates**:
|
||||||
- **Exact-only strategies** (every column uses `Algorithm.EXACT` at
|
- **Exact-only strategies** (every column uses `Algorithm.EXACT` at
|
||||||
threshold 100 — covers strong-key dedup like email/phone, the
|
threshold 100 — covers strong-key dedup like email/phone, the
|
||||||
fallback drop-duplicates path, and explicit "match on this exact
|
fallback drop-duplicates path, and explicit "match on this exact
|
||||||
@@ -117,19 +117,30 @@ Sample size: 1,000 rows (configurable).
|
|||||||
(the common dedup workload) skip re-parsing.
|
(the common dedup workload) skip re-parsing.
|
||||||
|
|
||||||
## 11. Tools
|
## 11. Tools
|
||||||
1. Deduplicator — Ready
|
1. Find Duplicates — Ready
|
||||||
2. Text Cleaner — Ready
|
2. Clean Text — Ready
|
||||||
3. Format Standardizer — Ready
|
3. Standardize Formats — Ready
|
||||||
4. Missing Value Handler — Ready
|
4. Fix Missing Values — Ready
|
||||||
5. Column Mapper — Ready
|
5. Map Columns — Ready
|
||||||
6. Outlier Detector — Coming Soon
|
6. Find Unusual Values — Coming Soon
|
||||||
7. Multi-File Merger — Coming Soon
|
7. Combine Files — Coming Soon
|
||||||
8. Validator & Reporter — Coming Soon
|
8. Quality Check — Coming Soon
|
||||||
9. Pipeline Runner — Ready
|
9. Automated Workflows — Ready
|
||||||
|
|
||||||
|
**Future / not in v1.** Tool ideas captured for after-launch consideration
|
||||||
|
live in `docs/FUTURE-TOOLS.md` — entries there are gated by the new-tool
|
||||||
|
freeze in `PLAN.md` §2.1 and don't ship without a paying-customer +
|
||||||
|
repeated-demand signal. Currently parked there:
|
||||||
|
|
||||||
|
- **#10. PDF → CSV extractor** (bank statements + similar). No PDF
|
||||||
|
dependency exists in the repo today; this tool would need pdfplumber,
|
||||||
|
streamlit-drawable-canvas, and a templates store. Estimated 3–4 weeks
|
||||||
|
for a text-only MVP, 6–10 weeks for the polished version with
|
||||||
|
multi-page template recall.
|
||||||
|
|
||||||
### 11.a Recommended pipeline order (soft, not enforced)
|
### 11.a Recommended pipeline order (soft, not enforced)
|
||||||
|
|
||||||
The Pipeline Runner ships with a `SOFT_DEPENDENCIES` table; the
|
Automated Workflows ships with a `SOFT_DEPENDENCIES` table; the
|
||||||
following ordering is the default and the basis of the warning
|
following ordering is the default and the basis of the warning
|
||||||
surface. Re-ordering is allowed; the runner emits a warning string
|
surface. Re-ordering is allowed; the runner emits a warning string
|
||||||
and proceeds.
|
and proceeds.
|
||||||
@@ -174,7 +185,16 @@ and proceeds.
|
|||||||
- **Dev**: pytest, tox.
|
- **Dev**: pytest, tox.
|
||||||
|
|
||||||
## 16. Test coverage
|
## 16. Test coverage
|
||||||
- 1,777 tests passing, 0 skipped, 0 xfailed (incl. 15 perf-shape regression tests).
|
- 2,033 tests passing, 0 skipped, 0 xfailed.
|
||||||
|
- 1,868 core + CLI tests (run with `pytest -m 'not gui'` for a quick loop).
|
||||||
|
Includes 49 license-layer unit tests (Ed25519 sign/verify, dev-key
|
||||||
|
derivation, production-safe tripwire, schema), 25 license-CLI
|
||||||
|
tests, and 17 Lite-tier feature-map + guard tests.
|
||||||
|
- 165 GUI tests under `tests/gui/` driving Streamlit pages via `AppTest`
|
||||||
|
(smoke + EN/ES localization, chrome, gate, workflows, dedup review,
|
||||||
|
advanced panels, error paths, findings panel, activation +
|
||||||
|
license gate, Lite-tier per-page lock behaviour). Marked `gui`.
|
||||||
|
- Includes 15 perf-shape regression tests.
|
||||||
- Fixture corpora: text-cleaner (21), encodings (31), reference UTF-8 (9), format-cleaner (199 buyer cases + 20-row international stress fixture), missing-handler (3 use cases + 16 edge cases), column-mapper (3 use cases + 5 edge cases).
|
- Fixture corpora: text-cleaner (21), encodings (31), reference UTF-8 (9), format-cleaner (199 buyer cases + 20-row international stress fixture), missing-handler (3 use cases + 16 edge cases), column-mapper (3 use cases + 5 edge cases).
|
||||||
- Run: `python run_tests.py [--tool …] [--fixtures] [--coverage]`.
|
- Run: `python run_tests.py [--tool …] [--fixtures] [--coverage]`.
|
||||||
|
|
||||||
@@ -184,6 +204,58 @@ and proceeds.
|
|||||||
- Original input never modified.
|
- Original input never modified.
|
||||||
- Audit logs: `logs/` next to each run (timestamped).
|
- Audit logs: `logs/` next to each run (timestamped).
|
||||||
|
|
||||||
|
## 17a. Licensing
|
||||||
|
- **Storage**: ``~/.datatools/license.json`` (or
|
||||||
|
``$DATATOOLS_LICENSE_PATH`` override). Signed with Ed25519
|
||||||
|
(asymmetric).
|
||||||
|
- **Crypto**: Ed25519. The seller holds the private key; every
|
||||||
|
shipped binary embeds only the public key. A motivated reverse
|
||||||
|
engineer who pulls everything out of the binary still can't sign
|
||||||
|
new licenses. Keys are 32 bytes raw, exposed as hex via
|
||||||
|
``DATATOOLS_LICENSE_PRIVKEY`` (seller-side) and
|
||||||
|
``DATATOOLS_LICENSE_PUBKEY`` (build-time bake-in).
|
||||||
|
- **Activation**: buyer pastes a base64-encoded license blob
|
||||||
|
(``DTLIC1:...``) on first launch; app verifies the signature
|
||||||
|
offline + matches the buyer-entered name/email to the embedded
|
||||||
|
values.
|
||||||
|
- **No free trial**: every license requires a paid blob from the
|
||||||
|
seller. The user-facing trial flow (button + ``license_cli trial``
|
||||||
|
subcommand) was removed in v1.6 to keep paid-tier economics clean.
|
||||||
|
- **Lifetime**: every license is 1 year by default. Renewal applies a
|
||||||
|
fresh blob without losing the embedded buyer identity. Tier may
|
||||||
|
change during renewal (Lite → Core upgrade path).
|
||||||
|
- **Tiers**:
|
||||||
|
- ``lite`` — Find Duplicates + Clean Text + Standardize Formats.
|
||||||
|
Buyer pays once, gets the three universally-useful tools.
|
||||||
|
- ``core`` — every Ready tool (all 9 in v1.6).
|
||||||
|
- ``pro``, ``enterprise`` — scaffolded for future SKUs; currently
|
||||||
|
mirror Core. Add per-SKU restrictions by editing
|
||||||
|
``FEATURES_BY_TIER`` in ``src/license/features.py``.
|
||||||
|
- ``trial`` — kept in the enum for backwards compat with any
|
||||||
|
field-tested trial licenses but no longer issuable.
|
||||||
|
- **Feature flags**: every tool has a stable feature id matching its
|
||||||
|
``tool_id`` in :mod:`src.gui.tools_registry`. Adding a future per-
|
||||||
|
tool SKU is a one-line change to ``FEATURES_BY_TIER`` — no consumer
|
||||||
|
code edits.
|
||||||
|
- **Per-tool gating**: each tool page (GUI) and tool CLI calls
|
||||||
|
``require_feature(FeatureFlag.<TOOL>)`` at entry. GUI shows an
|
||||||
|
upgrade prompt + button to the Activate page; CLI prints a
|
||||||
|
message naming the locked feature and exits with code 2.
|
||||||
|
- **Lock badge**: the home grid shows a red 🔒 Locked pill on tool
|
||||||
|
cards the current tier doesn't unlock.
|
||||||
|
- **Dev bypass**: ``DATATOOLS_DEV_MODE=1`` skips every check (used by
|
||||||
|
the test suite and during development). **Refused in shipped
|
||||||
|
builds** by the production-safe tripwire.
|
||||||
|
- **Production-safe tripwire**: ``assert_production_safe()`` runs at
|
||||||
|
startup in every frozen build. Refuses to boot when ``DEV_MODE``
|
||||||
|
is set or the verification key is still the embedded dev key
|
||||||
|
(i.e., the build pipeline forgot to override
|
||||||
|
``DATATOOLS_LICENSE_PUBKEY``). No-op in source / pytest runs.
|
||||||
|
- **No internet**: signature verification is fully offline. The
|
||||||
|
shipped binary embeds only the public key; the private key never
|
||||||
|
leaves the seller. See ``docs/DECISIONS.md`` for the threat-model
|
||||||
|
discussion.
|
||||||
|
|
||||||
## 18. Error handling
|
## 18. Error handling
|
||||||
- Structured hierarchy: `DataToolsError` → `InputValidationError`, `ConfigError`, `FileFormatError`, `FileAccessError`.
|
- Structured hierarchy: `DataToolsError` → `InputValidationError`, `ConfigError`, `FileFormatError`, `FileAccessError`.
|
||||||
- Subclasses extend stdlib `ValueError` / `OSError` so existing handlers still catch them.
|
- Subclasses extend stdlib `ValueError` / `OSError` so existing handlers still catch them.
|
||||||
|
|||||||
593
docs/SETUP-LICENSE-SERVER.md
Normal file
593
docs/SETUP-LICENSE-SERVER.md
Normal file
@@ -0,0 +1,593 @@
|
|||||||
|
# SETUP — Self-hosted license server runbook
|
||||||
|
|
||||||
|
End-to-end build instructions for `licenses.datatools.unalogix.com` on
|
||||||
|
the existing invixiom box (Ubuntu 24.04, public IP `46.225.166.142`).
|
||||||
|
|
||||||
|
Audience: creator/operator. Read top to bottom on first install; use as
|
||||||
|
a reference thereafter.
|
||||||
|
|
||||||
|
Companions:
|
||||||
|
- `LICENSE-SERVER.md` — the architecture / design rationale
|
||||||
|
- `ADMIN.md` — day-2 ops (minting comps, looking at the issuance log)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 0. Multi-tenancy: where this lands among existing services
|
||||||
|
|
||||||
|
This box already hosts the `*.invixiom.com` family (kasm, files, lifeos,
|
||||||
|
code, gitea) via one shared nginx + one shared Let's Encrypt cert.
|
||||||
|
DataTools is intentionally separated from that stack at every layer:
|
||||||
|
|
||||||
|
| Layer | Existing | New |
|
||||||
|
|---|---|---|
|
||||||
|
| **DNS zone** | `invixiom.com` | `unalogix.com` (different TLD) |
|
||||||
|
| **nginx file** | `/etc/nginx/sites-available/invixiom` | `/etc/nginx/sites-available/unalogix` |
|
||||||
|
| **nginx symlink** | `sites-enabled/invixiom` | `sites-enabled/unalogix` |
|
||||||
|
| **TLS cert** | `letsencrypt/live/kasm.invixiom.com[-0001]` | `letsencrypt/live/datatools.unalogix.com` |
|
||||||
|
| **Backend port** | 8000, 8002, 8003, 8080, 8081, 8443 | **8090** (mint API), **5433** (Postgres, localhost-only) |
|
||||||
|
| **Docker compose project** | per-service (kasm, lifeos, gitea) | `datatools-license` |
|
||||||
|
| **Docker volume** | per service | `datatools_pg_data` |
|
||||||
|
| **Filesystem root** | various | `/srv/datatools-license/` |
|
||||||
|
| **System user** | various | `datatools-api` (UID auto-assigned, no shell) |
|
||||||
|
|
||||||
|
Nothing in the invixiom stack is read, modified, or referenced by the
|
||||||
|
datatools stack. Restart, upgrade, or remove either without affecting
|
||||||
|
the other.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Pre-flight checklist (off-box, before any commands run)
|
||||||
|
|
||||||
|
These have to be done by the operator outside this box. The build
|
||||||
|
won't proceed without them.
|
||||||
|
|
||||||
|
### 1a. DNS records
|
||||||
|
|
||||||
|
In your `unalogix.com` registrar / DNS panel, add:
|
||||||
|
|
||||||
|
```
|
||||||
|
A datatools.unalogix.com 46.225.166.142
|
||||||
|
A licenses.datatools.unalogix.com 46.225.166.142
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify before continuing:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
dig +short datatools.unalogix.com
|
||||||
|
dig +short licenses.datatools.unalogix.com
|
||||||
|
# Both should print: 46.225.166.142
|
||||||
|
```
|
||||||
|
|
||||||
|
DNS propagation can take 1–60 minutes. Let's Encrypt won't issue
|
||||||
|
certs until DNS resolves correctly.
|
||||||
|
|
||||||
|
### 1b. Postmark account (transactional email)
|
||||||
|
|
||||||
|
1. Sign up at https://postmarkapp.com (free 100 emails/mo, $15/mo for
|
||||||
|
the volume range we'll be in).
|
||||||
|
2. Verify the `unalogix.com` domain (DNS TXT/CNAME records — Postmark
|
||||||
|
will tell you exactly what to add).
|
||||||
|
3. Create a Server, copy the **Server API Token**. Stash it; we'll put
|
||||||
|
it in the app's `.env`.
|
||||||
|
4. Configure the sender address: `licenses@datatools.unalogix.com`.
|
||||||
|
|
||||||
|
If you prefer SES, Mailgun, Resend, etc. — fine, just swap the
|
||||||
|
adapter (see §6). Postmark is the recommended default.
|
||||||
|
|
||||||
|
### 1c. Cloudflare in front (recommended)
|
||||||
|
|
||||||
|
Move `unalogix.com` DNS hosting to Cloudflare and enable proxy ("orange
|
||||||
|
cloud") on both subdomains. Gets you free DDoS protection, WAF, and rate
|
||||||
|
limiting. **Origin TLS still goes through Let's Encrypt on this box**;
|
||||||
|
Cloudflare adds a second TLS hop in front. Cert renewal still works
|
||||||
|
because we use HTTP-01 challenge on the origin, which Cloudflare
|
||||||
|
proxies transparently.
|
||||||
|
|
||||||
|
If you skip this, the public webhook endpoint is directly hammerable.
|
||||||
|
Not catastrophic at low scale, but the free protection is worth taking.
|
||||||
|
|
||||||
|
### 1d. Gumroad webhook secret
|
||||||
|
|
||||||
|
In Gumroad's seller dashboard → Settings → Advanced → "Ping URL":
|
||||||
|
|
||||||
|
```
|
||||||
|
URL: https://licenses.datatools.unalogix.com/webhooks/gumroad
|
||||||
|
Secret: <generate a random 32-char hex; save it for the .env>
|
||||||
|
```
|
||||||
|
|
||||||
|
Don't enter this until §10 ("PR 2 cutover") — the endpoint won't exist
|
||||||
|
yet during the Mint API build.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. One-time host setup
|
||||||
|
|
||||||
|
Run as `root` (or via `sudo`).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Update apt cache and pull in the bits the rest of the doc needs.
|
||||||
|
apt-get update
|
||||||
|
apt-get install -y \
|
||||||
|
docker-compose-plugin \
|
||||||
|
certbot \
|
||||||
|
python3-certbot-nginx \
|
||||||
|
postgresql-client-16 # for psql to reach the containerized DB
|
||||||
|
|
||||||
|
# Sanity check: docker + compose v2 are already installed via Docker CE.
|
||||||
|
docker --version
|
||||||
|
docker compose version
|
||||||
|
|
||||||
|
# Create the system user the app process will run as (no shell, no home).
|
||||||
|
adduser --system --group --no-create-home --shell /usr/sbin/nologin datatools-api
|
||||||
|
|
||||||
|
# Filesystem layout under /srv (separate from /opt to make the
|
||||||
|
# multi-tenant boundary obvious on disk).
|
||||||
|
install -d -o datatools-api -g datatools-api -m 750 /srv/datatools-license
|
||||||
|
install -d -o datatools-api -g datatools-api -m 750 /srv/datatools-license/app
|
||||||
|
install -d -o datatools-api -g datatools-api -m 750 /srv/datatools-license/secrets
|
||||||
|
install -d -o datatools-api -g datatools-api -m 750 /srv/datatools-license/backups
|
||||||
|
```
|
||||||
|
|
||||||
|
The `secrets/` dir is mode 750 owned by `datatools-api`. The private
|
||||||
|
signing key and Postmark token live there as mode-400 files — never
|
||||||
|
in environment-variable-via-systemd-EnvironmentFile, never in the
|
||||||
|
docker-compose file, never anywhere `root` doesn't need to look.
|
||||||
|
|
||||||
|
> **Gotcha — secret file ownership UID.** Docker compose's
|
||||||
|
> `uid:`/`gid:`/`mode:` long-form on `secrets:` is silently ignored
|
||||||
|
> for **file-based** secrets (it's a swarm-mode-only feature). The
|
||||||
|
> file inside the container appears with whatever ownership it has
|
||||||
|
> on the host, and the API runs as UID 10001 (the `app` user from
|
||||||
|
> the Dockerfile). So chown the actual files to **10001** (a numeric
|
||||||
|
> UID that doesn't exist on the host — that's fine, chown accepts
|
||||||
|
> it) and rely on the parent dir's mode 750 + ownership for host-side
|
||||||
|
> access control. See §3 below for the corrected `chown` step.
|
||||||
|
|
||||||
|
### Firewall recommendation (separate decision)
|
||||||
|
|
||||||
|
The box currently runs without UFW. Enabling it now would affect all
|
||||||
|
existing services. Two options:
|
||||||
|
|
||||||
|
- **(A) Don't enable UFW.** Leave the cloud provider's network firewall
|
||||||
|
as the perimeter. This is the current state.
|
||||||
|
- **(B) Enable UFW with `allow 22, 80, 443` only.** Forces every Docker
|
||||||
|
service to bind to `127.0.0.1` (some currently bind `0.0.0.0`). Will
|
||||||
|
break any direct-port access until those binds are updated.
|
||||||
|
|
||||||
|
Default for this runbook: **(A)**. Revisit independently of the
|
||||||
|
DataTools rollout. The DataTools containers always bind to `127.0.0.1`
|
||||||
|
regardless.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Database (Postgres in Docker)
|
||||||
|
|
||||||
|
Postgres lives inside the datatools compose project — separate from
|
||||||
|
every other service on the box, separate volume, separate port,
|
||||||
|
localhost-only binding.
|
||||||
|
|
||||||
|
`/srv/datatools-license/compose.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:16-alpine
|
||||||
|
container_name: datatools-postgres
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
POSTGRES_DB: datatools_licenses
|
||||||
|
POSTGRES_USER: datatools_api
|
||||||
|
POSTGRES_PASSWORD_FILE: /run/secrets/pg_password
|
||||||
|
secrets:
|
||||||
|
- pg_password
|
||||||
|
volumes:
|
||||||
|
- datatools_pg_data:/var/lib/postgresql/data
|
||||||
|
ports:
|
||||||
|
- "127.0.0.1:5433:5432" # localhost-only, non-default port
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U datatools_api -d datatools_licenses"]
|
||||||
|
interval: 10s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 5
|
||||||
|
|
||||||
|
api:
|
||||||
|
build:
|
||||||
|
context: ./app
|
||||||
|
dockerfile: server/Dockerfile
|
||||||
|
image: datatools-license-api:latest
|
||||||
|
container_name: datatools-api
|
||||||
|
restart: unless-stopped
|
||||||
|
depends_on:
|
||||||
|
postgres:
|
||||||
|
condition: service_healthy
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: postgresql+psycopg://datatools_api@postgres:5432/datatools_licenses
|
||||||
|
PG_PASSWORD_FILE: /run/secrets/pg_password
|
||||||
|
DATATOOLS_ADMIN_TOKEN_FILE: /run/secrets/admin_token
|
||||||
|
# PR 2 — uncomment when Postmark + Gumroad are provisioned.
|
||||||
|
# POSTMARK_TOKEN_FILE: /run/secrets/postmark_token
|
||||||
|
# GUMROAD_WEBHOOK_SECRET_FILE: /run/secrets/gumroad_secret
|
||||||
|
# Production keypair (replaces in-tree dev key): set
|
||||||
|
# DATATOOLS_LICENSE_PRIVKEY_FILE: /run/secrets/license_privkey
|
||||||
|
# and DATATOOLS_LICENSE_PUBKEY: <hex> before shipping v1.0.
|
||||||
|
secrets:
|
||||||
|
- pg_password
|
||||||
|
- admin_token
|
||||||
|
# PR 2:
|
||||||
|
# - postmark_token
|
||||||
|
# - gumroad_secret
|
||||||
|
ports:
|
||||||
|
- "127.0.0.1:8090:8000" # localhost-only; nginx is the only path in
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
|
secrets:
|
||||||
|
pg_password: { file: ./secrets/pg_password }
|
||||||
|
admin_token: { file: ./secrets/admin_token }
|
||||||
|
# PR 2:
|
||||||
|
# postmark_token: { file: ./secrets/postmark_token }
|
||||||
|
# gumroad_secret: { file: ./secrets/gumroad_secret }
|
||||||
|
# Production keypair rotation adds:
|
||||||
|
# license_privkey: { file: ./secrets/license_privkey }
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
datatools_pg_data:
|
||||||
|
name: datatools_pg_data
|
||||||
|
```
|
||||||
|
|
||||||
|
Populate the secrets (each file should contain the value with no
|
||||||
|
trailing newline). For PR 1, only `pg_password` and `admin_token`
|
||||||
|
are required; the rest land in PR 2 / production key rotation.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/datatools-license
|
||||||
|
|
||||||
|
# Random 32-char hex DB password
|
||||||
|
openssl rand -hex 32 > secrets/pg_password
|
||||||
|
|
||||||
|
# Random admin Bearer token (CLI auth). Save this — you'll need it
|
||||||
|
# on your laptop to talk to /internal/* via the SSH tunnel.
|
||||||
|
openssl rand -hex 32 > secrets/admin_token
|
||||||
|
|
||||||
|
# --- PR 2 secrets ---
|
||||||
|
# echo -n "<postmark-server-token>" > secrets/postmark_token # from postmarkapp.com
|
||||||
|
# openssl rand -hex 32 > secrets/gumroad_secret # paste into Gumroad's Ping URL: ?secret=<this>
|
||||||
|
#
|
||||||
|
# --- production-key follow-up (defer until v1.0 cutover) ---
|
||||||
|
# echo -n "<ed25519-private-hex>" > secrets/license_privkey
|
||||||
|
|
||||||
|
# Lock everything down. The numeric 10001 matches the in-container
|
||||||
|
# `app` user (Dockerfile-defined), letting the API read the file
|
||||||
|
# while keeping host-side access gated by the parent dir's mode 750.
|
||||||
|
chmod 400 secrets/*
|
||||||
|
chown 10001:10001 secrets/*
|
||||||
|
```
|
||||||
|
|
||||||
|
The corresponding **public** key for `DATATOOLS_LICENSE_PUBKEY` goes
|
||||||
|
in `/srv/datatools-license/.env` (it's not secret — it's already in
|
||||||
|
every shipped binary):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "DATATOOLS_LICENSE_PUBKEY=<hex-pubkey>" > /srv/datatools-license/.env
|
||||||
|
chmod 640 /srv/datatools-license/.env
|
||||||
|
chown datatools-api:datatools-api /srv/datatools-license/.env
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. App image build
|
||||||
|
|
||||||
|
The Mint API source lives in this repo under `server/` (new directory
|
||||||
|
introduced by PR 1). Build the Docker image:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/datatools-license/app
|
||||||
|
git clone https://git.invixiom.com/giteadmin/datatools-dev.git .
|
||||||
|
docker build -t datatools-license-api:latest -f server/Dockerfile server/
|
||||||
|
```
|
||||||
|
|
||||||
|
Schema bootstrap (one-time, after first `docker compose up`):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose exec api alembic upgrade head
|
||||||
|
```
|
||||||
|
|
||||||
|
Smoke test:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s http://127.0.0.1:8090/health
|
||||||
|
# expects: {"status":"ok","db":"ok"}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. nginx config
|
||||||
|
|
||||||
|
> **Gotcha — nginx version syntax.** Ubuntu 24.04 ships nginx 1.24,
|
||||||
|
> which uses the legacy `listen 443 ssl http2;` form. The standalone
|
||||||
|
> `http2 on;` directive arrived in nginx 1.25 and will error on 1.24
|
||||||
|
> with `unknown directive "http2"`. The config below uses the 1.24
|
||||||
|
> form.
|
||||||
|
>
|
||||||
|
> **Bring-up sequence.** This config references a TLS cert at
|
||||||
|
> `/etc/letsencrypt/live/datatools.unalogix.com/`, which doesn't
|
||||||
|
> exist on a fresh install — nginx would refuse to start. The
|
||||||
|
> working sequence is: (a) install a temporary HTTP-only config
|
||||||
|
> that serves `.well-known/acme-challenge/` and returns 503 for
|
||||||
|
> everything else, (b) `nginx -s reload`, (c) run `certbot
|
||||||
|
> certonly --webroot`, (d) replace with the HTTPS config below,
|
||||||
|
> (e) `nginx -s reload` again. See §6.
|
||||||
|
|
||||||
|
`/etc/nginx/sites-available/unalogix` — **new file**, do not merge
|
||||||
|
into `invixiom`:
|
||||||
|
|
||||||
|
```nginx
|
||||||
|
# Marketing / product site (datatools.unalogix.com) — static for now.
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name datatools.unalogix.com licenses.datatools.unalogix.com;
|
||||||
|
return 301 https://$host$request_uri;
|
||||||
|
}
|
||||||
|
|
||||||
|
server {
|
||||||
|
listen 443 ssl http2; # nginx 1.24 syntax (Ubuntu 24.04)
|
||||||
|
server_name datatools.unalogix.com;
|
||||||
|
|
||||||
|
ssl_certificate /etc/letsencrypt/live/datatools.unalogix.com/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/letsencrypt/live/datatools.unalogix.com/privkey.pem;
|
||||||
|
|
||||||
|
root /srv/datatools-license/site; # static landing page; create later
|
||||||
|
index index.html;
|
||||||
|
|
||||||
|
location / {
|
||||||
|
try_files $uri $uri/ =404;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# License operations subdomain.
|
||||||
|
server {
|
||||||
|
listen 443 ssl http2; # nginx 1.24 syntax (Ubuntu 24.04)
|
||||||
|
server_name licenses.datatools.unalogix.com;
|
||||||
|
|
||||||
|
ssl_certificate /etc/letsencrypt/live/datatools.unalogix.com/fullchain.pem;
|
||||||
|
ssl_certificate_key /etc/letsencrypt/live/datatools.unalogix.com/privkey.pem;
|
||||||
|
|
||||||
|
# Block /internal/* from the public side as defense-in-depth.
|
||||||
|
# (The app also enforces this server-side; this is layered.)
|
||||||
|
location /internal/ {
|
||||||
|
return 404;
|
||||||
|
}
|
||||||
|
|
||||||
|
location / {
|
||||||
|
proxy_pass http://127.0.0.1:8090;
|
||||||
|
proxy_http_version 1.1;
|
||||||
|
proxy_set_header Host $host;
|
||||||
|
proxy_set_header X-Real-IP $remote_addr;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $scheme;
|
||||||
|
|
||||||
|
# Gumroad webhook payloads are tiny but tighten anyway.
|
||||||
|
client_max_body_size 1m;
|
||||||
|
|
||||||
|
# Basic rate limiting: 30 req/min/IP on /webhooks/* and /portal/*.
|
||||||
|
# Tune in nginx.conf with a `limit_req_zone` directive.
|
||||||
|
# limit_req zone=licenses burst=10 nodelay;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Enable + reload:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ln -s /etc/nginx/sites-available/unalogix /etc/nginx/sites-enabled/unalogix
|
||||||
|
nginx -t # validate
|
||||||
|
systemctl reload nginx
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. TLS cert
|
||||||
|
|
||||||
|
Use the standalone http-01 challenge (nginx-plugin works too; this is
|
||||||
|
slightly more explicit):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
certbot certonly \
|
||||||
|
--webroot -w /var/www/html \
|
||||||
|
-d datatools.unalogix.com \
|
||||||
|
-d licenses.datatools.unalogix.com \
|
||||||
|
--agree-tos \
|
||||||
|
--email michael.dombaugh@gmail.com \
|
||||||
|
--non-interactive
|
||||||
|
```
|
||||||
|
|
||||||
|
Cert lands at `/etc/letsencrypt/live/datatools.unalogix.com/`.
|
||||||
|
Auto-renewal is already configured by the certbot package (systemd
|
||||||
|
timer `certbot.timer`). Confirm:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
systemctl list-timers certbot.timer
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Bring it up
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /srv/datatools-license
|
||||||
|
docker compose up -d
|
||||||
|
docker compose ps # both services should be 'running (healthy)'
|
||||||
|
docker compose logs -f api
|
||||||
|
```
|
||||||
|
|
||||||
|
Public smoke test:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -s https://licenses.datatools.unalogix.com/health
|
||||||
|
# expects: {"status":"ok","db":"ok"}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Verification — end-to-end internal mint
|
||||||
|
|
||||||
|
From your laptop (NOT the server), open an SSH tunnel for the internal
|
||||||
|
endpoint:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
ssh -L 8090:127.0.0.1:8090 michael@46.225.166.142 -N
|
||||||
|
# Leave running; in another terminal:
|
||||||
|
|
||||||
|
curl -X POST http://127.0.0.1:8090/internal/mint \
|
||||||
|
-H "Authorization: Bearer $DATATOOLS_ADMIN_TOKEN" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"name":"Test Buyer",
|
||||||
|
"email":"test@example.com",
|
||||||
|
"tier":"core",
|
||||||
|
"years":1,
|
||||||
|
"source":"manual",
|
||||||
|
"notes":"smoke test"
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: 200 + a `DTLIC2:...` blob + a row inserted in the `licenses`
|
||||||
|
table. Confirm with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose exec postgres \
|
||||||
|
psql -U datatools_api -d datatools_licenses \
|
||||||
|
-c "SELECT license_key, email, tier, source FROM licenses;"
|
||||||
|
```
|
||||||
|
|
||||||
|
Then **revoke the test row** before going further:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose exec postgres \
|
||||||
|
psql -U datatools_api -d datatools_licenses \
|
||||||
|
-c "DELETE FROM licenses WHERE email = 'test@example.com';"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Operational concerns
|
||||||
|
|
||||||
|
### Backups (Postgres → off-site)
|
||||||
|
|
||||||
|
`/etc/cron.daily/datatools-license-backup`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
set -euo pipefail
|
||||||
|
TS=$(date -u +%Y%m%dT%H%M%SZ)
|
||||||
|
OUT=/srv/datatools-license/backups/db-${TS}.sql.gz
|
||||||
|
docker compose -f /srv/datatools-license/compose.yml exec -T postgres \
|
||||||
|
pg_dump -U datatools_api datatools_licenses | gzip > "$OUT"
|
||||||
|
chmod 600 "$OUT"
|
||||||
|
# Off-site copy — pick one:
|
||||||
|
# rclone copy "$OUT" remote:datatools-license-backups/
|
||||||
|
# aws s3 cp "$OUT" s3://datatools-backups/db/ --sse AES256
|
||||||
|
find /srv/datatools-license/backups -name 'db-*.sql.gz' -mtime +30 -delete
|
||||||
|
```
|
||||||
|
|
||||||
|
Pick an off-site target. Without one, a disk failure loses every
|
||||||
|
customer record. Test the restore at least once on a staging copy.
|
||||||
|
|
||||||
|
### Monitoring
|
||||||
|
|
||||||
|
External uptime probe (free):
|
||||||
|
1. UptimeRobot account → add monitor for `https://licenses.datatools.unalogix.com/health`.
|
||||||
|
2. 5-minute interval, alert to email/SMS.
|
||||||
|
|
||||||
|
Container health is already handled by `restart: unless-stopped` +
|
||||||
|
healthcheck. To see recent failures:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose ps # last health-check status
|
||||||
|
docker compose logs api --tail 200
|
||||||
|
journalctl -u docker --since '1 hour ago' | grep datatools
|
||||||
|
```
|
||||||
|
|
||||||
|
### Log rotation
|
||||||
|
|
||||||
|
Docker handles container logs; cap their size in
|
||||||
|
`/etc/docker/daemon.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"log-driver": "json-file",
|
||||||
|
"log-opts": {
|
||||||
|
"max-size": "10m",
|
||||||
|
"max-file": "3"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Then `systemctl restart docker` (this restarts all containers — schedule
|
||||||
|
during a quiet window).
|
||||||
|
|
||||||
|
### Key rotation (future)
|
||||||
|
|
||||||
|
If the private signing key is ever compromised:
|
||||||
|
|
||||||
|
1. Generate a new keypair (`scripts/generate_keypair.py`).
|
||||||
|
2. Build and ship a desktop release with the new pubkey embedded.
|
||||||
|
3. Update `/srv/datatools-license/secrets/license_privkey` and
|
||||||
|
`/srv/datatools-license/.env`'s pubkey.
|
||||||
|
4. `docker compose restart api`.
|
||||||
|
5. Re-issue every active license (script that queries the DB, calls
|
||||||
|
`/internal/mint`, emails buyers). Old blobs will fail verification
|
||||||
|
in the new desktop build.
|
||||||
|
|
||||||
|
Plan a 90-day overlap window where the desktop verifies against
|
||||||
|
*both* keys before retiring the old pubkey. (Verification logic
|
||||||
|
change to the desktop app — not in scope for PR 1.)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. PR cutover sequence
|
||||||
|
|
||||||
|
This runbook covers the box-level scaffolding. Application code lands
|
||||||
|
in three independently shippable PRs:
|
||||||
|
|
||||||
|
| PR | Adds | Ship gate | Webhook live? |
|
||||||
|
|---|---|---|---|
|
||||||
|
| **1** | Source-agnostic Mint API + Postgres + `datatools-admin mint` CLI | Operator can mint a comp license through the server | No |
|
||||||
|
| **2** | Gumroad adapter + webhook receiver + email send | Real Gumroad sale auto-mints + emails buyer | **Yes** (enable in Gumroad dashboard at this PR's deploy) |
|
||||||
|
| **3** | Renewal / re-delivery portal | Buyer self-services renewals and lost-blob re-delivery | (unchanged) |
|
||||||
|
|
||||||
|
§1d (Gumroad webhook URL) is **filled in during PR 2's deploy**, not
|
||||||
|
before. Until then the endpoint returns 404.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Rollback
|
||||||
|
|
||||||
|
Each component is independently reversible.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop and remove containers (DB volume persists)
|
||||||
|
docker compose -f /srv/datatools-license/compose.yml down
|
||||||
|
|
||||||
|
# Full teardown including DB (DESTRUCTIVE — backup first)
|
||||||
|
docker compose -f /srv/datatools-license/compose.yml down -v
|
||||||
|
|
||||||
|
# Remove nginx site
|
||||||
|
rm /etc/nginx/sites-enabled/unalogix
|
||||||
|
nginx -t && systemctl reload nginx
|
||||||
|
|
||||||
|
# Revoke + delete TLS cert
|
||||||
|
certbot delete --cert-name datatools.unalogix.com
|
||||||
|
|
||||||
|
# Remove filesystem
|
||||||
|
rm -rf /srv/datatools-license # NOTE: includes secrets dir; backup first
|
||||||
|
|
||||||
|
# Remove system user
|
||||||
|
deluser datatools-api
|
||||||
|
delgroup datatools-api
|
||||||
|
```
|
||||||
|
|
||||||
|
DNS records can stay or be removed — they're not on this host.
|
||||||
@@ -3,6 +3,9 @@
|
|||||||
> Creator-only. Do not ship to buyers.
|
> Creator-only. Do not ship to buyers.
|
||||||
> **Version**: 1.6 · **Updated**: 2026-05-01
|
> **Version**: 1.6 · **Updated**: 2026-05-01
|
||||||
|
|
||||||
|
For the end-to-end picture (desktop app + license server + storefronts
|
||||||
|
+ email), see `ARCHITECTURE.md`. This doc focuses on desktop internals.
|
||||||
|
|
||||||
## 1. Architecture
|
## 1. Architecture
|
||||||
|
|
||||||
- **Dual interface**: CLI + GUI, both wrapping the same `src/core/` library.
|
- **Dual interface**: CLI + GUI, both wrapping the same `src/core/` library.
|
||||||
@@ -31,8 +34,8 @@ src/
|
|||||||
normalizers.py # Per-column normalizers for dedup matching
|
normalizers.py # Per-column normalizers for dedup matching
|
||||||
text_clean.py # clean_dataframe + smart_title_case
|
text_clean.py # clean_dataframe + smart_title_case
|
||||||
_constants.py # Shared USPS abbrevs + state names
|
_constants.py # Shared USPS abbrevs + state names
|
||||||
cli.py # Deduplicator CLI (Typer)
|
cli.py # Find Duplicates CLI (Typer)
|
||||||
cli_text_clean.py # Text Cleaner CLI
|
cli_text_clean.py # Clean Text CLI
|
||||||
cli_analyze.py # Analyzer CLI (--json)
|
cli_analyze.py # Analyzer CLI (--json)
|
||||||
gui/
|
gui/
|
||||||
app.py # Streamlit entry point
|
app.py # Streamlit entry point
|
||||||
@@ -119,6 +122,17 @@ Tag a release → 3 platform artifacts upload to GitHub Releases. Manual: copy t
|
|||||||
|
|
||||||
`demo/streamlit_app.py` → Streamlit Community Cloud. Configure deployment in Streamlit UI. Custom domain via CNAME (verify policy at deploy time). Fall back to $5/mo VPS if rate limits / branding constraints hit.
|
`demo/streamlit_app.py` → Streamlit Community Cloud. Configure deployment in Streamlit UI. Custom domain via CNAME (verify policy at deploy time). Fall back to $5/mo VPS if rate limits / branding constraints hit.
|
||||||
|
|
||||||
|
### 3.10 Bundled Tesseract (PDF Extractor OCR)
|
||||||
|
|
||||||
|
Frozen builds ship Tesseract 5.5 + `eng.traineddata` inside the PyInstaller bundle so scanned PDFs work without a separate install. Per-platform binary URLs pinned in `build/make_release.py`; tessdata vendored at `build/vendor/tessdata/eng.traineddata`. License attribution in `LICENSE_TESSERACT.txt` at the repo root.
|
||||||
|
|
||||||
|
**Discovery order at runtime** (see `docs/DEVELOPER.md` for the full Path layout):
|
||||||
|
|
||||||
|
1. `DATATOOLS_TESSERACT_BIN` env var override.
|
||||||
|
2. Bundled path under `sys._MEIPASS / "tesseract" /` (frozen bundles only).
|
||||||
|
3. `tesseract` on `PATH` (source / pip developer environments).
|
||||||
|
4. Windows well-known locations.
|
||||||
|
|
||||||
## 4. Libraries
|
## 4. Libraries
|
||||||
|
|
||||||
| Purpose | Library |
|
| Purpose | Library |
|
||||||
@@ -189,7 +203,7 @@ GUI / CLI handlers use `format_for_user()` so the user always sees: file path, o
|
|||||||
|
|
||||||
| Bundle | Status |
|
| Bundle | Status |
|
||||||
|--------|--------|
|
|--------|--------|
|
||||||
| Data Cleaning Mastery | 3/9 tools Ready (Dedup, Text Cleaner, Format Standardizer); 6 stubs |
|
| Data Cleaning Mastery | 3/9 tools Ready (Find Duplicates, Clean Text, Standardize Formats); 6 stubs |
|
||||||
| Automated Business Reporting | Not started |
|
| Automated Business Reporting | Not started |
|
||||||
| Ecommerce Data Pipeline | Not started |
|
| Ecommerce Data Pipeline | Not started |
|
||||||
| Small Business Finance | Not started |
|
| Small Business Finance | Not started |
|
||||||
@@ -211,12 +225,12 @@ Deliberately separate. Confluent original spec was wrong.
|
|||||||
|
|
||||||
| Script | Owns |
|
| Script | Owns |
|
||||||
|--------|------|
|
|--------|------|
|
||||||
| 04 Missing Value Handler | "What's not there." Disguised nulls (`N/A`, `-`, sentinel codes), missingness patterns, imputation, drop-by-threshold. |
|
| 04 Fix Missing Values | "What's not there." Disguised nulls (`N/A`, `-`, sentinel codes), missingness patterns, imputation, drop-by-threshold. |
|
||||||
| 06 Outlier Detector | "What shouldn't be there." z-score / IQR / modified-z, multivariate (Isolation Forest, Mahalanobis), domain rules, winsorization. |
|
| 06 Find Unusual Values | "What shouldn't be there." z-score / IQR / modified-z, multivariate (Isolation Forest, Mahalanobis), domain rules, winsorization. |
|
||||||
|
|
||||||
**Run order**: 04 before 06. Outlier stats on data with `NaN` / sentinels are mathematically poisoned (means dragged, IQR widens, false negatives).
|
**Run order**: 04 before 06. Outlier stats on data with `NaN` / sentinels are mathematically poisoned (means dragged, IQR widens, false negatives).
|
||||||
|
|
||||||
**Pipeline order** (Pipeline Runner enforces): 02 → 03 → 04 → 05 → 06 → 07 → 08. 01 is order-flexible.
|
**Pipeline order** (Automated Workflows enforces): 02 → 03 → 04 → 05 → 06 → 07 → 08. 01 is order-flexible.
|
||||||
|
|
||||||
**Contested cases**:
|
**Contested cases**:
|
||||||
- Whitespace-only cell — 02 trims to empty; 04 then flags empty as null.
|
- Whitespace-only cell — 02 trims to empty; 04 then flags empty as null.
|
||||||
@@ -239,6 +253,15 @@ The GUI uses an in-house, JSON-backed translation layer at `src/i18n/`. **No** `
|
|||||||
|
|
||||||
**Why not gettext**: zero compiled artifacts in the PyInstaller bundle, no build step before tests run, no `.po`/`.mo` round-trip for translators (anyone can edit JSON), and the same lookup works in unit tests without process state. Locked in because the surface won't grow large enough to need the alternative, and the alternative breaks the "drop a file, run pytest, ship" loop.
|
**Why not gettext**: zero compiled artifacts in the PyInstaller bundle, no build step before tests run, no `.po`/`.mo` round-trip for translators (anyone can edit JSON), and the same lookup works in unit tests without process state. Locked in because the surface won't grow large enough to need the alternative, and the alternative breaks the "drop a file, run pytest, ship" loop.
|
||||||
|
|
||||||
|
## 10c. GUI chrome — sidebar nav indicator swap
|
||||||
|
|
||||||
|
Streamlit's `st.Page`-driven sidebar renders section headers with a Material Symbols ligature (`expand_more` / `expand_less`). The header element is not a button and carries no `aria-expanded`, so a pure-CSS swap can't follow open/closed state. We replace the glyph with plain typographic `+` / `−` (U+2212) via JS:
|
||||||
|
|
||||||
|
- **CSS** (`components/_legacy.py`, `_HIDE_CHROME_CSS`) drops the Material Symbols font on `[data-testid="stIconMaterial"]` inside `[data-testid="stNavSectionHeader"]` so the rewritten character renders as normal text rather than re-resolving as an icon name.
|
||||||
|
- **JS** (`_SWAP_NAV_SECTION_INDICATOR_JS`) walks each section header, reads the icon's text node, and rewrites `expand_more` → `+` / `expand_less` → `−`. A MutationObserver re-runs the swap when Streamlit re-renders the sidebar (RAF-throttled so a burst of mutations is one swap).
|
||||||
|
|
||||||
|
The script ships through the same component-iframe bundle as the brand injector and upload-button rename inside `hide_streamlit_chrome()` — one iframe per page, three DOM mutations.
|
||||||
|
|
||||||
## 11. Per-script functional specs
|
## 11. Per-script functional specs
|
||||||
|
|
||||||
Specs live in this section as scripts enter active build. Each follows the Tier 1/2/3 structure with explicit strategic framing (what's the market gap given some of this is free elsewhere).
|
Specs live in this section as scripts enter active build. Each follows the Tier 1/2/3 structure with explicit strategic framing (what's the market gap given some of this is free elsewhere).
|
||||||
|
|||||||
@@ -4,29 +4,108 @@
|
|||||||
|
|
||||||
**Versión**: 1.6 · **Actualizado**: 2026-05-13
|
**Versión**: 1.6 · **Actualizado**: 2026-05-13
|
||||||
|
|
||||||
|
## 0. Primer arranque — activación
|
||||||
|
|
||||||
|
DataTools debe activarse antes de desbloquear cualquier herramienta. En el primer arranque verás la pantalla **Activar**.
|
||||||
|
|
||||||
|
Introduce tu nombre completo y correo, pega el código de licencia del correo de compra (empieza con `DTLIC1:`) y pulsa **Activar**. La renovación funciona igual: pega el código de renovación y pulsa **Aplicar renovación**.
|
||||||
|
|
||||||
|
**Niveles**:
|
||||||
|
|
||||||
|
| Nivel | Herramientas |
|
||||||
|
|---|---|
|
||||||
|
| **Lite** | Buscar duplicados · Limpiar texto · Estandarizar formatos |
|
||||||
|
| **Core** | Las 9 herramientas |
|
||||||
|
|
||||||
|
Un usuario Lite que abra una herramienta exclusiva de Core verá un mensaje "Actualiza tu licencia". La página de inicio también muestra una marca 🔒 Bloqueado en las tarjetas de las herramientas que tu nivel no incluye. Para actualizar, pega un código Core en la página Activar.
|
||||||
|
|
||||||
|
Cada licencia dura 1 año. La barra lateral muestra tu nivel y los días restantes en todo momento; aparece un aviso de renovación 30 días antes de la caducidad. El archivo de licencia vive en `~/.datatools/license.json` (Windows: `C:\Users\<tú>\.datatools\license.json`).
|
||||||
|
|
||||||
|
Para usar la misma licencia en otro equipo: desactiva éste (página Activar → **Desactivar este dispositivo**) y vuelve a pegar tu código en el nuevo.
|
||||||
|
|
||||||
## 1. Instalación
|
## 1. Instalación
|
||||||
|
|
||||||
No necesitas tener Python instalado — el paquete es autocontenido.
|
No necesitas tener Python ni permisos de administrador — el paquete trae su propio intérprete y todas las dependencias. Dos formatos por sistema operativo, elige el que tu política de TI permita:
|
||||||
|
|
||||||
| Sistema operativo | Archivo | Cómo |
|
- **Instalador** — crea automáticamente acceso directo en el escritorio + entrada en el menú Inicio / Launchpad. Recomendado para la mayoría.
|
||||||
|----|------|-----|
|
- **.zip portable** — descomprime y haz doble clic. No toca el registro, se ejecuta desde cualquier lugar (escritorio, USB, recurso de red). Úsalo si no puedes ejecutar instaladores, quieres una instalación de una sola carpeta que puedas copiar entre equipos, o estás evaluando antes de instalar.
|
||||||
| Windows | `BundleName-Setup-1.0.exe` | Doble clic en el instalador → acceso directo en el escritorio. |
|
|
||||||
| macOS | `BundleName-1.0.dmg` | Monta el DMG y arrástralo a Aplicaciones. Firmado y notarizado. |
|
|
||||||
| Linux | `BundleName-1.0.AppImage` | `chmod +x`, doble clic. (También hay un `.tar.gz` de respaldo.) |
|
|
||||||
|
|
||||||
Al iniciar la app, se abre tu navegador predeterminado en una página local (`http://localhost:8501`).
|
Ambos formatos son idénticos por dentro: mismo Python, mismas dependencias, mismo comportamiento de arranque.
|
||||||
|
|
||||||
### Cómo funciona la interfaz gráfica (GUI)
|
### 1.1 Windows
|
||||||
|
|
||||||
|
**Opción A — Instalador (`DataTools-<ver>-win-setup.exe`)**
|
||||||
|
|
||||||
|
1. Descarga `DataTools-<ver>-win-setup.exe` desde tu correo de licencia o GitHub Releases.
|
||||||
|
2. Doble clic en el instalador. La primera vez, Windows SmartScreen mostrará **"Windows protegió tu PC"** — pulsa **Más información** → **Ejecutar de todas formas**. (Este aviso solo aparece una vez por compilación hasta que tengamos un certificado EV de firma de código.)
|
||||||
|
3. Acepta la ruta de instalación por usuario (`%LOCALAPPDATA%\Programs\DataTools` por defecto — no pide UAC). Marca **Crear acceso directo en el escritorio** si lo quieres (activado por defecto).
|
||||||
|
4. Pulsa **Instalar** y luego **Finalizar**. El instalador te ofrece lanzar DataTools al terminar.
|
||||||
|
5. A partir de ahora ejecútalo desde: **Menú Inicio → DataTools**, el **acceso directo del escritorio**, o escribiendo `DataTools` en Ejecutar (Win+R) / cmd.
|
||||||
|
|
||||||
|
Para anclarlo a la barra de tareas, lanza la app una vez, clic derecho en su icono de la barra de tareas, y **Anclar a la barra de tareas**. Windows requiere este paso manual — ningún instalador puede anclar por programa.
|
||||||
|
|
||||||
|
**Opción B — Portable (`DataTools-<ver>-win-portable.zip`)**
|
||||||
|
|
||||||
|
1. Descarga `DataTools-<ver>-win-portable.zip`.
|
||||||
|
2. Clic derecho en el .zip → **Extraer todo…** → elige una carpeta (p. ej. `C:\Tools\DataTools`).
|
||||||
|
3. Abre la carpeta `DataTools\` extraída, doble clic en `DataTools.exe`. El aviso de SmartScreen aparece solo la primera vez.
|
||||||
|
4. Para crear tu propio acceso directo en el escritorio: clic derecho en `DataTools.exe` → **Enviar a → Escritorio (crear acceso directo)**.
|
||||||
|
|
||||||
|
**Desinstalar** (solo instalador): Configuración → Aplicaciones → DataTools → Desinstalar. Portable: borra la carpeta.
|
||||||
|
|
||||||
|
### 1.2 macOS
|
||||||
|
|
||||||
|
**Opción A — DMG instalador (`DataTools-<ver>-mac.dmg`)**
|
||||||
|
|
||||||
|
1. Descarga `DataTools-<ver>-mac.dmg`.
|
||||||
|
2. Doble clic en el .dmg. Se abre una ventana de Finder con el icono **DataTools** y un alias **Aplicaciones**.
|
||||||
|
3. Arrastra **DataTools** sobre **Aplicaciones**. Espera a que termine la copia y expulsa el DMG.
|
||||||
|
4. En compilaciones sin firma, el primer arranque muestra **"No se puede abrir 'DataTools' porque no se puede verificar al desarrollador"**. Solución: clic derecho en DataTools en /Aplicaciones → **Abrir** → confirma **Abrir** en el diálogo. macOS recuerda la elección — los siguientes arranques no muestran nada.
|
||||||
|
5. Ejecútalo desde **Launchpad**, **Spotlight** (`⌘ Espacio` → escribe "DataTools"), o **Aplicaciones** en Finder.
|
||||||
|
|
||||||
|
Para mantener DataTools en el Dock: lanza la app, clic derecho en su icono del Dock → **Opciones → Mantener en el Dock**. macOS no permite que los instaladores fijen al Dock automáticamente.
|
||||||
|
|
||||||
|
**Opción B — Portable (`DataTools-<ver>-mac-portable.zip`)**
|
||||||
|
|
||||||
|
1. Descarga `DataTools-<ver>-mac-portable.zip`. Safari descomprime al descargar por defecto; en Finder verás `DataTools.app` directamente.
|
||||||
|
2. Mueve `DataTools.app` a **Aplicaciones** si quieres que aparezca en Launchpad — o déjalo en el escritorio, un USB o un recurso de red. La .app portable se ejecuta desde cualquier sitio.
|
||||||
|
3. Doble clic en `DataTools.app`. Clic derecho → **Abrir** la primera vez (misma rutina que con el DMG).
|
||||||
|
|
||||||
|
**Desinstalar**: arrastra `DataTools.app` a la Papelera. Tus archivos de datos siguen donde estén — la app no instala nada más.
|
||||||
|
|
||||||
|
### 1.3 Linux
|
||||||
|
|
||||||
|
`DataTools-<ver>-linux-x86_64.AppImage` ya es portable — no hay .zip aparte.
|
||||||
|
|
||||||
|
1. Descarga el .AppImage.
|
||||||
|
2. `chmod +x DataTools-*.AppImage`.
|
||||||
|
3. Doble clic, o ejecútalo desde la terminal.
|
||||||
|
|
||||||
|
Si tu distro no incluye FUSE 2: `sudo apt install libfuse2` (Debian/Ubuntu) o equivalente.
|
||||||
|
|
||||||
|
### 1.4 Qué pasa al arrancar por primera vez
|
||||||
|
|
||||||
|
El lanzador (llamado `DataTools.exe` / `DataTools.app` / `DataTools.AppImage`) hace tres cosas, en orden:
|
||||||
|
|
||||||
|
1. Elige un puerto TCP libre en `127.0.0.1` — normalmente el 8501; si está ocupado prueba 8502, 8503, …
|
||||||
|
2. Arranca un servidor Streamlit local en ese puerto. El servidor solo está enlazado a localhost, nunca a tu red.
|
||||||
|
3. Abre tu navegador predeterminado en `http://127.0.0.1:<puerto>/`. Si el navegador no se abre en 5 segundos, pega esa URL manualmente.
|
||||||
|
|
||||||
|
La ventana del lanzador queda abierta en segundo plano. Cerrarla detiene el servidor — la pestaña del navegador dirá "no se puede acceder a este sitio" la próxima vez.
|
||||||
|
|
||||||
|
### 1.5 Cómo funciona la GUI
|
||||||
|
|
||||||
- Se ejecuta localmente en tu equipo. **Sin internet, sin subidas.**
|
- Se ejecuta localmente en tu equipo. **Sin internet, sin subidas.**
|
||||||
- El navegador es solo la capa de visualización. Cerrarlo detiene el programa subyacente.
|
- El navegador es solo la capa de visualización. Cerrarlo NO detiene la app — cierra la ventana del lanzador (o sal de la .app de macOS desde el Dock) para terminar del todo.
|
||||||
- ¿Prefieres la terminal? Cada herramienta incluye también una interfaz de línea de comandos (CLI) — ver Sección 3.
|
- ¿Prefieres la terminal? Cada herramienta incluye también una CLI — ver Sección 3.
|
||||||
|
|
||||||
### Requisitos del sistema
|
### 1.6 Requisitos del sistema
|
||||||
|
|
||||||
- Windows 10/11 (64 bits), macOS 11+, Linux moderno (2020+).
|
- Windows 10/11 (64 bits), macOS 11+, Linux moderno (2020+).
|
||||||
- Navegador moderno (Chrome, Edge, Firefox, Safari, últimos 3 años).
|
- Navegador moderno (Chrome, Edge, Firefox, Safari, últimos 3 años).
|
||||||
- ~400-500 MB de espacio libre en disco.
|
- ~500 MB de espacio libre en disco (el paquete ocupa ~300 MB; el resto es espacio de trabajo para CSV grandes).
|
||||||
|
|
||||||
|
**OCR para PDFs escaneados viene incluido** — Tesseract 5.5 y el modelo en inglés `eng.traineddata` vienen dentro de cada instalador / portable / AppImage. La ruta de extracción de PDFs escaneados del Extractor de PDF funciona sin configuración adicional; no hace falta instalar nada por separado. (Quien ejecute desde un checkout con `pip install -r requirements.txt` sigue necesitando Tesseract del sistema en el `PATH` — ver [DEVELOPER.md §PDF Extractor — bundled Tesseract](DEVELOPER.md#pdf-extractor--bundled-tesseract) (solo en inglés).)
|
||||||
|
|
||||||
Matriz de soporte completa: [REQUIREMENTS.md](REQUIREMENTS.md) (solo en inglés).
|
Matriz de soporte completa: [REQUIREMENTS.md](REQUIREMENTS.md) (solo en inglés).
|
||||||
|
|
||||||
@@ -34,15 +113,15 @@ Matriz de soporte completa: [REQUIREMENTS.md](REQUIREMENTS.md) (solo en inglés)
|
|||||||
|
|
||||||
| # | Herramienta | Propósito | Estado |
|
| # | Herramienta | Propósito | Estado |
|
||||||
|---|------|---------|--------|
|
|---|------|---------|--------|
|
||||||
| 01 | Eliminador de duplicados | Coincidencia exacta + difusa, 5 normalizadores, auditoría | Listo |
|
| 01 | Buscar duplicados | Coincidencia exacta + difusa, 5 normalizadores, auditoría | Listo |
|
||||||
| 02 | Limpiador de texto | Espacios, caracteres tipográficos, BOM, finales de línea, mayúsculas/minúsculas | Listo |
|
| 02 | Limpiar texto | Espacios, caracteres tipográficos, BOM, finales de línea, mayúsculas/minúsculas | Listo |
|
||||||
| 03 | Estandarizador de formatos | Fechas / teléfonos / correos / direcciones / nombres / monedas / booleanos | Listo |
|
| 03 | Estandarizar formatos | Fechas / teléfonos / correos / direcciones / nombres / monedas / booleanos | Listo |
|
||||||
| 04 | Gestor de valores faltantes | Nulos disfrazados, imputación, descarte por umbral | Próximamente |
|
| 04 | Corregir valores faltantes | Nulos disfrazados, imputación, descarte por umbral | Próximamente |
|
||||||
| 05 | Mapeador de columnas | Renombrar + aplicar esquema | Próximamente |
|
| 05 | Mapear columnas | Renombrar + aplicar esquema | Próximamente |
|
||||||
| 06 | Detector de valores atípicos | z-score, IQR, multivariante | Próximamente |
|
| 06 | Detectar valores atípicos | z-score, IQR, multivariante | Próximamente |
|
||||||
| 07 | Combinador de varios archivos | Combina varios archivos | Próximamente |
|
| 07 | Combinar archivos | Combina varios archivos | Próximamente |
|
||||||
| 08 | Validador e informes | Reglas + informe PDF/Excel | Próximamente |
|
| 08 | Verificación de calidad | Reglas + informe PDF/Excel | Próximamente |
|
||||||
| 09 | Ejecutor de canalizaciones | Lanzador multi-herramienta de un clic | Próximamente |
|
| 09 | Flujos automatizados | Lanzador multi-herramienta de un clic | Próximamente |
|
||||||
|
|
||||||
**Datos de muestra** (`samples/`): `messy_sales.csv`, `bank_export.xlsx`.
|
**Datos de muestra** (`samples/`): `messy_sales.csv`, `bank_export.xlsx`.
|
||||||
|
|
||||||
@@ -58,6 +137,10 @@ Matriz de soporte completa: [REQUIREMENTS.md](REQUIREMENTS.md) (solo en inglés)
|
|||||||
|
|
||||||
Las opciones avanzadas se encuentran en paneles desplegables. El archivo original nunca se modifica.
|
Las opciones avanzadas se encuentran en paneles desplegables. El archivo original nunca se modifica.
|
||||||
|
|
||||||
|
**Ayuda en la herramienta**: cada página tiene un botón **Help** a la derecha del título. Al pulsarlo se abre una ventana emergente con una guía compacta (Cuándo usarla · Pasos · Ejemplos · Consejo). Úsala como recordatorio a media tarea — la ventana se cierra al hacer clic fuera y tus datos no se ven afectados.
|
||||||
|
|
||||||
|
**Navegación lateral**: la barra lateral agrupa las herramientas en secciones (Análisis, Limpiadores de datos, Transformaciones, Automatizaciones). Cada cabecera muestra `+` cuando está plegada y `−` cuando está desplegada — pulsa la cabecera para alternar.
|
||||||
|
|
||||||
### 3.2 CLI
|
### 3.2 CLI
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -70,17 +153,17 @@ Ayuda: `deduplicator --help`. Referencia completa: [CLI-REFERENCE.es.md](CLI-REF
|
|||||||
|
|
||||||
### 3.3 Orden de ejecución (cuando uses las herramientas manualmente)
|
### 3.3 Orden de ejecución (cuando uses las herramientas manualmente)
|
||||||
|
|
||||||
Si no usas el Ejecutor de canalizaciones, sigue este orden:
|
Si no usas Flujos automatizados, sigue este orden:
|
||||||
|
|
||||||
1. **02 Limpiador de texto** primero — normaliza espacios y caracteres especiales.
|
1. **02 Limpiar texto** primero — normaliza espacios y caracteres especiales.
|
||||||
2. **03 Estandarizador de formatos** — fechas, teléfonos, etc. necesitan texto limpio.
|
2. **03 Estandarizar formatos** — fechas, teléfonos, etc. necesitan texto limpio.
|
||||||
3. **04 Gestor de valores faltantes** — códigos centinela se ocultan como números.
|
3. **04 Corregir valores faltantes** — códigos centinela se ocultan como números.
|
||||||
4. **05 Mapeador de columnas** — esquema antes que estadísticas de atípicos.
|
4. **05 Mapear columnas** — esquema antes que estadísticas de atípicos.
|
||||||
5. **06 Detector de valores atípicos** — necesita datos numéricos limpios. Calcular estadísticas con `NaN` o `-999` envenena los resultados.
|
5. **06 Detectar valores atípicos** — necesita datos numéricos limpios. Calcular estadísticas con `NaN` o `-999` envenena los resultados.
|
||||||
6. **07 Combinador de varios archivos**, **08 Validador** según sea necesario.
|
6. **07 Combinar archivos**, **08 Verificación de calidad** según sea necesario.
|
||||||
7. **01 Eliminador de duplicados** es flexible en cuanto al orden (normaliza internamente para la coincidencia).
|
7. **01 Buscar duplicados** es flexible en cuanto al orden (normaliza internamente para la coincidencia).
|
||||||
|
|
||||||
El Ejecutor de canalizaciones aplica este orden automáticamente.
|
Flujos automatizados aplica este orden automáticamente.
|
||||||
|
|
||||||
### 3.4 Idioma
|
### 3.4 Idioma
|
||||||
|
|
||||||
@@ -118,12 +201,15 @@ El archivo original nunca se modifica.
|
|||||||
|
|
||||||
## 6. Solución de problemas
|
## 6. Solución de problemas
|
||||||
|
|
||||||
- **La GUI no se abre / el navegador no se inicia** — espera 10-15 s; visita manualmente `http://localhost:8501`. Error de puerto ocupado → cierra otras instancias.
|
- **La GUI no se abre / el navegador no se inicia** — espera 10-15 s; visita manualmente `http://127.0.0.1:8501` (o el puerto que muestre la ventana del lanzador). Error de puerto ocupado → cierra otras instancias. El lanzador recorre los puertos 8501–8550 buscando uno libre, así que una instancia colgada puede desplazar la URL.
|
||||||
- **¿Por qué se abre el navegador?** — patrón de aplicación web local (igual que Jupyter o RStudio). Nada sale de tu equipo.
|
- **¿Por qué se abre el navegador?** — patrón de aplicación web local (igual que Jupyter o RStudio). Nada sale de tu equipo.
|
||||||
- **Windows SmartScreen** — pulsa "Más información" → "Ejecutar de todas formas". Estándar para software sin firma EV.
|
- **Windows SmartScreen** — pulsa "Más información" → "Ejecutar de todas formas". Una sola vez por compilación hasta que tengamos un certificado EV.
|
||||||
- **macOS "La aplicación está dañada"** — descárgala de nuevo (probablemente se corrompió en tránsito).
|
- **macOS "La aplicación está dañada" / "no se puede verificar al desarrollador"** — clic derecho en la app → **Abrir** → confirma. Si el mensaje persiste, el archivo se corrompió en tránsito — vuelve a descargarlo. Último recurso: `xattr -cr /Applications/DataTools.app` limpia el atributo de cuarentena.
|
||||||
- **El AppImage de Linux no se ejecuta** — `chmod +x archivo.AppImage`. Si falta FUSE → `sudo apt install libfuse2` o usa el `.tar.gz`.
|
- **macOS — el .zip portable extraído no abre** — Safari descomprime al descargar; si ves una carpeta `__MACOSX/` o archivos `._DataTools.app` usaste otro descompresor. Vuelve a extraer con la Utilidad de Archivo integrada (clic derecho en el .zip → **Abrir con → Utilidad de Archivo**) para preservar los metadatos de la .app.
|
||||||
|
- **Windows — el antivirus pone en cuarentena `DataTools.exe` del portable** — tu antivirus no reconoce el paquete. Añade la carpeta extraída a la lista blanca. El instalador .exe activa menos antivirus porque es un envoltorio Inno Setup conocido.
|
||||||
|
- **El AppImage de Linux no se ejecuta** — `chmod +x archivo.AppImage`. Si falta FUSE → `sudo apt install libfuse2`.
|
||||||
- **Lento con archivos grandes** — por encima de ~100k filas tarda más; la barra de progreso lo indica. Para millones de filas → usa la CLI directamente.
|
- **Lento con archivos grandes** — por encima de ~100k filas tarda más; la barra de progreso lo indica. Para millones de filas → usa la CLI directamente.
|
||||||
|
- **¿Dónde guarda la app mi licencia / configuración?** — `~/.datatools/` en macOS y Linux, `C:\Users\<tú>\.datatools\` en Windows. Tus archivos de entrada y salida siguen donde los dejes; la app nunca los copia a otro sitio.
|
||||||
- **Necesito ayuda** — escribe al correo que aparece en tu recibo de compra.
|
- **Necesito ayuda** — escribe al correo que aparece en tu recibo de compra.
|
||||||
|
|
||||||
## 7. Licencia
|
## 7. Licencia
|
||||||
|
|||||||
@@ -4,29 +4,108 @@
|
|||||||
|
|
||||||
**Version**: 1.6 · **Updated**: 2026-05-01
|
**Version**: 1.6 · **Updated**: 2026-05-01
|
||||||
|
|
||||||
|
## 0. First launch — activation
|
||||||
|
|
||||||
|
DataTools must be activated before any tools unlock. On first launch you'll see the **Activate** screen.
|
||||||
|
|
||||||
|
Enter your full name + email, paste the license blob from your purchase email (starts with `DTLIC1:`), and click **Activate**. Renewal works the same way — paste the renewal blob, click **Apply renewal**.
|
||||||
|
|
||||||
|
**Tiers**:
|
||||||
|
|
||||||
|
| Tier | Tools |
|
||||||
|
|---|---|
|
||||||
|
| **Lite** | Find Duplicates · Clean Text · Standardize Formats |
|
||||||
|
| **Core** | All 9 tools |
|
||||||
|
|
||||||
|
A Lite user opening a Core-only tool sees an "Upgrade your license" prompt. The home page also shows a 🔒 Locked badge on tool cards your tier doesn't unlock. To upgrade, paste a Core blob on the Activate page.
|
||||||
|
|
||||||
|
Every license lasts 1 year. The sidebar shows your tier and days remaining at all times; a renewal warning appears 30 days before expiry. The license file lives at `~/.datatools/license.json` (Windows: `C:\Users\<you>\.datatools\license.json`).
|
||||||
|
|
||||||
|
To use the same license on a different machine: deactivate this one (Activate page → **Deactivate this device**) and re-paste your blob on the new machine.
|
||||||
|
|
||||||
## 1. Install
|
## 1. Install
|
||||||
|
|
||||||
You don't need Python — the bundle is self-contained.
|
You don't need Python and you don't need admin rights — the bundle ships its own interpreter and every dependency. Two flavors per OS, pick whichever your IT policy allows:
|
||||||
|
|
||||||
| OS | File | How |
|
- **Installer** — wires up Desktop shortcut + Start Menu / Launchpad entry automatically. Recommended for most users.
|
||||||
|----|------|-----|
|
- **Portable .zip** — unzip and double-click. No registry writes, runs from anywhere (Desktop, USB stick, network share). Use this if you can't run installers, want a single-folder install you can copy between machines, or are evaluating before committing to install.
|
||||||
| Windows | `BundleName-Setup-1.0.exe` | Double-click installer → desktop shortcut. |
|
|
||||||
| macOS | `BundleName-1.0.dmg` | Mount, drag to Applications. Signed + notarized. |
|
|
||||||
| Linux | `BundleName-1.0.AppImage` | `chmod +x`, double-click. (`.tar.gz` fallback available.) |
|
|
||||||
|
|
||||||
Launching opens your default browser to a local page (`http://localhost:8501`).
|
Both flavors are byte-identical inside: same Python, same dependencies, same launch behavior.
|
||||||
|
|
||||||
### How the GUI works
|
### 1.1 Windows
|
||||||
|
|
||||||
|
**Option A — Installer (`DataTools-<ver>-win-setup.exe`)**
|
||||||
|
|
||||||
|
1. Download `DataTools-<ver>-win-setup.exe` from your release email or GitHub Releases.
|
||||||
|
2. Double-click the installer. On the first run Windows SmartScreen will say **"Windows protected your PC"** — click **More info** → **Run anyway**. (This warning only appears once per build until we have an EV code-signing cert.)
|
||||||
|
3. Accept the per-user install location (`%LOCALAPPDATA%\Programs\DataTools` by default — no admin prompt). Check **Create a desktop shortcut** if you want one (on by default).
|
||||||
|
4. Click **Install**, then **Finish**. The installer offers to launch DataTools immediately.
|
||||||
|
5. From now on launch from: **Start Menu → DataTools**, the **Desktop shortcut**, or just type `DataTools` into Windows Run (Win+R) / cmd.
|
||||||
|
|
||||||
|
To pin to the taskbar, launch the app once, right-click its icon in the taskbar, then **Pin to taskbar**. Windows requires this manual step — no installer is allowed to pin programmatically.
|
||||||
|
|
||||||
|
**Option B — Portable (`DataTools-<ver>-win-portable.zip`)**
|
||||||
|
|
||||||
|
1. Download `DataTools-<ver>-win-portable.zip`.
|
||||||
|
2. Right-click the .zip → **Extract All…** → pick a folder (e.g. `C:\Tools\DataTools`).
|
||||||
|
3. Open the extracted `DataTools\` folder, double-click `DataTools.exe`. SmartScreen warning fires the first time only.
|
||||||
|
4. To create your own desktop shortcut later: right-click `DataTools.exe` → **Send to → Desktop (create shortcut)**.
|
||||||
|
|
||||||
|
**Uninstall** (installer only): Settings → Apps → DataTools → Uninstall. Portable: delete the folder.
|
||||||
|
|
||||||
|
### 1.2 macOS
|
||||||
|
|
||||||
|
**Option A — Installer DMG (`DataTools-<ver>-mac.dmg`)**
|
||||||
|
|
||||||
|
1. Download `DataTools-<ver>-mac.dmg`.
|
||||||
|
2. Double-click the .dmg. A Finder window opens showing the **DataTools** icon and an **Applications** alias.
|
||||||
|
3. Drag **DataTools** onto **Applications**. Wait for the copy to finish, then eject the DMG.
|
||||||
|
4. On unsigned builds the first launch shows **"DataTools" cannot be opened because the developer cannot be verified**. Fix: right-click DataTools in /Applications → **Open** → confirm **Open** in the dialog. macOS remembers this choice — subsequent launches are clean.
|
||||||
|
5. Launch from **Launchpad**, **Spotlight** (`⌘ Space` → type "DataTools"), or **Applications** in Finder.
|
||||||
|
|
||||||
|
To keep DataTools in the Dock: launch the app, right-click its Dock icon → **Options → Keep in Dock**. macOS doesn't allow installers to pin to the Dock automatically.
|
||||||
|
|
||||||
|
**Option B — Portable (`DataTools-<ver>-mac-portable.zip`)**
|
||||||
|
|
||||||
|
1. Download `DataTools-<ver>-mac-portable.zip`. Safari auto-unzips on download; in Finder you'll see `DataTools.app` directly.
|
||||||
|
2. Move `DataTools.app` to **Applications** if you want it discoverable via Launchpad — or keep it on your Desktop, a USB stick, or a network share. The portable .app runs from anywhere.
|
||||||
|
3. Double-click `DataTools.app`. Right-click → **Open** the first time (same unsigned-build dance as the DMG).
|
||||||
|
|
||||||
|
**Uninstall**: drag `DataTools.app` to the Trash. Your data files stay where you put them — nothing else is installed.
|
||||||
|
|
||||||
|
### 1.3 Linux
|
||||||
|
|
||||||
|
`DataTools-<ver>-linux-x86_64.AppImage` is already portable — no separate zip needed.
|
||||||
|
|
||||||
|
1. Download the .AppImage.
|
||||||
|
2. `chmod +x DataTools-*.AppImage`.
|
||||||
|
3. Double-click, or run it from a terminal.
|
||||||
|
|
||||||
|
If your distro doesn't ship FUSE 2: `sudo apt install libfuse2` (Debian/Ubuntu) or equivalent.
|
||||||
|
|
||||||
|
### 1.4 What happens on first launch
|
||||||
|
|
||||||
|
The launcher (called `DataTools.exe` / `DataTools.app` / `DataTools.AppImage`) does three things, in order:
|
||||||
|
|
||||||
|
1. Picks a free TCP port on `127.0.0.1` — usually 8501, falls back through 8502, 8503, … if another app is using 8501.
|
||||||
|
2. Starts a local Streamlit server on that port. The server is **bound to localhost only**, never to your LAN.
|
||||||
|
3. Opens your default browser at `http://127.0.0.1:<port>/`. If the browser doesn't open within 5 seconds, paste that URL into your browser manually.
|
||||||
|
|
||||||
|
The launcher window stays open in the background. Closing it stops the server — the browser tab will say "this site can't be reached" the next time you click it.
|
||||||
|
|
||||||
|
### 1.5 How the GUI works
|
||||||
|
|
||||||
- Runs locally on your machine. **No internet, no upload.**
|
- Runs locally on your machine. **No internet, no upload.**
|
||||||
- Browser is just the display surface. Closing it stops the underlying program.
|
- The browser is just the display surface. Closing it does NOT stop the app — close the launcher window (or quit the macOS .app from the Dock) to fully exit.
|
||||||
- Prefer the terminal? Every tool ships with a CLI too (Section 3).
|
- Prefer the terminal? Every tool ships with a CLI too (Section 3).
|
||||||
|
|
||||||
### System requirements
|
### 1.6 System requirements
|
||||||
|
|
||||||
- Windows 10/11 (64-bit), macOS 11+, modern Linux (2020+).
|
- Windows 10/11 (64-bit), macOS 11+, modern Linux (2020+).
|
||||||
- Modern browser (Chrome, Edge, Firefox, Safari, last 3 years).
|
- Modern browser (Chrome, Edge, Firefox, Safari, last 3 years).
|
||||||
- ~400-500 MB free disk space.
|
- ~500 MB free disk space (the bundle itself is ~300 MB; the rest is working scratch space for large CSVs).
|
||||||
|
|
||||||
|
**OCR for scanned PDFs is bundled** — Tesseract 5.5 + the English `eng.traineddata` model ship inside every installer / portable / AppImage. The PDF Extractor's scanned-statement path works out of the box; no separate install required. (Developers running from a `pip install -r requirements.txt` checkout still need system Tesseract on `PATH` — see [DEVELOPER.md §PDF Extractor — bundled Tesseract](DEVELOPER.md#pdf-extractor--bundled-tesseract).)
|
||||||
|
|
||||||
Full numbered support matrix: [REQUIREMENTS.md](REQUIREMENTS.md).
|
Full numbered support matrix: [REQUIREMENTS.md](REQUIREMENTS.md).
|
||||||
|
|
||||||
@@ -34,15 +113,15 @@ Full numbered support matrix: [REQUIREMENTS.md](REQUIREMENTS.md).
|
|||||||
|
|
||||||
| # | Tool | Purpose | Status |
|
| # | Tool | Purpose | Status |
|
||||||
|---|------|---------|--------|
|
|---|------|---------|--------|
|
||||||
| 01 | Deduplicator | Exact + fuzzy match, 5 normalizers, audit | Ready |
|
| 01 | Find Duplicates | Exact + fuzzy match, 5 normalizers, audit | Ready |
|
||||||
| 02 | Text Cleaner | Whitespace, smart chars, BOM, line endings, case ops | Ready |
|
| 02 | Clean Text | Whitespace, smart chars, BOM, line endings, case ops | Ready |
|
||||||
| 03 | Format Standardizer | Dates / phones / emails / addresses / names / currencies / booleans | Ready |
|
| 03 | Standardize Formats | Dates / phones / emails / addresses / names / currencies / booleans | Ready |
|
||||||
| 04 | Missing Value Handler | Disguised nulls, imputation, drop-by-threshold | Coming Soon |
|
| 04 | Fix Missing Values | Disguised nulls, imputation, drop-by-threshold | Coming Soon |
|
||||||
| 05 | Column Mapper | Rename + enforce schema | Coming Soon |
|
| 05 | Map Columns | Rename + enforce schema | Coming Soon |
|
||||||
| 06 | Outlier Detector | z-score, IQR, multivariate | Coming Soon |
|
| 06 | Find Unusual Values | z-score, IQR, multivariate | Coming Soon |
|
||||||
| 07 | Multi-File Merger | Combine multiple files | Coming Soon |
|
| 07 | Combine Files | Combine multiple files | Coming Soon |
|
||||||
| 08 | Validator & Reporter | Rules + PDF/Excel report | Coming Soon |
|
| 08 | Quality Check | Rules + PDF/Excel report | Coming Soon |
|
||||||
| 09 | Pipeline Runner | One-click multi-tool launcher | Coming Soon |
|
| 09 | Automated Workflows | One-click multi-tool launcher | Coming Soon |
|
||||||
|
|
||||||
**Sample data** (`samples/`): `messy_sales.csv`, `bank_export.xlsx`.
|
**Sample data** (`samples/`): `messy_sales.csv`, `bank_export.xlsx`.
|
||||||
|
|
||||||
@@ -58,6 +137,10 @@ Full numbered support matrix: [REQUIREMENTS.md](REQUIREMENTS.md).
|
|||||||
|
|
||||||
Advanced options are tucked in expander panes. The original file is never modified.
|
Advanced options are tucked in expander panes. The original file is never modified.
|
||||||
|
|
||||||
|
**In-tool Help**: every tool page has a **Help** button right of the title. Click it to open a popover with a compact how-to (When to use · Steps · Examples · Tip). Use it as a refresher mid-task — the popover closes when you click outside, your inputs are untouched.
|
||||||
|
|
||||||
|
**Sidebar nav**: the sidebar groups tools into sections (Analysis, Data Cleaners, Transformations, Automations). Each section header shows `+` when collapsed and `−` when expanded — click the header to toggle.
|
||||||
|
|
||||||
### 3.2 CLI
|
### 3.2 CLI
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -70,17 +153,17 @@ Get help: `deduplicator --help`. Full reference: [CLI-REFERENCE.md](CLI-REFERENC
|
|||||||
|
|
||||||
### 3.3 Run order (when running tools manually)
|
### 3.3 Run order (when running tools manually)
|
||||||
|
|
||||||
If you skip the Pipeline Runner, follow this order:
|
If you skip Automated Workflows, follow this order:
|
||||||
|
|
||||||
1. **02 Text Cleaner** first — normalizes whitespace + special chars.
|
1. **02 Clean Text** first — normalizes whitespace + special chars.
|
||||||
2. **03 Format Standardizer** — dates, phones, etc. need cleaned text.
|
2. **03 Standardize Formats** — dates, phones, etc. need cleaned text.
|
||||||
3. **04 Missing Value Handler** — sentinel codes hide as numbers.
|
3. **04 Fix Missing Values** — sentinel codes hide as numbers.
|
||||||
4. **05 Column Mapper** — schema before outlier stats.
|
4. **05 Map Columns** — schema before outlier stats.
|
||||||
5. **06 Outlier Detector** — needs clean numerics. Stats on data with `NaN` or `-999` are mathematically poisoned.
|
5. **06 Find Unusual Values** — needs clean numerics. Stats on data with `NaN` or `-999` are mathematically poisoned.
|
||||||
6. **07 Multi-File Merger**, **08 Validator** as needed.
|
6. **07 Combine Files**, **08 Quality Check** as needed.
|
||||||
7. **01 Deduplicator** is order-flexible (normalizes internally for matching).
|
7. **01 Find Duplicates** is order-flexible (normalizes internally for matching).
|
||||||
|
|
||||||
The Pipeline Runner enforces this automatically.
|
Automated Workflows enforces this automatically.
|
||||||
|
|
||||||
### 3.4 Language
|
### 3.4 Language
|
||||||
|
|
||||||
@@ -118,12 +201,15 @@ Original input is never modified.
|
|||||||
|
|
||||||
## 6. Troubleshooting
|
## 6. Troubleshooting
|
||||||
|
|
||||||
- **GUI won't launch / browser doesn't open** — wait 10-15 s; manually visit `http://localhost:8501`. Port-in-use error → close other instances.
|
- **GUI won't launch / browser doesn't open** — wait 10-15 s; manually visit `http://127.0.0.1:8501` (or whichever port the launcher window prints). Port-in-use error → close other instances. The launcher walks ports 8501–8550 looking for a free one, so a stale instance can shift the URL.
|
||||||
- **Why does my browser open?** — local web app pattern (same as Jupyter, RStudio). Nothing leaves your machine.
|
- **Why does my browser open?** — local web app pattern (same as Jupyter, RStudio). Nothing leaves your machine.
|
||||||
- **Windows SmartScreen** — click "More info" → "Run anyway". Standard for non-EV-signed software.
|
- **Windows SmartScreen** — click "More info" → "Run anyway". One-time per build until we have an EV-signed cert.
|
||||||
- **macOS "App is damaged"** — re-download (file likely corrupted in transit).
|
- **macOS "App is damaged" / "developer cannot be verified"** — right-click the app → **Open** → confirm. If the message persists, the file was likely corrupted in transit — re-download. As a last resort: `xattr -cr /Applications/DataTools.app` clears the quarantine attribute.
|
||||||
- **Linux AppImage won't run** — `chmod +x file.AppImage`. Missing FUSE → `sudo apt install libfuse2` or use `.tar.gz`.
|
- **macOS portable .zip — extracted but won't open** — Safari unzips on download by default; if you see a `__MACOSX/` folder or `._DataTools.app` file you used a different unarchiver. Re-extract with the built-in Archive Utility (right-click the .zip → **Open With → Archive Utility**) so the .app's metadata is preserved.
|
||||||
|
- **Windows portable .zip — antivirus quarantines DataTools.exe** — your AV doesn't recognize the bundle. Allowlist the extracted folder. The installer .exe trips fewer AV products because it's a known Inno Setup wrapper.
|
||||||
|
- **Linux AppImage won't run** — `chmod +x file.AppImage`. Missing FUSE → `sudo apt install libfuse2`.
|
||||||
- **Slow on large file** — over ~100k rows takes longer; progress bar shows. Multi-million rows → use the CLI directly.
|
- **Slow on large file** — over ~100k rows takes longer; progress bar shows. Multi-million rows → use the CLI directly.
|
||||||
|
- **Where does the app store my license / settings?** — `~/.datatools/` on macOS + Linux, `C:\Users\<you>\.datatools\` on Windows. Your input/output files stay where you put them; the app never copies them anywhere else.
|
||||||
- **Need help** — email the address on your purchase receipt.
|
- **Need help** — email the address on your purchase receipt.
|
||||||
|
|
||||||
## 7. License
|
## 7. License
|
||||||
|
|||||||
@@ -251,12 +251,12 @@ row,column,field_type,old,new
|
|||||||
<div class="eyebrow">In the bundle</div>
|
<div class="eyebrow">In the bundle</div>
|
||||||
<h2>Six tools. One pipeline. One $49 download.</h2>
|
<h2>Six tools. One pipeline. One $49 download.</h2>
|
||||||
<div class="grid">
|
<div class="grid">
|
||||||
<div class="card"><h3>1 · Deduplicator</h3><p>Fuzzy match (Jaro-Winkler), explicit strategies for Date+Amount+Vendor, survivor rules.</p></div>
|
<div class="card"><h3>1 · Find Duplicates</h3><p>Fuzzy match (Jaro-Winkler), explicit strategies for Date+Amount+Vendor, survivor rules.</p></div>
|
||||||
<div class="card"><h3>2 · Text Cleaner</h3><p>Header whitespace, smart quotes from copy-paste, em-dash sentinels.</p></div>
|
<div class="card"><h3>2 · Clean Text</h3><p>Header whitespace, smart quotes from copy-paste, em-dash sentinels.</p></div>
|
||||||
<div class="card"><h3>3 · Format Standardizer</h3><p>ISO dates, numeric amounts (parens-negative), vendor casing, multi-currency.</p></div>
|
<div class="card"><h3>3 · Standardize Formats</h3><p>ISO dates, numeric amounts (parens-negative), vendor casing, multi-currency.</p></div>
|
||||||
<div class="card"><h3>4 · Missing Value Handler</h3><p>Disguised-null detection: <code>—</code>, <code>N/A</code>, <code>(blank)</code>, <code>?</code>.</p></div>
|
<div class="card"><h3>4 · Fix Missing Values</h3><p>Disguised-null detection: <code>—</code>, <code>N/A</code>, <code>(blank)</code>, <code>?</code>.</p></div>
|
||||||
<div class="card"><h3>5 · Column Mapper</h3><p>Project to your accounting tool's required schema, coerce types, drop extras.</p></div>
|
<div class="card"><h3>5 · Map Columns</h3><p>Project to your accounting tool's required schema, coerce types, drop extras.</p></div>
|
||||||
<div class="card"><h3>6 · Pipeline Runner</h3><p>Save the cleanup. Run it on next month's export with one command. Same audit, automated.</p></div>
|
<div class="card"><h3>6 · Automated Workflows</h3><p>Save the cleanup. Run it on next month's export with one command. Same audit, automated.</p></div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|||||||
@@ -168,9 +168,9 @@
|
|||||||
<h2>One engine. Same six tools. Same $49.</h2>
|
<h2>One engine. Same six tools. Same $49.</h2>
|
||||||
<p>
|
<p>
|
||||||
The persona pages above are positioning, not different products.
|
The persona pages above are positioning, not different products.
|
||||||
Whichever you buy, you get the full bundle: Deduplicator, Text
|
Whichever you buy, you get the full bundle: Find Duplicates, Clean
|
||||||
Cleaner, Format Standardizer, Missing-Value Handler, Column
|
Text, Standardize Formats, Fix Missing Values, Map Columns,
|
||||||
Mapper, and Pipeline Runner — pre-tuned with a saved pipeline
|
and Automated Workflows — pre-tuned with a saved pipeline
|
||||||
that matches your workflow.
|
that matches your workflow.
|
||||||
</p>
|
</p>
|
||||||
<div class="grid">
|
<div class="grid">
|
||||||
|
|||||||
@@ -165,7 +165,7 @@
|
|||||||
<div class="card">
|
<div class="card">
|
||||||
<span class="icon">🌍</span>
|
<span class="icon">🌍</span>
|
||||||
<h3>Multi-platform audience reconciliation</h3>
|
<h3>Multi-platform audience reconciliation</h3>
|
||||||
<p>Build one canonical audience from Meta, Google Ads, LinkedIn, and your CRM. Each platform exports a different shape; column-mapper aligns them all, dedup merges the survivors with their most-complete fields.</p>
|
<p>Build one canonical audience from Meta, Google Ads, LinkedIn, and your CRM. Each platform exports a different shape; Map Columns aligns them all, dedup merges the survivors with their most-complete fields.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<span class="icon">🛡️</span>
|
<span class="icon">🛡️</span>
|
||||||
@@ -192,7 +192,7 @@
|
|||||||
<li><strong>Per-row country column</strong> drives the parser — no global default that bucks UK numbers as malformed US.</li>
|
<li><strong>Per-row country column</strong> drives the parser — no global default that bucks UK numbers as malformed US.</li>
|
||||||
<li><strong>Country-name normalization</strong>: <code>USA</code> / <code>US</code> / <code>United States</code> all resolve to the same ISO-2 code.</li>
|
<li><strong>Country-name normalization</strong>: <code>USA</code> / <code>US</code> / <code>United States</code> all resolve to the same ISO-2 code.</li>
|
||||||
<li><strong>50+ country support</strong> via Google's libphonenumber, including KR, CN, IN, MX, BR, IL, TR, PL, DK, SE.</li>
|
<li><strong>50+ country support</strong> via Google's libphonenumber, including KR, CN, IN, MX, BR, IL, TR, PL, DK, SE.</li>
|
||||||
<li><strong>Schema enforcement</strong> via the column-mapper: project to your CRM's required shape, coerce score columns to integers, reorder fields to match the import contract.</li>
|
<li><strong>Schema enforcement</strong> via Map Columns: project to your CRM's required shape, coerce score columns to integers, reorder fields to match the import contract.</li>
|
||||||
</ul>
|
</ul>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
@@ -249,12 +249,12 @@ Total elapsed: 6.7 s
|
|||||||
<div class="eyebrow">In the bundle</div>
|
<div class="eyebrow">In the bundle</div>
|
||||||
<h2>Six tools. One pipeline. One $49 download.</h2>
|
<h2>Six tools. One pipeline. One $49 download.</h2>
|
||||||
<div class="grid">
|
<div class="grid">
|
||||||
<div class="card"><h3>1 · Deduplicator</h3><p>Fuzzy match across email + phone + name + company; merge survivors with most-complete fields.</p></div>
|
<div class="card"><h3>1 · Find Duplicates</h3><p>Fuzzy match across email + phone + name + company; merge survivors with most-complete fields.</p></div>
|
||||||
<div class="card"><h3>2 · Text Cleaner</h3><p>Smart quotes from copy-paste, NBSP from spreadsheet exports, BOM from Excel.</p></div>
|
<div class="card"><h3>2 · Clean Text</h3><p>Smart quotes from copy-paste, NBSP from spreadsheet exports, BOM from Excel.</p></div>
|
||||||
<div class="card"><h3>3 · Format Standardizer</h3><p>E.164 phones with per-row country, canonical emails, name casing, ISO dates.</p></div>
|
<div class="card"><h3>3 · Standardize Formats</h3><p>E.164 phones with per-row country, canonical emails, name casing, ISO dates.</p></div>
|
||||||
<div class="card"><h3>4 · Missing Value Handler</h3><p>Detect <code>TBD</code>, <code>(unknown)</code>, <code>—</code> across vendor exports.</p></div>
|
<div class="card"><h3>4 · Fix Missing Values</h3><p>Detect <code>TBD</code>, <code>(unknown)</code>, <code>—</code> across vendor exports.</p></div>
|
||||||
<div class="card"><h3>5 · Column Mapper</h3><p>Project to your CRM's required schema, coerce score to integer, reorder for import.</p></div>
|
<div class="card"><h3>5 · Map Columns</h3><p>Project to your CRM's required schema, coerce score to integer, reorder for import.</p></div>
|
||||||
<div class="card"><h3>6 · Pipeline Runner</h3><p>Save the cleanup as JSON. Drop next campaign's combined export on it. Same dedup, automated.</p></div>
|
<div class="card"><h3>6 · Automated Workflows</h3><p>Save the cleanup as JSON. Drop next campaign's combined export on it. Same dedup, automated.</p></div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|||||||
@@ -178,7 +178,7 @@
|
|||||||
<div class="card">
|
<div class="card">
|
||||||
<span class="icon">🔗</span>
|
<span class="icon">🔗</span>
|
||||||
<h3>Multi-channel order consolidation</h3>
|
<h3>Multi-channel order consolidation</h3>
|
||||||
<p>Orders from Shopify + Etsy + a wholesale spreadsheet, each with a different column for "customer email." Column-mapper aligns them; dedup merges across channels.</p>
|
<p>Orders from Shopify + Etsy + a wholesale spreadsheet, each with a different column for "customer email." Map Columns aligns them; dedup merges across channels.</p>
|
||||||
</div>
|
</div>
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<span class="icon">⚙️</span>
|
<span class="icon">⚙️</span>
|
||||||
@@ -270,12 +270,12 @@ Total elapsed: 4.2 s
|
|||||||
<div class="eyebrow">In the bundle</div>
|
<div class="eyebrow">In the bundle</div>
|
||||||
<h2>Six tools. One pipeline. One $49 download.</h2>
|
<h2>Six tools. One pipeline. One $49 download.</h2>
|
||||||
<div class="grid">
|
<div class="grid">
|
||||||
<div class="card"><h3>1 · Deduplicator</h3><p>Fuzzy match (Jaro-Winkler), 5 normalizers, survivor rules, interactive review.</p></div>
|
<div class="card"><h3>1 · Find Duplicates</h3><p>Fuzzy match (Jaro-Winkler), 5 normalizers, survivor rules, interactive review.</p></div>
|
||||||
<div class="card"><h3>2 · Text Cleaner</h3><p>Whitespace, smart chars, NBSP, BOM, line endings, case ops.</p></div>
|
<div class="card"><h3>2 · Clean Text</h3><p>Whitespace, smart chars, NBSP, BOM, line endings, case ops.</p></div>
|
||||||
<div class="card"><h3>3 · Format Standardizer</h3><p>Dates, phones, emails, addresses, names, currencies, booleans.</p></div>
|
<div class="card"><h3>3 · Standardize Formats</h3><p>Dates, phones, emails, addresses, names, currencies, booleans.</p></div>
|
||||||
<div class="card"><h3>4 · Missing Value Handler</h3><p>Disguised-null detection, profile, mean/median/mode/ffill, drop strategies.</p></div>
|
<div class="card"><h3>4 · Fix Missing Values</h3><p>Disguised-null detection, profile, mean/median/mode/ffill, drop strategies.</p></div>
|
||||||
<div class="card"><h3>5 · Column Mapper</h3><p>Fuzzy auto-rename, target schema, type coercion, required-field defaults.</p></div>
|
<div class="card"><h3>5 · Map Columns</h3><p>Fuzzy auto-rename, target schema, type coercion, required-field defaults.</p></div>
|
||||||
<div class="card"><h3>6 · Pipeline Runner</h3><p>Chain tools in recommended order, save/load JSON, automate weekly cleanups.</p></div>
|
<div class="card"><h3>6 · Automated Workflows</h3><p>Chain tools in recommended order, save/load JSON, automate weekly cleanups.</p></div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|||||||
187
layout-review/01_deduplicator.html
Normal file
187
layout-review/01_deduplicator.html
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Find Duplicates</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="01_deduplicator">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Find Duplicates</strong>, shown with a file imported and a completed run (results + match-group review). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Find Duplicates</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Find rows that repeat, then keep one and remove the extras.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding & delimiter auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">customers_export.csv</span>
|
||||||
|
<span class="size">2.1 MB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Delimiter selector (CSV) -->
|
||||||
|
<div class="dt-field" style="max-width:320px">
|
||||||
|
<label class="dt-label">Delimiter</label>
|
||||||
|
<div class="dt-select">Comma (,)</div>
|
||||||
|
<div class="dt-help-text">Auto-detected on upload. Change if the preview looks wrong.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed after a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: customers_export.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">18,442 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>Austin</td><td>512-555-0190</td><td>2024-01-04</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>jane doe</td><td>JANE@ACME.IO</td><td>austin</td><td>(512) 555-0190</td><td>01/04/2024</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R. Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Options expander -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<details class="dt-expander" style="margin-top:0">
|
||||||
|
<summary>Advanced Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Match on columns</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-placeholder">Leave empty to auto-detect</span></div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Strong keys</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">email <span class="x">✕</span></span></div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Fuzzy columns</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">name <span class="x">✕</span></span></div></div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Fuzzy algorithm</label><div class="dt-select">jaro_winkler</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Similarity threshold</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:70%"></div><div class="knob" style="left:70%"></div></div><div class="val">85</div></div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Survivor rule</label><div class="dt-select">most-complete</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on" style="margin-top:6px"><span class="box"><span class="dt-mi">check</span></span> Merge mode — fill missing fields in the surviving row</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Find Duplicates</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Original rows</div><div class="value">18,442</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Duplicate rows</div><div class="value">312</div><div class="delta down">−312 removed</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Match groups</div><div class="value">147</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Rows kept</div><div class="value">18,130</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-btn-row" style="max-width:560px">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download deduplicated CSV</button>
|
||||||
|
<button class="dt-btn">Download removed rows</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Match groups -->
|
||||||
|
<h2>Match Groups</h2>
|
||||||
|
<div class="dt-cols-3" style="max-width:520px">
|
||||||
|
<button class="dt-btn">Accept All</button>
|
||||||
|
<button class="dt-btn">Reject All</button>
|
||||||
|
<button class="dt-btn">Clear Decisions</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Match group card 1 -->
|
||||||
|
<div class="dt-match-card">
|
||||||
|
<div class="dt-match-head">
|
||||||
|
<span class="title">Group 1 · 2 rows</span>
|
||||||
|
<span class="conf"><span class="dt-count-pill success">98% match</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-match-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>keep</th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr class="dt-keep-row"><td><span class="dt-keep-tag">keep</span></td><td>Jane Doe</td><td>jane@acme.io</td><td>Austin</td><td>512-555-0190</td><td>2024-01-04</td></tr>
|
||||||
|
<tr><td><span class="dt-caption">remove</span></td><td class="dt-cell-flag">jane doe</td><td class="dt-cell-flag">JANE@ACME.IO</td><td class="dt-cell-flag">austin</td><td>(512) 555-0190</td><td class="dt-cell-flag">01/04/2024</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Differing columns highlighted. The survivor row is kept; uncheck rows to split the group.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Match group card 2 -->
|
||||||
|
<div class="dt-match-card">
|
||||||
|
<div class="dt-match-head">
|
||||||
|
<span class="title">Group 2 · 2 rows</span>
|
||||||
|
<span class="conf"><span class="dt-count-pill warn">87% match</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-match-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>keep</th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr class="dt-keep-row"><td><span class="dt-keep-tag">keep</span></td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
<tr><td><span class="dt-caption">remove</span></td><td class="dt-cell-flag">R. Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="dt-caption" style="margin-top:14px">Decisions: 1 merged, 1 pending</p>
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block" style="margin-top:8px">Apply Review Decisions & Download</button>
|
||||||
|
|
||||||
|
<!-- Processing log -->
|
||||||
|
<details class="dt-expander" style="margin-top:18px">
|
||||||
|
<summary>Processing Log</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-code">[00:00.01] Loaded 18,442 rows from customers_export.csv
|
||||||
|
[00:00.04] Strategy: exact(email) + fuzzy(name, jaro_winkler ≥ 85)
|
||||||
|
[00:00.91] Compared 18,442 rows → 147 match groups
|
||||||
|
[00:01.02] Survivor rule: most-complete · merge=on
|
||||||
|
[00:01.05] 312 rows flagged for removal</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
208
layout-review/02_text_cleaner.html
Normal file
208
layout-review/02_text_cleaner.html
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Clean Text</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
<style>
|
||||||
|
/* Hidden-character badges — mirrors src/core/text_clean.py:hidden_char_css(),
|
||||||
|
not part of app.css so reproduced inline against the same palette. */
|
||||||
|
.hidden-char { display: inline-block; padding: 0 2px; margin: 0 1px; border-radius: 3px; font-family: var(--font-mono); font-size: 0.85em; cursor: help; }
|
||||||
|
.hidden-char.hidden-whitespace { background: #fff3cd; color: #856404; border: 1px solid #ffeaa7; }
|
||||||
|
.hidden-char.hidden-special { background: #d1ecf1; color: #0c5460; border: 1px solid #bee5eb; }
|
||||||
|
.hidden-char.hidden-control { background: #f8d7da; color: #721c24; border: 1px solid #f5c6cb; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body data-page="02_text_cleaner">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Clean Text</strong>, shown with a file imported and a completed run (results metrics, changes-by-column, before/after examples, cleaned preview, downloads). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Clean Text</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Trim extra spaces and strip out odd characters.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">contacts_messy.csv</span>
|
||||||
|
<span class="size">684 KB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: contacts_messy.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">4,120 rows, 4 columns</p>
|
||||||
|
<div class="dt-check on" style="margin-top:2px"><span class="box"><span class="dt-mi">check</span></span> Show hidden characters in preview</div>
|
||||||
|
<div class="dt-table-wrap" style="margin-top:8px">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>company</th><th>notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td><span class="hidden-char hidden-whitespace" title="U+0020 SP LEAD">·</span>Jane Doe<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>jane@acme.io</td><td>Acme<span class="hidden-char hidden-whitespace" title="U+00A0 NBSP">·</span>Inc.</td><td>VIP<span class="hidden-char hidden-special" title="U+201D RIGHT DOUBLE QUOTE">”</span></td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com<span class="hidden-char hidden-special" title="U+200B ZWSP">∅</span></td><td>Globex</td><td>—<span class="hidden-char hidden-control" title="U+0007 CTRL">␣</span></td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Ana López</td><td>ana@initech.com</td><td>Initech<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>follow up</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td><span class="hidden-char hidden-whitespace" title="U+0009 TAB">→</span>Wei Chen</td><td>WEI@umbrella.co</td><td>Umbrella</td><td>“key<span class="hidden-char hidden-special" title="U+2014 EM DASH">—</span>account”</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Preset</label>
|
||||||
|
<div class="dt-radio-row">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> excel-hygiene (recommended)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> minimal</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> paranoid</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">excel-hygiene: trim, collapse whitespace, fold smart quotes, strip invisible chars, normalize line endings, NFC.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Advanced options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Trim leading/trailing whitespace</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Collapse internal whitespace</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Normalize line endings (\r\n → \n)</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Strip control characters</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Strip BOM</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Fold smart characters (curly quotes, em-dash, NBSP)</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Strip zero-width / invisible characters</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Unicode NFC normalization</div>
|
||||||
|
<div class="dt-check"><span class="box"></span> Unicode NFKC compat fold (lossy: ① → 1, fi → fi)</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Scope</h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to clean (default: all string columns)</label>
|
||||||
|
<div class="dt-multiselect">
|
||||||
|
<span class="dt-ms-chip">name <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">email <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">company <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">notes <span class="x">✕</span></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to skip even if they look like text</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-placeholder">Choose columns to leave untouched</span></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Case conversion</h4>
|
||||||
|
<div class="dt-field" style="max-width:360px">
|
||||||
|
<label class="dt-label">Apply case conversion to selected columns</label>
|
||||||
|
<div class="dt-select">None</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Clean Text</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Cells scanned</div><div class="value">16,480</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells changed</div><div class="value">3,947</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">% changed</div><div class="value">24.0%</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Columns processed</div><div class="value">4</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)</div>
|
||||||
|
|
||||||
|
<h4>Changes by column</h4>
|
||||||
|
<div class="dt-table-wrap" style="max-width:360px">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>cells_changed</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">company</td><td>1,604</td></tr>
|
||||||
|
<tr><td class="idx">name</td><td>1,210</td></tr>
|
||||||
|
<tr><td class="idx">notes</td><td>982</td></tr>
|
||||||
|
<tr><td class="idx">email</td><td>151</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Examples (first 25 changes)</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Row</th><th>Column</th><th>Before</th><th>After</th><th>Ops applied</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>1</td><td>name</td><td><span class="hidden-char hidden-whitespace" title="U+0020 SP LEAD">·</span>Jane Doe<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>Jane Doe</td><td>trim</td></tr>
|
||||||
|
<tr><td>1</td><td>company</td><td>Acme<span class="hidden-char hidden-whitespace" title="U+00A0 NBSP">·</span>Inc.</td><td>Acme Inc.</td><td>fold_smart</td></tr>
|
||||||
|
<tr><td>1</td><td>notes</td><td>VIP<span class="hidden-char hidden-special" title="U+201D RIGHT DOUBLE QUOTE">”</span></td><td>VIP"</td><td>fold_smart</td></tr>
|
||||||
|
<tr><td>2</td><td>name</td><td>Bob<span class="hidden-char hidden-whitespace" title="U+0020 SP">·</span><span class="hidden-char hidden-whitespace" title="U+0020 SP">·</span>Smith</td><td>Bob Smith</td><td>collapse_ws</td></tr>
|
||||||
|
<tr><td>2</td><td>email</td><td>bob@globex.com<span class="hidden-char hidden-special" title="U+200B ZWSP">∅</span></td><td>bob@globex.com</td><td>strip_zero_width</td></tr>
|
||||||
|
<tr><td>2</td><td>notes</td><td>—<span class="hidden-char hidden-control" title="U+0007 CTRL">␣</span></td><td>—</td><td>strip_control</td></tr>
|
||||||
|
<tr><td>3</td><td>company</td><td>Initech<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>Initech</td><td>trim</td></tr>
|
||||||
|
<tr><td>4</td><td>name</td><td><span class="hidden-char hidden-whitespace" title="U+0009 TAB">→</span>Wei Chen</td><td>Wei Chen</td><td>trim</td></tr>
|
||||||
|
<tr><td>4</td><td>notes</td><td>“key<span class="hidden-char hidden-special" title="U+2014 EM DASH">—</span>account”</td><td>"key-account"</td><td>fold_smart, nfc</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Cleaned preview (first 10 rows)</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>company</th><th>notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td class="dt-cell-add">Jane Doe</td><td>jane@acme.io</td><td class="dt-cell-add">Acme Inc.</td><td class="dt-cell-add">VIP"</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td class="dt-cell-add">Bob Smith</td><td class="dt-cell-add">bob@globex.com</td><td>Globex</td><td class="dt-cell-add">—</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Ana López</td><td>ana@initech.com</td><td class="dt-cell-add">Initech</td><td>follow up</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td class="dt-cell-add">Wei Chen</td><td>WEI@umbrella.co</td><td>Umbrella</td><td class="dt-cell-add">"key-account"</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Changed cells highlighted. Toggle “Show hidden characters” to inspect the invisibles being removed.</p>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download cleaned CSV</button>
|
||||||
|
<button class="dt-btn">Download changes audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
224
layout-review/03_format_standardizer.html
Normal file
224
layout-review/03_format_standardizer.html
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Standardize Formats</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="03_format_standardizer">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Standardize Formats</strong>, shown with a file imported from the upload screen and a completed run (results + changes audit + standardized preview). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Standardize Formats</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Make dates, phones, currency, and names look the same throughout.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- File pickup banner (using file from upload screen) -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">description</span>
|
||||||
|
<span>Using <strong>customers_export.csv</strong> from the upload screen.</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn" style="margin-bottom:4px">Use a different file</button>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: customers_export.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">18,442 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>full_name</th><th>phone</th><th>amount</th><th>signup_date</th><th>active</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>jane DOE</td><td>(512) 555-0190</td><td>$1,234.5</td><td>01/04/2024</td><td>Y</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>bob smith</td><td>720.555.7781</td><td>$99</td><td>2024-2-11</td><td>yes</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>ALICIA REYES</td><td>+1 415 555 2233</td><td>$45,000</td><td>Mar 3, 2024</td><td>n</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>m. okafor</td><td>2125550148</td><td>$7.999</td><td>2024/04/22</td><td>true</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (collapsed after run; opened here to show the most informative content) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<h3 style="margin-top:0">Column types</h3>
|
||||||
|
<p class="dt-caption">Assign each column to a field type. Auto-detected suggestions are pre-filled; pick <strong>(skip)</strong> to leave a column untouched.</p>
|
||||||
|
|
||||||
|
<!-- Per-column type selectboxes, 3 per row -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<div class="dt-field"><label class="dt-label">full_name</label><div class="dt-select">Name</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">phone</label><div class="dt-select">Phone</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">amount</label><div class="dt-select">Currency</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<div class="dt-field"><label class="dt-label">signup_date</label><div class="dt-select">Date</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">active</label><div class="dt-select">Boolean</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">notes</label><div class="dt-select">(skip)</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<h3>Format options</h3>
|
||||||
|
|
||||||
|
<!-- Standards preset radio (vertical) -->
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Standards preset</label>
|
||||||
|
<div style="display:flex;flex-direction:column;gap:8px;margin-top:4px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> US (default) — ISO 8601 dates · E.164 phones · USD</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> European — DMY input · INTL phones · EUR comma decimal</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> UK — DD/MM/YYYY · GB phones · Yes/No booleans</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> ISO Strict — ISO 8601 · bare-number currency · true/false</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Legacy US — MM/DD/YYYY · National phones · Yes/No</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Custom — keep current settings</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">Pick a published standard or regional convention as the baseline. Every option below is still individually overridable.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Two-column format options -->
|
||||||
|
<div class="dt-cols-2" style="margin-top:14px">
|
||||||
|
<!-- Left column: Dates + Phones -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0"><strong>Dates</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Output format</label><div class="dt-select">YYYY-MM-DD (ISO)</div></div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Ambiguous input order (e.g. 01/02/2024)</label>
|
||||||
|
<div class="dt-radio-row">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> MDY (US)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> DMY (EU)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4><strong>Phones</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Output format</label><div class="dt-select">E.164 (+15551234567)</div></div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Default region (ISO-2)</label>
|
||||||
|
<div class="dt-input">US</div>
|
||||||
|
<div class="dt-help-text">Region used when the input has no country code. US, GB, DE, etc.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Right column: Currency + Names + Booleans -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0"><strong>Currency</strong></h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Decimal separator in input</label>
|
||||||
|
<div class="dt-radio-row">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> dot (1,234.56)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> comma (1.234,56)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field" style="max-width:200px"><label class="dt-label">Round to decimals</label><div class="dt-input">2</div></div>
|
||||||
|
<div class="dt-check"><span class="box"></span> Preserve original precision (don't round)</div>
|
||||||
|
<div class="dt-check"><span class="box"></span> Preserve currency code (emit <code>USD 1234.56</code>, <code>EUR 99.00</code>, etc.)</div>
|
||||||
|
|
||||||
|
<h4><strong>Names</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Casing</label><div class="dt-select">Title Case</div></div>
|
||||||
|
|
||||||
|
<h4><strong>Booleans</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Output style</label><div class="dt-select">True/False</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Standardize Formats</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Cells scanned</div><div class="value">92,210</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells changed</div><div class="value">61,838</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">% changed</div><div class="value">67.1%</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Unparseable</div><div class="value">47</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>47 cell(s) in typed columns didn't match a recognizable shape and were left as-is. Check the changes audit below to find them, or re-classify the column to <strong>(skip)</strong>.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Changes by column -->
|
||||||
|
<p style="margin-bottom:6px"><strong>Changes by column</strong></p>
|
||||||
|
<div class="dt-table-wrap" style="max-width:520px">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>field_type</th><th>cells_changed</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>amount</td><td>currency</td><td>17,902</td></tr>
|
||||||
|
<tr><td>full_name</td><td>name</td><td>16,041</td></tr>
|
||||||
|
<tr><td>phone</td><td>phone</td><td>14,388</td></tr>
|
||||||
|
<tr><td>signup_date</td><td>date</td><td>11,205</td></tr>
|
||||||
|
<tr><td>active</td><td>boolean</td><td>2,302</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Examples (first 25 changes) -->
|
||||||
|
<p style="margin:14px 0 6px"><strong>Examples (first 25 changes)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>row</th><th>column</th><th>field_type</th><th>before</th><th>after</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>1</td><td>full_name</td><td>name</td><td class="dt-cell-del">jane DOE</td><td class="dt-cell-add">Jane Doe</td></tr>
|
||||||
|
<tr><td>1</td><td>phone</td><td>phone</td><td class="dt-cell-del">(512) 555-0190</td><td class="dt-cell-add">+15125550190</td></tr>
|
||||||
|
<tr><td>1</td><td>amount</td><td>currency</td><td class="dt-cell-del">$1,234.5</td><td class="dt-cell-add">1234.50</td></tr>
|
||||||
|
<tr><td>1</td><td>signup_date</td><td>date</td><td class="dt-cell-del">01/04/2024</td><td class="dt-cell-add">2024-01-04</td></tr>
|
||||||
|
<tr><td>1</td><td>active</td><td>boolean</td><td class="dt-cell-del">Y</td><td class="dt-cell-add">True</td></tr>
|
||||||
|
<tr><td>2</td><td>full_name</td><td>name</td><td class="dt-cell-del">bob smith</td><td class="dt-cell-add">Bob Smith</td></tr>
|
||||||
|
<tr><td>2</td><td>phone</td><td>phone</td><td class="dt-cell-del">720.555.7781</td><td class="dt-cell-add">+17205557781</td></tr>
|
||||||
|
<tr><td>2</td><td>signup_date</td><td>date</td><td class="dt-cell-del">2024-2-11</td><td class="dt-cell-add">2024-02-11</td></tr>
|
||||||
|
<tr><td>3</td><td>signup_date</td><td>date</td><td class="dt-cell-del">Mar 3, 2024</td><td class="dt-cell-add">2024-03-03</td></tr>
|
||||||
|
<tr><td>4</td><td>amount</td><td>currency</td><td class="dt-cell-del">$7.999</td><td class="dt-cell-add">8.00</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Standardized preview -->
|
||||||
|
<p style="margin:14px 0 6px"><strong>Standardized preview (first 10 rows)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>full_name</th><th>phone</th><th>amount</th><th>signup_date</th><th>active</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>+15125550190</td><td>1234.50</td><td>2024-01-04</td><td>True</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>+17205557781</td><td>99.00</td><td>2024-02-11</td><td>True</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Alicia Reyes</td><td>+14155552233</td><td>45000.00</td><td>2024-03-03</td><td>False</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>M. Okafor</td><td>+12125550148</td><td>8.00</td><td>2024-04-22</td><td>True</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (3 columns) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download standardized CSV</button>
|
||||||
|
<button class="dt-btn">Download changes audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
271
layout-review/04_missing_handler.html
Normal file
271
layout-review/04_missing_handler.html
Normal file
@@ -0,0 +1,271 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Fix Missing Values</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="04_missing_handler">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Fix Missing Values</strong>, shown with a file imported and a completed run (per-column missingness profile + before/after results). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Fix Missing Values</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Find blank cells (even hidden ones) and fill them in or remove them.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<p class="dt-caption">Tip: files imported on the Home screen are picked up here automatically.</p>
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">survey_responses.csv</span>
|
||||||
|
<span class="size">684 KB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed after a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: survey_responses.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">2,150 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>respondent_id</th><th>age</th><th>region</th><th>income</th><th>satisfaction</th><th>comments</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>R-1001</td><td>34</td><td>West</td><td>52000</td><td>4</td><td>great service</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>R-1002</td><td class="dt-cell-flag">N/A</td><td>East</td><td class="dt-cell-flag"></td><td>3</td><td class="dt-cell-flag">?</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>R-1003</td><td>41</td><td class="dt-cell-flag">-</td><td>61000</td><td class="dt-cell-flag">NULL</td><td>none</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R-1004</td><td>29</td><td>South</td><td class="dt-cell-flag">N/A</td><td>5</td><td>quick</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (Missingness profile + Strategy) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<h3>Missingness profile</h3>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Rows</div><div class="value">2,150</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells missing</div><div class="value">1,043</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">% cells missing</div><div class="value">8.1%</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Complete rows</div><div class="value">1,388</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>dtype</th><th>missing</th><th>missing_pct</th><th>disguised</th><th>has_missing</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>respondent_id</td><td>object</td><td>0</td><td>0.0%</td><td>0</td><td>False</td></tr>
|
||||||
|
<tr><td>age</td><td>float64</td><td>187</td><td>8.7%</td><td>61</td><td>True</td></tr>
|
||||||
|
<tr><td>region</td><td>object</td><td>142</td><td>6.6%</td><td>142</td><td>True</td></tr>
|
||||||
|
<tr><td>income</td><td>float64</td><td>329</td><td>15.3%</td><td>118</td><td>True</td></tr>
|
||||||
|
<tr><td>satisfaction</td><td>float64</td><td>95</td><td>4.4%</td><td>40</td><td>True</td></tr>
|
||||||
|
<tr><td>comments</td><td>object</td><td>290</td><td>13.5%</td><td>290</td><td>True</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<h3>Strategy</h3>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Preset</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column;gap:10px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> detect-only (standardize sentinels to NaN, no fill or drop)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> safe-fill (numeric → median, categorical → mode)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> drop-incomplete (drop any row with missing)</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. safe-fill: also fill — numeric columns with median, others with mode. drop-incomplete: also drop every row that has any missing cell.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Advanced options expander (open — most informative) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Advanced options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<h4>Detection</h4>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Standardize disguised nulls to NaN</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Sentinel values (comma-separated)</label>
|
||||||
|
<div class="dt-input">N/A, n/a, NA, NULL, null, None, -, --, ?, #N/A</div>
|
||||||
|
<div class="dt-help-text">Matched case-insensitively after stripping whitespace.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<h4>Strategy override</h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Global strategy</label>
|
||||||
|
<div class="dt-select">(use preset)</div>
|
||||||
|
<div class="dt-help-text">drop_row / drop_col use the thresholds below. mean / median / interpolate are numeric only — non-numeric columns fall back to the categorical strategy.</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Categorical fallback (for non-numeric columns)</label>
|
||||||
|
<div class="dt-select">mode</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Drop thresholds</h4>
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Row drop threshold (drop rows with ≥ this fraction missing across selected cols)</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:100%"></div><div class="knob" style="left:100%"></div></div><div class="val">1.00</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Column drop threshold (drop columns with ≥ this fraction missing)</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:100%"></div><div class="knob" style="left:100%"></div></div><div class="val">1.00</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Scope</h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to handle (default: all)</label>
|
||||||
|
<div class="dt-multiselect">
|
||||||
|
<span class="dt-ms-chip">respondent_id <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">age <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">region <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">income <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">satisfaction <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">comments <span class="x">✕</span></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to skip</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-placeholder">Choose columns</span></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Per-column strategy overrides (optional)</h4>
|
||||||
|
<p class="dt-caption">Set a different strategy for specific columns. Leave any row blank to use the global strategy.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Column</th><th>Override</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>age</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px">median</span></td></tr>
|
||||||
|
<tr><td>region</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px">mode</span></td></tr>
|
||||||
|
<tr><td>income</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px"></span></td></tr>
|
||||||
|
<tr><td>satisfaction</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px"></span></td></tr>
|
||||||
|
<tr><td>comments</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px">constant</span></td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Handle Missing Values</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<div id="missing-results-anchor"></div>
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Sentinels → NaN</div><div class="value">651</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells filled</div><div class="value">1,043</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Rows dropped</div><div class="value">0</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Columns dropped</div><div class="value">0</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Missingness — before vs. after</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>before_missing</th><th>before_pct</th><th>after_missing</th><th>after_pct</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>respondent_id</td><td>0</td><td>0.0</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>age</td><td class="dt-cell-flag">187</td><td>8.7</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>region</td><td class="dt-cell-flag">142</td><td>6.6</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>income</td><td class="dt-cell-flag">329</td><td>15.3</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>satisfaction</td><td class="dt-cell-flag">95</td><td>4.4</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>comments</td><td class="dt-cell-flag">290</td><td>13.5</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Strategy applied per column</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>strategy</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>age</td><td>median</td></tr>
|
||||||
|
<tr><td>region</td><td>mode</td></tr>
|
||||||
|
<tr><td>income</td><td>median</td></tr>
|
||||||
|
<tr><td>satisfaction</td><td>median</td></tr>
|
||||||
|
<tr><td>comments</td><td>constant</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Audit (first 50 changes)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>row</th><th>column</th><th>old_value</th><th>new_value</th><th>reason</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2</td><td>age</td><td class="dt-cell-flag">N/A</td><td class="dt-cell-add">37.0</td><td>fill: median</td></tr>
|
||||||
|
<tr><td>2</td><td>income</td><td class="dt-cell-flag">(blank)</td><td class="dt-cell-add">54000.0</td><td>fill: median</td></tr>
|
||||||
|
<tr><td>2</td><td>comments</td><td class="dt-cell-flag">?</td><td class="dt-cell-add">(no comment)</td><td>fill: constant</td></tr>
|
||||||
|
<tr><td>3</td><td>region</td><td class="dt-cell-flag">-</td><td class="dt-cell-add">West</td><td>fill: mode</td></tr>
|
||||||
|
<tr><td>3</td><td>satisfaction</td><td class="dt-cell-flag">NULL</td><td class="dt-cell-add">4.0</td><td>fill: median</td></tr>
|
||||||
|
<tr><td>4</td><td>income</td><td class="dt-cell-flag">N/A</td><td class="dt-cell-add">54000.0</td><td>fill: median</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">… and 1,037 more (download the full audit below).</p>
|
||||||
|
|
||||||
|
<p><strong>Handled preview (first 10 rows)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>respondent_id</th><th>age</th><th>region</th><th>income</th><th>satisfaction</th><th>comments</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>R-1001</td><td>34.0</td><td>West</td><td>52000.0</td><td>4.0</td><td>great service</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>R-1002</td><td class="dt-cell-add">37.0</td><td>East</td><td class="dt-cell-add">54000.0</td><td>3.0</td><td class="dt-cell-add">(no comment)</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>R-1003</td><td>41.0</td><td class="dt-cell-add">West</td><td>61000.0</td><td class="dt-cell-add">4.0</td><td>none</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R-1004</td><td>29.0</td><td>South</td><td class="dt-cell-add">54000.0</td><td>5.0</td><td>quick</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (html_download_button anchors) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download handled CSV</button>
|
||||||
|
<button class="dt-btn">Download changes audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
222
layout-review/05_column_mapper.html
Normal file
222
layout-review/05_column_mapper.html
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Map Columns</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="05_column_mapper">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Map Columns</strong>, shown with a file imported, an interactive target schema + mapping configured, and a completed run (results + mapped preview). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Map Columns</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Rename columns, change their order, and set each one as text, number, or date.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<p class="dt-caption">You can also import a file on the home screen and pick it up here.</p>
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding & delimiter auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">crm_contacts_raw.csv</span>
|
||||||
|
<span class="size">684 KB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed after a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: crm_contacts_raw.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">4,210 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>Full Name</th><th>EmailAddr</th><th>Phone #</th><th>Signup</th><th>Amount Spent</th><th>Notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>512-555-0190</td><td>01/04/2024</td><td>$1,204.50</td><td>VIP</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com</td><td>720-555-7781</td><td>02/11/2024</td><td>$88.00</td><td></td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Carla Reyes</td><td>carla@initech.net</td><td>415-555-3322</td><td>03/02/2024</td><td>$612.10</td><td>renewal</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>Dev Patel</td><td>dev@umbrella.co</td><td>206-555-9043</td><td>03/19/2024</td><td>$0.00</td><td></td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (open — heart of the tool) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<!-- ===== Target schema ===== -->
|
||||||
|
<h3 style="margin-top:0">Target schema</h3>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">How would you like to define the target schema?</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column; gap:8px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> Build interactively (start from current columns)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Import schema JSON</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Skip (rename / coerce only — no schema)</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">An interactive build is fastest for one-off cleanup. Import a JSON when you have a fixed contract (a CRM import format, db schema). Skip when you only want to rename or coerce specific columns.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="dt-caption">Edit the table to define your target schema. Add rows for fields the input doesn't have yet (with a default), or remove rows for columns you want to drop.</p>
|
||||||
|
|
||||||
|
<!-- Schema editor (st.data_editor, num_rows=dynamic) -->
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Target name</th><th>Type</th><th>Required</th><th>Default (for added cols)</th><th>Aliases (comma-sep, helps fuzzy-match)</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>full_name</td><td>string</td><td>✗</td><td></td><td>Full Name, name</td></tr>
|
||||||
|
<tr><td>email</td><td>string</td><td>✓</td><td></td><td>EmailAddr, email_address</td></tr>
|
||||||
|
<tr><td>phone</td><td>string</td><td>✗</td><td></td><td>Phone #, tel</td></tr>
|
||||||
|
<tr><td>signup_date</td><td>date</td><td>✗</td><td></td><td>Signup</td></tr>
|
||||||
|
<tr><td>amount_spent</td><td>float</td><td>✗</td><td>0.0</td><td>Amount Spent</td></tr>
|
||||||
|
<tr><td>source</td><td>string</td><td>✗</td><td>crm-import</td><td></td></tr>
|
||||||
|
<tr><td class="idx" style="color:var(--ink-tertiary)"><span class="dt-mi" style="font-size:16px;vertical-align:-3px">add</span> add row</td><td></td><td></td><td></td><td></td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">6 target fields · 1 added field (<code>source</code>) not present in the input.</p>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- ===== Strategy ===== -->
|
||||||
|
<h3>Strategy</h3>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Preset</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column; gap:8px">
|
||||||
|
<span class="dt-radio"><span class="dot"></span> rename-only (just rename, leave types alone, keep extras)</span>
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> lenient-schema (rename + coerce + reorder, keep extras)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> strict-schema (rename + coerce + reorder, drop extras)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Advanced options expander -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Advanced options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Unmapped source columns</label>
|
||||||
|
<div class="dt-select">keep</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Coerce types per schema</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Reorder to schema order</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Auto-infer mapping (fuzzy match)</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Fuzzy match threshold</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:80%"></div><div class="knob" style="left:80%"></div></div><div class="val">0.80</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Enforce required fields</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- ===== Mapping ===== -->
|
||||||
|
<h3>Mapping</h3>
|
||||||
|
<!-- schema is set → source→target selectbox editor with auto-suggested flag -->
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Source</th><th>Target</th><th>Auto-suggested</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Full Name</td><td>full_name</td><td>✓</td></tr>
|
||||||
|
<tr><td>EmailAddr</td><td>email</td><td>✓</td></tr>
|
||||||
|
<tr><td>Phone #</td><td>phone</td><td>✓</td></tr>
|
||||||
|
<tr><td>Signup</td><td>signup_date</td><td>✓</td></tr>
|
||||||
|
<tr><td>Amount Spent</td><td>amount_spent</td><td>✓</td></tr>
|
||||||
|
<tr><td>Notes</td><td>(unmapped)</td><td>✗</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Pick a target for each source column. <code>Notes</code> stays unmapped — with the lenient preset it is kept as-is. <code>source</code> is added from the schema default.</p>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Apply Column Mapping</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- ===== Results ===== -->
|
||||||
|
<div id="colmap-results-anchor" style="height:1px"></div>
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Renamed</div><div class="value">5</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Dropped</div><div class="value">0</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Added</div><div class="value">1</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Coerce fails</div><div class="value">3</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-alert info"><span class="dt-mi">info</span><span>Added (with defaults): <code>source</code></span></div>
|
||||||
|
<div class="dt-alert warn"><span class="dt-mi">warning</span><span>Some cells could not be coerced and were left as NaN: amount_spent (3)</span></div>
|
||||||
|
|
||||||
|
<p><strong>Resolved mapping</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>source</th><th>target</th><th>auto</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Full Name</td><td>full_name</td><td>True</td></tr>
|
||||||
|
<tr><td>EmailAddr</td><td>email</td><td>True</td></tr>
|
||||||
|
<tr><td>Phone #</td><td>phone</td><td>True</td></tr>
|
||||||
|
<tr><td>Signup</td><td>signup_date</td><td>True</td></tr>
|
||||||
|
<tr><td>Amount Spent</td><td>amount_spent</td><td>True</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Mapped preview (first 10 rows)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th class="dt-cell-add">full_name</th><th>email</th><th>phone</th><th>signup_date</th><th>amount_spent</th><th class="dt-cell-add">source</th><th>Notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>512-555-0190</td><td>2024-01-04</td><td>1204.5</td><td>crm-import</td><td>VIP</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com</td><td>720-555-7781</td><td>2024-02-11</td><td>88.0</td><td>crm-import</td><td></td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Carla Reyes</td><td>carla@initech.net</td><td>415-555-3322</td><td>2024-03-02</td><td>612.1</td><td>crm-import</td><td>renewal</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>Dev Patel</td><td>dev@umbrella.co</td><td>206-555-9043</td><td>2024-03-19</td><td>0.0</td><td>crm-import</td><td></td></tr>
|
||||||
|
<tr><td class="idx">4</td><td>Mei Lin</td><td>mei@hooli.com</td><td>503-555-1188</td><td>2024-04-07</td><td class="dt-cell-flag">NaN</td><td>crm-import</td><td>trial</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (3 columns) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download mapped CSV</button>
|
||||||
|
<button class="dt-btn">Download mapping audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
91
layout-review/06_outlier_detector.html
Normal file
91
layout-review/06_outlier_detector.html
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Find Unusual Values</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="06_outlier_detector">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Find Unusual Values</strong> — a <strong>Coming Soon</strong> tool. The page is a stub/teaser: an "under development" notice, a list of planned features, and disabled placeholder controls (only the file uploader is live). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Find Unusual Values</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Spot values that look wrong — way too high, too low, or breaking your rules.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- st.info: under development -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>This tool is under development.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Planned features (st.markdown) -->
|
||||||
|
<p><strong>Features:</strong></p>
|
||||||
|
<ul>
|
||||||
|
<li>Z-score detection (configurable threshold)</li>
|
||||||
|
<li>IQR (interquartile range) detection</li>
|
||||||
|
<li>MAD (median absolute deviation) detection</li>
|
||||||
|
<li>Domain-rule violations (e.g., age < 0, price > $1M)</li>
|
||||||
|
<li>Visual outlier highlighting in data preview</li>
|
||||||
|
<li>Handling: flag only, remove, cap/winsorize to bounds</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- File upload (functional) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS · Import a file to preview. Processing is not yet available.</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Placeholder options (all disabled) -->
|
||||||
|
<h3>Detection Method</h3>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px">
|
||||||
|
<label class="dt-label">Method</label>
|
||||||
|
<div class="dt-select" style="opacity:.55;cursor:not-allowed">Z-Score</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px;opacity:.55">
|
||||||
|
<label class="dt-label">Z-Score threshold</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:50%"></div><div class="knob" style="left:50%"></div></div><div class="val">3.0</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px;opacity:.55">
|
||||||
|
<label class="dt-label">IQR multiplier</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:25%"></div><div class="knob" style="left:25%"></div></div><div class="val">1.5</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Handling</h3>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px">
|
||||||
|
<label class="dt-label">Action</label>
|
||||||
|
<div class="dt-select" style="opacity:.55;cursor:not-allowed">Flag only (add column)</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block is-disabled" disabled>Detect Outliers</button>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
83
layout-review/07_multi_file_merger.html
Normal file
83
layout-review/07_multi_file_merger.html
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Combine Files</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="07_multi_file_merger">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Combine Files</strong> — a Coming-Soon tool. The page is a stub: an "under development" notice, a planned-features list, a working multi-file uploader, and disabled placeholder options. <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Combine Files</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Combine several CSV or Excel files into one — even if columns differ.</p>
|
||||||
|
|
||||||
|
<!-- Under-development notice (st.info) -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>This tool is under development.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Planned features (st.markdown) -->
|
||||||
|
<p><strong>Features:</strong></p>
|
||||||
|
<ul style="font-size:14px;line-height:1.55;color:var(--ink);margin:0 0 0.6rem;padding-left:22px">
|
||||||
|
<li>Import multiple CSV/Excel files at once</li>
|
||||||
|
<li>Automatic schema alignment (matching columns by name)</li>
|
||||||
|
<li>Append mode: stack files vertically (union)</li>
|
||||||
|
<li>Join mode: merge files on shared key columns</li>
|
||||||
|
<li>Handle mismatched columns (fill missing with nulls or drop)</li>
|
||||||
|
<li>Source file tracking column</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Multi-file upload (functional) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel files</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop files here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS · multiple files allowed</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">Import multiple files to preview. Processing is not yet available.</div>
|
||||||
|
|
||||||
|
<!-- Placeholder options (all disabled) -->
|
||||||
|
<h3>Merge Strategy</h3>
|
||||||
|
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Mode</label>
|
||||||
|
<div class="dt-select" style="color:var(--ink-tertiary);background-color:var(--surface-hover)">Append (stack vertically)</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Mismatched columns</label>
|
||||||
|
<div class="dt-select" style="color:var(--ink-tertiary);background-color:var(--surface-hover)">Fill with null</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-check on" style="opacity:0.6">
|
||||||
|
<span class="box"><span class="dt-mi">check</span></span> Add source filename column
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block is-disabled">Merge Files</button>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
93
layout-review/08_validator_reporter.html
Normal file
93
layout-review/08_validator_reporter.html
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Quality Check</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="08_validator_reporter">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Quality Check</strong>, a Coming-Soon tool. The page is a stub: an "under development" notice, a feature list, a working file uploader, and disabled placeholder controls. <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Quality Check</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Check your file against rules you set, and export a PDF or Excel report.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Under-development notice (st.info) -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>This tool is under development.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Features list (st.markdown) -->
|
||||||
|
<p><strong>Features:</strong></p>
|
||||||
|
<ul>
|
||||||
|
<li>Column-level validation rules (not null, unique, regex pattern, range, enum)</li>
|
||||||
|
<li>Cross-column validation (e.g., start_date < end_date)</li>
|
||||||
|
<li>Data quality score per column and overall</li>
|
||||||
|
<li>Generate PDF quality report</li>
|
||||||
|
<li>Generate Excel report with flagged rows highlighted</li>
|
||||||
|
<li>Summary dashboard: pass/fail counts, severity breakdown</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- File upload (functional) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Import a file to preview. Processing is not yet available.</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Placeholder options -->
|
||||||
|
<h3>Validation Rules</h3>
|
||||||
|
|
||||||
|
<label class="dt-label">Load rules file (JSON)</label>
|
||||||
|
<div class="dt-uploader" style="opacity:0.55">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">JSON</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn is-disabled" disabled>Browse files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Quick checks</label>
|
||||||
|
<div class="dt-multiselect" style="opacity:0.55">
|
||||||
|
<span class="dt-ms-placeholder">Choose options</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Report Format</h3>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:320px">
|
||||||
|
<label class="dt-label">Output format</label>
|
||||||
|
<div class="dt-select" style="opacity:0.55">Excel (flagged rows)</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block is-disabled" disabled>Validate & Generate Report</button>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
231
layout-review/09_pipeline_runner.html
Normal file
231
layout-review/09_pipeline_runner.html
Normal file
@@ -0,0 +1,231 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Automated Workflows</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="09_pipeline_runner">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Automated Workflows</strong> (Pipeline Runner), shown with a file imported, a four-step pipeline configured, and a completed run (results + per-step summary). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Automated Workflows</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Run several tools in a row — save the steps once, reuse them anytime.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding & delimiter auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">customers_export.csv</span>
|
||||||
|
<span class="size">2.1 MB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: customers_export.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">18,442 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td> Jane Doe </td><td>jane@acme.io</td><td>Austin</td><td>512-555-0190</td><td>2024-01-04</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>jane doe</td><td>JANE@ACME.IO</td><td>austin</td><td>(512) 555-0190</td><td>01/04/2024</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td>720.555.7781</td><td>2024-02-11</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R. Smith</td><td>bob@globex.com</td><td>—</td><td>720-555-7781</td><td>Feb 11 2024</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options: pipeline builder (collapsed once a result exists; opened here to show structure) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<!-- Mode radio -->
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">How would you like to define the pipeline?</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column;gap:9px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> Use the recommended default (text-clean → format → missing → dedup)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Build interactively</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Import a saved pipeline JSON</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="dt-caption" style="margin:10px 0">
|
||||||
|
Edit the table to add, remove, reorder (drag the row index), enable, or configure each step.
|
||||||
|
Tool order is recommended, not enforced — violations surface as warnings below the table.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<!-- Pipeline editor (st.data_editor: Tool selectbox · Enabled checkbox · Options JSON) -->
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th class="idx"></th>
|
||||||
|
<th>Tool</th>
|
||||||
|
<th>Enabled</th>
|
||||||
|
<th>Options (JSON)</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 0</td>
|
||||||
|
<td>text_clean <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"trim": true, "collapse_whitespace": true}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 1</td>
|
||||||
|
<td>format_standardize <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"column_types": {"phone": "phone", "signup_date": "date"}}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 2</td>
|
||||||
|
<td>missing <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"strategy": "flag", "sentinels": ["N/A", "—"]}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 3</td>
|
||||||
|
<td>dedup <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"survivor_rule": "most_complete", "merge": true}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx" style="color:var(--ink-tertiary)">+</td>
|
||||||
|
<td colspan="3" style="color:var(--ink-tertiary);font-family:var(--font-sans)">Add row</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Validation: pipeline is in recommended order, so no warning shown (warning block omitted) -->
|
||||||
|
|
||||||
|
<!-- Nested explainer expander -->
|
||||||
|
<details class="dt-expander" open style="margin-top:14px">
|
||||||
|
<summary>Recommended tool order — why each step belongs where it does</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p><strong>text_clean</strong> before <strong>format_standardize</strong> — format parsers (phone / currency / date) fail on smart-quote-contaminated or NBSP-padded input — clean text first</p>
|
||||||
|
<p><strong>text_clean</strong> before <strong>missing</strong> — sentinel detection misses cells padded with NBSP / zero-width characters — clean text first</p>
|
||||||
|
<p><strong>text_clean</strong> before <strong>dedup</strong> — fuzzy matching treats NBSP-padded values as different — clean text first</p>
|
||||||
|
<p><strong>format_standardize</strong> before <strong>missing</strong> — numeric imputation needs numeric dtypes; canonical phones / currencies improve sentinel detection</p>
|
||||||
|
<p><strong>format_standardize</strong> before <strong>dedup</strong> — canonical phones / lowercase emails enable cross-format duplicate matching</p>
|
||||||
|
<p style="margin-bottom:0"><strong>missing</strong> before <strong>dedup</strong> — deduping rows with mixed NaN sentinels produces brittle merges — resolve missing values first</p>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Run -->
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Run Pipeline</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Initial rows</div><div class="value">18,442</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Final rows</div><div class="value">18,130</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Steps run</div><div class="value">4</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Elapsed</div><div class="value">1.84 s</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Per-step summary</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead>
|
||||||
|
<tr><th>step</th><th>status</th><th>elapsed_ms</th><th>summary</th><th>error</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>text_clean</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>214</td>
|
||||||
|
<td>{"cells_changed": 1204, "columns": ["name", "city"]}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>format_standardize</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>388</td>
|
||||||
|
<td>{"phone": 18301, "signup_date": 17996}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>missing</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>121</td>
|
||||||
|
<td>{"flagged_cells": 642, "sentinels_found": ["—"]}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>dedup</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>911</td>
|
||||||
|
<td>{"input_rows": 18442, "output_rows": 18130, "duplicates_removed": 312, "groups": 147}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Output preview (first 10 rows)</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>Austin</td><td class="dt-cell-add">+1 512-555-0190</td><td class="dt-cell-add">2024-01-04</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td class="dt-cell-add">+1 720-555-7781</td><td class="dt-cell-add">2024-02-11</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Carla Reyes</td><td>carla@initech.co</td><td>Phoenix</td><td class="dt-cell-add">+1 480-555-3320</td><td class="dt-cell-add">2024-03-02</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>Dan Okafor</td><td>dan@umbrella.net</td><td><span class="dt-cell-flag">⚑ missing</span></td><td class="dt-cell-add">+1 206-555-7745</td><td class="dt-cell-add">2024-03-18</td></tr>
|
||||||
|
<tr><td class="idx">4</td><td>Emily Tran</td><td>emily@hooli.com</td><td>Seattle</td><td class="dt-cell-add">+1 206-555-1182</td><td class="dt-cell-add">2024-04-05</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (3 columns) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary"><span class="dt-mi">download</span> Download cleaned CSV</button>
|
||||||
|
<button class="dt-btn"><span class="dt-mi">download</span> Download pipeline JSON</button>
|
||||||
|
<button class="dt-btn"><span class="dt-mi">download</span> Download run audit</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
189
layout-review/10_pdf_extractor.html
Normal file
189
layout-review/10_pdf_extractor.html
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — PDF to CSV</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="10_pdf_extractor">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>PDF to CSV</strong>, shown with two bank-statement PDFs imported and a completed scan (candidate transactions in the editable preview table). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>PDF to CSV</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Pull transactions out of bank-statement PDFs into a clean CSV file.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Scan options expander (collapsed by default) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Scan options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div class="dt-check on">
|
||||||
|
<span class="box"><span class="dt-mi">check</span></span>
|
||||||
|
Treat (4.50) as negative
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on">
|
||||||
|
<span class="box"><span class="dt-mi">check</span></span>
|
||||||
|
Use OCR for scanned pages
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-help-text" style="margin:0 0 10px">OCR status: ready (bundled Tesseract). Most modern bank PDFs are text-based and don't need OCR — only enable for image-based scans.</p>
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Output date format</label>
|
||||||
|
<div class="dt-select">YYYY-MM-DD (2026-01-13)</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Override year for short dates (optional)</label>
|
||||||
|
<input class="dt-input" type="text" placeholder="" value="" disabled>
|
||||||
|
<div class="dt-help-text">Leave blank for automatic (statement period → filename year → this override).</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Files section head -->
|
||||||
|
<div class="dt-files-section-head">
|
||||||
|
<h2>Files</h2>
|
||||||
|
<span class="dt-section-meta">2 files · 318.4 KB total</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Files card (Home-style bordered list + Add more files) -->
|
||||||
|
<div class="dt-card" style="padding-bottom:0">
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove statement-jan-2026.pdf">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">statement-jan-2026.pdf</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">171.2 KB</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove statement-feb-2026.pdf">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">statement-feb-2026.pdf</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">147.2 KB</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-file-add">
|
||||||
|
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M12 5v14M5 12h14"/></svg> Add more files
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Action buttons -->
|
||||||
|
<div class="dt-btn-row" style="margin-top:16px;max-width:340px">
|
||||||
|
<button class="dt-btn dt-btn-primary">Scan</button>
|
||||||
|
<button class="dt-btn">Clear all files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Warnings expander (collapsed) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Warnings (1)</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-alert warn">
|
||||||
|
<span class="dt-mi">warning</span>
|
||||||
|
<span>[statement-feb-2026.pdf] 2 lines matched a date but no amount — skipped (likely a wrapped description). Check the source if a transaction looks missing.</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h4>47 candidate transaction(s) from 2 file(s)</h4>
|
||||||
|
<p class="dt-caption">Uncheck rows to exclude. Edit any cell to fix a value the scanner got wrong. The <code>raw</code> column shows the original PDF text for that row.</p>
|
||||||
|
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Include</th>
|
||||||
|
<th>date</th>
|
||||||
|
<th>description</th>
|
||||||
|
<th>amount_debit</th>
|
||||||
|
<th>amount_credit</th>
|
||||||
|
<th>account_number</th>
|
||||||
|
<th>source_file</th>
|
||||||
|
<th>page</th>
|
||||||
|
<th>raw</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-03</td><td>OPENING BALANCE</td><td></td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">1</td><td>01/03 OPENING BALANCE 2,140.55</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-05</td><td>POS PURCHASE WHOLE FOODS MKT</td><td>84.12</td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">1</td><td>01/05 POS PURCHASE WHOLE FOODS MKT (84.12)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-08</td><td>ACH DEPOSIT PAYROLL ACME CORP</td><td></td><td>3,250.00</td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">1</td><td>01/08 ACH DEPOSIT PAYROLL ACME CORP 3,250.00</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-11</td><td>ONLINE TRANSFER TO SAVINGS</td><td>500.00</td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">2</td><td>01/11 ONLINE TRANSFER TO SAVINGS (500.00)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check" style="margin:0"><span class="box"></span></span></td>
|
||||||
|
<td class="dt-cell-flag">2026-01-12</td><td class="dt-cell-flag">INTEREST RATE 0.50% APY DETAIL</td><td></td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">2</td><td>01/12 INTEREST RATE 0.50% APY 0.00</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-14</td><td>DEBIT CARD SHELL OIL #2287</td><td>52.40</td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">2</td><td>01/14 DEBIT CARD SHELL OIL #2287 (52.40)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-02-02</td><td>POS PURCHASE TRADER JOES #511</td><td>61.88</td><td></td><td>****4821</td><td>statement-feb-2026.pdf</td><td class="idx">1</td><td>02/02 POS PURCHASE TRADER JOES #511 (61.88)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-02-06</td><td>ACH DEPOSIT PAYROLL ACME CORP</td><td></td><td>3,250.00</td><td>****4821</td><td>statement-feb-2026.pdf</td><td class="idx">2</td><td>02/06 ACH DEPOSIT PAYROLL ACME CORP 3,250.00</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-02-09</td><td>CHECK #1043</td><td>1,200.00</td><td></td><td>****4821</td><td>statement-feb-2026.pdf</td><td class="idx">2</td><td>02/09 CHECK #1043 (1,200.00)</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Download row: download button (left) + columns multiselect (right) -->
|
||||||
|
<div class="dt-row" style="margin-top:14px;align-items:flex-start">
|
||||||
|
<div style="flex:2">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Download 46 rows as CSV</button>
|
||||||
|
<p class="dt-caption" style="margin-top:8px">46 of 47 rows selected.</p>
|
||||||
|
</div>
|
||||||
|
<div style="flex:3">
|
||||||
|
<div class="dt-field" style="margin:0">
|
||||||
|
<label class="dt-label">Columns to include in CSV</label>
|
||||||
|
<div class="dt-multiselect">
|
||||||
|
<span class="dt-ms-chip">date <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">description <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">amount_debit <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">amount_credit <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">account_number <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">source_file <span class="x">✕</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text"><code>page</code> and <code>raw</code> are kept off by default; tick them if you want them in the file.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
251
layout-review/11_reconciler.html
Normal file
251
layout-review/11_reconciler.html
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Reconcile Two Files</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="11_reconciler">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Reconcile Two Files</strong>, shown with both files imported, key columns mapped, and a completed reconciliation (matched / review / unmatched results). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Reconcile Two Files</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Compare two lists of transactions (e.g. bank vs. ledger) and flag what doesn't match.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Side-by-side upload (st.columns(2) → two _side_panel) -->
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<!-- Left side -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Left (e.g. bank feed)</h4>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">bank_feed_may.csv</span>
|
||||||
|
<span class="size">214 KB</span>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption" style="margin-top:6px"><code>bank_feed_may.csv</code> — 1,204 rows, 4 columns</p>
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview left (e.g. bank feed)</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>posted_date</th><th>description</th><th>amount</th><th>ref</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2026-05-01</td><td>ACME SUPPLIES</td><td>-1240.00</td><td>CHK1041</td></tr>
|
||||||
|
<tr><td>2026-05-02</td><td>PAYROLL RUN</td><td>-8800.00</td><td>ACH5520</td></tr>
|
||||||
|
<tr><td>2026-05-03</td><td>CLIENT GLOBEX</td><td>5200.00</td><td>DEP0090</td></tr>
|
||||||
|
<tr><td>2026-05-04</td><td>UTILITY CO</td><td>-318.42</td><td>CHK1042</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
<!-- Right side -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Right (e.g. ledger)</h4>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">ledger_may.xlsx</span>
|
||||||
|
<span class="size">96 KB</span>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption" style="margin-top:6px"><code>ledger_may.xlsx</code> — 1,198 rows, 5 columns</p>
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview right (e.g. ledger)</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>txn_date</th><th>memo</th><th>value</th><th>invoice_no</th><th>account</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2026-05-01</td><td>Acme Supplies Inc</td><td>-1240.00</td><td>INV-1041</td><td>5000</td></tr>
|
||||||
|
<tr><td>2026-05-02</td><td>Monthly payroll</td><td>-8800.00</td><td>INV-5520</td><td>6000</td></tr>
|
||||||
|
<tr><td>2026-05-03</td><td>Globex retainer</td><td>5200.00</td><td>INV-0090</td><td>4000</td></tr>
|
||||||
|
<tr><td>2026-05-04</td><td>City Utilities</td><td>-318.40</td><td>INV-1042</td><td>6100</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Match settings -->
|
||||||
|
<h2>Match settings</h2>
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<!-- Left pickers (file order: posted_date, description, amount → date, desc, amount) -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Left columns</h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Date column (optional)</label><div class="dt-select">posted_date</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Description column (optional)</label><div class="dt-select">description</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Amount column</label><div class="dt-select">amount</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Reference columns (optional, e.g. check / invoice no.)</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">ref <span class="x">✕</span></span></div></div>
|
||||||
|
</div>
|
||||||
|
<!-- Right pickers (file order: txn_date, memo, value → date, desc, amount) -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Right columns</h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Date column (optional)</label><div class="dt-select">txn_date</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Description column (optional)</label><div class="dt-select">memo</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Amount column</label><div class="dt-select">value</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Reference columns (must match left count)</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">invoice_no <span class="x">✕</span></span></div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Tolerances & options (expanded=True) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Tolerances & options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<div class="dt-field"><label class="dt-label">Amount tolerance</label>
|
||||||
|
<div class="dt-input">0.0200</div>
|
||||||
|
<div class="dt-help-text">Absolute tolerance on amount (e.g. 0.01 to absorb cent rounding).</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Date tolerance (days)</label>
|
||||||
|
<div class="dt-input">1</div>
|
||||||
|
<div class="dt-help-text">Allow N calendar days of drift between posting dates.</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Invert right amount sign</label>
|
||||||
|
<div class="dt-check" style="margin-top:8px"><span class="box"></span> Invert right amount sign</div>
|
||||||
|
<div class="dt-help-text">Use when one side records debits as positive and the other as negative.</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Description similarity boost (0 disables)</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:80%"></div><div class="knob" style="left:80%"></div></div><div class="val">80</div></div>
|
||||||
|
<div class="dt-help-text">When both sides have a description column set, accept matches with this minimum fuzzy similarity even if amount/date are merely within tolerance. Lower = more permissive.</div></div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Reconcile</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Matched</div><div class="value">1,173</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Review</div><div class="value">9</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Unmatched left</div><div class="value">22</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Unmatched right</div><div class="value">16</div></div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Coverage: 97.4% of the larger side</p>
|
||||||
|
|
||||||
|
<!-- Tabs (st.tabs) — Matched active -->
|
||||||
|
<div class="dt-tabs">
|
||||||
|
<span class="dt-tab is-active">Matched (1,173)</span>
|
||||||
|
<span class="dt-tab">Review (9)</span>
|
||||||
|
<span class="dt-tab">Unmatched left (22)</span>
|
||||||
|
<span class="dt-tab">Unmatched right (16)</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Matched tab content -->
|
||||||
|
<p class="dt-caption">Preview of first 25 of 1,173 rows — download the CSV below for the full set.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr>
|
||||||
|
<th>left_posted_date</th><th>left_description</th><th>left_amount</th>
|
||||||
|
<th>right_txn_date</th><th>right_memo</th><th>right_value</th><th>amount_diff</th>
|
||||||
|
</tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2026-05-01</td><td>ACME SUPPLIES</td><td>-1240.00</td><td>2026-05-01</td><td>Acme Supplies Inc</td><td>-1240.00</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
<tr><td>2026-05-02</td><td>PAYROLL RUN</td><td>-8800.00</td><td>2026-05-02</td><td>Monthly payroll</td><td>-8800.00</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
<tr><td>2026-05-03</td><td>CLIENT GLOBEX</td><td>5200.00</td><td>2026-05-03</td><td>Globex retainer</td><td>5200.00</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
<tr><td>2026-05-04</td><td>UTILITY CO</td><td>-318.42</td><td>2026-05-04</td><td>City Utilities</td><td>-318.40</td><td class="dt-cell-flag">0.02</td></tr>
|
||||||
|
<tr><td>2026-05-06</td><td>OFFICE DEPOT</td><td>-89.15</td><td>2026-05-07</td><td>Office supplies</td><td>-89.15</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Other tab previews shown as collapsed expanders for review context -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Review (9) — ambiguous candidates</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">Pairs flagged because the algorithm couldn't pick a single best match (e.g. multiple equally-good candidates). Use the left/right indices to disambiguate manually.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>left_idx</th><th>left_amount</th><th>right_idx</th><th>right_value</th><th>candidates</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>118</td><td>-450.00</td><td>121, 209</td><td>-450.00</td><td class="dt-cell-flag">2 equal</td></tr>
|
||||||
|
<tr><td>203</td><td>1000.00</td><td>198, 244</td><td>1000.00</td><td class="dt-cell-flag">2 equal</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Unmatched left (22) — only in bank_feed_may.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">Preview of first 25 of 22 rows.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>posted_date</th><th>description</th><th>amount</th><th>ref</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-09</td><td class="dt-cell-del">BANK FEE</td><td class="dt-cell-del">-12.00</td><td class="dt-cell-del">FEE0001</td></tr>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-14</td><td class="dt-cell-del">ATM WITHDRAWAL</td><td class="dt-cell-del">-200.00</td><td class="dt-cell-del">ATM7781</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Unmatched right (16) — only in ledger_may.xlsx</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">Preview of first 25 of 16 rows.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>txn_date</th><th>memo</th><th>value</th><th>invoice_no</th><th>account</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-11</td><td class="dt-cell-del">Accrued interest</td><td class="dt-cell-del">37.50</td><td class="dt-cell-del">INV-9001</td><td class="dt-cell-del">7000</td></tr>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-22</td><td class="dt-cell-del">Depreciation</td><td class="dt-cell-del">-410.00</td><td class="dt-cell-del">INV-9044</td><td class="dt-cell-del">8000</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (st.columns(4) of html_download_button) -->
|
||||||
|
<div class="dt-btn-row">
|
||||||
|
<button class="dt-btn dt-btn-primary">Matched CSV</button>
|
||||||
|
<button class="dt-btn">Review CSV</button>
|
||||||
|
<button class="dt-btn">Unmatched left</button>
|
||||||
|
<button class="dt-btn">Unmatched right</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
473
layout-review/app.css
Normal file
473
layout-review/app.css
Normal file
@@ -0,0 +1,473 @@
|
|||||||
|
/* ===========================================================================
|
||||||
|
DataTools — static layout-review stylesheet
|
||||||
|
---------------------------------------------------------------------------
|
||||||
|
Faithful reproduction of the live Streamlit app's design system for human
|
||||||
|
review of page layouts. Tokens are copied verbatim from src/gui/theme.py
|
||||||
|
(§3 color + type scale) and the component values from
|
||||||
|
src/gui/components/_legacy.py:_DESIGN_TOKENS_CSS.
|
||||||
|
|
||||||
|
The live app applies these styles to Streamlit's data-testid DOM; here we
|
||||||
|
re-express the same look against clean semantic classes so the static HTML
|
||||||
|
stays readable. Where the app uses real .dt-* classes (page header, files
|
||||||
|
card, findings, stats) the class names are kept identical.
|
||||||
|
=========================================================================== */
|
||||||
|
|
||||||
|
@import url("https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600;700&family=Geist+Mono:wght@400;500&display=swap");
|
||||||
|
@import url("https://fonts.googleapis.com/css2?family=Material+Symbols+Outlined:opsz,wght,FILL,GRAD@20..48,400,0,0&display=block");
|
||||||
|
|
||||||
|
:root {
|
||||||
|
--font-sans: "Geist", -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
||||||
|
--font-mono: "Geist Mono", ui-monospace, "SF Mono", Menlo, monospace;
|
||||||
|
|
||||||
|
--ink: #1c1917;
|
||||||
|
--ink-secondary: #57534e;
|
||||||
|
--ink-tertiary: #a8a29e;
|
||||||
|
--bg: #fafaf7;
|
||||||
|
--surface: #ffffff;
|
||||||
|
--surface-hover: #f8f7f3;
|
||||||
|
--border: #e7e5dc;
|
||||||
|
--border-strong: #d6d3c7;
|
||||||
|
--accent: #c2410c;
|
||||||
|
--accent-hover: #9a3412;
|
||||||
|
--accent-fill: #fef4ed;
|
||||||
|
--accent-fill-strong: #fde4d3;
|
||||||
|
|
||||||
|
--warn: #b45309;
|
||||||
|
--warn-fill: #fef3c7;
|
||||||
|
--info: #0369a1;
|
||||||
|
--info-fill: #e0f2fe;
|
||||||
|
--success: #15803d;
|
||||||
|
--success-fill: #dcfce7;
|
||||||
|
--danger: #b91c1c;
|
||||||
|
--danger-fill: #fee2e2;
|
||||||
|
|
||||||
|
--r-sm: 6px;
|
||||||
|
--r-md: 10px;
|
||||||
|
--r-lg: 14px;
|
||||||
|
|
||||||
|
--sidebar-w: 264px;
|
||||||
|
}
|
||||||
|
|
||||||
|
* { box-sizing: border-box; }
|
||||||
|
|
||||||
|
html, body {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
background: var(--bg);
|
||||||
|
color: var(--ink);
|
||||||
|
font-family: var(--font-sans);
|
||||||
|
font-feature-settings: "ss01", "cv01", "cv11";
|
||||||
|
-webkit-font-smoothing: antialiased;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------- Type scale (theme.py §4) ---------- */
|
||||||
|
h1 { font-size: 32px; font-weight: 600; letter-spacing: -0.035em; line-height: 1.1; margin: 0 0 4px; }
|
||||||
|
h2 { font-size: 22px; font-weight: 600; letter-spacing: -0.025em; line-height: 1.2; margin: 1.5rem 0 0.75rem; }
|
||||||
|
h3 { font-size: 18px; font-weight: 500; letter-spacing: -0.018em; line-height: 1.25; margin: 1.25rem 0 0.5rem; }
|
||||||
|
h4 { font-size: 15px; font-weight: 500; letter-spacing: -0.012em; line-height: 1.35; margin: 1rem 0 0.5rem; }
|
||||||
|
p { font-size: 14px; font-weight: 400; line-height: 1.55; color: var(--ink); margin: 0 0 0.6rem; }
|
||||||
|
strong { font-weight: 500; color: var(--ink); }
|
||||||
|
a { color: var(--accent); text-decoration: none; }
|
||||||
|
a:hover { color: var(--accent-hover); text-decoration: underline; }
|
||||||
|
code, .dt-mono { font-family: var(--font-mono); font-size: 0.92em; font-feature-settings: "ss02"; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
App frame — sidebar + main + sticky footer
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-app { display: flex; min-height: 100vh; }
|
||||||
|
|
||||||
|
/* ---------- Sidebar (cream paper) ---------- */
|
||||||
|
.dt-sidebar {
|
||||||
|
width: var(--sidebar-w);
|
||||||
|
flex-shrink: 0;
|
||||||
|
background: #f5f4ef;
|
||||||
|
border-right: 1px solid var(--border);
|
||||||
|
padding: 18px 14px 90px;
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
align-self: flex-start;
|
||||||
|
height: 100vh;
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
.dt-brand { display: flex; align-items: center; gap: 10px; padding: 0 4px 18px; }
|
||||||
|
.dt-brand-mark {
|
||||||
|
width: 28px; height: 28px; border-radius: 7px;
|
||||||
|
background: var(--ink); color: var(--accent-fill);
|
||||||
|
display: inline-flex; align-items: center; justify-content: center;
|
||||||
|
font-weight: 700; font-size: 16px; letter-spacing: -0.04em; line-height: 1; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-brand-name { display: flex; flex-direction: column; gap: 1px; line-height: 1.05; }
|
||||||
|
.dt-brand-eyebrow {
|
||||||
|
font-size: 9.5px; font-weight: 600; letter-spacing: 0.14em;
|
||||||
|
text-transform: uppercase; color: var(--ink-tertiary); line-height: 1;
|
||||||
|
}
|
||||||
|
.dt-brand-word { font-weight: 600; font-size: 15px; letter-spacing: -0.02em; color: var(--ink); }
|
||||||
|
|
||||||
|
.dt-nav { display: flex; flex-direction: column; }
|
||||||
|
.dt-nav-section {
|
||||||
|
font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.08em;
|
||||||
|
color: var(--ink-tertiary); font-weight: 500;
|
||||||
|
padding: 14px 10px 4px; margin: 0;
|
||||||
|
display: flex; align-items: center; justify-content: space-between;
|
||||||
|
}
|
||||||
|
.dt-nav-section .dt-nav-indicator { font-size: 16px; color: var(--ink-tertiary); }
|
||||||
|
.dt-nav-link {
|
||||||
|
display: flex; align-items: center; gap: 8px;
|
||||||
|
color: var(--ink-secondary); font-size: 13px; font-weight: 500; line-height: 1.3;
|
||||||
|
padding: 5px 10px; border-radius: var(--r-sm); margin-bottom: 1px;
|
||||||
|
text-decoration: none; transition: background 0.12s ease, color 0.12s ease;
|
||||||
|
}
|
||||||
|
.dt-nav-link:hover { background: rgba(0,0,0,0.04); color: var(--ink); text-decoration: none; }
|
||||||
|
.dt-nav-link.is-active { background: rgba(0,0,0,0.04); color: var(--ink); font-weight: 600; }
|
||||||
|
.dt-nav-link .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; color: var(--ink-secondary); line-height: 1; }
|
||||||
|
.dt-nav-link.is-active .dt-mi { color: var(--ink); }
|
||||||
|
.dt-nav-link.is-soon { opacity: 0.55; }
|
||||||
|
.dt-nav-soon-tag {
|
||||||
|
margin-left: auto; font-size: 9px; font-weight: 600; letter-spacing: 0.06em;
|
||||||
|
text-transform: uppercase; color: var(--ink-tertiary);
|
||||||
|
border: 1px solid var(--border-strong); border-radius: 999px; padding: 1px 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.dt-sidebar-foot { margin-top: 22px; padding-top: 16px; border-top: 1px solid var(--border); display: flex; flex-direction: column; gap: 10px; }
|
||||||
|
.dt-sidebar-label { font-size: 11.5px; font-weight: 500; text-transform: uppercase; letter-spacing: 0.08em; color: var(--ink-tertiary); margin-bottom: 4px; }
|
||||||
|
.dt-license-badge { font-size: 12.5px; color: var(--ink-secondary); }
|
||||||
|
|
||||||
|
/* ---------- Main column ---------- */
|
||||||
|
.dt-main { flex: 1; min-width: 0; padding: 40px 56px 96px; }
|
||||||
|
.dt-main-inner { max-width: 920px; margin: 0 auto; }
|
||||||
|
|
||||||
|
/* Review banner above every mockup */
|
||||||
|
.dt-review-banner {
|
||||||
|
max-width: 920px; margin: 0 auto 20px; display: flex; gap: 10px; align-items: center;
|
||||||
|
background: var(--info-fill); color: var(--info);
|
||||||
|
border: 1px solid transparent; border-radius: var(--r-md);
|
||||||
|
padding: 8px 14px; font-size: 12.5px; line-height: 1.4;
|
||||||
|
}
|
||||||
|
.dt-review-banner a { color: var(--info); text-decoration: underline; }
|
||||||
|
.dt-review-banner .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; }
|
||||||
|
|
||||||
|
/* ---------- Sticky footer ---------- */
|
||||||
|
.dt-footer {
|
||||||
|
position: fixed; bottom: 0; left: var(--sidebar-w); right: 0;
|
||||||
|
background: rgba(255,255,255,0.97); backdrop-filter: blur(8px);
|
||||||
|
border-top: 1px solid var(--border-strong);
|
||||||
|
padding: 8px 20px; z-index: 50;
|
||||||
|
display: flex; align-items: center; gap: 8px;
|
||||||
|
}
|
||||||
|
.dt-footer-btn {
|
||||||
|
display: inline-flex; align-items: center; gap: 8px;
|
||||||
|
color: var(--ink-secondary); font-size: 13px; font-weight: 500; line-height: 1.3;
|
||||||
|
padding: 5px 10px; border-radius: var(--r-sm);
|
||||||
|
background: transparent; border: none; cursor: pointer; text-decoration: none;
|
||||||
|
}
|
||||||
|
.dt-footer-btn:hover { background: rgba(0,0,0,0.04); color: var(--ink); text-decoration: none; }
|
||||||
|
.dt-footer-btn .dt-mi { font-family: "Material Symbols Outlined"; font-size: 16px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Page header (brand + privacy pill) — .dt-page-* mirror the live app
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-page-header {
|
||||||
|
display: flex; align-items: center; justify-content: space-between; gap: 24px;
|
||||||
|
margin: 0 0 24px; padding-bottom: 22px; border-bottom: 1px solid var(--border);
|
||||||
|
}
|
||||||
|
.dt-page-brand { display: flex; flex-direction: column; gap: 8px; }
|
||||||
|
.dt-page-brand-row { display: flex; align-items: center; gap: 18px; }
|
||||||
|
.dt-page-brand-mark {
|
||||||
|
width: 56px; height: 56px; border-radius: 14px; background: var(--ink);
|
||||||
|
color: var(--accent-fill); display: inline-flex; align-items: center; justify-content: center;
|
||||||
|
font-weight: 700; font-size: 32px; letter-spacing: -0.04em; line-height: 1; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-page-brand-words { display: flex; flex-direction: column; gap: 2px; line-height: 1; }
|
||||||
|
.dt-page-eyebrow { font-size: 11.5px; font-weight: 600; letter-spacing: 0.14em; text-transform: uppercase; color: var(--ink-tertiary); line-height: 1.2; }
|
||||||
|
.dt-page-wordmark { margin: 0; font-weight: 600; font-size: 32px; letter-spacing: -0.035em; line-height: 1.1; color: var(--ink); }
|
||||||
|
.dt-page-subtitle { margin: 4px 0 0; color: var(--ink-secondary); font-size: 14px; line-height: 1.5; }
|
||||||
|
.dt-privacy-pill {
|
||||||
|
display: inline-flex; align-items: center; gap: 6px; padding: 6px 11px;
|
||||||
|
background: var(--success-fill); color: var(--success); border-radius: 999px;
|
||||||
|
font-size: 12px; font-weight: 500; white-space: nowrap; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-privacy-pill svg { width: 13px; height: 13px; stroke-width: 2; }
|
||||||
|
|
||||||
|
/* ---------- Tool header (title + Help popover) ---------- */
|
||||||
|
.dt-tool-header { display: flex; align-items: flex-start; justify-content: space-between; gap: 16px; }
|
||||||
|
.dt-tool-header h1 { margin: 0; }
|
||||||
|
.dt-help-btn {
|
||||||
|
display: inline-flex; align-items: center; gap: 6px; white-space: nowrap;
|
||||||
|
background: var(--surface); color: var(--ink); border: 1px solid var(--border-strong);
|
||||||
|
border-radius: var(--r-md); padding: 9px 16px; font-size: 13.5px; font-weight: 500;
|
||||||
|
cursor: pointer; flex-shrink: 0; margin-top: 6px;
|
||||||
|
}
|
||||||
|
.dt-help-btn .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; }
|
||||||
|
.dt-tool-caption { font-size: 12.5px; color: var(--ink-tertiary); line-height: 1.5; margin: 2px 0 0; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Buttons
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-btn {
|
||||||
|
border-radius: var(--r-md); font-family: var(--font-sans); font-weight: 500;
|
||||||
|
font-size: 13.5px; letter-spacing: -0.005em; line-height: 1; padding: 9px 16px;
|
||||||
|
border: 1px solid var(--border-strong); background: var(--surface); color: var(--ink);
|
||||||
|
cursor: pointer; transition: background 0.12s ease, border-color 0.12s ease, color 0.12s ease;
|
||||||
|
display: inline-flex; align-items: center; justify-content: center; gap: 8px;
|
||||||
|
}
|
||||||
|
.dt-btn:hover { background: var(--surface-hover); border-color: var(--ink-tertiary); }
|
||||||
|
.dt-btn-primary { background: var(--ink); color: var(--bg); border-color: var(--ink); }
|
||||||
|
.dt-btn-primary:hover { background: #292524; border-color: #292524; color: var(--bg); }
|
||||||
|
.dt-btn-tertiary { background: transparent; border: none; color: var(--ink-tertiary); padding: 4px 8px; }
|
||||||
|
.dt-btn-tertiary:hover { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-btn:disabled, .dt-btn.is-disabled {
|
||||||
|
background: var(--surface-hover); color: var(--ink-tertiary);
|
||||||
|
border: 1px solid var(--border); cursor: not-allowed;
|
||||||
|
}
|
||||||
|
.dt-btn-block { width: 100%; }
|
||||||
|
.dt-btn .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; }
|
||||||
|
|
||||||
|
.dt-btn-row { display: flex; gap: 10px; flex-wrap: wrap; }
|
||||||
|
.dt-btn-row > .dt-btn { flex: 1; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
File uploader (cream dropzone)
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-uploader {
|
||||||
|
background: var(--surface-hover); border: 1px dashed var(--border-strong);
|
||||||
|
border-radius: var(--r-md); padding: 22px 20px;
|
||||||
|
display: flex; align-items: center; justify-content: space-between; gap: 16px;
|
||||||
|
}
|
||||||
|
.dt-uploader-text { display: flex; flex-direction: column; gap: 2px; }
|
||||||
|
.dt-uploader-text .hint { font-size: 14px; color: var(--ink); }
|
||||||
|
.dt-uploader-text .sub { font-size: 12.5px; color: var(--ink-tertiary); }
|
||||||
|
.dt-uploader .dt-mi { font-family: "Material Symbols Outlined"; font-size: 24px; color: var(--ink-tertiary); }
|
||||||
|
|
||||||
|
/* Staged-file chip */
|
||||||
|
.dt-file-chip {
|
||||||
|
display: flex; align-items: center; gap: 12px;
|
||||||
|
background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-sm);
|
||||||
|
padding: 10px 14px; margin-top: 10px;
|
||||||
|
}
|
||||||
|
.dt-file-chip .name { font-family: var(--font-mono); font-size: 13px; color: var(--ink); font-feature-settings: "ss02"; }
|
||||||
|
.dt-file-chip .size { font-family: var(--font-mono); font-size: 12px; color: var(--ink-tertiary); margin-left: auto; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Expanders / bordered cards
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-expander {
|
||||||
|
background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg);
|
||||||
|
overflow: hidden; box-shadow: 0 1px 2px rgba(28,25,23,0.03); margin: 10px 0;
|
||||||
|
}
|
||||||
|
.dt-expander > summary, .dt-expander-head {
|
||||||
|
background: var(--surface-hover); border-bottom: 1px solid var(--border);
|
||||||
|
padding: 12px 16px; font-weight: 500; color: var(--ink); font-size: 14px;
|
||||||
|
cursor: pointer; list-style: none; display: flex; align-items: center; gap: 8px;
|
||||||
|
}
|
||||||
|
.dt-expander > summary::-webkit-details-marker { display: none; }
|
||||||
|
.dt-expander > summary::before {
|
||||||
|
content: "expand_more"; font-family: "Material Symbols Outlined"; font-size: 20px;
|
||||||
|
color: var(--ink-tertiary); transition: transform 0.15s ease;
|
||||||
|
}
|
||||||
|
.dt-expander[open] > summary::before { transform: rotate(180deg); }
|
||||||
|
.dt-expander-body, .dt-expander > .dt-expander-body { padding: 14px 16px; }
|
||||||
|
.dt-expander:not([open]) > summary { border-bottom: none; }
|
||||||
|
|
||||||
|
.dt-card {
|
||||||
|
background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg);
|
||||||
|
box-shadow: 0 1px 2px rgba(28,25,23,0.03); padding: 16px; margin: 10px 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Alerts
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-alert {
|
||||||
|
border-radius: var(--r-md); border: 1px solid transparent;
|
||||||
|
padding: 10px 14px; font-size: 13.5px; line-height: 1.45; margin: 10px 0;
|
||||||
|
display: flex; gap: 10px; align-items: flex-start;
|
||||||
|
}
|
||||||
|
.dt-alert .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; flex-shrink: 0; margin-top: 1px; }
|
||||||
|
.dt-alert.info { background: var(--info-fill); color: var(--info); }
|
||||||
|
.dt-alert.success { background: var(--success-fill); color: var(--success); }
|
||||||
|
.dt-alert.warn { background: var(--warn-fill); color: var(--warn); }
|
||||||
|
.dt-alert.error { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-alert code { background: rgba(0,0,0,0.05); padding: 1px 5px; border-radius: 4px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Inputs (static representations of Streamlit widgets)
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-field { margin: 10px 0; }
|
||||||
|
.dt-label { font-size: 13px; font-weight: 500; color: var(--ink); margin-bottom: 5px; display: block; }
|
||||||
|
.dt-label .req { color: var(--accent); }
|
||||||
|
.dt-input, .dt-select, .dt-textarea {
|
||||||
|
width: 100%; background: var(--surface); border: 1px solid var(--border-strong);
|
||||||
|
border-radius: var(--r-sm); padding: 8px 11px; font-family: var(--font-sans);
|
||||||
|
font-size: 13.5px; color: var(--ink);
|
||||||
|
}
|
||||||
|
.dt-select { appearance: none; background-image: linear-gradient(45deg, transparent 50%, var(--ink-tertiary) 50%), linear-gradient(135deg, var(--ink-tertiary) 50%, transparent 50%); background-position: calc(100% - 16px) 14px, calc(100% - 11px) 14px; background-size: 5px 5px, 5px 5px; background-repeat: no-repeat; }
|
||||||
|
.dt-textarea { min-height: 76px; resize: vertical; font-family: var(--font-mono); font-size: 13px; }
|
||||||
|
.dt-help-text { font-size: 12px; color: var(--ink-tertiary); margin-top: 4px; }
|
||||||
|
|
||||||
|
/* Multiselect — chips inside a box */
|
||||||
|
.dt-multiselect {
|
||||||
|
width: 100%; background: var(--surface); border: 1px solid var(--border-strong);
|
||||||
|
border-radius: var(--r-sm); padding: 6px 8px; min-height: 38px;
|
||||||
|
display: flex; flex-wrap: wrap; gap: 6px; align-items: center;
|
||||||
|
}
|
||||||
|
.dt-ms-chip {
|
||||||
|
display: inline-flex; align-items: center; gap: 5px; background: var(--accent-fill);
|
||||||
|
color: var(--accent-hover); border-radius: var(--r-sm); padding: 3px 8px;
|
||||||
|
font-size: 12.5px; font-weight: 500;
|
||||||
|
}
|
||||||
|
.dt-ms-chip .x { color: var(--accent); font-size: 13px; }
|
||||||
|
.dt-ms-placeholder { color: var(--ink-tertiary); font-size: 13px; padding: 2px 4px; }
|
||||||
|
|
||||||
|
/* Checkbox / radio */
|
||||||
|
.dt-check { display: flex; align-items: center; gap: 9px; margin: 8px 0; font-size: 13.5px; color: var(--ink); }
|
||||||
|
.dt-check .box {
|
||||||
|
width: 18px; height: 18px; border-radius: 5px; border: 1px solid var(--border-strong);
|
||||||
|
background: var(--surface); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-check.on .box { background: var(--ink); border-color: var(--ink); color: var(--bg); }
|
||||||
|
.dt-check.on .box .dt-mi { font-family: "Material Symbols Outlined"; font-size: 14px; }
|
||||||
|
.dt-radio-row { display: flex; gap: 18px; flex-wrap: wrap; margin: 8px 0; }
|
||||||
|
.dt-radio { display: inline-flex; align-items: center; gap: 7px; font-size: 13.5px; }
|
||||||
|
.dt-radio .dot { width: 16px; height: 16px; border-radius: 50%; border: 1px solid var(--border-strong); display: inline-block; flex-shrink: 0; }
|
||||||
|
.dt-radio.on .dot { border: 5px solid var(--ink); }
|
||||||
|
|
||||||
|
/* Slider */
|
||||||
|
.dt-slider { margin: 14px 0 6px; }
|
||||||
|
.dt-slider .track { position: relative; height: 4px; background: var(--border-strong); border-radius: 2px; }
|
||||||
|
.dt-slider .fill { position: absolute; left: 0; top: 0; height: 4px; background: var(--ink); border-radius: 2px; }
|
||||||
|
.dt-slider .knob { position: absolute; top: 50%; width: 16px; height: 16px; border-radius: 50%; background: var(--ink); transform: translate(-50%, -50%); }
|
||||||
|
.dt-slider .val { font-family: var(--font-mono); font-size: 12px; color: var(--ink-secondary); margin-top: 8px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Layout helpers
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-row { display: flex; gap: 16px; }
|
||||||
|
.dt-row > * { flex: 1; min-width: 0; }
|
||||||
|
.dt-cols-2 { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
|
||||||
|
.dt-cols-3 { display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px; }
|
||||||
|
.dt-divider { border: none; border-top: 1px solid var(--border); margin: 22px 0; }
|
||||||
|
.dt-caption { font-size: 12.5px; color: var(--ink-tertiary); line-height: 1.5; }
|
||||||
|
.dt-spacer { height: 12px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
DataFrame / preview table
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-table-wrap { border: 1px solid var(--border); border-radius: var(--r-md); overflow: hidden; margin: 8px 0; }
|
||||||
|
table.dt-table { width: 100%; border-collapse: collapse; font-size: 13px; }
|
||||||
|
table.dt-table th {
|
||||||
|
background: var(--surface-hover); color: var(--ink-secondary); font-weight: 500;
|
||||||
|
text-align: left; padding: 8px 12px; border-bottom: 1px solid var(--border);
|
||||||
|
font-size: 12px; text-transform: none; white-space: nowrap;
|
||||||
|
}
|
||||||
|
table.dt-table td {
|
||||||
|
padding: 7px 12px; border-bottom: 1px solid var(--border);
|
||||||
|
font-family: var(--font-mono); font-size: 12.5px; color: var(--ink); font-feature-settings: "ss02"; white-space: nowrap;
|
||||||
|
}
|
||||||
|
table.dt-table tr:last-child td { border-bottom: none; }
|
||||||
|
table.dt-table tr:nth-child(even) td { background: #fcfbf8; }
|
||||||
|
table.dt-table td.idx { color: var(--ink-tertiary); background: var(--surface-hover); }
|
||||||
|
.dt-cell-flag { color: var(--warn); }
|
||||||
|
.dt-cell-del { color: var(--danger); text-decoration: line-through; }
|
||||||
|
.dt-cell-add { color: var(--success); }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Stats overview (home) — copied from _legacy.py
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-stats { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin: 8px 0 20px; }
|
||||||
|
.dt-stat { background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg); padding: 16px 18px; box-shadow: 0 1px 2px rgba(28,25,23,0.03); }
|
||||||
|
.dt-stat-label { font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.08em; color: var(--ink-tertiary); font-weight: 500; margin-bottom: 6px; line-height: 1.4; }
|
||||||
|
.dt-stat-value { font-size: 28px; font-weight: 600; letter-spacing: -0.03em; line-height: 1; color: var(--ink); display: flex; align-items: baseline; gap: 6px; }
|
||||||
|
.dt-stat-unit { font-size: 12px; font-weight: 400; color: var(--ink-tertiary); letter-spacing: 0; }
|
||||||
|
.dt-stat.is-warn .dt-stat-value { color: var(--warn); }
|
||||||
|
.dt-stat.is-info .dt-stat-value { color: var(--info); }
|
||||||
|
.dt-stat.is-success .dt-stat-value { color: var(--success); }
|
||||||
|
@media (max-width: 900px) { .dt-stats { grid-template-columns: repeat(2, 1fr); } }
|
||||||
|
|
||||||
|
/* Metric (st.metric) */
|
||||||
|
.dt-metrics { display: flex; gap: 28px; flex-wrap: wrap; margin: 6px 0 14px; }
|
||||||
|
.dt-metric .label { font-size: 12.5px; color: var(--ink-tertiary); margin-bottom: 4px; }
|
||||||
|
.dt-metric .value { font-size: 26px; font-weight: 600; letter-spacing: -0.03em; color: var(--ink); line-height: 1; }
|
||||||
|
.dt-metric .delta { font-size: 12.5px; margin-top: 3px; }
|
||||||
|
.dt-metric .delta.up { color: var(--success); }
|
||||||
|
.dt-metric .delta.down { color: var(--danger); }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Files card (home) — copied from _legacy.py
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-files-section-head { display: flex; align-items: baseline; justify-content: space-between; margin: 4px 0 10px; gap: 12px; }
|
||||||
|
.dt-files-section-head h2 { margin: 0; }
|
||||||
|
.dt-section-meta { font-size: 12.5px; color: var(--ink-tertiary); }
|
||||||
|
.dt-file-row { display: flex; align-items: center; gap: 12px; }
|
||||||
|
.dt-file-icon-chip { width: 28px; height: 28px; border-radius: var(--r-sm); background: var(--accent-fill); color: var(--accent); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0; }
|
||||||
|
.dt-file-icon-chip svg { width: 14px; height: 14px; stroke-width: 1.8; }
|
||||||
|
.dt-file-name { font-family: var(--font-mono); font-size: 13px; color: var(--ink); font-feature-settings: "ss02"; }
|
||||||
|
.dt-file-size { font-family: var(--font-mono); font-size: 12px; color: var(--ink-tertiary); font-feature-settings: "ss02"; }
|
||||||
|
.dt-file-add {
|
||||||
|
display: flex; align-items: center; justify-content: center; gap: 8px;
|
||||||
|
width: 100%; padding: 12px 16px; background: var(--surface-hover);
|
||||||
|
border: none; border-top: 1px dashed var(--border-strong);
|
||||||
|
border-radius: 0 0 var(--r-lg) var(--r-lg); cursor: pointer;
|
||||||
|
font-size: 13px; font-weight: 500; color: var(--ink-secondary); margin-top: 14px;
|
||||||
|
}
|
||||||
|
.dt-file-add:hover { background: var(--accent-fill); color: var(--accent); }
|
||||||
|
.dt-file-add svg { width: 14px; height: 14px; stroke-width: 2; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Findings panel — copied from _legacy.py
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-finding-group-head {
|
||||||
|
display: flex; align-items: center; gap: 12px; padding: 16px 22px;
|
||||||
|
border-bottom: 1px solid var(--border); background: var(--surface-hover);
|
||||||
|
margin: -16px -16px 1.2rem; border-radius: var(--r-lg) var(--r-lg) 0 0;
|
||||||
|
cursor: pointer; user-select: none;
|
||||||
|
}
|
||||||
|
.dt-finding-group-chevron { color: var(--ink-tertiary); font-family: "Material Symbols Outlined"; font-size: 20px; line-height: 1; flex-shrink: 0; }
|
||||||
|
.dt-severity-dot { width: 8px; height: 8px; border-radius: 50%; flex-shrink: 0; display: inline-block; }
|
||||||
|
.dt-severity-dot.warn { background: var(--warn); }
|
||||||
|
.dt-severity-dot.info { background: var(--info); }
|
||||||
|
.dt-severity-dot.error { background: var(--danger); }
|
||||||
|
.dt-severity-dot.success { background: var(--success); }
|
||||||
|
.dt-group-filename { font-family: var(--font-mono); font-size: 13.5px; font-weight: 500; color: var(--ink); font-feature-settings: "ss02"; }
|
||||||
|
.dt-group-counts { margin-left: auto; display: flex; align-items: center; gap: 8px; }
|
||||||
|
.dt-count-pill { display: inline-flex; align-items: center; padding: 3px 9px; border-radius: 999px; font-size: 11.5px; font-weight: 500; line-height: 1.4; white-space: nowrap; }
|
||||||
|
.dt-count-pill.warn { background: var(--warn-fill); color: var(--warn); }
|
||||||
|
.dt-count-pill.info { background: var(--info-fill); color: var(--info); }
|
||||||
|
.dt-count-pill.error { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-count-pill.success { background: var(--success-fill); color: var(--success); }
|
||||||
|
.dt-finding-row { display: flex; align-items: flex-start; gap: 12px; padding: 12px 0; border-top: 1px solid var(--border); }
|
||||||
|
.dt-finding-row:first-of-type { border-top: none; }
|
||||||
|
.dt-finding-icon { width: 24px; height: 24px; border-radius: var(--r-sm); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0; }
|
||||||
|
.dt-finding-icon.warn { background: var(--warn-fill); color: var(--warn); }
|
||||||
|
.dt-finding-icon.info { background: var(--info-fill); color: var(--info); }
|
||||||
|
.dt-finding-icon.error { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-finding-icon .dt-mi { font-family: "Material Symbols Outlined"; font-size: 16px; line-height: 1; }
|
||||||
|
.dt-finding-body { flex: 1; min-width: 0; }
|
||||||
|
.dt-finding-title { font-size: 14px; color: var(--ink); margin: 0 0 2px; line-height: 1.4; letter-spacing: -0.005em; }
|
||||||
|
.dt-finding-title strong { font-weight: 500; }
|
||||||
|
.dt-finding-meta { font-family: var(--font-mono); font-size: 12px; color: var(--ink-tertiary); line-height: 1.4; margin: 0; font-feature-settings: "ss02"; }
|
||||||
|
|
||||||
|
/* Match-group review card (dedup) */
|
||||||
|
.dt-match-card { background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg); box-shadow: 0 1px 2px rgba(28,25,23,0.03); margin: 12px 0; overflow: hidden; }
|
||||||
|
.dt-match-head { background: var(--surface-hover); border-bottom: 1px solid var(--border); padding: 12px 16px; display: flex; align-items: center; gap: 12px; }
|
||||||
|
.dt-match-head .title { font-weight: 500; font-size: 14px; }
|
||||||
|
.dt-match-head .conf { margin-left: auto; }
|
||||||
|
.dt-match-body { padding: 14px 16px; }
|
||||||
|
.dt-keep-row { background: var(--success-fill); }
|
||||||
|
.dt-keep-tag { display: inline-flex; align-items: center; gap: 4px; background: var(--success-fill); color: var(--success); border-radius: 999px; padding: 2px 8px; font-size: 11px; font-weight: 500; }
|
||||||
|
|
||||||
|
/* Progress bar */
|
||||||
|
.dt-progress { height: 6px; background: var(--border); border-radius: 3px; overflow: hidden; margin: 10px 0; }
|
||||||
|
.dt-progress .bar { height: 100%; background: var(--ink); border-radius: 3px; }
|
||||||
|
|
||||||
|
/* Tabs */
|
||||||
|
.dt-tabs { display: flex; gap: 18px; border-bottom: 1px solid var(--border); margin: 10px 0 16px; }
|
||||||
|
.dt-tab { font-size: 13.5px; color: var(--ink-secondary); padding: 8px 2px; border-bottom: 2px solid transparent; cursor: pointer; }
|
||||||
|
.dt-tab.is-active { color: var(--ink); font-weight: 500; border-bottom-color: var(--accent); }
|
||||||
|
|
||||||
|
/* Code block */
|
||||||
|
.dt-code { background: var(--surface-hover); border: 1px solid var(--border); border-radius: var(--r-md); padding: 12px 14px; font-family: var(--font-mono); font-size: 12.5px; color: var(--ink); white-space: pre; overflow-x: auto; font-feature-settings: "ss02"; }
|
||||||
|
|
||||||
|
@media (max-width: 1100px) {
|
||||||
|
.dt-footer { left: 0; }
|
||||||
|
.dt-sidebar { display: none; }
|
||||||
|
.dt-main { padding: 28px 24px 96px; }
|
||||||
|
}
|
||||||
164
layout-review/home.html
Normal file
164
layout-review/home.html
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — File Analysis (Home)</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="home">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of the <strong>Home / File Analysis</strong> page, shown with three imported files in the post-analysis state. <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Page header: brand block + privacy pill -->
|
||||||
|
<header class="dt-page-header">
|
||||||
|
<div class="dt-page-brand">
|
||||||
|
<div class="dt-page-brand-row">
|
||||||
|
<div class="dt-page-brand-mark">D</div>
|
||||||
|
<div class="dt-page-brand-words">
|
||||||
|
<span class="dt-page-eyebrow">UNALOGIX</span>
|
||||||
|
<h1 class="dt-page-wordmark">DataTools</h1>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-page-subtitle">Clean. Normalize. Transform.</p>
|
||||||
|
</div>
|
||||||
|
<span class="dt-privacy-pill">
|
||||||
|
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">
|
||||||
|
<rect x="4" y="11" width="16" height="10" rx="2"/>
|
||||||
|
<path d="M8 11V7a4 4 0 018 0v4"/>
|
||||||
|
</svg>
|
||||||
|
Runs 100% locally
|
||||||
|
</span>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<!-- Files section head -->
|
||||||
|
<div class="dt-files-section-head">
|
||||||
|
<h2>Files</h2>
|
||||||
|
<span class="dt-section-meta">3 files · 4.7 MB total</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Files card -->
|
||||||
|
<div class="dt-card" style="padding-bottom:0">
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">customers_export.csv</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">2.1 MB</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">q3_transactions.xlsx</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">1.8 MB</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">vendor_list.csv</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">0.8 MB</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-file-add" style="margin-left:-16px;margin-right:-16px;width:calc(100% + 32px)">
|
||||||
|
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M12 5v14M5 12h14"/></svg> Add more files
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Action bar -->
|
||||||
|
<div class="dt-btn-row" style="margin-top:16px;max-width:340px">
|
||||||
|
<button class="dt-btn dt-btn-primary">Run analysis</button>
|
||||||
|
<button class="dt-btn">Clear results</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Stats overview -->
|
||||||
|
<div class="dt-stats">
|
||||||
|
<div class="dt-stat">
|
||||||
|
<div class="dt-stat-label">Files analyzed</div>
|
||||||
|
<div class="dt-stat-value">3</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-stat">
|
||||||
|
<div class="dt-stat-label">Total findings</div>
|
||||||
|
<div class="dt-stat-value">14</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-stat is-warn">
|
||||||
|
<div class="dt-stat-label">Warnings</div>
|
||||||
|
<div class="dt-stat-value">9 <span class="dt-stat-unit">to review</span></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-stat is-info">
|
||||||
|
<div class="dt-stat-label">Info</div>
|
||||||
|
<div class="dt-stat-value">5 <span class="dt-stat-unit">suggestions</span></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Per-file findings panel #1 -->
|
||||||
|
<div class="dt-card">
|
||||||
|
<div class="dt-finding-group-head">
|
||||||
|
<span class="dt-finding-group-chevron" style="transform:rotate(90deg)">chevron_right</span>
|
||||||
|
<span class="dt-severity-dot warn"></span>
|
||||||
|
<span class="dt-group-filename">customers_export.csv</span>
|
||||||
|
<div class="dt-group-counts">
|
||||||
|
<span class="dt-count-pill warn">6 warnings</span>
|
||||||
|
<span class="dt-count-pill info">2 info</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-finding-row">
|
||||||
|
<span class="dt-finding-icon warn"><span class="dt-mi">priority_high</span></span>
|
||||||
|
<div class="dt-finding-body">
|
||||||
|
<p class="dt-finding-title"><strong>312 duplicate rows</strong> across exact + near matches</p>
|
||||||
|
<p class="dt-finding-meta">column: email · Find Duplicates →</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-finding-row">
|
||||||
|
<span class="dt-finding-icon warn"><span class="dt-mi">format_color_text</span></span>
|
||||||
|
<div class="dt-finding-body">
|
||||||
|
<p class="dt-finding-title"><strong>1,204 cells</strong> with leading / trailing whitespace</p>
|
||||||
|
<p class="dt-finding-meta">columns: name, city · Clean Text →</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-finding-row">
|
||||||
|
<span class="dt-finding-icon info"><span class="dt-mi">event</span></span>
|
||||||
|
<div class="dt-finding-body">
|
||||||
|
<p class="dt-finding-title">Mixed date formats in <strong>signup_date</strong></p>
|
||||||
|
<p class="dt-finding-meta">3 formats detected · Standardize Formats →</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Per-file findings panel #2 (collapsed) -->
|
||||||
|
<div class="dt-card" style="padding-bottom:16px">
|
||||||
|
<div class="dt-finding-group-head" style="margin-bottom:-16px;border-radius:var(--r-lg);border-bottom:none">
|
||||||
|
<span class="dt-finding-group-chevron">chevron_right</span>
|
||||||
|
<span class="dt-severity-dot warn"></span>
|
||||||
|
<span class="dt-group-filename">q3_transactions.xlsx</span>
|
||||||
|
<div class="dt-group-counts">
|
||||||
|
<span class="dt-count-pill warn">3 warnings</span>
|
||||||
|
<span class="dt-count-pill info">3 info</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Per-file findings panel #3 (clean) -->
|
||||||
|
<div class="dt-card" style="padding-bottom:16px">
|
||||||
|
<div class="dt-finding-group-head" style="margin-bottom:-16px;border-radius:var(--r-lg);border-bottom:none">
|
||||||
|
<span class="dt-severity-dot success"></span>
|
||||||
|
<span class="dt-group-filename">vendor_list.csv</span>
|
||||||
|
<div class="dt-group-counts">
|
||||||
|
<span class="dt-count-pill success">no issues</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
71
layout-review/index.html
Normal file
71
layout-review/index.html
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>DataTools — Layout Review</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
<style>
|
||||||
|
.lr-wrap { max-width: 960px; margin: 0 auto; padding: 48px 32px 80px; }
|
||||||
|
.lr-grid { display: grid; grid-template-columns: repeat(2, 1fr); gap: 14px; margin-top: 18px; }
|
||||||
|
.lr-card { display: flex; align-items: center; gap: 14px; background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg); padding: 16px 18px; box-shadow: 0 1px 2px rgba(28,25,23,0.03); text-decoration: none; transition: border-color .12s ease, box-shadow .12s ease; }
|
||||||
|
.lr-card:hover { border-color: var(--border-strong); box-shadow: 0 2px 8px rgba(28,25,23,0.06); text-decoration: none; }
|
||||||
|
.lr-ico { width: 40px; height: 40px; border-radius: var(--r-md); background: var(--accent-fill); color: var(--accent); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0; }
|
||||||
|
.lr-ico .dt-mi { font-family: "Material Symbols Outlined"; font-size: 22px; }
|
||||||
|
.lr-body { min-width: 0; }
|
||||||
|
.lr-name { font-size: 15px; font-weight: 600; color: var(--ink); letter-spacing: -0.01em; display:flex; align-items:center; gap:8px; }
|
||||||
|
.lr-desc { font-size: 12.5px; color: var(--ink-secondary); margin-top: 2px; line-height: 1.45; }
|
||||||
|
.lr-sec { font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.08em; color: var(--ink-tertiary); font-weight: 600; margin: 26px 0 2px; }
|
||||||
|
.lr-soon { font-size: 9px; font-weight: 600; letter-spacing: .06em; text-transform: uppercase; color: var(--ink-tertiary); border: 1px solid var(--border-strong); border-radius: 999px; padding: 1px 6px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="lr-wrap">
|
||||||
|
<header class="dt-page-header">
|
||||||
|
<div class="dt-page-brand">
|
||||||
|
<div class="dt-page-brand-row">
|
||||||
|
<div class="dt-page-brand-mark">D</div>
|
||||||
|
<div class="dt-page-brand-words">
|
||||||
|
<span class="dt-page-eyebrow">UNALOGIX · LAYOUT REVIEW</span>
|
||||||
|
<h1 class="dt-page-wordmark">DataTools</h1>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-page-subtitle">Static HTML reproductions of every tool page, built from the live app's design tokens for human review of layouts.</p>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>These are faithful static mockups — not the running Streamlit app. Colors, type scale, spacing, and components are copied verbatim from <code>theme.py</code> and <code>components/_legacy.py</code>. Each page is shown in a representative <strong>populated</strong> state so the layout can be reviewed end-to-end. Fonts load from Google Fonts (needs network); the chrome (sidebar + footer) is shared across every page.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Analysis</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="home.html"><span class="lr-ico"><span class="dt-mi">insert_chart_outlined</span></span><span class="lr-body"><span class="lr-name">File Analysis (Home)</span><span class="lr-desc">Import files, run the analyzer, browse per-file findings.</span></span></a>
|
||||||
|
<a class="lr-card" href="11_reconciler.html"><span class="lr-ico"><span class="dt-mi">compare_arrows</span></span><span class="lr-body"><span class="lr-name">Reconcile Two Files</span><span class="lr-desc">Compare two lists of transactions and flag what doesn't match.</span></span></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Data Cleaners</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="04_missing_handler.html"><span class="lr-ico"><span class="dt-mi">help_outline</span></span><span class="lr-body"><span class="lr-name">Fix Missing Values</span><span class="lr-desc">Find blank cells (even hidden ones) and fill them in or remove them.</span></span></a>
|
||||||
|
<a class="lr-card" href="06_outlier_detector.html"><span class="lr-ico"><span class="dt-mi">insights</span></span><span class="lr-body"><span class="lr-name">Find Unusual Values <span class="lr-soon">Soon</span></span><span class="lr-desc">Spot values that look wrong — too high, too low, or rule-breaking.</span></span></a>
|
||||||
|
<a class="lr-card" href="02_text_cleaner.html"><span class="lr-ico"><span class="dt-mi">text_format</span></span><span class="lr-body"><span class="lr-name">Clean Text</span><span class="lr-desc">Trim extra spaces and strip out odd characters.</span></span></a>
|
||||||
|
<a class="lr-card" href="03_format_standardizer.html"><span class="lr-ico"><span class="dt-mi">format_list_bulleted</span></span><span class="lr-body"><span class="lr-name">Standardize Formats</span><span class="lr-desc">Make dates, phones, currency, and names look the same throughout.</span></span></a>
|
||||||
|
<a class="lr-card" href="01_deduplicator.html"><span class="lr-ico"><span class="dt-mi">search</span></span><span class="lr-body"><span class="lr-name">Find Duplicates</span><span class="lr-desc">Find rows that repeat, then keep one and remove the extras.</span></span></a>
|
||||||
|
<a class="lr-card" href="08_validator_reporter.html"><span class="lr-ico"><span class="dt-mi">check_circle</span></span><span class="lr-body"><span class="lr-name">Quality Check <span class="lr-soon">Soon</span></span><span class="lr-desc">Check your file against rules and export a PDF or Excel report.</span></span></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Transformations</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="05_column_mapper.html"><span class="lr-ico"><span class="dt-mi">view_column</span></span><span class="lr-body"><span class="lr-name">Map Columns</span><span class="lr-desc">Rename columns, reorder, and set each one as text, number, or date.</span></span></a>
|
||||||
|
<a class="lr-card" href="07_multi_file_merger.html"><span class="lr-ico"><span class="dt-mi">account_tree</span></span><span class="lr-body"><span class="lr-name">Combine Files <span class="lr-soon">Soon</span></span><span class="lr-desc">Combine several CSV or Excel files into one — even if columns differ.</span></span></a>
|
||||||
|
<a class="lr-card" href="10_pdf_extractor.html"><span class="lr-ico"><span class="dt-mi">picture_as_pdf</span></span><span class="lr-body"><span class="lr-name">PDF to CSV</span><span class="lr-desc">Pull transactions out of bank-statement PDFs into a clean CSV file.</span></span></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Automations</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="09_pipeline_runner.html"><span class="lr-ico"><span class="dt-mi">auto_awesome</span></span><span class="lr-body"><span class="lr-name">Automated Workflows</span><span class="lr-desc">Run several tools in a row — save the steps and reuse them anytime.</span></span></a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
74
layout-review/shell.js
Normal file
74
layout-review/shell.js
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
/* Shared app chrome (sidebar nav + sticky footer) for the static layout
|
||||||
|
review pages. Mirrors src/gui/app.py:_build_navigation() ordering and
|
||||||
|
src/gui/components/_legacy.py:render_sticky_footer(). Each page sets
|
||||||
|
<body data-page="<tool_id|home>"> to mark the active nav item. */
|
||||||
|
(function () {
|
||||||
|
// Sections + entries in the same order app.py registers them.
|
||||||
|
var NAV = [
|
||||||
|
{ label: "Analysis", items: [
|
||||||
|
{ id: "home", icon: "insert_chart_outlined", name: "File Analysis", href: "home.html" },
|
||||||
|
{ id: "11_reconciler", icon: "compare_arrows", name: "Reconcile Two Files", href: "11_reconciler.html" },
|
||||||
|
]},
|
||||||
|
{ label: "Data Cleaners", items: [
|
||||||
|
{ id: "04_missing_handler", icon: "help_outline", name: "Fix Missing Values", href: "04_missing_handler.html" },
|
||||||
|
{ id: "06_outlier_detector", icon: "insights", name: "Find Unusual Values", href: "06_outlier_detector.html", soon: true },
|
||||||
|
{ id: "02_text_cleaner", icon: "text_format", name: "Clean Text", href: "02_text_cleaner.html" },
|
||||||
|
{ id: "03_format_standardizer", icon: "format_list_bulleted", name: "Standardize Formats", href: "03_format_standardizer.html" },
|
||||||
|
{ id: "01_deduplicator", icon: "search", name: "Find Duplicates", href: "01_deduplicator.html" },
|
||||||
|
{ id: "08_validator_reporter", icon: "check_circle", name: "Quality Check", href: "08_validator_reporter.html", soon: true },
|
||||||
|
]},
|
||||||
|
{ label: "Transformations", items: [
|
||||||
|
{ id: "05_column_mapper", icon: "view_column", name: "Map Columns", href: "05_column_mapper.html" },
|
||||||
|
{ id: "07_multi_file_merger", icon: "account_tree", name: "Combine Files", href: "07_multi_file_merger.html", soon: true },
|
||||||
|
{ id: "10_pdf_extractor", icon: "picture_as_pdf", name: "PDF to CSV", href: "10_pdf_extractor.html" },
|
||||||
|
]},
|
||||||
|
{ label: "Automations", items: [
|
||||||
|
{ id: "09_pipeline_runner", icon: "auto_awesome", name: "Automated Workflows", href: "09_pipeline_runner.html" },
|
||||||
|
]},
|
||||||
|
];
|
||||||
|
|
||||||
|
var active = document.body.getAttribute("data-page") || "";
|
||||||
|
|
||||||
|
// ---- Sidebar -----------------------------------------------------------
|
||||||
|
var sb = document.getElementById("dt-sidebar");
|
||||||
|
if (sb) {
|
||||||
|
var html = '' +
|
||||||
|
'<a class="dt-brand" href="index.html" style="text-decoration:none">' +
|
||||||
|
'<span class="dt-brand-mark">D</span>' +
|
||||||
|
'<span class="dt-brand-name">' +
|
||||||
|
'<span class="dt-brand-eyebrow">UNALOGIX</span>' +
|
||||||
|
'<span class="dt-brand-word">DataTools</span>' +
|
||||||
|
'</span>' +
|
||||||
|
'</a>' +
|
||||||
|
'<nav class="dt-nav">';
|
||||||
|
NAV.forEach(function (sec) {
|
||||||
|
var indicator = sec.label === "Analysis" ? "−" : "−";
|
||||||
|
html += '<div class="dt-nav-section">' + sec.label +
|
||||||
|
'<span class="dt-nav-indicator">' + indicator + '</span></div>';
|
||||||
|
sec.items.forEach(function (it) {
|
||||||
|
var cls = "dt-nav-link" + (it.id === active ? " is-active" : "") + (it.soon ? " is-soon" : "");
|
||||||
|
html += '<a class="' + cls + '" href="' + it.href + '">' +
|
||||||
|
'<span class="dt-mi">' + it.icon + '</span>' +
|
||||||
|
'<span>' + it.name + '</span>' +
|
||||||
|
(it.soon ? '<span class="dt-nav-soon-tag">Soon</span>' : '') +
|
||||||
|
'</a>';
|
||||||
|
});
|
||||||
|
});
|
||||||
|
html += '</nav>' +
|
||||||
|
'<div class="dt-sidebar-foot">' +
|
||||||
|
'<div><div class="dt-sidebar-label">Language</div>' +
|
||||||
|
'<div class="dt-select" style="pointer-events:none">English</div></div>' +
|
||||||
|
'<div class="dt-license-badge">Core · 1,820 days left</div>' +
|
||||||
|
'</div>';
|
||||||
|
sb.innerHTML = html;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Sticky footer -----------------------------------------------------
|
||||||
|
var ft = document.getElementById("dt-footer");
|
||||||
|
if (ft) {
|
||||||
|
ft.innerHTML =
|
||||||
|
'<a class="dt-footer-btn" href="index.html"><span class="dt-mi">close</span>Close</a>' +
|
||||||
|
'<button class="dt-footer-btn" type="button"><span class="dt-mi">help_outline</span>Help</button>' +
|
||||||
|
'<span style="margin-left:auto;font-size:11.5px;color:var(--ink-tertiary)">DataTools · local-first · static layout preview</span>';
|
||||||
|
}
|
||||||
|
})();
|
||||||
@@ -12,9 +12,14 @@ markers =
|
|||||||
e2e: end-to-end CLI / integration tests
|
e2e: end-to-end CLI / integration tests
|
||||||
install: import / dependency sanity tests
|
install: import / dependency sanity tests
|
||||||
fixture_sweep: parametrized sweep over the test-cases/ folder
|
fixture_sweep: parametrized sweep over the test-cases/ folder
|
||||||
|
gui: Streamlit AppTest-driven tests (live in tests/gui/)
|
||||||
|
|
||||||
# Warnings discipline: fail on unexpected DeprecationWarning from our own
|
# Warnings discipline: fail on any DeprecationWarning *or* ResourceWarning
|
||||||
# code, but tolerate third-party deprecations that we can't fix.
|
# from our own ``src`` package so a leaked file handle or stale stdlib call
|
||||||
|
# can't slip in unnoticed. Tolerate third-party deprecations / resource
|
||||||
|
# warnings — we can't fix pandas / openpyxl / streamlit churn from here.
|
||||||
filterwarnings =
|
filterwarnings =
|
||||||
error::DeprecationWarning:src
|
error::DeprecationWarning:src
|
||||||
|
error::ResourceWarning:src
|
||||||
ignore::DeprecationWarning
|
ignore::DeprecationWarning
|
||||||
|
ignore::ResourceWarning
|
||||||
|
|||||||
@@ -1,2 +1,6 @@
|
|||||||
pytest>=8.0,<9
|
pytest>=8.0,<9
|
||||||
pytest-cov>=5.0,<6
|
pytest-cov>=5.0,<6
|
||||||
|
# Test-only: generate small fixture PDFs in
|
||||||
|
# tests/test_pdf_extract_smoke.py so we can exercise pdfplumber +
|
||||||
|
# pypdfium2 end-to-end without committing binary fixtures.
|
||||||
|
fpdf2==2.8.7
|
||||||
|
|||||||
@@ -8,3 +8,16 @@ tqdm>=4.66,<5
|
|||||||
typer>=0.12,<1
|
typer>=0.12,<1
|
||||||
phonenumbers>=8.13,<9
|
phonenumbers>=8.13,<9
|
||||||
streamlit>=1.35,<2
|
streamlit>=1.35,<2
|
||||||
|
cryptography>=41,<49
|
||||||
|
# PDF Extractor stack — pinned to exact tested versions so a future
|
||||||
|
# upstream release can't quietly change pdfplumber's word-position
|
||||||
|
# behavior or pypdfium2's OCR rendering mid-build. Bump these
|
||||||
|
# explicitly when re-testing against a new release.
|
||||||
|
#
|
||||||
|
# ``pypdfium2`` is here for the OCR fallback path only (rasterizing
|
||||||
|
# pages to images for Tesseract). The drawable-canvas dep was
|
||||||
|
# removed when the visual picker was ripped out — the scanner is
|
||||||
|
# pure heuristic now, no coordinate UI.
|
||||||
|
pdfplumber==0.11.9
|
||||||
|
pypdfium2==5.8.0
|
||||||
|
pytesseract==0.3.13
|
||||||
|
|||||||
106
scripts/generate_keypair.py
Normal file
106
scripts/generate_keypair.py
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Generate a fresh Ed25519 keypair for production license signing.
|
||||||
|
|
||||||
|
**Creator-only.** Run once, write the private key somewhere safe,
|
||||||
|
configure the build pipeline with the public key.
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
python scripts/generate_keypair.py
|
||||||
|
python scripts/generate_keypair.py --json
|
||||||
|
python scripts/generate_keypair.py --output keys.txt
|
||||||
|
|
||||||
|
The output looks like::
|
||||||
|
|
||||||
|
DATATOOLS_LICENSE_PRIVKEY=<64 hex chars> # KEEP SECRET
|
||||||
|
DATATOOLS_LICENSE_PUBKEY=<64 hex chars> # BAKE INTO BUILD
|
||||||
|
|
||||||
|
The private key never goes near the buyer-facing binary. Stash it in
|
||||||
|
a password manager / KMS / hardware token; the only places it gets
|
||||||
|
loaded are:
|
||||||
|
|
||||||
|
- ``scripts/generate_license.py`` when minting a buyer's blob
|
||||||
|
- Your CI's signing step, if you've automated blob minting
|
||||||
|
|
||||||
|
The public key gets set as ``DATATOOLS_LICENSE_PUBKEY`` in the
|
||||||
|
PyInstaller build env (so the shipped binary verifies against it),
|
||||||
|
and the production-safe runtime check refuses to start any frozen
|
||||||
|
build that's still using the in-source dev key.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from cryptography.hazmat.primitives import serialization
|
||||||
|
from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey
|
||||||
|
|
||||||
|
|
||||||
|
def generate() -> tuple[str, str]:
|
||||||
|
"""Return ``(private_hex, public_hex)`` for a fresh keypair."""
|
||||||
|
priv = Ed25519PrivateKey.generate()
|
||||||
|
priv_hex = priv.private_bytes(
|
||||||
|
encoding=serialization.Encoding.Raw,
|
||||||
|
format=serialization.PrivateFormat.Raw,
|
||||||
|
encryption_algorithm=serialization.NoEncryption(),
|
||||||
|
).hex()
|
||||||
|
pub_hex = priv.public_key().public_bytes(
|
||||||
|
encoding=serialization.Encoding.Raw,
|
||||||
|
format=serialization.PublicFormat.Raw,
|
||||||
|
).hex()
|
||||||
|
return priv_hex, pub_hex
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: list[str] | None = None) -> int:
|
||||||
|
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
||||||
|
p.add_argument("--json", action="store_true", help="Emit JSON instead of env-file format.")
|
||||||
|
p.add_argument("--output", "-o", type=Path, default=None, help="Write to this file instead of stdout.")
|
||||||
|
args = p.parse_args(argv)
|
||||||
|
|
||||||
|
priv_hex, pub_hex = generate()
|
||||||
|
|
||||||
|
if args.json:
|
||||||
|
payload = json.dumps(
|
||||||
|
{"private_key": priv_hex, "public_key": pub_hex},
|
||||||
|
indent=2,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
payload = (
|
||||||
|
f"# DataTools license keypair — generated by generate_keypair.py\n"
|
||||||
|
f"# KEEP THE PRIVATE KEY SECRET. Lose it and your existing\n"
|
||||||
|
f"# licenses can't be renewed (you'd have to ship a new build\n"
|
||||||
|
f"# with a new public key and re-issue every active license).\n"
|
||||||
|
f"\n"
|
||||||
|
f"DATATOOLS_LICENSE_PRIVKEY={priv_hex}\n"
|
||||||
|
f"DATATOOLS_LICENSE_PUBKEY={pub_hex}\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.output:
|
||||||
|
args.output.write_text(payload + "\n", encoding="utf-8")
|
||||||
|
# chmod 600 — best-effort; ignored on Windows.
|
||||||
|
try:
|
||||||
|
args.output.chmod(0o600)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
print(f"Wrote {args.output} (mode 600)", file=sys.stderr)
|
||||||
|
else:
|
||||||
|
print(payload)
|
||||||
|
|
||||||
|
print(
|
||||||
|
"\nNext steps:\n"
|
||||||
|
" 1. Store the private key in your password manager.\n"
|
||||||
|
" 2. Bake the public key into the PyInstaller build:\n"
|
||||||
|
" DATATOOLS_LICENSE_PUBKEY=<pubkey> pyinstaller ...\n"
|
||||||
|
" 3. Mint buyer licenses by setting the private key:\n"
|
||||||
|
" DATATOOLS_LICENSE_PRIVKEY=<privkey> "
|
||||||
|
"python scripts/generate_license.py --name 'Buyer' --email b@x.com\n",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
215
scripts/generate_license.py
Normal file
215
scripts/generate_license.py
Normal file
@@ -0,0 +1,215 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Mint a signed license blob for a buyer (LOCAL, break-glass).
|
||||||
|
|
||||||
|
.. warning::
|
||||||
|
|
||||||
|
This script mints **locally**, without going through the license
|
||||||
|
server. Prefer :mod:`src.admin_cli` (``datatools-admin mint``)
|
||||||
|
for routine work — it writes to the authoritative ``licenses``
|
||||||
|
Postgres table and emits the same blob.
|
||||||
|
|
||||||
|
Reach for this script only when the server is unreachable and a
|
||||||
|
buyer needs a license *right now*. Mints from here land in the
|
||||||
|
local issuance JSONL log; you'll need to reconcile them into the
|
||||||
|
server's DB afterwards.
|
||||||
|
|
||||||
|
Creator-only tool. Signs with the Ed25519 private key from
|
||||||
|
``$DATATOOLS_LICENSE_PRIVKEY`` (production) or the in-tree dev key
|
||||||
|
(local development).
|
||||||
|
|
||||||
|
Every successful mint also appends a record to the issuance log at
|
||||||
|
``~/.datatools-creator/issued.jsonl`` (override with
|
||||||
|
``$DATATOOLS_ISSUANCE_LOG``). That log is the creator-side system of
|
||||||
|
record for "who has a license" — useful for re-delivery, support, and
|
||||||
|
as the seed for the future server-side ``licenses`` table.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
|
||||||
|
Mint a 1-year CORE license for Jane Doe::
|
||||||
|
|
||||||
|
python scripts/generate_license.py \\
|
||||||
|
--name "Jane Doe" --email jane@example.com --tier core
|
||||||
|
|
||||||
|
Mint a 2-year PRO license and write the blob to a file::
|
||||||
|
|
||||||
|
python scripts/generate_license.py \\
|
||||||
|
--name "Acme Corp" --email ops@acme.com --tier pro \\
|
||||||
|
--years 2 --output acme.dtlic
|
||||||
|
|
||||||
|
Mint with the production key (CI / manual fulfillment)::
|
||||||
|
|
||||||
|
DATATOOLS_LICENSE_PRIVKEY=<prod-private-hex> \\
|
||||||
|
python scripts/generate_license.py --name ... --email ...
|
||||||
|
|
||||||
|
The output is a single base64-encoded token starting with ``DTLIC2:``
|
||||||
|
— paste this whole string into the buyer's delivery email or
|
||||||
|
deliver as an attached ``.dtlic`` file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Make ``src.license`` importable when run from the repo root.
|
||||||
|
_PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
if str(_PROJECT_ROOT) not in sys.path:
|
||||||
|
sys.path.insert(0, str(_PROJECT_ROOT))
|
||||||
|
|
||||||
|
from src.license import Tier # noqa: E402
|
||||||
|
from src.license.crypto import encode_blob, sign # noqa: E402
|
||||||
|
from src.license.features import all_features_for_tier # noqa: E402
|
||||||
|
from src.license.schema import ( # noqa: E402
|
||||||
|
License,
|
||||||
|
_utcnow_iso,
|
||||||
|
default_expiry_iso,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def default_issuance_log() -> Path:
|
||||||
|
"""Path to the local issuance log (creator-side ledger).
|
||||||
|
|
||||||
|
Resolution order:
|
||||||
|
|
||||||
|
1. ``$DATATOOLS_ISSUANCE_LOG`` (absolute path; useful for tests
|
||||||
|
and for pointing at a shared / encrypted volume).
|
||||||
|
2. ``~/.datatools-creator/issued.jsonl`` — separate from the
|
||||||
|
buyer-facing ``~/.datatools/`` dir so it never gets bundled
|
||||||
|
into a shipped install.
|
||||||
|
"""
|
||||||
|
override = os.environ.get("DATATOOLS_ISSUANCE_LOG")
|
||||||
|
if override:
|
||||||
|
return Path(override).expanduser().resolve()
|
||||||
|
return Path.home() / ".datatools-creator" / "issued.jsonl"
|
||||||
|
|
||||||
|
|
||||||
|
def append_issuance_log(record: dict, *, path: Path | None = None) -> Path | None:
|
||||||
|
"""Best-effort append of *record* to the issuance log.
|
||||||
|
|
||||||
|
Returns the resolved path on success, ``None`` on IO failure
|
||||||
|
(with a warning printed to stderr). We intentionally do not raise:
|
||||||
|
the blob has already been minted by the time this runs, and losing
|
||||||
|
one ledger row is strictly better than aborting after a successful
|
||||||
|
mint and leaving the creator unsure whether to re-mint.
|
||||||
|
"""
|
||||||
|
p = path or default_issuance_log()
|
||||||
|
try:
|
||||||
|
p.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with p.open("a", encoding="utf-8") as f:
|
||||||
|
f.write(json.dumps(record, sort_keys=True) + "\n")
|
||||||
|
try:
|
||||||
|
p.chmod(0o600)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
return p
|
||||||
|
except OSError as e:
|
||||||
|
print(
|
||||||
|
f"WARNING: could not write issuance log at {p}: {e}\n"
|
||||||
|
" The blob above is still valid — record the mint "
|
||||||
|
"manually.",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def build_args() -> argparse.ArgumentParser:
|
||||||
|
p = argparse.ArgumentParser(
|
||||||
|
description="Mint a signed DataTools license blob.",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
)
|
||||||
|
p.add_argument("--name", required=True, help="Buyer's full name.")
|
||||||
|
p.add_argument("--email", required=True, help="Buyer's email.")
|
||||||
|
p.add_argument(
|
||||||
|
"--tier",
|
||||||
|
default=Tier.CORE.value,
|
||||||
|
choices=[t.value for t in Tier],
|
||||||
|
help="License tier (default: %(default)s).",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--years",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="License lifetime in years (default: %(default)s).",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--key",
|
||||||
|
default=None,
|
||||||
|
help="Override the auto-generated license key (default: random).",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--output",
|
||||||
|
"-o",
|
||||||
|
type=Path,
|
||||||
|
default=None,
|
||||||
|
help="Write the blob to this file (default: print to stdout).",
|
||||||
|
)
|
||||||
|
p.add_argument(
|
||||||
|
"--no-log",
|
||||||
|
action="store_true",
|
||||||
|
help=(
|
||||||
|
"Skip writing to the issuance log. Use for one-off test "
|
||||||
|
"mints; do NOT use for real buyer fulfillment."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return p
|
||||||
|
|
||||||
|
|
||||||
|
def main(argv: list[str] | None = None) -> int:
|
||||||
|
args = build_args().parse_args(argv)
|
||||||
|
tier = Tier(args.tier)
|
||||||
|
rid = uuid.uuid4().hex
|
||||||
|
key = args.key or f"DT1-{tier.value.upper()}-{rid[:8]}-{rid[8:16]}"
|
||||||
|
|
||||||
|
lic = License(
|
||||||
|
name=args.name,
|
||||||
|
email=args.email,
|
||||||
|
license_key=key,
|
||||||
|
tier=tier,
|
||||||
|
features=all_features_for_tier(tier),
|
||||||
|
issued_at=_utcnow_iso(),
|
||||||
|
expires_at=default_expiry_iso(years=args.years),
|
||||||
|
signature="",
|
||||||
|
)
|
||||||
|
signature = sign(lic.to_canonical_dict())
|
||||||
|
payload = lic.to_canonical_dict()
|
||||||
|
payload["signature"] = signature
|
||||||
|
blob = encode_blob(payload)
|
||||||
|
|
||||||
|
if not args.no_log:
|
||||||
|
log_path = append_issuance_log({
|
||||||
|
"license_key": lic.license_key,
|
||||||
|
"name": lic.name,
|
||||||
|
"email": lic.email,
|
||||||
|
"tier": lic.tier.value,
|
||||||
|
"issued_at": lic.issued_at,
|
||||||
|
"expires_at": lic.expires_at,
|
||||||
|
"blob": blob,
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
log_path = None
|
||||||
|
|
||||||
|
if args.output:
|
||||||
|
args.output.write_text(blob + "\n", encoding="utf-8")
|
||||||
|
print(f"Wrote license to {args.output}", file=sys.stderr)
|
||||||
|
else:
|
||||||
|
print(blob)
|
||||||
|
print(
|
||||||
|
f" name: {lic.name}\n"
|
||||||
|
f" email: {lic.email}\n"
|
||||||
|
f" tier: {lic.tier.value}\n"
|
||||||
|
f" key: {lic.license_key}\n"
|
||||||
|
f" expires: {lic.expires_at}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
if log_path:
|
||||||
|
print(f" logged: {log_path}", file=sys.stderr)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
16
server/.dockerignore
Normal file
16
server/.dockerignore
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
**/__pycache__
|
||||||
|
**/*.pyc
|
||||||
|
**/.pytest_cache
|
||||||
|
**/.mypy_cache
|
||||||
|
**/.ruff_cache
|
||||||
|
.git
|
||||||
|
.venv
|
||||||
|
venv
|
||||||
|
docs
|
||||||
|
landing
|
||||||
|
marketing
|
||||||
|
samples
|
||||||
|
test-cases
|
||||||
|
tests
|
||||||
|
logs
|
||||||
|
build
|
||||||
38
server/Dockerfile
Normal file
38
server/Dockerfile
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
# syntax=docker/dockerfile:1.6
|
||||||
|
FROM python:3.12-slim AS base
|
||||||
|
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||||
|
PYTHONUNBUFFERED=1 \
|
||||||
|
PIP_NO_CACHE_DIR=1 \
|
||||||
|
PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||||
|
|
||||||
|
RUN apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends \
|
||||||
|
curl \
|
||||||
|
libpq5 \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
RUN useradd --system --create-home --shell /usr/sbin/nologin --uid 10001 app
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY server/requirements.txt /app/requirements.txt
|
||||||
|
RUN pip install -r /app/requirements.txt
|
||||||
|
|
||||||
|
# Reused crypto / schema logic from the desktop app — single source of truth.
|
||||||
|
COPY src/license /app/datatools_license
|
||||||
|
|
||||||
|
COPY server/app /app/app
|
||||||
|
COPY server/config /app/config
|
||||||
|
COPY server/alembic /app/alembic
|
||||||
|
COPY server/alembic.ini /app/alembic.ini
|
||||||
|
|
||||||
|
RUN chown -R app:app /app
|
||||||
|
USER app
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
HEALTHCHECK --interval=30s --timeout=3s --start-period=15s --retries=3 \
|
||||||
|
CMD curl --fail --silent --show-error http://localhost:8000/health || exit 1
|
||||||
|
|
||||||
|
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--proxy-headers", "--forwarded-allow-ips", "*"]
|
||||||
38
server/alembic.ini
Normal file
38
server/alembic.ini
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
[alembic]
|
||||||
|
script_location = alembic
|
||||||
|
prepend_sys_path = .
|
||||||
|
sqlalchemy.url =
|
||||||
|
|
||||||
|
[loggers]
|
||||||
|
keys = root,sqlalchemy,alembic
|
||||||
|
|
||||||
|
[handlers]
|
||||||
|
keys = console
|
||||||
|
|
||||||
|
[formatters]
|
||||||
|
keys = generic
|
||||||
|
|
||||||
|
[logger_root]
|
||||||
|
level = WARN
|
||||||
|
handlers = console
|
||||||
|
qualname =
|
||||||
|
|
||||||
|
[logger_sqlalchemy]
|
||||||
|
level = WARN
|
||||||
|
handlers =
|
||||||
|
qualname = sqlalchemy.engine
|
||||||
|
|
||||||
|
[logger_alembic]
|
||||||
|
level = INFO
|
||||||
|
handlers =
|
||||||
|
qualname = alembic
|
||||||
|
|
||||||
|
[handler_console]
|
||||||
|
class = StreamHandler
|
||||||
|
args = (sys.stderr,)
|
||||||
|
level = NOTSET
|
||||||
|
formatter = generic
|
||||||
|
|
||||||
|
[formatter_generic]
|
||||||
|
format = %(levelname)-5.5s [%(name)s] %(message)s
|
||||||
|
datefmt = %H:%M:%S
|
||||||
46
server/alembic/env.py
Normal file
46
server/alembic/env.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
"""Alembic environment.
|
||||||
|
|
||||||
|
Reads the runtime database URL from ``app.db`` (which resolves the
|
||||||
|
password from the secrets file), so ``alembic upgrade head`` Just
|
||||||
|
Works inside the API container with no extra env wiring.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from logging.config import fileConfig
|
||||||
|
|
||||||
|
from alembic import context
|
||||||
|
|
||||||
|
from app.db import Base, engine
|
||||||
|
from app import models # noqa: F401 — imported for side-effect of registering models
|
||||||
|
|
||||||
|
config = context.config
|
||||||
|
|
||||||
|
if config.config_file_name is not None:
|
||||||
|
fileConfig(config.config_file_name)
|
||||||
|
|
||||||
|
target_metadata = Base.metadata
|
||||||
|
|
||||||
|
|
||||||
|
def run_migrations_offline() -> None:
|
||||||
|
context.configure(
|
||||||
|
url=str(engine.url),
|
||||||
|
target_metadata=target_metadata,
|
||||||
|
literal_binds=True,
|
||||||
|
dialect_opts={"paramstyle": "named"},
|
||||||
|
)
|
||||||
|
with context.begin_transaction():
|
||||||
|
context.run_migrations()
|
||||||
|
|
||||||
|
|
||||||
|
def run_migrations_online() -> None:
|
||||||
|
with engine.connect() as connection:
|
||||||
|
context.configure(connection=connection, target_metadata=target_metadata)
|
||||||
|
with context.begin_transaction():
|
||||||
|
context.run_migrations()
|
||||||
|
|
||||||
|
|
||||||
|
if context.is_offline_mode():
|
||||||
|
run_migrations_offline()
|
||||||
|
else:
|
||||||
|
run_migrations_online()
|
||||||
26
server/alembic/script.py.mako
Normal file
26
server/alembic/script.py.mako
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
"""${message}
|
||||||
|
|
||||||
|
Revision ID: ${up_revision}
|
||||||
|
Revises: ${down_revision | comma,n}
|
||||||
|
Create Date: ${create_date}
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
${imports if imports else ""}
|
||||||
|
|
||||||
|
revision: str = ${repr(up_revision)}
|
||||||
|
down_revision: Union[str, None] = ${repr(down_revision)}
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)}
|
||||||
|
depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)}
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
${upgrades if upgrades else "pass"}
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
${downgrades if downgrades else "pass"}
|
||||||
80
server/alembic/versions/0001_initial.py
Normal file
80
server/alembic/versions/0001_initial.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
"""Initial schema — licenses + gumroad_events.
|
||||||
|
|
||||||
|
Revision ID: 0001_initial
|
||||||
|
Revises:
|
||||||
|
Create Date: 2026-05-14
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
revision: str = "0001_initial"
|
||||||
|
down_revision: Union[str, None] = None
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
op.create_table(
|
||||||
|
"licenses",
|
||||||
|
sa.Column("license_key", sa.String(), primary_key=True),
|
||||||
|
sa.Column("name", sa.String(), nullable=False),
|
||||||
|
sa.Column("email", sa.String(), nullable=False),
|
||||||
|
sa.Column("tier", sa.String(), nullable=False),
|
||||||
|
sa.Column("issued_at", sa.DateTime(timezone=True), nullable=False),
|
||||||
|
sa.Column("expires_at", sa.DateTime(timezone=True), nullable=False),
|
||||||
|
sa.Column("blob", sa.String(), nullable=False),
|
||||||
|
sa.Column("source", sa.String(), nullable=False),
|
||||||
|
sa.Column("source_order_id", sa.String(), nullable=True),
|
||||||
|
sa.Column("promotion", sa.String(), nullable=True),
|
||||||
|
sa.Column("amount_paid", sa.Numeric(10, 2), nullable=True),
|
||||||
|
sa.Column("currency", sa.String(length=3), server_default=sa.text("'USD'"), nullable=True),
|
||||||
|
sa.Column("revoked_at", sa.DateTime(timezone=True), nullable=True),
|
||||||
|
sa.Column("notes", sa.String(), nullable=True),
|
||||||
|
sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.Column("updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.UniqueConstraint("source", "source_order_id", name="uq_licenses_source_order"),
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
"ix_licenses_email_lower",
|
||||||
|
"licenses",
|
||||||
|
[sa.text("lower(email)")],
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
"ix_licenses_expires_active",
|
||||||
|
"licenses",
|
||||||
|
["expires_at"],
|
||||||
|
postgresql_where=sa.text("revoked_at IS NULL"),
|
||||||
|
)
|
||||||
|
|
||||||
|
op.create_table(
|
||||||
|
"gumroad_events",
|
||||||
|
sa.Column("id", sa.BigInteger(), primary_key=True, autoincrement=True),
|
||||||
|
sa.Column("received_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False),
|
||||||
|
sa.Column("event_type", sa.String(), nullable=False),
|
||||||
|
sa.Column("order_id", sa.String(), nullable=True),
|
||||||
|
sa.Column("raw_payload", postgresql.JSONB(), nullable=False),
|
||||||
|
sa.Column("processed", sa.Boolean(), server_default=sa.text("false"), nullable=False),
|
||||||
|
sa.Column("error", sa.String(), nullable=True),
|
||||||
|
)
|
||||||
|
op.create_index("ix_gumroad_events_order_id", "gumroad_events", ["order_id"])
|
||||||
|
op.create_index(
|
||||||
|
"ix_gumroad_events_unprocessed",
|
||||||
|
"gumroad_events",
|
||||||
|
["received_at"],
|
||||||
|
postgresql_where=sa.text("processed = false"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_index("ix_gumroad_events_unprocessed", table_name="gumroad_events")
|
||||||
|
op.drop_index("ix_gumroad_events_order_id", table_name="gumroad_events")
|
||||||
|
op.drop_table("gumroad_events")
|
||||||
|
op.drop_index("ix_licenses_expires_active", table_name="licenses")
|
||||||
|
op.drop_index("ix_licenses_email_lower", table_name="licenses")
|
||||||
|
op.drop_table("licenses")
|
||||||
0
server/app/__init__.py
Normal file
0
server/app/__init__.py
Normal file
0
server/app/adapters/__init__.py
Normal file
0
server/app/adapters/__init__.py
Normal file
71
server/app/adapters/base.py
Normal file
71
server/app/adapters/base.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
"""Source-adapter interface.
|
||||||
|
|
||||||
|
The Mint API speaks only the normalized event types defined here.
|
||||||
|
Each storefront has its own adapter that:
|
||||||
|
|
||||||
|
- Verifies the storefront's webhook signature in its native format.
|
||||||
|
- Parses the storefront's payload into a :class:`SaleEvent` or
|
||||||
|
:class:`RefundEvent`.
|
||||||
|
- Maps the storefront's product/variant IDs to a license tier via
|
||||||
|
the per-source config in :mod:`app.adapters.config`.
|
||||||
|
|
||||||
|
Adding a new source (Lemon Squeezy, Stripe, Paddle) is one new
|
||||||
|
module that implements :class:`SourceAdapter`. The Mint API and DB
|
||||||
|
do not change.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from decimal import Decimal
|
||||||
|
from typing import Any, Optional, Protocol
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class SaleEvent:
|
||||||
|
"""A storefront sale, normalized.
|
||||||
|
|
||||||
|
The Mint API consumes this directly — it never reaches into the
|
||||||
|
raw storefront payload. Anything storefront-specific that's worth
|
||||||
|
keeping is preserved in :attr:`raw_payload` for audit.
|
||||||
|
"""
|
||||||
|
|
||||||
|
source: str # e.g. "gumroad", "manual"
|
||||||
|
source_order_id: Optional[str] # storefront's order ID; None for manual mints
|
||||||
|
buyer_name: str
|
||||||
|
buyer_email: str
|
||||||
|
tier: str # mapped from product/variant
|
||||||
|
years: int = 1
|
||||||
|
promotion: Optional[str] = None
|
||||||
|
amount_paid: Optional[Decimal] = None
|
||||||
|
currency: Optional[str] = "USD"
|
||||||
|
notes: Optional[str] = None
|
||||||
|
raw_payload: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class RefundEvent:
|
||||||
|
"""A storefront refund — marks an existing license revoked."""
|
||||||
|
|
||||||
|
source: str
|
||||||
|
source_order_id: str
|
||||||
|
reason: Optional[str] = None
|
||||||
|
raw_payload: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
class SourceAdapter(Protocol):
|
||||||
|
"""Interface every storefront adapter implements."""
|
||||||
|
|
||||||
|
source_name: str
|
||||||
|
|
||||||
|
def verify_webhook(self, *, body: bytes, headers: dict[str, str]) -> bool:
|
||||||
|
"""Return True iff the request came from the legitimate storefront."""
|
||||||
|
...
|
||||||
|
|
||||||
|
def parse_sale(self, payload: dict[str, Any]) -> Optional[SaleEvent]:
|
||||||
|
"""Return a :class:`SaleEvent` if *payload* is a sale, else None."""
|
||||||
|
...
|
||||||
|
|
||||||
|
def parse_refund(self, payload: dict[str, Any]) -> Optional[RefundEvent]:
|
||||||
|
"""Return a :class:`RefundEvent` if *payload* is a refund, else None."""
|
||||||
|
...
|
||||||
173
server/app/adapters/gumroad.py
Normal file
173
server/app/adapters/gumroad.py
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
"""Gumroad adapter.
|
||||||
|
|
||||||
|
Receives "Ping" notifications from Gumroad — form-encoded POSTs sent
|
||||||
|
when a sale occurs. Gumroad's Ping URL is configured in the seller
|
||||||
|
dashboard (Settings → Advanced → Ping URL).
|
||||||
|
|
||||||
|
Authentication
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Gumroad does not HMAC-sign the body. Their recommended pattern is
|
||||||
|
to put a secret in the URL itself::
|
||||||
|
|
||||||
|
https://licenses.datatools.unalogix.com/webhooks/gumroad?secret=...
|
||||||
|
|
||||||
|
The webhook receiver pulls the secret from the query string and
|
||||||
|
:meth:`GumroadAdapter.verify_webhook` constant-time-compares it
|
||||||
|
against the configured value. If they don't match, the request is
|
||||||
|
dropped with 404 (so a probing attacker can't tell whether the
|
||||||
|
endpoint exists, much less that it's the wrong secret).
|
||||||
|
|
||||||
|
The "test" field
|
||||||
|
----------------
|
||||||
|
|
||||||
|
Gumroad sends ``test=true`` on test pings fired from the dashboard.
|
||||||
|
We treat test pings as real sales (they create licenses just like
|
||||||
|
production sales), but tag them with ``notes='gumroad test ping'``
|
||||||
|
so the operator can filter / delete them later. Refusing test pings
|
||||||
|
would block the standard "Send Test Ping" verification flow.
|
||||||
|
|
||||||
|
Refunds, disputes, cancellations
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
Stubbed for now (``parse_refund`` returns None). Gumroad doesn't
|
||||||
|
include refund signals in the standard sale Ping — refunds arrive
|
||||||
|
via the separate "Resource subscriptions" mechanism. Wiring that
|
||||||
|
in is PR 2.1; until then, refunds are handled by the operator
|
||||||
|
running ``datatools-admin revoke``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hmac
|
||||||
|
from decimal import Decimal
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from app.adapters.base import RefundEvent, SaleEvent
|
||||||
|
from app.products import lookup as product_lookup
|
||||||
|
|
||||||
|
|
||||||
|
class GumroadAdapter:
|
||||||
|
source_name = "gumroad"
|
||||||
|
|
||||||
|
def __init__(self, secret: Optional[str]) -> None:
|
||||||
|
self._secret = secret
|
||||||
|
|
||||||
|
# --- Auth ----------------------------------------------------------------
|
||||||
|
|
||||||
|
def verify_webhook(self, *, body: bytes, headers: dict[str, str]) -> bool:
|
||||||
|
"""Not used — Gumroad authentication is via URL query param,
|
||||||
|
which only the route handler has direct access to. Call
|
||||||
|
:meth:`verify_secret` instead."""
|
||||||
|
return False
|
||||||
|
|
||||||
|
def verify_secret(self, presented: Optional[str]) -> bool:
|
||||||
|
"""Constant-time compare against the configured secret.
|
||||||
|
|
||||||
|
Returns False (not an exception) so the route handler can
|
||||||
|
decide the response code — we return 404 to avoid signaling
|
||||||
|
endpoint existence to an unauthenticated prober.
|
||||||
|
"""
|
||||||
|
if not self._secret or not presented:
|
||||||
|
return False
|
||||||
|
return hmac.compare_digest(presented, self._secret)
|
||||||
|
|
||||||
|
# --- Parsing -------------------------------------------------------------
|
||||||
|
|
||||||
|
def parse_sale(self, payload: dict[str, Any]) -> Optional[SaleEvent]:
|
||||||
|
"""Parse a Gumroad Ping form-encoded payload into a SaleEvent.
|
||||||
|
|
||||||
|
Returns None if the payload isn't a sale (e.g. some future
|
||||||
|
event type we don't yet handle). Returns None *with no row
|
||||||
|
side-effect* if the product_id is unmapped — the caller
|
||||||
|
should treat that as an error and record it in the audit
|
||||||
|
row, not silently drop.
|
||||||
|
"""
|
||||||
|
# Sale pings always include sale_id (the order ID) and email.
|
||||||
|
sale_id = payload.get("sale_id")
|
||||||
|
email = payload.get("email")
|
||||||
|
product_id = (
|
||||||
|
payload.get("product_id")
|
||||||
|
or payload.get("product_permalink")
|
||||||
|
or payload.get("permalink")
|
||||||
|
)
|
||||||
|
if not (sale_id and email and product_id):
|
||||||
|
return None
|
||||||
|
|
||||||
|
mapping = product_lookup(self.source_name, str(product_id))
|
||||||
|
if mapping is None:
|
||||||
|
# Unmapped — surface to caller as a SaleEvent with no tier.
|
||||||
|
# We deliberately don't raise here so the caller can
|
||||||
|
# log it to gumroad_events with error info and still
|
||||||
|
# return 200 (no Gumroad retry storm).
|
||||||
|
raise UnmappedProductError(
|
||||||
|
f"Gumroad product_id {product_id!r} has no entry in "
|
||||||
|
"config/products.yaml. Add a mapping and replay this "
|
||||||
|
f"sale (sale_id={sale_id})."
|
||||||
|
)
|
||||||
|
|
||||||
|
name = (payload.get("full_name") or "").strip() or _email_local(email)
|
||||||
|
|
||||||
|
price_cents = _to_int(payload.get("price"))
|
||||||
|
amount_paid = Decimal(price_cents) / Decimal(100) if price_cents is not None else None
|
||||||
|
currency = (payload.get("currency") or "USD").upper()
|
||||||
|
promotion = (payload.get("offer_code") or "").strip() or None
|
||||||
|
|
||||||
|
notes = None
|
||||||
|
if _is_truthy(payload.get("test")):
|
||||||
|
notes = "gumroad test ping"
|
||||||
|
|
||||||
|
return SaleEvent(
|
||||||
|
source=self.source_name,
|
||||||
|
source_order_id=str(sale_id),
|
||||||
|
buyer_name=name,
|
||||||
|
buyer_email=email.strip(),
|
||||||
|
tier=mapping.tier,
|
||||||
|
years=mapping.years,
|
||||||
|
promotion=promotion,
|
||||||
|
amount_paid=amount_paid,
|
||||||
|
currency=currency,
|
||||||
|
notes=notes,
|
||||||
|
raw_payload=dict(payload),
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse_refund(self, payload: dict[str, Any]) -> Optional[RefundEvent]:
|
||||||
|
# PR 2.1.
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class UnmappedProductError(ValueError):
|
||||||
|
"""Raised when a sale arrives for a product not in products.yaml.
|
||||||
|
|
||||||
|
Caller catches and logs into ``gumroad_events.error`` so the
|
||||||
|
operator can fix the mapping and replay.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _email_local(email: str) -> str:
|
||||||
|
"""Fallback display name when ``full_name`` is missing — the part
|
||||||
|
of the email before the ``@``, capitalized. Better than 'Unknown'
|
||||||
|
for support tickets and the buyer's own delivery email."""
|
||||||
|
local = email.split("@", 1)[0]
|
||||||
|
return local.replace(".", " ").title()
|
||||||
|
|
||||||
|
|
||||||
|
def _to_int(v: Any) -> Optional[int]:
|
||||||
|
if v is None or v == "":
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return int(v)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _is_truthy(v: Any) -> bool:
|
||||||
|
if isinstance(v, bool):
|
||||||
|
return v
|
||||||
|
if v is None:
|
||||||
|
return False
|
||||||
|
return str(v).strip().lower() in {"1", "true", "yes", "on"}
|
||||||
52
server/app/adapters/manual.py
Normal file
52
server/app/adapters/manual.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
"""Manual adapter — operator-initiated mints (comps, support replacements).
|
||||||
|
|
||||||
|
There is no webhook to verify and no payload to parse: the operator
|
||||||
|
hands us the buyer details directly via the CLI, and we construct a
|
||||||
|
:class:`SaleEvent` from them. ``source='manual'`` separates these
|
||||||
|
rows from storefront-driven mints in the DB.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decimal import Decimal
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from app.adapters.base import RefundEvent, SaleEvent
|
||||||
|
|
||||||
|
|
||||||
|
class ManualAdapter:
|
||||||
|
source_name = "manual"
|
||||||
|
|
||||||
|
def verify_webhook(self, *, body: bytes, headers: dict[str, str]) -> bool:
|
||||||
|
return False # manual flows never come through webhooks
|
||||||
|
|
||||||
|
def parse_sale(self, payload: dict[str, Any]) -> Optional[SaleEvent]:
|
||||||
|
return self.build_sale(**payload)
|
||||||
|
|
||||||
|
def parse_refund(self, payload: dict[str, Any]) -> Optional[RefundEvent]:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def build_sale(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
name: str,
|
||||||
|
email: str,
|
||||||
|
tier: str,
|
||||||
|
years: int = 1,
|
||||||
|
promotion: Optional[str] = None,
|
||||||
|
amount_paid: Optional[Decimal] = None,
|
||||||
|
currency: Optional[str] = "USD",
|
||||||
|
notes: Optional[str] = None,
|
||||||
|
) -> SaleEvent:
|
||||||
|
return SaleEvent(
|
||||||
|
source=self.source_name,
|
||||||
|
source_order_id=None,
|
||||||
|
buyer_name=name,
|
||||||
|
buyer_email=email,
|
||||||
|
tier=tier,
|
||||||
|
years=years,
|
||||||
|
promotion=promotion,
|
||||||
|
amount_paid=amount_paid,
|
||||||
|
currency=currency,
|
||||||
|
notes=notes,
|
||||||
|
)
|
||||||
65
server/app/auth.py
Normal file
65
server/app/auth.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
"""Auth guards for ``/internal/*``.
|
||||||
|
|
||||||
|
Active layer: Bearer token, presented by the operator's CLI and
|
||||||
|
matched against the value in the secrets dir. Token rotation =
|
||||||
|
update the file, restart the container.
|
||||||
|
|
||||||
|
:func:`require_localhost` is preserved but unused by default — it
|
||||||
|
fights the Docker bridge network model (the container sees the
|
||||||
|
gateway IP, not 127.0.0.1, regardless of where traffic originated).
|
||||||
|
Re-enable it only if the API runs in ``network_mode: host``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hmac
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import HTTPException, Request, status
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
|
||||||
|
|
||||||
|
def require_localhost(request: Request) -> None:
|
||||||
|
"""Reject the request unless the connecting peer is loopback.
|
||||||
|
|
||||||
|
``request.client.host`` reflects the actual TCP peer (the nginx
|
||||||
|
upstream connecting from 127.0.0.1) when ``proxy_set_header`` is
|
||||||
|
used appropriately. We deliberately do NOT trust
|
||||||
|
``X-Forwarded-For`` here — we want the raw peer.
|
||||||
|
"""
|
||||||
|
peer = request.client.host if request.client else None
|
||||||
|
if peer not in {"127.0.0.1", "::1"}:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail="Not found",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def require_bearer_token(request: Request) -> None:
|
||||||
|
"""Verify ``Authorization: Bearer <admin_token>``.
|
||||||
|
|
||||||
|
Uses constant-time comparison so timing leaks don't reveal token
|
||||||
|
prefixes. The 401 deliberately doesn't echo the supplied token or
|
||||||
|
leak whether a token is configured at all — clients should treat
|
||||||
|
"no token configured" the same as "wrong token".
|
||||||
|
"""
|
||||||
|
settings = get_settings()
|
||||||
|
expected: Optional[str] = settings.resolve_admin_token()
|
||||||
|
if not expected:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
|
detail="Server not configured for internal access.",
|
||||||
|
)
|
||||||
|
auth = request.headers.get("Authorization", "")
|
||||||
|
if not auth.startswith("Bearer "):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
|
detail="Bearer token required.",
|
||||||
|
)
|
||||||
|
presented = auth.removeprefix("Bearer ").strip()
|
||||||
|
if not hmac.compare_digest(presented, expected):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||||
|
detail="Invalid token.",
|
||||||
|
)
|
||||||
64
server/app/config.py
Normal file
64
server/app/config.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
"""Runtime configuration loaded from environment + secret files.
|
||||||
|
|
||||||
|
Secrets are read from files (``*_FILE`` env vars pointing at
|
||||||
|
``/run/secrets/<name>``) so they never appear in ``docker inspect``
|
||||||
|
or process environment dumps. Plain ``*`` vars are the fallback for
|
||||||
|
local development where mounting secret files is overkill.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from functools import lru_cache
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import Field
|
||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
model_config = SettingsConfigDict(env_file=".env", extra="ignore")
|
||||||
|
|
||||||
|
database_url: str = Field(
|
||||||
|
default="postgresql+psycopg://datatools_api@localhost:5432/datatools_licenses",
|
||||||
|
validation_alias="DATABASE_URL",
|
||||||
|
)
|
||||||
|
|
||||||
|
admin_token: Optional[str] = Field(default=None, validation_alias="DATATOOLS_ADMIN_TOKEN")
|
||||||
|
admin_token_file: Optional[Path] = Field(default=None, validation_alias="DATATOOLS_ADMIN_TOKEN_FILE")
|
||||||
|
|
||||||
|
license_privkey_hex: Optional[str] = Field(default=None, validation_alias="DATATOOLS_LICENSE_PRIVKEY")
|
||||||
|
license_privkey_file: Optional[Path] = Field(default=None, validation_alias="DATATOOLS_LICENSE_PRIVKEY_FILE")
|
||||||
|
|
||||||
|
license_pubkey_hex: Optional[str] = Field(default=None, validation_alias="DATATOOLS_LICENSE_PUBKEY")
|
||||||
|
|
||||||
|
postmark_token: Optional[str] = Field(default=None, validation_alias="POSTMARK_TOKEN")
|
||||||
|
postmark_token_file: Optional[Path] = Field(default=None, validation_alias="POSTMARK_TOKEN_FILE")
|
||||||
|
|
||||||
|
gumroad_secret: Optional[str] = Field(default=None, validation_alias="GUMROAD_WEBHOOK_SECRET")
|
||||||
|
gumroad_secret_file: Optional[Path] = Field(default=None, validation_alias="GUMROAD_WEBHOOK_SECRET_FILE")
|
||||||
|
|
||||||
|
def resolve_admin_token(self) -> Optional[str]:
|
||||||
|
return _resolve(self.admin_token, self.admin_token_file)
|
||||||
|
|
||||||
|
def resolve_license_privkey(self) -> Optional[str]:
|
||||||
|
return _resolve(self.license_privkey_hex, self.license_privkey_file)
|
||||||
|
|
||||||
|
def resolve_postmark_token(self) -> Optional[str]:
|
||||||
|
return _resolve(self.postmark_token, self.postmark_token_file)
|
||||||
|
|
||||||
|
def resolve_gumroad_secret(self) -> Optional[str]:
|
||||||
|
return _resolve(self.gumroad_secret, self.gumroad_secret_file)
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve(inline: Optional[str], path: Optional[Path]) -> Optional[str]:
|
||||||
|
if inline:
|
||||||
|
return inline.strip()
|
||||||
|
if path and path.exists():
|
||||||
|
return path.read_text(encoding="utf-8").strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_settings() -> Settings:
|
||||||
|
return Settings()
|
||||||
65
server/app/db.py
Normal file
65
server/app/db.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
"""SQLAlchemy engine + session factory.
|
||||||
|
|
||||||
|
The DB password lives in ``/run/secrets/pg_password``; we read it
|
||||||
|
from there (or ``$PG_PASSWORD`` for local dev) and splice it into
|
||||||
|
``DATABASE_URL`` so the password never has to be in plaintext in
|
||||||
|
``compose.yml`` or process environment listings.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Generator
|
||||||
|
from urllib.parse import quote_plus, urlparse, urlunparse
|
||||||
|
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.orm import DeclarativeBase, Session, sessionmaker
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_password() -> str | None:
|
||||||
|
inline = os.environ.get("PG_PASSWORD")
|
||||||
|
if inline:
|
||||||
|
return inline.strip()
|
||||||
|
path = os.environ.get("PG_PASSWORD_FILE")
|
||||||
|
if path and Path(path).exists():
|
||||||
|
return Path(path).read_text(encoding="utf-8").strip()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _build_url(base_url: str) -> str:
|
||||||
|
"""Inject the resolved password into ``base_url`` if absent."""
|
||||||
|
parsed = urlparse(base_url)
|
||||||
|
if parsed.password:
|
||||||
|
return base_url
|
||||||
|
pw = _resolve_password()
|
||||||
|
if pw is None:
|
||||||
|
return base_url
|
||||||
|
netloc = f"{parsed.username or ''}:{quote_plus(pw)}@{parsed.hostname}"
|
||||||
|
if parsed.port:
|
||||||
|
netloc += f":{parsed.port}"
|
||||||
|
return urlunparse(parsed._replace(netloc=netloc))
|
||||||
|
|
||||||
|
|
||||||
|
_settings = get_settings()
|
||||||
|
engine = create_engine(_build_url(_settings.database_url), pool_pre_ping=True, future=True)
|
||||||
|
SessionLocal = sessionmaker(bind=engine, autoflush=False, autocommit=False, expire_on_commit=False)
|
||||||
|
|
||||||
|
|
||||||
|
class Base(DeclarativeBase):
|
||||||
|
"""Declarative base for ORM models."""
|
||||||
|
|
||||||
|
|
||||||
|
def get_session() -> Generator[Session, None, None]:
|
||||||
|
"""FastAPI dependency. Commits on success, rolls back on exception."""
|
||||||
|
session = SessionLocal()
|
||||||
|
try:
|
||||||
|
yield session
|
||||||
|
session.commit()
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
session.close()
|
||||||
214
server/app/email.py
Normal file
214
server/app/email.py
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
"""Transactional email delivery.
|
||||||
|
|
||||||
|
Provider: Postmark. Picked for its transactional-deliverability
|
||||||
|
reputation and a tiny, no-SDK-needed HTTP API.
|
||||||
|
|
||||||
|
Configuration
|
||||||
|
-------------
|
||||||
|
|
||||||
|
- ``POSTMARK_TOKEN`` / ``POSTMARK_TOKEN_FILE`` — server API token.
|
||||||
|
- ``EMAIL_FROM`` — verified sender address (default
|
||||||
|
``licenses@datatools.unalogix.com``).
|
||||||
|
- ``EMAIL_REPLY_TO`` — optional Reply-To (default same as From).
|
||||||
|
|
||||||
|
When ``POSTMARK_TOKEN`` is unset the service falls back to
|
||||||
|
:class:`LoggingEmailService`, which prints the email to stdout
|
||||||
|
instead of sending. Lets the webhook handler exercise the full
|
||||||
|
flow before the Postmark account is provisioned.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional, Protocol
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from app.config import get_settings
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class LicenseEmail:
|
||||||
|
"""Inputs the renderer needs from the caller."""
|
||||||
|
|
||||||
|
to_name: str
|
||||||
|
to_email: str
|
||||||
|
tier: str
|
||||||
|
license_key: str
|
||||||
|
expires_at_iso: str
|
||||||
|
blob: str
|
||||||
|
|
||||||
|
|
||||||
|
class EmailService(Protocol):
|
||||||
|
"""Provider-agnostic email surface — keeps Postmark out of the
|
||||||
|
callers' import graph."""
|
||||||
|
|
||||||
|
def send_license(self, msg: LicenseEmail) -> str:
|
||||||
|
"""Deliver the license-delivery email. Returns a provider
|
||||||
|
message id (or ``"logged"`` for the dev fallback) so the
|
||||||
|
caller can record it on the licenses row for audit."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class LoggingEmailService:
|
||||||
|
"""Stand-in when no real provider is configured. Logs the
|
||||||
|
rendered message body at INFO so it shows up in ``docker compose
|
||||||
|
logs api`` — useful during local dev and during the deploy
|
||||||
|
window before Postmark is wired up."""
|
||||||
|
|
||||||
|
def send_license(self, msg: LicenseEmail) -> str:
|
||||||
|
body = _render_text(msg)
|
||||||
|
log.info(
|
||||||
|
"[email-stub] would send to=%s subject=%r\n%s",
|
||||||
|
msg.to_email,
|
||||||
|
_subject(msg),
|
||||||
|
body,
|
||||||
|
)
|
||||||
|
return "logged"
|
||||||
|
|
||||||
|
|
||||||
|
class PostmarkEmailService:
|
||||||
|
"""Postmark transactional API client.
|
||||||
|
|
||||||
|
Single endpoint, ~3 fields, no SDK needed. We use a per-call
|
||||||
|
httpx Client with a tight timeout — webhook handlers run on
|
||||||
|
the request thread and we never want to block them on a flaky
|
||||||
|
upstream.
|
||||||
|
"""
|
||||||
|
|
||||||
|
API_URL = "https://api.postmarkapp.com/email"
|
||||||
|
TIMEOUT_S = 8.0
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
token: str,
|
||||||
|
*,
|
||||||
|
sender: str,
|
||||||
|
reply_to: Optional[str] = None,
|
||||||
|
message_stream: str = "outbound",
|
||||||
|
) -> None:
|
||||||
|
self._token = token
|
||||||
|
self._sender = sender
|
||||||
|
self._reply_to = reply_to or sender
|
||||||
|
self._stream = message_stream
|
||||||
|
|
||||||
|
def send_license(self, msg: LicenseEmail) -> str:
|
||||||
|
body_text = _render_text(msg)
|
||||||
|
body_html = _render_html(msg)
|
||||||
|
payload = {
|
||||||
|
"From": self._sender,
|
||||||
|
"To": _rfc_addr(msg.to_name, msg.to_email),
|
||||||
|
"ReplyTo": self._reply_to,
|
||||||
|
"Subject": _subject(msg),
|
||||||
|
"TextBody": body_text,
|
||||||
|
"HtmlBody": body_html,
|
||||||
|
"MessageStream": self._stream,
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Accept": "application/json",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"X-Postmark-Server-Token": self._token,
|
||||||
|
}
|
||||||
|
with httpx.Client(timeout=self.TIMEOUT_S) as c:
|
||||||
|
r = c.post(self.API_URL, json=payload, headers=headers)
|
||||||
|
if r.status_code >= 400:
|
||||||
|
raise EmailDeliveryError(
|
||||||
|
f"Postmark rejected the request: HTTP {r.status_code} "
|
||||||
|
f"body={r.text[:300]!r}"
|
||||||
|
)
|
||||||
|
return str(r.json().get("MessageID", ""))
|
||||||
|
|
||||||
|
|
||||||
|
class EmailDeliveryError(RuntimeError):
|
||||||
|
"""Provider returned a non-2xx. Caller should record this on the
|
||||||
|
audit row so the operator can replay after fixing the provider
|
||||||
|
config (verified sender domain, paid plan, etc.)."""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Factory
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def get_email_service() -> EmailService:
|
||||||
|
"""Choose the real provider if a token is configured, else the
|
||||||
|
logger. Reads settings fresh — tests can flip env vars between
|
||||||
|
sends without restarting."""
|
||||||
|
settings = get_settings()
|
||||||
|
token = settings.resolve_postmark_token()
|
||||||
|
if not token:
|
||||||
|
return LoggingEmailService()
|
||||||
|
sender = os.environ.get("EMAIL_FROM", "licenses@datatools.unalogix.com")
|
||||||
|
reply_to = os.environ.get("EMAIL_REPLY_TO")
|
||||||
|
return PostmarkEmailService(token, sender=sender, reply_to=reply_to)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Rendering
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _subject(msg: LicenseEmail) -> str:
|
||||||
|
return f"Your DataTools license ({msg.tier})"
|
||||||
|
|
||||||
|
|
||||||
|
def _render_text(msg: LicenseEmail) -> str:
|
||||||
|
return (
|
||||||
|
f"Hi {msg.to_name},\n\n"
|
||||||
|
f"Thanks for your DataTools purchase. Your license is below.\n\n"
|
||||||
|
f"License key: {msg.license_key}\n"
|
||||||
|
f"Tier: {msg.tier}\n"
|
||||||
|
f"Expires: {msg.expires_at_iso[:10]}\n\n"
|
||||||
|
f"To activate, paste the full blob (starting with DTLIC2:) into\n"
|
||||||
|
f"the Activate screen, or run:\n\n"
|
||||||
|
f" python -m src.license_cli activate \"{msg.blob}\" \\\n"
|
||||||
|
f" --name \"{msg.to_name}\" --email {msg.to_email}\n\n"
|
||||||
|
f"Your blob:\n\n"
|
||||||
|
f"{msg.blob}\n\n"
|
||||||
|
f"Keep this email — you'll need the blob if you move to a new\n"
|
||||||
|
f"computer. Questions: reply to this email.\n\n"
|
||||||
|
f"— DataTools\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _render_html(msg: LicenseEmail) -> str:
|
||||||
|
return (
|
||||||
|
"<!doctype html><html><body style=\"font-family:system-ui,sans-serif;"
|
||||||
|
"max-width:560px;margin:auto;padding:24px;color:#222;\">"
|
||||||
|
f"<p>Hi {_html_escape(msg.to_name)},</p>"
|
||||||
|
"<p>Thanks for your DataTools purchase. Your license is below.</p>"
|
||||||
|
"<table cellpadding=\"4\" style=\"border-collapse:collapse;\">"
|
||||||
|
f"<tr><td><b>License key</b></td><td><code>{_html_escape(msg.license_key)}</code></td></tr>"
|
||||||
|
f"<tr><td><b>Tier</b></td><td>{_html_escape(msg.tier)}</td></tr>"
|
||||||
|
f"<tr><td><b>Expires</b></td><td>{_html_escape(msg.expires_at_iso[:10])}</td></tr>"
|
||||||
|
"</table>"
|
||||||
|
"<p>To activate, paste the blob below into the <em>Activate</em> "
|
||||||
|
"screen on first launch.</p>"
|
||||||
|
"<pre style=\"background:#f4f4f4;padding:12px;border-radius:6px;"
|
||||||
|
"white-space:pre-wrap;word-break:break-all;font-size:11px;\">"
|
||||||
|
f"{_html_escape(msg.blob)}</pre>"
|
||||||
|
"<p style=\"color:#666;font-size:13px;\">Keep this email — you'll "
|
||||||
|
"need the blob if you move to a new computer. Questions: just reply.</p>"
|
||||||
|
"<p>— DataTools</p></body></html>"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _rfc_addr(name: str, email: str) -> str:
|
||||||
|
# Postmark accepts "Name <addr>" or just "addr". Quote names with
|
||||||
|
# special chars; otherwise keep it readable in the inbox.
|
||||||
|
if not name or "@" in name:
|
||||||
|
return email
|
||||||
|
if any(c in name for c in ',<>"'):
|
||||||
|
name = name.replace('"', "").replace(",", "")
|
||||||
|
return f"{name} <{email}>"
|
||||||
|
|
||||||
|
|
||||||
|
def _html_escape(s: str) -> str:
|
||||||
|
return (
|
||||||
|
s.replace("&", "&")
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">")
|
||||||
|
.replace('"', """)
|
||||||
|
)
|
||||||
19
server/app/main.py
Normal file
19
server/app/main.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
"""FastAPI entry point for the DataTools license server."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
|
||||||
|
from app.routes import internal, public, webhooks
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="DataTools License Server",
|
||||||
|
version="0.1.0",
|
||||||
|
docs_url=None,
|
||||||
|
redoc_url=None,
|
||||||
|
openapi_url=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
app.include_router(public.router)
|
||||||
|
app.include_router(internal.router)
|
||||||
|
app.include_router(webhooks.router)
|
||||||
136
server/app/mint.py
Normal file
136
server/app/mint.py
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
"""Core mint + revoke logic.
|
||||||
|
|
||||||
|
Bridges the source-adapter layer (:mod:`app.adapters`) to the DB
|
||||||
|
layer (:mod:`app.models`), reusing the desktop app's signing /
|
||||||
|
encoding primitives from ``datatools_license.crypto`` so blobs minted
|
||||||
|
here verify against the same embedded pubkey on the buyer's machine.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from sqlalchemy import select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.adapters.base import SaleEvent
|
||||||
|
from app.config import get_settings
|
||||||
|
from app.models import License
|
||||||
|
|
||||||
|
|
||||||
|
def _init_key_env() -> None:
|
||||||
|
"""Resolve secret-file pointers into env vars before importing crypto.
|
||||||
|
|
||||||
|
``datatools_license.crypto`` looks for ``DATATOOLS_LICENSE_PRIVKEY``
|
||||||
|
/ ``DATATOOLS_LICENSE_PUBKEY`` in ``os.environ``. When those come
|
||||||
|
from secret files (``*_FILE`` env vars), we read them once at
|
||||||
|
module import and stash so crypto can pick them up without
|
||||||
|
changes.
|
||||||
|
"""
|
||||||
|
settings = get_settings()
|
||||||
|
priv = settings.resolve_license_privkey()
|
||||||
|
if priv:
|
||||||
|
os.environ.setdefault("DATATOOLS_LICENSE_PRIVKEY", priv)
|
||||||
|
pub = settings.license_pubkey_hex
|
||||||
|
if pub:
|
||||||
|
os.environ.setdefault("DATATOOLS_LICENSE_PUBKEY", pub)
|
||||||
|
|
||||||
|
|
||||||
|
_init_key_env()
|
||||||
|
|
||||||
|
# Imported after env init so the crypto module reads the correct key.
|
||||||
|
from datatools_license.crypto import encode_blob, sign # noqa: E402
|
||||||
|
from datatools_license.features import all_features_for_tier # noqa: E402
|
||||||
|
from datatools_license.schema import ( # noqa: E402
|
||||||
|
License as LicenseDataclass,
|
||||||
|
Tier,
|
||||||
|
_utcnow_iso,
|
||||||
|
default_expiry_iso,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _generate_license_key(tier: str) -> str:
|
||||||
|
rid = uuid.uuid4().hex
|
||||||
|
return f"DT1-{tier.upper()}-{rid[:8]}-{rid[8:16]}"
|
||||||
|
|
||||||
|
|
||||||
|
def _iso_to_dt(iso: str) -> datetime:
|
||||||
|
return datetime.fromisoformat(iso.replace("Z", "+00:00"))
|
||||||
|
|
||||||
|
|
||||||
|
def mint_from_sale(session: Session, sale: SaleEvent) -> License:
|
||||||
|
"""Idempotently mint a license for *sale*.
|
||||||
|
|
||||||
|
If a row with the same ``(source, source_order_id)`` already
|
||||||
|
exists, return it untouched — Gumroad retrying a webhook does not
|
||||||
|
produce a second blob with a different signature. Manual mints
|
||||||
|
(``source_order_id is None``) skip the dedup check and always
|
||||||
|
produce a new row.
|
||||||
|
"""
|
||||||
|
if sale.source_order_id is not None:
|
||||||
|
existing = session.execute(
|
||||||
|
select(License).where(
|
||||||
|
License.source == sale.source,
|
||||||
|
License.source_order_id == sale.source_order_id,
|
||||||
|
)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
if existing is not None:
|
||||||
|
return existing
|
||||||
|
|
||||||
|
tier_enum = Tier(sale.tier)
|
||||||
|
license_key = _generate_license_key(sale.tier)
|
||||||
|
issued_iso = _utcnow_iso()
|
||||||
|
expires_iso = default_expiry_iso(years=sale.years)
|
||||||
|
|
||||||
|
unsigned = LicenseDataclass(
|
||||||
|
name=sale.buyer_name,
|
||||||
|
email=sale.buyer_email,
|
||||||
|
license_key=license_key,
|
||||||
|
tier=tier_enum,
|
||||||
|
features=all_features_for_tier(tier_enum),
|
||||||
|
issued_at=issued_iso,
|
||||||
|
expires_at=expires_iso,
|
||||||
|
signature="",
|
||||||
|
)
|
||||||
|
signature = sign(unsigned.to_canonical_dict())
|
||||||
|
payload = unsigned.to_canonical_dict()
|
||||||
|
payload["signature"] = signature
|
||||||
|
blob = encode_blob(payload)
|
||||||
|
|
||||||
|
row = License(
|
||||||
|
license_key=license_key,
|
||||||
|
name=sale.buyer_name,
|
||||||
|
email=sale.buyer_email,
|
||||||
|
tier=sale.tier,
|
||||||
|
issued_at=_iso_to_dt(issued_iso),
|
||||||
|
expires_at=_iso_to_dt(expires_iso),
|
||||||
|
blob=blob,
|
||||||
|
source=sale.source,
|
||||||
|
source_order_id=sale.source_order_id,
|
||||||
|
promotion=sale.promotion,
|
||||||
|
amount_paid=sale.amount_paid,
|
||||||
|
currency=sale.currency,
|
||||||
|
notes=sale.notes,
|
||||||
|
)
|
||||||
|
session.add(row)
|
||||||
|
session.flush()
|
||||||
|
return row
|
||||||
|
|
||||||
|
|
||||||
|
def revoke_license(
|
||||||
|
session: Session,
|
||||||
|
*,
|
||||||
|
license_key: str,
|
||||||
|
reason: Optional[str] = None,
|
||||||
|
) -> Optional[License]:
|
||||||
|
row = session.get(License, license_key)
|
||||||
|
if row is None:
|
||||||
|
return None
|
||||||
|
row.revoked_at = datetime.now(timezone.utc)
|
||||||
|
if reason:
|
||||||
|
suffix = f"\nRevoked: {reason}"
|
||||||
|
row.notes = ((row.notes or "") + suffix).strip()
|
||||||
|
return row
|
||||||
97
server/app/models.py
Normal file
97
server/app/models.py
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
"""ORM models for the licenses + gumroad_events tables.
|
||||||
|
|
||||||
|
Schema mirrors ``docs/LICENSE-SERVER.md``, generalized so any
|
||||||
|
``source`` can populate it. The ``(source, source_order_id)``
|
||||||
|
composite uniqueness key gives idempotent webhook retries — a
|
||||||
|
storefront firing the same sale twice maps to the same row.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from sqlalchemy import (
|
||||||
|
JSON,
|
||||||
|
BigInteger,
|
||||||
|
DateTime,
|
||||||
|
Index,
|
||||||
|
Integer,
|
||||||
|
Numeric,
|
||||||
|
String,
|
||||||
|
UniqueConstraint,
|
||||||
|
func,
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
from sqlalchemy.dialects.postgresql import JSONB
|
||||||
|
from sqlalchemy.orm import Mapped, mapped_column
|
||||||
|
|
||||||
|
# JSONB on Postgres (indexable, queryable), plain JSON elsewhere
|
||||||
|
# (SQLite for tests). Same Python interface either way.
|
||||||
|
_JSON_TYPE = JSON().with_variant(JSONB(), "postgresql")
|
||||||
|
|
||||||
|
# SQLite only auto-increments INTEGER PRIMARY KEY (not BIGINT).
|
||||||
|
# Postgres can autoincrement either, so the variant keeps the
|
||||||
|
# production migration on BigInteger while tests use Integer.
|
||||||
|
_PK_TYPE = BigInteger().with_variant(Integer(), "sqlite")
|
||||||
|
|
||||||
|
from app.db import Base
|
||||||
|
|
||||||
|
|
||||||
|
class License(Base):
|
||||||
|
__tablename__ = "licenses"
|
||||||
|
|
||||||
|
license_key: Mapped[str] = mapped_column(String, primary_key=True)
|
||||||
|
name: Mapped[str] = mapped_column(String, nullable=False)
|
||||||
|
email: Mapped[str] = mapped_column(String, nullable=False)
|
||||||
|
tier: Mapped[str] = mapped_column(String, nullable=False)
|
||||||
|
issued_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False)
|
||||||
|
expires_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False)
|
||||||
|
blob: Mapped[str] = mapped_column(String, nullable=False)
|
||||||
|
|
||||||
|
source: Mapped[str] = mapped_column(String, nullable=False)
|
||||||
|
source_order_id: Mapped[Optional[str]] = mapped_column(String, nullable=True)
|
||||||
|
promotion: Mapped[Optional[str]] = mapped_column(String, nullable=True)
|
||||||
|
amount_paid: Mapped[Optional[float]] = mapped_column(Numeric(10, 2), nullable=True)
|
||||||
|
currency: Mapped[Optional[str]] = mapped_column(String(3), nullable=True, server_default=text("'USD'"))
|
||||||
|
|
||||||
|
revoked_at: Mapped[Optional[datetime]] = mapped_column(DateTime(timezone=True), nullable=True)
|
||||||
|
notes: Mapped[Optional[str]] = mapped_column(String, nullable=True)
|
||||||
|
|
||||||
|
created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now())
|
||||||
|
updated_at: Mapped[datetime] = mapped_column(
|
||||||
|
DateTime(timezone=True),
|
||||||
|
nullable=False,
|
||||||
|
server_default=func.now(),
|
||||||
|
onupdate=func.now(),
|
||||||
|
)
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
UniqueConstraint("source", "source_order_id", name="uq_licenses_source_order"),
|
||||||
|
Index("ix_licenses_email_lower", func.lower(text("email"))),
|
||||||
|
Index("ix_licenses_expires_active", "expires_at", postgresql_where=text("revoked_at IS NULL")),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class GumroadEvent(Base):
|
||||||
|
"""Append-only audit log of every webhook delivery.
|
||||||
|
|
||||||
|
Stored regardless of processing outcome so we can replay failed
|
||||||
|
events, investigate disputes, and reconstruct the customer
|
||||||
|
record if the ``licenses`` table is ever corrupted.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__tablename__ = "gumroad_events"
|
||||||
|
|
||||||
|
id: Mapped[int] = mapped_column(_PK_TYPE, primary_key=True, autoincrement=True)
|
||||||
|
received_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now())
|
||||||
|
event_type: Mapped[str] = mapped_column(String, nullable=False)
|
||||||
|
order_id: Mapped[Optional[str]] = mapped_column(String, nullable=True)
|
||||||
|
raw_payload: Mapped[dict] = mapped_column(_JSON_TYPE, nullable=False)
|
||||||
|
processed: Mapped[bool] = mapped_column(server_default=text("false"), nullable=False)
|
||||||
|
error: Mapped[Optional[str]] = mapped_column(String, nullable=True)
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
Index("ix_gumroad_events_order_id", "order_id"),
|
||||||
|
Index("ix_gumroad_events_unprocessed", "received_at", postgresql_where=text("processed = false")),
|
||||||
|
)
|
||||||
71
server/app/products.py
Normal file
71
server/app/products.py
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
"""Storefront product → license tier mapping.
|
||||||
|
|
||||||
|
The mapping lives in ``server/config/products.yaml`` (gitignored
|
||||||
|
for secrets it isn't — it's a routine catalog file) so adding a
|
||||||
|
new SKU is one yaml edit plus a container restart. The lookup is
|
||||||
|
``(source, product_id) -> (tier, years)``.
|
||||||
|
|
||||||
|
Cached at module import. The runtime cost of reloading on every
|
||||||
|
webhook would be trivial, but caching keeps the hot path
|
||||||
|
allocation-free and makes the "edit yaml, restart api" idiom
|
||||||
|
explicit — operators always know exactly when their changes go
|
||||||
|
live.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from functools import lru_cache
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class ProductMapping:
|
||||||
|
tier: str
|
||||||
|
years: int
|
||||||
|
|
||||||
|
|
||||||
|
def _config_path() -> Path:
|
||||||
|
"""Resolve the products config.
|
||||||
|
|
||||||
|
Container layout puts the config at ``/app/config/products.yaml``
|
||||||
|
(the Dockerfile COPYs ``server/config`` to ``/app/config``).
|
||||||
|
For local pytest runs we walk up from this file to ``server/``.
|
||||||
|
"""
|
||||||
|
in_container = Path("/app/config/products.yaml")
|
||||||
|
if in_container.exists():
|
||||||
|
return in_container
|
||||||
|
return Path(__file__).resolve().parent.parent / "config" / "products.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _table() -> dict[tuple[str, str], ProductMapping]:
|
||||||
|
raw = yaml.safe_load(_config_path().read_text(encoding="utf-8")) or {}
|
||||||
|
table: dict[tuple[str, str], ProductMapping] = {}
|
||||||
|
for source, entries in raw.items():
|
||||||
|
for entry in entries or []:
|
||||||
|
key = (source, str(entry["product_id"]))
|
||||||
|
table[key] = ProductMapping(
|
||||||
|
tier=entry["tier"],
|
||||||
|
years=int(entry.get("years", 1)),
|
||||||
|
)
|
||||||
|
return table
|
||||||
|
|
||||||
|
|
||||||
|
def lookup(source: str, product_id: str) -> Optional[ProductMapping]:
|
||||||
|
"""Return the mapping for *(source, product_id)*, or None if unmapped.
|
||||||
|
|
||||||
|
Returning None (rather than raising) lets the webhook layer
|
||||||
|
decide whether to surface the failure as an audit row vs a
|
||||||
|
user-visible error — we want unmapped sales to be logged, not
|
||||||
|
to crash the handler and trigger Gumroad retry storms.
|
||||||
|
"""
|
||||||
|
return _table().get((source, product_id))
|
||||||
|
|
||||||
|
|
||||||
|
def reload_for_tests() -> None:
|
||||||
|
"""Drop the cache. Tests that mutate the yaml call this."""
|
||||||
|
_table.cache_clear()
|
||||||
0
server/app/routes/__init__.py
Normal file
0
server/app/routes/__init__.py
Normal file
103
server/app/routes/internal.py
Normal file
103
server/app/routes/internal.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
"""Internal (operator-only) routes.
|
||||||
|
|
||||||
|
Two defense layers protect this path:
|
||||||
|
|
||||||
|
1. **nginx** blocks ``/internal/*`` at the public server-block level
|
||||||
|
(``location /internal/ { return 404; }`` in
|
||||||
|
``docs/SETUP-LICENSE-SERVER.md``).
|
||||||
|
2. **Bearer token** authenticates the operator's CLI.
|
||||||
|
|
||||||
|
An earlier draft also enforced a peer-IP loopback check here, but
|
||||||
|
that fights the Docker bridge network model: the container always
|
||||||
|
sees the gateway IP (172.x.0.1) regardless of whether traffic
|
||||||
|
originated from nginx on the host or from outside. The check is
|
||||||
|
preserved as :func:`app.auth.require_localhost` for future use
|
||||||
|
(e.g. if the API ever runs in ``network_mode: host``).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Query, status
|
||||||
|
from sqlalchemy import func, select
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.adapters.manual import ManualAdapter
|
||||||
|
from app.auth import require_bearer_token
|
||||||
|
from app.db import get_session
|
||||||
|
from app.mint import mint_from_sale, revoke_license
|
||||||
|
from app.models import License
|
||||||
|
from app.schemas import LicenseResponse, MintRequest, RevokeRequest
|
||||||
|
|
||||||
|
router = APIRouter(
|
||||||
|
prefix="/internal",
|
||||||
|
dependencies=[Depends(require_bearer_token)],
|
||||||
|
)
|
||||||
|
|
||||||
|
_MANUAL = ManualAdapter()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/mint", response_model=LicenseResponse, status_code=status.HTTP_201_CREATED)
|
||||||
|
def mint(req: MintRequest, session: Session = Depends(get_session)) -> License:
|
||||||
|
"""Mint a license blob and persist the row.
|
||||||
|
|
||||||
|
PR 1 only wires the ``manual`` source through this endpoint. Real
|
||||||
|
storefront sales (Gumroad et al.) arrive via per-source webhook
|
||||||
|
handlers in PR 2 and bypass this route entirely.
|
||||||
|
"""
|
||||||
|
if req.source != "manual":
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail=(
|
||||||
|
f"Source {req.source!r} is not wired for direct mints. "
|
||||||
|
"Storefront sales arrive via /webhooks/* (PR 2)."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
sale = _MANUAL.build_sale(
|
||||||
|
name=req.name,
|
||||||
|
email=req.email,
|
||||||
|
tier=req.tier.value,
|
||||||
|
years=req.years,
|
||||||
|
promotion=req.promotion,
|
||||||
|
amount_paid=req.amount_paid,
|
||||||
|
currency=req.currency,
|
||||||
|
notes=req.notes,
|
||||||
|
)
|
||||||
|
return mint_from_sale(session, sale)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/revoke", response_model=LicenseResponse)
|
||||||
|
def revoke(req: RevokeRequest, session: Session = Depends(get_session)) -> License:
|
||||||
|
row = revoke_license(session, license_key=req.license_key, reason=req.reason)
|
||||||
|
if row is None:
|
||||||
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="License not found")
|
||||||
|
return row
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/licenses", response_model=list[LicenseResponse])
|
||||||
|
def list_licenses(
|
||||||
|
email: Optional[str] = Query(default=None, description="Case-insensitive email substring."),
|
||||||
|
tier: Optional[str] = Query(default=None),
|
||||||
|
source: Optional[str] = Query(default=None),
|
||||||
|
include_revoked: bool = Query(default=False),
|
||||||
|
limit: int = Query(default=50, ge=1, le=500),
|
||||||
|
offset: int = Query(default=0, ge=0),
|
||||||
|
session: Session = Depends(get_session),
|
||||||
|
) -> list[License]:
|
||||||
|
stmt = select(License).order_by(License.created_at.desc()).limit(limit).offset(offset)
|
||||||
|
if email:
|
||||||
|
stmt = stmt.where(func.lower(License.email).contains(email.lower()))
|
||||||
|
if tier:
|
||||||
|
stmt = stmt.where(License.tier == tier)
|
||||||
|
if source:
|
||||||
|
stmt = stmt.where(License.source == source)
|
||||||
|
if not include_revoked:
|
||||||
|
stmt = stmt.where(License.revoked_at.is_(None))
|
||||||
|
return list(session.execute(stmt).scalars().all())
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/ping")
|
||||||
|
def ping() -> dict:
|
||||||
|
"""Sanity-check both guards from inside an SSH tunnel."""
|
||||||
|
return {"ok": True}
|
||||||
27
server/app/routes/public.py
Normal file
27
server/app/routes/public.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
"""Public (internet-facing) routes.
|
||||||
|
|
||||||
|
For PR 1: only ``/health``. The webhook receiver and renewal portal
|
||||||
|
land in PR 2 and PR 3 respectively.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from sqlalchemy import text
|
||||||
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.db import get_session
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/health")
|
||||||
|
def health(session: Session = Depends(get_session)) -> dict:
|
||||||
|
"""Liveness + DB reachability. Cheap; safe to hit on a tight cadence."""
|
||||||
|
db_ok = True
|
||||||
|
try:
|
||||||
|
session.execute(text("SELECT 1"))
|
||||||
|
except SQLAlchemyError:
|
||||||
|
db_ok = False
|
||||||
|
return {"status": "ok" if db_ok else "degraded", "db": "ok" if db_ok else "error"}
|
||||||
121
server/app/routes/webhooks.py
Normal file
121
server/app/routes/webhooks.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
"""Storefront webhook receivers.
|
||||||
|
|
||||||
|
PR 2 wires Gumroad. Future storefronts each get their own route
|
||||||
|
(``/webhooks/lemonsqueezy``, ``/webhooks/stripe``, ...). All share
|
||||||
|
the same downstream flow: audit-log the raw payload, parse via
|
||||||
|
adapter, mint, send email, mark processed.
|
||||||
|
|
||||||
|
Handler contract
|
||||||
|
----------------
|
||||||
|
|
||||||
|
We **always** return 200 once a request authenticates, even on
|
||||||
|
downstream failures. Gumroad retries non-2xx for ~3 days, which
|
||||||
|
would turn a single broken sale into hours of duplicate webhook
|
||||||
|
storms. Our idempotency keys (``UNIQUE(source, source_order_id)``)
|
||||||
|
make at-least-once handling safe; the storefront retries on
|
||||||
|
network errors only.
|
||||||
|
|
||||||
|
When something downstream fails (unmapped product, DB error, email
|
||||||
|
failure), we record the cause in ``gumroad_events.error`` so the
|
||||||
|
operator can fix and replay.
|
||||||
|
|
||||||
|
Unauthenticated requests return 404 — we don't want to signal
|
||||||
|
endpoint existence or "wrong secret" to a prober.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.adapters.gumroad import GumroadAdapter, UnmappedProductError
|
||||||
|
from app.config import get_settings
|
||||||
|
from app.db import get_session
|
||||||
|
from app.email import EmailDeliveryError, LicenseEmail, get_email_service
|
||||||
|
from app.mint import mint_from_sale
|
||||||
|
from app.models import GumroadEvent
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/webhooks")
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _gumroad_adapter() -> GumroadAdapter:
|
||||||
|
settings = get_settings()
|
||||||
|
return GumroadAdapter(secret=settings.resolve_gumroad_secret())
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/gumroad", status_code=200)
|
||||||
|
async def gumroad(
|
||||||
|
request: Request,
|
||||||
|
secret: Optional[str] = Query(default=None),
|
||||||
|
session: Session = Depends(get_session),
|
||||||
|
) -> dict:
|
||||||
|
adapter = _gumroad_adapter()
|
||||||
|
|
||||||
|
if not adapter.verify_secret(secret):
|
||||||
|
# 404 — no information leak about endpoint existence.
|
||||||
|
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Not found")
|
||||||
|
|
||||||
|
# Gumroad's Ping is form-encoded; FastAPI doesn't auto-parse
|
||||||
|
# without a Form() dependency, and we want the raw map for the
|
||||||
|
# audit log regardless of schema.
|
||||||
|
raw_form = await request.form()
|
||||||
|
payload = {k: str(v) for k, v in raw_form.items()}
|
||||||
|
|
||||||
|
# Audit row FIRST — any later failure leaves us a replayable record.
|
||||||
|
event = GumroadEvent(
|
||||||
|
event_type="sale",
|
||||||
|
order_id=payload.get("sale_id"),
|
||||||
|
raw_payload=payload,
|
||||||
|
)
|
||||||
|
session.add(event)
|
||||||
|
session.flush()
|
||||||
|
|
||||||
|
try:
|
||||||
|
sale = adapter.parse_sale(payload)
|
||||||
|
except UnmappedProductError as e:
|
||||||
|
event.error = str(e)
|
||||||
|
log.warning("Gumroad sale with unmapped product: %s", e)
|
||||||
|
return {"status": "logged-no-mint", "reason": "unmapped_product"}
|
||||||
|
except Exception as e: # pragma: no cover — defensive
|
||||||
|
event.error = f"parse error: {e!r}"
|
||||||
|
log.exception("Gumroad parse failure")
|
||||||
|
return {"status": "logged-no-mint", "reason": "parse_error"}
|
||||||
|
|
||||||
|
if sale is None:
|
||||||
|
event.error = "payload did not parse as a sale"
|
||||||
|
return {"status": "logged-no-mint", "reason": "not_a_sale"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
row = mint_from_sale(session, sale)
|
||||||
|
session.flush()
|
||||||
|
except Exception as e: # pragma: no cover — defensive
|
||||||
|
event.error = f"mint error: {e!r}"
|
||||||
|
log.exception("mint_from_sale failed")
|
||||||
|
return {"status": "logged-no-mint", "reason": "mint_error"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
get_email_service().send_license(
|
||||||
|
LicenseEmail(
|
||||||
|
to_name=row.name,
|
||||||
|
to_email=row.email,
|
||||||
|
tier=row.tier,
|
||||||
|
license_key=row.license_key,
|
||||||
|
expires_at_iso=row.expires_at.isoformat(),
|
||||||
|
blob=row.blob,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except EmailDeliveryError as e:
|
||||||
|
event.error = f"email error: {e}"
|
||||||
|
log.warning("Email delivery failed (license already minted): %s", e)
|
||||||
|
# The buyer can still be served from the DB via the renewal
|
||||||
|
# portal in PR 3 / a manual resend, so we don't fail the
|
||||||
|
# webhook over an email hiccup.
|
||||||
|
event.processed = True
|
||||||
|
return {"status": "minted-email-failed", "license_key": row.license_key}
|
||||||
|
|
||||||
|
event.processed = True
|
||||||
|
return {"status": "ok", "license_key": row.license_key}
|
||||||
54
server/app/schemas.py
Normal file
54
server/app/schemas.py
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
"""Pydantic request/response models for the HTTP layer."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
from decimal import Decimal
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict, EmailStr, Field
|
||||||
|
|
||||||
|
|
||||||
|
class TierName(str, Enum):
|
||||||
|
lite = "lite"
|
||||||
|
core = "core"
|
||||||
|
pro = "pro"
|
||||||
|
enterprise = "enterprise"
|
||||||
|
|
||||||
|
|
||||||
|
class MintRequest(BaseModel):
|
||||||
|
name: str = Field(min_length=1, max_length=200)
|
||||||
|
email: EmailStr
|
||||||
|
tier: TierName
|
||||||
|
years: int = Field(default=1, ge=1, le=10)
|
||||||
|
source: str = Field(default="manual", min_length=1, max_length=40)
|
||||||
|
source_order_id: Optional[str] = Field(default=None, max_length=120)
|
||||||
|
promotion: Optional[str] = Field(default=None, max_length=60)
|
||||||
|
amount_paid: Optional[Decimal] = Field(default=None, ge=0, decimal_places=2)
|
||||||
|
currency: Optional[str] = Field(default="USD", min_length=3, max_length=3)
|
||||||
|
notes: Optional[str] = Field(default=None, max_length=2000)
|
||||||
|
|
||||||
|
|
||||||
|
class RevokeRequest(BaseModel):
|
||||||
|
license_key: str = Field(min_length=1, max_length=120)
|
||||||
|
reason: Optional[str] = Field(default=None, max_length=500)
|
||||||
|
|
||||||
|
|
||||||
|
class LicenseResponse(BaseModel):
|
||||||
|
model_config = ConfigDict(from_attributes=True)
|
||||||
|
|
||||||
|
license_key: str
|
||||||
|
name: str
|
||||||
|
email: str
|
||||||
|
tier: str
|
||||||
|
issued_at: datetime
|
||||||
|
expires_at: datetime
|
||||||
|
blob: str
|
||||||
|
source: str
|
||||||
|
source_order_id: Optional[str]
|
||||||
|
promotion: Optional[str]
|
||||||
|
amount_paid: Optional[Decimal]
|
||||||
|
currency: Optional[str]
|
||||||
|
revoked_at: Optional[datetime]
|
||||||
|
notes: Optional[str]
|
||||||
43
server/compose.test.yml
Normal file
43
server/compose.test.yml
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# Smoke-test compose. Stands the API + Postgres up in isolation,
|
||||||
|
# exercises a mint, tears everything down (volume included). Never
|
||||||
|
# meant for production — for that see docs/SETUP-LICENSE-SERVER.md.
|
||||||
|
#
|
||||||
|
# Ports map to 127.0.0.1 only so it can run on a host that already
|
||||||
|
# binds 5432 / 8090 to something else.
|
||||||
|
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:16-alpine
|
||||||
|
environment:
|
||||||
|
POSTGRES_DB: dt_test
|
||||||
|
POSTGRES_USER: dt_test
|
||||||
|
POSTGRES_PASSWORD: test_pw
|
||||||
|
ports:
|
||||||
|
- "127.0.0.1:15432:5432"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "pg_isready -U dt_test -d dt_test"]
|
||||||
|
interval: 2s
|
||||||
|
timeout: 2s
|
||||||
|
retries: 20
|
||||||
|
|
||||||
|
api:
|
||||||
|
build:
|
||||||
|
context: ..
|
||||||
|
dockerfile: server/Dockerfile
|
||||||
|
depends_on:
|
||||||
|
postgres:
|
||||||
|
condition: service_healthy
|
||||||
|
environment:
|
||||||
|
DATABASE_URL: postgresql+psycopg://dt_test:test_pw@postgres:5432/dt_test
|
||||||
|
DATATOOLS_ADMIN_TOKEN: test-admin-token
|
||||||
|
GUMROAD_WEBHOOK_SECRET: test-gumroad-secret
|
||||||
|
# No DATATOOLS_LICENSE_PRIVKEY — falls back to the in-tree
|
||||||
|
# dev keypair, matching what the desktop dev build expects.
|
||||||
|
# No POSTMARK_TOKEN — falls back to LoggingEmailService.
|
||||||
|
ports:
|
||||||
|
- "127.0.0.1:18090:8000"
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "curl", "--fail", "--silent", "http://localhost:8000/health"]
|
||||||
|
interval: 5s
|
||||||
|
timeout: 3s
|
||||||
|
retries: 10
|
||||||
34
server/config/products.yaml
Normal file
34
server/config/products.yaml
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# Storefront product → license tier mapping.
|
||||||
|
#
|
||||||
|
# Each storefront has its own product/variant IDs. The webhook
|
||||||
|
# handler looks up (source, product_id) in this file to decide
|
||||||
|
# what to mint. Unknown product IDs are an error (audit row gets
|
||||||
|
# error="unmapped product", no license created — the operator
|
||||||
|
# fixes the mapping and replays).
|
||||||
|
#
|
||||||
|
# After editing this file, `docker compose restart api` to reload.
|
||||||
|
|
||||||
|
gumroad:
|
||||||
|
# Fill in real Gumroad product_ids once SKUs exist. Until then the
|
||||||
|
# examples below are placeholders that the test suite uses.
|
||||||
|
- product_id: "datatools-lite"
|
||||||
|
tier: lite
|
||||||
|
years: 1
|
||||||
|
- product_id: "datatools-core"
|
||||||
|
tier: core
|
||||||
|
years: 1
|
||||||
|
- product_id: "datatools-pro"
|
||||||
|
tier: pro
|
||||||
|
years: 1
|
||||||
|
|
||||||
|
# Future storefronts slot in as siblings:
|
||||||
|
#
|
||||||
|
# lemonsqueezy:
|
||||||
|
# - product_id: "12345"
|
||||||
|
# tier: core
|
||||||
|
# years: 1
|
||||||
|
#
|
||||||
|
# stripe:
|
||||||
|
# - product_id: "prod_xxx"
|
||||||
|
# tier: pro
|
||||||
|
# years: 1
|
||||||
3
server/requirements-dev.txt
Normal file
3
server/requirements-dev.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
-r requirements.txt
|
||||||
|
pytest>=8.3,<9
|
||||||
|
pytest-asyncio>=0.24,<1
|
||||||
12
server/requirements.txt
Normal file
12
server/requirements.txt
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
fastapi>=0.115,<0.120
|
||||||
|
uvicorn[standard]>=0.32,<0.40
|
||||||
|
python-multipart>=0.0.20,<1
|
||||||
|
sqlalchemy>=2.0,<3
|
||||||
|
psycopg[binary]>=3.2,<4
|
||||||
|
alembic>=1.14,<2
|
||||||
|
pydantic>=2.9,<3
|
||||||
|
pydantic-settings>=2.6,<3
|
||||||
|
email-validator>=2.2,<3
|
||||||
|
cryptography>=43,<46
|
||||||
|
httpx>=0.27,<1
|
||||||
|
pyyaml>=6,<7
|
||||||
148
server/scripts/smoke.sh
Executable file
148
server/scripts/smoke.sh
Executable file
@@ -0,0 +1,148 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# End-to-end smoke test for the license server.
|
||||||
|
#
|
||||||
|
# Builds the API image, brings up Postgres + API, runs the Alembic
|
||||||
|
# migration, mints a license through /internal/mint, verifies the
|
||||||
|
# resulting blob's Ed25519 signature against the dev pubkey, and
|
||||||
|
# confirms the row landed in the DB. Tears everything down at exit.
|
||||||
|
#
|
||||||
|
# Run from the server/ directory: ./scripts/smoke.sh
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
cd "$(dirname "$0")/.."
|
||||||
|
|
||||||
|
PROJECT=dt-license-smoke
|
||||||
|
COMPOSE=(docker compose -p "$PROJECT" -f compose.test.yml)
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
echo "--- Tearing down ---"
|
||||||
|
"${COMPOSE[@]}" down -v --remove-orphans >/dev/null 2>&1 || true
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
echo "--- Building image ---"
|
||||||
|
"${COMPOSE[@]}" build
|
||||||
|
|
||||||
|
echo "--- Starting stack ---"
|
||||||
|
"${COMPOSE[@]}" up -d
|
||||||
|
|
||||||
|
echo "--- Waiting for API health (max 60s) ---"
|
||||||
|
for i in $(seq 1 60); do
|
||||||
|
if curl -sf http://127.0.0.1:18090/health 2>/dev/null | grep -q '"status":"ok"'; then
|
||||||
|
echo "API up after ${i}s"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "--- Running migrations ---"
|
||||||
|
"${COMPOSE[@]}" exec -T api alembic upgrade head
|
||||||
|
|
||||||
|
echo "--- Re-checking health post-migration ---"
|
||||||
|
curl -sf http://127.0.0.1:18090/health | tee /dev/stderr | grep -q '"db":"ok"'
|
||||||
|
|
||||||
|
echo "--- POST /internal/mint ---"
|
||||||
|
RESP=$(curl -s -w "\nHTTP=%{http_code}" -X POST http://127.0.0.1:18090/internal/mint \
|
||||||
|
-H "Authorization: Bearer test-admin-token" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{"name":"Smoke Test","email":"smoke@example.com","tier":"core","source":"manual"}')
|
||||||
|
echo "$RESP"
|
||||||
|
HTTP_CODE=$(echo "$RESP" | tail -n1 | sed 's/HTTP=//')
|
||||||
|
RESP=$(echo "$RESP" | sed '$d')
|
||||||
|
if [ "$HTTP_CODE" != "201" ]; then
|
||||||
|
echo "MINT FAILED (HTTP $HTTP_CODE)"
|
||||||
|
"${COMPOSE[@]}" logs --tail 50 api
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "$RESP" | python3 -m json.tool | head -8
|
||||||
|
|
||||||
|
BLOB=$(echo "$RESP" | python3 -c 'import json,sys; print(json.load(sys.stdin)["blob"])')
|
||||||
|
|
||||||
|
echo "--- Verifying blob signature against host dev pubkey ---"
|
||||||
|
python3 - <<EOF
|
||||||
|
import sys
|
||||||
|
sys.path.insert(0, "..")
|
||||||
|
from src.license.crypto import decode_blob, verify
|
||||||
|
payload = decode_blob("$BLOB")
|
||||||
|
sig = payload.pop("signature")
|
||||||
|
assert verify(payload, sig), "signature must verify"
|
||||||
|
assert payload["name"] == "Smoke Test"
|
||||||
|
assert payload["email"] == "smoke@example.com"
|
||||||
|
assert payload["tier"] == "core"
|
||||||
|
print("OK: signature verifies, payload matches")
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "--- Verifying DB row ---"
|
||||||
|
"${COMPOSE[@]}" exec -T postgres \
|
||||||
|
psql -U dt_test -d dt_test -t -c \
|
||||||
|
"SELECT license_key, email, tier, source FROM licenses;" \
|
||||||
|
| grep -q smoke@example.com
|
||||||
|
|
||||||
|
echo "--- POST /webhooks/gumroad (synthetic Ping payload) ---"
|
||||||
|
WEBHOOK_RESP=$(curl -s -w "\nHTTP=%{http_code}" -X POST \
|
||||||
|
"http://127.0.0.1:18090/webhooks/gumroad?secret=test-gumroad-secret" \
|
||||||
|
-H "Content-Type: application/x-www-form-urlencoded" \
|
||||||
|
--data-urlencode "sale_id=GUM-SMOKE-001" \
|
||||||
|
--data-urlencode "email=webhook@example.com" \
|
||||||
|
--data-urlencode "full_name=Webhook Tester" \
|
||||||
|
--data-urlencode "product_id=datatools-core" \
|
||||||
|
--data-urlencode "price=9900" \
|
||||||
|
--data-urlencode "currency=usd" \
|
||||||
|
--data-urlencode "test=true")
|
||||||
|
WEBHOOK_CODE=$(echo "$WEBHOOK_RESP" | tail -n1 | sed 's/HTTP=//')
|
||||||
|
WEBHOOK_BODY=$(echo "$WEBHOOK_RESP" | sed '$d')
|
||||||
|
echo "$WEBHOOK_BODY" | python3 -m json.tool
|
||||||
|
if [ "$WEBHOOK_CODE" != "200" ] || ! echo "$WEBHOOK_BODY" | grep -q '"status":"ok"'; then
|
||||||
|
echo "WEBHOOK FAILED (HTTP $WEBHOOK_CODE)"
|
||||||
|
"${COMPOSE[@]}" logs --tail 30 api
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "--- Verifying Gumroad mint landed in DB ---"
|
||||||
|
"${COMPOSE[@]}" exec -T postgres \
|
||||||
|
psql -U dt_test -d dt_test -t -c \
|
||||||
|
"SELECT license_key, email, tier, source, source_order_id FROM licenses WHERE source='gumroad';" \
|
||||||
|
| tee /dev/stderr | grep -q GUM-SMOKE-001
|
||||||
|
|
||||||
|
echo "--- Verifying gumroad_events audit row ---"
|
||||||
|
PROCESSED=$("${COMPOSE[@]}" exec -T postgres \
|
||||||
|
psql -U dt_test -d dt_test -At -c \
|
||||||
|
"SELECT processed FROM gumroad_events WHERE order_id='GUM-SMOKE-001' ORDER BY id LIMIT 1;")
|
||||||
|
if [ "$PROCESSED" != "t" ]; then
|
||||||
|
echo "FAIL: gumroad_events.processed=$PROCESSED (expected 't')"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "audit row processed=true"
|
||||||
|
|
||||||
|
echo "--- Verifying wrong-secret returns 404 ---"
|
||||||
|
WRONG_CODE=$(curl -s -o /dev/null -w "%{http_code}" -X POST \
|
||||||
|
"http://127.0.0.1:18090/webhooks/gumroad?secret=wrong-secret" \
|
||||||
|
--data-urlencode "sale_id=should-not-mint")
|
||||||
|
if [ "$WRONG_CODE" != "404" ]; then
|
||||||
|
echo "FAIL: wrong-secret should return 404, got $WRONG_CODE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "wrong-secret correctly rejected"
|
||||||
|
|
||||||
|
echo "--- Verifying webhook idempotency on retry ---"
|
||||||
|
curl -s -o /dev/null -X POST \
|
||||||
|
"http://127.0.0.1:18090/webhooks/gumroad?secret=test-gumroad-secret" \
|
||||||
|
--data-urlencode "sale_id=GUM-SMOKE-001" \
|
||||||
|
--data-urlencode "email=webhook@example.com" \
|
||||||
|
--data-urlencode "full_name=Webhook Tester" \
|
||||||
|
--data-urlencode "product_id=datatools-core" \
|
||||||
|
--data-urlencode "price=9900" \
|
||||||
|
--data-urlencode "currency=usd"
|
||||||
|
ROW_COUNT=$("${COMPOSE[@]}" exec -T postgres psql -U dt_test -d dt_test -t -c \
|
||||||
|
"SELECT COUNT(*) FROM licenses WHERE source_order_id='GUM-SMOKE-001';" \
|
||||||
|
| tr -d ' ')
|
||||||
|
if [ "$ROW_COUNT" != "1" ]; then
|
||||||
|
echo "FAIL: duplicate webhook produced $ROW_COUNT rows (expected 1)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "idempotency OK: still 1 license row after retry"
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "===================================="
|
||||||
|
echo " SMOKE TEST PASSED"
|
||||||
|
echo "===================================="
|
||||||
0
server/tests/__init__.py
Normal file
0
server/tests/__init__.py
Normal file
108
server/tests/conftest.py
Normal file
108
server/tests/conftest.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
"""Shared pytest fixtures.
|
||||||
|
|
||||||
|
Tests run against in-memory SQLite — no docker, no Postgres install.
|
||||||
|
The cross-dialect type variants in :mod:`app.models` keep the schema
|
||||||
|
identical in behavior for everything PR 1 exercises (the JSONB
|
||||||
|
column on ``gumroad_events`` isn't touched until PR 2).
|
||||||
|
|
||||||
|
Auth: a fixed test token is wired into the settings cache before any
|
||||||
|
app modules import, and the ``client`` fixture overrides the
|
||||||
|
``require_localhost`` guard since Starlette's TestClient connects
|
||||||
|
from a synthetic ``testclient`` peer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Set required env BEFORE importing anything from app.* so pydantic
|
||||||
|
# Settings (lru_cache'd) picks up these values on first access.
|
||||||
|
os.environ["DATATOOLS_ADMIN_TOKEN"] = "test-admin-token"
|
||||||
|
os.environ["DATABASE_URL"] = "sqlite+pysqlite:///:memory:"
|
||||||
|
|
||||||
|
# Make the desktop license module importable as `datatools_license`.
|
||||||
|
# In the Docker image this happens via `COPY src/license /app/datatools_license`;
|
||||||
|
# during local tests we simulate it by aliasing src.license.
|
||||||
|
_REPO_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
sys.path.insert(0, str(_REPO_ROOT))
|
||||||
|
import src.license as _dt_license_module
|
||||||
|
sys.modules.setdefault("datatools_license", _dt_license_module)
|
||||||
|
for _sub in ("crypto", "schema", "features", "_dev_keypair"):
|
||||||
|
sys.modules.setdefault(
|
||||||
|
f"datatools_license.{_sub}",
|
||||||
|
__import__(f"src.license.{_sub}", fromlist=[_sub]),
|
||||||
|
)
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from sqlalchemy import create_engine, event
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
from sqlalchemy.pool import StaticPool
|
||||||
|
|
||||||
|
import app.db as app_db
|
||||||
|
from app.db import Base
|
||||||
|
from app.main import app
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def engine():
|
||||||
|
eng = create_engine(
|
||||||
|
"sqlite+pysqlite:///:memory:",
|
||||||
|
connect_args={"check_same_thread": False},
|
||||||
|
poolclass=StaticPool,
|
||||||
|
future=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Enforce foreign keys on SQLite (off by default).
|
||||||
|
@event.listens_for(eng, "connect")
|
||||||
|
def _fk_on(dbapi_conn, _):
|
||||||
|
dbapi_conn.execute("PRAGMA foreign_keys=ON")
|
||||||
|
|
||||||
|
Base.metadata.create_all(eng)
|
||||||
|
yield eng
|
||||||
|
eng.dispose()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _bind_app_engine(engine, monkeypatch):
|
||||||
|
"""Point the app's session factory at the test engine and wipe rows
|
||||||
|
between tests so order-of-execution can't leak state."""
|
||||||
|
TestSession = sessionmaker(bind=engine, autoflush=False, autocommit=False, expire_on_commit=False)
|
||||||
|
monkeypatch.setattr(app_db, "engine", engine)
|
||||||
|
monkeypatch.setattr(app_db, "SessionLocal", TestSession)
|
||||||
|
yield
|
||||||
|
with engine.begin() as conn:
|
||||||
|
for tbl in reversed(Base.metadata.sorted_tables):
|
||||||
|
conn.execute(tbl.delete())
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def db_session(engine):
|
||||||
|
"""Per-test session with rollback isolation."""
|
||||||
|
Session = sessionmaker(bind=engine, autoflush=False, autocommit=False, expire_on_commit=False)
|
||||||
|
sess = Session()
|
||||||
|
try:
|
||||||
|
yield sess
|
||||||
|
finally:
|
||||||
|
# Clean rows between tests rather than transaction-rolling (the
|
||||||
|
# mint code flushes mid-transaction and we want each test to
|
||||||
|
# see a clean licenses table).
|
||||||
|
sess.rollback()
|
||||||
|
for tbl in reversed(Base.metadata.sorted_tables):
|
||||||
|
sess.execute(tbl.delete())
|
||||||
|
sess.commit()
|
||||||
|
sess.close()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
"""Plain TestClient. Bearer-token check is live."""
|
||||||
|
with TestClient(app) as c:
|
||||||
|
yield c
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def admin_headers() -> dict[str, str]:
|
||||||
|
return {"Authorization": "Bearer test-admin-token"}
|
||||||
52
server/tests/test_adapters.py
Normal file
52
server/tests/test_adapters.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
"""ManualAdapter — building a SaleEvent from CLI-style kwargs."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
|
from app.adapters.manual import ManualAdapter
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_sale_minimal_defaults():
|
||||||
|
a = ManualAdapter()
|
||||||
|
sale = a.build_sale(name="Jane Doe", email="jane@example.com", tier="core")
|
||||||
|
assert sale.source == "manual"
|
||||||
|
assert sale.source_order_id is None
|
||||||
|
assert sale.buyer_name == "Jane Doe"
|
||||||
|
assert sale.buyer_email == "jane@example.com"
|
||||||
|
assert sale.tier == "core"
|
||||||
|
assert sale.years == 1
|
||||||
|
assert sale.currency == "USD"
|
||||||
|
assert sale.promotion is None
|
||||||
|
assert sale.amount_paid is None
|
||||||
|
assert sale.notes is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_sale_full_metadata():
|
||||||
|
a = ManualAdapter()
|
||||||
|
sale = a.build_sale(
|
||||||
|
name="Acme",
|
||||||
|
email="ops@acme.example",
|
||||||
|
tier="pro",
|
||||||
|
years=2,
|
||||||
|
promotion="LAUNCH50",
|
||||||
|
amount_paid=Decimal("249.00"),
|
||||||
|
currency="EUR",
|
||||||
|
notes="comp for beta tester",
|
||||||
|
)
|
||||||
|
assert sale.years == 2
|
||||||
|
assert sale.promotion == "LAUNCH50"
|
||||||
|
assert sale.amount_paid == Decimal("249.00")
|
||||||
|
assert sale.currency == "EUR"
|
||||||
|
assert sale.notes == "comp for beta tester"
|
||||||
|
|
||||||
|
|
||||||
|
def test_verify_webhook_always_false():
|
||||||
|
"""Manual flow never originates from a webhook."""
|
||||||
|
a = ManualAdapter()
|
||||||
|
assert a.verify_webhook(body=b"{}", headers={}) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_refund_returns_none():
|
||||||
|
a = ManualAdapter()
|
||||||
|
assert a.parse_refund({"any": "payload"}) is None
|
||||||
103
server/tests/test_email.py
Normal file
103
server/tests/test_email.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
"""EmailService — Postmark client + dev-mode logging fallback."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.email import (
|
||||||
|
EmailDeliveryError,
|
||||||
|
LicenseEmail,
|
||||||
|
LoggingEmailService,
|
||||||
|
PostmarkEmailService,
|
||||||
|
_render_html,
|
||||||
|
_render_text,
|
||||||
|
get_email_service,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _msg() -> LicenseEmail:
|
||||||
|
return LicenseEmail(
|
||||||
|
to_name="Jane Doe",
|
||||||
|
to_email="jane@example.com",
|
||||||
|
tier="core",
|
||||||
|
license_key="DT1-CORE-aaaa-bbbb",
|
||||||
|
expires_at_iso="2027-05-14T01:00:00Z",
|
||||||
|
blob="DTLIC2:placeholder",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_render_text_contains_essentials():
|
||||||
|
body = _render_text(_msg())
|
||||||
|
assert "Jane Doe" in body
|
||||||
|
assert "DTLIC2:placeholder" in body
|
||||||
|
assert "DT1-CORE-aaaa-bbbb" in body
|
||||||
|
assert "core" in body
|
||||||
|
assert "2027-05-14" in body
|
||||||
|
|
||||||
|
|
||||||
|
def test_render_html_escapes_user_input():
|
||||||
|
msg = LicenseEmail(
|
||||||
|
to_name="<script>alert(1)</script>",
|
||||||
|
to_email="x@y.com",
|
||||||
|
tier="core",
|
||||||
|
license_key="K",
|
||||||
|
expires_at_iso="2030-01-01T00:00:00Z",
|
||||||
|
blob="DTLIC2:x",
|
||||||
|
)
|
||||||
|
html = _render_html(msg)
|
||||||
|
assert "<script>" not in html
|
||||||
|
assert "<script>" in html
|
||||||
|
|
||||||
|
|
||||||
|
def test_logging_service_writes_to_log(caplog):
|
||||||
|
caplog.set_level(logging.INFO)
|
||||||
|
svc = LoggingEmailService()
|
||||||
|
result = svc.send_license(_msg())
|
||||||
|
assert result == "logged"
|
||||||
|
log_text = "\n".join(r.message for r in caplog.records)
|
||||||
|
assert "would send" in log_text
|
||||||
|
assert "jane@example.com" in log_text
|
||||||
|
assert "DTLIC2:placeholder" in log_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_factory_returns_logging_when_no_token(monkeypatch):
|
||||||
|
monkeypatch.delenv("POSTMARK_TOKEN", raising=False)
|
||||||
|
monkeypatch.delenv("POSTMARK_TOKEN_FILE", raising=False)
|
||||||
|
from app.config import get_settings
|
||||||
|
get_settings.cache_clear()
|
||||||
|
svc = get_email_service()
|
||||||
|
assert isinstance(svc, LoggingEmailService)
|
||||||
|
|
||||||
|
|
||||||
|
def test_postmark_send_success():
|
||||||
|
svc = PostmarkEmailService("test-token", sender="from@example.com")
|
||||||
|
|
||||||
|
def handler(request: httpx.Request) -> httpx.Response:
|
||||||
|
body = request.read().decode()
|
||||||
|
assert "from@example.com" in body
|
||||||
|
assert "jane@example.com" in body
|
||||||
|
assert "DTLIC2:placeholder" in body
|
||||||
|
return httpx.Response(200, json={"MessageID": "pm-12345"})
|
||||||
|
|
||||||
|
transport = httpx.MockTransport(handler)
|
||||||
|
_real_client = httpx.Client
|
||||||
|
with patch("app.email.httpx.Client", lambda **kw: _real_client(transport=transport, **kw)):
|
||||||
|
msg_id = svc.send_license(_msg())
|
||||||
|
assert msg_id == "pm-12345"
|
||||||
|
|
||||||
|
|
||||||
|
def test_postmark_send_raises_on_failure():
|
||||||
|
svc = PostmarkEmailService("test-token", sender="from@example.com")
|
||||||
|
|
||||||
|
def handler(request: httpx.Request) -> httpx.Response:
|
||||||
|
return httpx.Response(422, json={"Message": "Invalid sender"})
|
||||||
|
|
||||||
|
transport = httpx.MockTransport(handler)
|
||||||
|
_real_client = httpx.Client
|
||||||
|
with patch("app.email.httpx.Client", lambda **kw: _real_client(transport=transport, **kw)):
|
||||||
|
with pytest.raises(EmailDeliveryError, match="422"):
|
||||||
|
svc.send_license(_msg())
|
||||||
97
server/tests/test_gumroad_adapter.py
Normal file
97
server/tests/test_gumroad_adapter.py
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
"""Gumroad adapter: secret check, Ping parsing, refunds stub."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.adapters.gumroad import GumroadAdapter, UnmappedProductError
|
||||||
|
|
||||||
|
|
||||||
|
def _sale_payload(**overrides) -> dict:
|
||||||
|
base = {
|
||||||
|
"sale_id": "GUM-1001",
|
||||||
|
"email": "jane@example.com",
|
||||||
|
"full_name": "Jane Doe",
|
||||||
|
"product_id": "datatools-core",
|
||||||
|
"price": "9900", # cents
|
||||||
|
"currency": "usd",
|
||||||
|
"offer_code": "",
|
||||||
|
"test": "false",
|
||||||
|
}
|
||||||
|
base.update(overrides)
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def test_verify_secret_correct():
|
||||||
|
a = GumroadAdapter(secret="abc123")
|
||||||
|
assert a.verify_secret("abc123") is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_verify_secret_wrong():
|
||||||
|
a = GumroadAdapter(secret="abc123")
|
||||||
|
assert a.verify_secret("nope") is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_verify_secret_unset_rejects_all():
|
||||||
|
a = GumroadAdapter(secret=None)
|
||||||
|
assert a.verify_secret("anything") is False
|
||||||
|
assert a.verify_secret(None) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_verify_secret_missing_presented_value():
|
||||||
|
a = GumroadAdapter(secret="abc123")
|
||||||
|
assert a.verify_secret(None) is False
|
||||||
|
assert a.verify_secret("") is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_sale_happy_path():
|
||||||
|
a = GumroadAdapter(secret="x")
|
||||||
|
sale = a.parse_sale(_sale_payload())
|
||||||
|
assert sale is not None
|
||||||
|
assert sale.source == "gumroad"
|
||||||
|
assert sale.source_order_id == "GUM-1001"
|
||||||
|
assert sale.buyer_email == "jane@example.com"
|
||||||
|
assert sale.buyer_name == "Jane Doe"
|
||||||
|
assert sale.tier == "core"
|
||||||
|
assert sale.years == 1
|
||||||
|
assert sale.amount_paid == Decimal("99.00")
|
||||||
|
assert sale.currency == "USD"
|
||||||
|
assert sale.promotion is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_sale_with_offer_code():
|
||||||
|
a = GumroadAdapter(secret="x")
|
||||||
|
sale = a.parse_sale(_sale_payload(offer_code="LAUNCH50"))
|
||||||
|
assert sale.promotion == "LAUNCH50"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_sale_test_ping_tagged():
|
||||||
|
a = GumroadAdapter(secret="x")
|
||||||
|
sale = a.parse_sale(_sale_payload(test="true"))
|
||||||
|
assert sale.notes == "gumroad test ping"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_sale_name_fallback_from_email():
|
||||||
|
a = GumroadAdapter(secret="x")
|
||||||
|
sale = a.parse_sale(_sale_payload(full_name="", email="john.doe@example.com"))
|
||||||
|
assert sale.buyer_name == "John Doe"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_sale_missing_required_returns_none():
|
||||||
|
a = GumroadAdapter(secret="x")
|
||||||
|
assert a.parse_sale(_sale_payload(sale_id="")) is None
|
||||||
|
assert a.parse_sale(_sale_payload(email="")) is None
|
||||||
|
assert a.parse_sale(_sale_payload(product_id="")) is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_sale_unmapped_product_raises():
|
||||||
|
a = GumroadAdapter(secret="x")
|
||||||
|
with pytest.raises(UnmappedProductError):
|
||||||
|
a.parse_sale(_sale_payload(product_id="no-such-sku"))
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_refund_stub_returns_none():
|
||||||
|
a = GumroadAdapter(secret="x")
|
||||||
|
assert a.parse_refund({"any": "payload"}) is None
|
||||||
102
server/tests/test_mint.py
Normal file
102
server/tests/test_mint.py
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
"""Mint core — signing, persistence, idempotency, revoke."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decimal import Decimal
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.adapters.base import SaleEvent
|
||||||
|
from app.mint import mint_from_sale, revoke_license
|
||||||
|
from app.models import License
|
||||||
|
from datatools_license.crypto import decode_blob, verify
|
||||||
|
|
||||||
|
|
||||||
|
def _sale(**overrides) -> SaleEvent:
|
||||||
|
base = dict(
|
||||||
|
source="manual",
|
||||||
|
source_order_id=None,
|
||||||
|
buyer_name="Jane Doe",
|
||||||
|
buyer_email="jane@example.com",
|
||||||
|
tier="core",
|
||||||
|
years=1,
|
||||||
|
promotion=None,
|
||||||
|
amount_paid=None,
|
||||||
|
currency="USD",
|
||||||
|
notes=None,
|
||||||
|
)
|
||||||
|
base.update(overrides)
|
||||||
|
return SaleEvent(**base)
|
||||||
|
|
||||||
|
|
||||||
|
def test_mint_persists_and_signs_verifiably(db_session):
|
||||||
|
row = mint_from_sale(db_session, _sale())
|
||||||
|
db_session.commit()
|
||||||
|
|
||||||
|
assert row.license_key.startswith("DT1-CORE-")
|
||||||
|
assert row.tier == "core"
|
||||||
|
assert row.source == "manual"
|
||||||
|
assert row.blob.startswith("DTLIC2:")
|
||||||
|
assert row.revoked_at is None
|
||||||
|
|
||||||
|
payload = decode_blob(row.blob)
|
||||||
|
sig = payload.pop("signature")
|
||||||
|
assert verify(payload, sig), "minted blob must verify against the dev pubkey"
|
||||||
|
assert payload["name"] == "Jane Doe"
|
||||||
|
assert payload["email"] == "jane@example.com"
|
||||||
|
assert payload["tier"] == "core"
|
||||||
|
|
||||||
|
|
||||||
|
def test_mint_idempotent_on_source_order_id(db_session):
|
||||||
|
"""A second mint with the same (source, source_order_id) returns
|
||||||
|
the existing row — webhook retries cannot double-mint."""
|
||||||
|
first = mint_from_sale(
|
||||||
|
db_session,
|
||||||
|
_sale(source="gumroad", source_order_id="GUM-1001"),
|
||||||
|
)
|
||||||
|
db_session.commit()
|
||||||
|
|
||||||
|
second = mint_from_sale(
|
||||||
|
db_session,
|
||||||
|
_sale(source="gumroad", source_order_id="GUM-1001", buyer_name="Different Name"),
|
||||||
|
)
|
||||||
|
db_session.commit()
|
||||||
|
|
||||||
|
assert first.license_key == second.license_key
|
||||||
|
assert second.name == "Jane Doe", "existing row is returned unchanged"
|
||||||
|
|
||||||
|
|
||||||
|
def test_manual_mints_never_dedup(db_session):
|
||||||
|
"""source_order_id=None means each manual mint creates a new row."""
|
||||||
|
a = mint_from_sale(db_session, _sale())
|
||||||
|
db_session.commit()
|
||||||
|
b = mint_from_sale(db_session, _sale())
|
||||||
|
db_session.commit()
|
||||||
|
assert a.license_key != b.license_key
|
||||||
|
|
||||||
|
|
||||||
|
def test_mint_records_commercial_metadata(db_session):
|
||||||
|
row = mint_from_sale(
|
||||||
|
db_session,
|
||||||
|
_sale(promotion="LAUNCH50", amount_paid=Decimal("79.00"), currency="USD"),
|
||||||
|
)
|
||||||
|
db_session.commit()
|
||||||
|
assert row.promotion == "LAUNCH50"
|
||||||
|
assert Decimal(str(row.amount_paid)) == Decimal("79.00")
|
||||||
|
assert row.currency == "USD"
|
||||||
|
|
||||||
|
|
||||||
|
def test_revoke_marks_row(db_session):
|
||||||
|
row = mint_from_sale(db_session, _sale())
|
||||||
|
db_session.commit()
|
||||||
|
|
||||||
|
revoked = revoke_license(db_session, license_key=row.license_key, reason="refund")
|
||||||
|
db_session.commit()
|
||||||
|
|
||||||
|
assert revoked is not None
|
||||||
|
assert revoked.revoked_at is not None
|
||||||
|
assert "refund" in (revoked.notes or "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_revoke_unknown_returns_none(db_session):
|
||||||
|
assert revoke_license(db_session, license_key="DT1-CORE-no-such-key") is None
|
||||||
32
server/tests/test_products.py
Normal file
32
server/tests/test_products.py
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
"""Product → tier mapping lookup."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from app.products import lookup, reload_for_tests
|
||||||
|
|
||||||
|
|
||||||
|
def setup_function(_):
|
||||||
|
# The yaml file is read once at import; reload to be safe if
|
||||||
|
# other tests mutate state in the future.
|
||||||
|
reload_for_tests()
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_known_gumroad_product():
|
||||||
|
m = lookup("gumroad", "datatools-core")
|
||||||
|
assert m is not None
|
||||||
|
assert m.tier == "core"
|
||||||
|
assert m.years == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_unknown_product_returns_none():
|
||||||
|
assert lookup("gumroad", "no-such-product") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_lookup_unknown_source_returns_none():
|
||||||
|
assert lookup("paddle", "datatools-core") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_all_three_tiers_mapped():
|
||||||
|
assert lookup("gumroad", "datatools-lite").tier == "lite"
|
||||||
|
assert lookup("gumroad", "datatools-core").tier == "core"
|
||||||
|
assert lookup("gumroad", "datatools-pro").tier == "pro"
|
||||||
130
server/tests/test_routes.py
Normal file
130
server/tests/test_routes.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
"""HTTP route tests — auth, mint, revoke, list, health."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
|
def test_health_is_public(client):
|
||||||
|
r = client.get("/health")
|
||||||
|
assert r.status_code == 200
|
||||||
|
assert r.json()["status"] == "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def test_internal_requires_bearer(client):
|
||||||
|
r = client.get("/internal/ping")
|
||||||
|
assert r.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
|
def test_internal_rejects_wrong_bearer(client):
|
||||||
|
r = client.get("/internal/ping", headers={"Authorization": "Bearer nope"})
|
||||||
|
assert r.status_code == 401
|
||||||
|
|
||||||
|
|
||||||
|
def test_internal_ping_ok_with_token(client, admin_headers):
|
||||||
|
r = client.get("/internal/ping", headers=admin_headers)
|
||||||
|
assert r.status_code == 200
|
||||||
|
assert r.json() == {"ok": True}
|
||||||
|
|
||||||
|
|
||||||
|
def test_mint_creates_license(client, admin_headers):
|
||||||
|
r = client.post(
|
||||||
|
"/internal/mint",
|
||||||
|
headers=admin_headers,
|
||||||
|
json={
|
||||||
|
"name": "Jane Doe",
|
||||||
|
"email": "jane@example.com",
|
||||||
|
"tier": "core",
|
||||||
|
"years": 1,
|
||||||
|
"source": "manual",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert r.status_code == 201, r.text
|
||||||
|
body = r.json()
|
||||||
|
assert body["tier"] == "core"
|
||||||
|
assert body["source"] == "manual"
|
||||||
|
assert body["blob"].startswith("DTLIC2:")
|
||||||
|
|
||||||
|
|
||||||
|
def test_mint_rejects_non_manual_source(client, admin_headers):
|
||||||
|
r = client.post(
|
||||||
|
"/internal/mint",
|
||||||
|
headers=admin_headers,
|
||||||
|
json={
|
||||||
|
"name": "x", "email": "x@example.com", "tier": "core",
|
||||||
|
"source": "gumroad",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert r.status_code == 400
|
||||||
|
assert "not wired" in r.json()["detail"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_mint_rejects_bad_email(client, admin_headers):
|
||||||
|
r = client.post(
|
||||||
|
"/internal/mint",
|
||||||
|
headers=admin_headers,
|
||||||
|
json={"name": "x", "email": "not-an-email", "tier": "core"},
|
||||||
|
)
|
||||||
|
assert r.status_code == 422
|
||||||
|
|
||||||
|
|
||||||
|
def test_mint_rejects_unknown_tier(client, admin_headers):
|
||||||
|
r = client.post(
|
||||||
|
"/internal/mint",
|
||||||
|
headers=admin_headers,
|
||||||
|
json={"name": "x", "email": "x@example.com", "tier": "platinum"},
|
||||||
|
)
|
||||||
|
assert r.status_code == 422
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_licenses_filters_email_case_insensitive(client, admin_headers):
|
||||||
|
# Pydantic EmailStr normalizes the domain to lowercase per RFC.
|
||||||
|
for email in ("alice@example.com", "Bob@Example.com", "carol@other.test"):
|
||||||
|
client.post(
|
||||||
|
"/internal/mint",
|
||||||
|
headers=admin_headers,
|
||||||
|
json={"name": "User", "email": email, "tier": "core"},
|
||||||
|
)
|
||||||
|
|
||||||
|
r = client.get(
|
||||||
|
"/internal/licenses?email=example.com",
|
||||||
|
headers=admin_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
emails = {row["email"].lower() for row in r.json()}
|
||||||
|
assert "alice@example.com" in emails
|
||||||
|
assert "bob@example.com" in emails
|
||||||
|
assert "carol@other.test" not in emails
|
||||||
|
|
||||||
|
|
||||||
|
def test_revoke_then_excluded_by_default(client, admin_headers):
|
||||||
|
r = client.post(
|
||||||
|
"/internal/mint",
|
||||||
|
headers=admin_headers,
|
||||||
|
json={"name": "x", "email": "x@example.com", "tier": "lite"},
|
||||||
|
)
|
||||||
|
key = r.json()["license_key"]
|
||||||
|
|
||||||
|
r2 = client.post(
|
||||||
|
"/internal/revoke",
|
||||||
|
headers=admin_headers,
|
||||||
|
json={"license_key": key, "reason": "refund"},
|
||||||
|
)
|
||||||
|
assert r2.status_code == 200
|
||||||
|
assert r2.json()["revoked_at"] is not None
|
||||||
|
|
||||||
|
listed = client.get("/internal/licenses", headers=admin_headers).json()
|
||||||
|
assert all(row["license_key"] != key for row in listed)
|
||||||
|
|
||||||
|
listed_all = client.get(
|
||||||
|
"/internal/licenses?include_revoked=true",
|
||||||
|
headers=admin_headers,
|
||||||
|
).json()
|
||||||
|
assert any(row["license_key"] == key for row in listed_all)
|
||||||
|
|
||||||
|
|
||||||
|
def test_revoke_unknown_returns_404(client, admin_headers):
|
||||||
|
r = client.post(
|
||||||
|
"/internal/revoke",
|
||||||
|
headers=admin_headers,
|
||||||
|
json={"license_key": "DT1-CORE-doesnot-exist"},
|
||||||
|
)
|
||||||
|
assert r.status_code == 404
|
||||||
143
server/tests/test_webhook.py
Normal file
143
server/tests/test_webhook.py
Normal file
@@ -0,0 +1,143 @@
|
|||||||
|
"""End-to-end webhook tests — secret check, audit log, mint, email,
|
||||||
|
idempotency."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.email import LicenseEmail
|
||||||
|
from app.models import GumroadEvent, License
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def with_gumroad_secret(monkeypatch):
|
||||||
|
monkeypatch.setenv("GUMROAD_WEBHOOK_SECRET", "test-gumroad-secret")
|
||||||
|
from app.config import get_settings
|
||||||
|
get_settings.cache_clear()
|
||||||
|
yield
|
||||||
|
get_settings.cache_clear()
|
||||||
|
|
||||||
|
|
||||||
|
def _form(**overrides) -> dict:
|
||||||
|
base = {
|
||||||
|
"sale_id": "GUM-2001",
|
||||||
|
"email": "jane@example.com",
|
||||||
|
"full_name": "Jane Doe",
|
||||||
|
"product_id": "datatools-core",
|
||||||
|
"price": "9900",
|
||||||
|
"currency": "usd",
|
||||||
|
"offer_code": "",
|
||||||
|
"test": "false",
|
||||||
|
}
|
||||||
|
base.update(overrides)
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
|
def test_webhook_rejects_missing_secret(client, with_gumroad_secret):
|
||||||
|
r = client.post("/webhooks/gumroad", data=_form())
|
||||||
|
assert r.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_webhook_rejects_wrong_secret(client, with_gumroad_secret):
|
||||||
|
r = client.post("/webhooks/gumroad?secret=wrong", data=_form())
|
||||||
|
assert r.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_webhook_mints_and_sends_email(client, with_gumroad_secret, db_session):
|
||||||
|
captured: list[LicenseEmail] = []
|
||||||
|
|
||||||
|
class CapturingEmail:
|
||||||
|
def send_license(self, msg):
|
||||||
|
captured.append(msg)
|
||||||
|
return "captured"
|
||||||
|
|
||||||
|
with patch("app.routes.webhooks.get_email_service", lambda: CapturingEmail()):
|
||||||
|
r = client.post(
|
||||||
|
"/webhooks/gumroad?secret=test-gumroad-secret",
|
||||||
|
data=_form(),
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
body = r.json()
|
||||||
|
assert body["status"] == "ok"
|
||||||
|
assert body["license_key"].startswith("DT1-CORE-")
|
||||||
|
|
||||||
|
# DB row landed.
|
||||||
|
row = db_session.query(License).filter_by(source_order_id="GUM-2001").one()
|
||||||
|
assert row.source == "gumroad"
|
||||||
|
assert row.email == "jane@example.com"
|
||||||
|
assert row.tier == "core"
|
||||||
|
assert row.blob.startswith("DTLIC2:")
|
||||||
|
|
||||||
|
# Audit row processed.
|
||||||
|
event = db_session.query(GumroadEvent).filter_by(order_id="GUM-2001").one()
|
||||||
|
assert event.processed is True
|
||||||
|
assert event.error is None
|
||||||
|
|
||||||
|
# Email captured.
|
||||||
|
assert len(captured) == 1
|
||||||
|
assert captured[0].to_email == "jane@example.com"
|
||||||
|
assert captured[0].blob == row.blob
|
||||||
|
|
||||||
|
|
||||||
|
def test_webhook_idempotent_on_duplicate_sale_id(client, with_gumroad_secret, db_session):
|
||||||
|
"""Gumroad retries on transient failures. Same sale_id must
|
||||||
|
produce one license, not two."""
|
||||||
|
with patch("app.routes.webhooks.get_email_service", lambda: _NullEmail()):
|
||||||
|
client.post("/webhooks/gumroad?secret=test-gumroad-secret", data=_form())
|
||||||
|
client.post("/webhooks/gumroad?secret=test-gumroad-secret", data=_form())
|
||||||
|
|
||||||
|
rows = db_session.query(License).filter_by(source_order_id="GUM-2001").all()
|
||||||
|
assert len(rows) == 1
|
||||||
|
# But both webhook deliveries should be in the audit log.
|
||||||
|
events = db_session.query(GumroadEvent).filter_by(order_id="GUM-2001").all()
|
||||||
|
assert len(events) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_webhook_unmapped_product_audits_and_returns_200(client, with_gumroad_secret, db_session):
|
||||||
|
"""An unknown product_id must NOT crash and must NOT trigger a
|
||||||
|
retry storm. Audit row gets the error reason."""
|
||||||
|
with patch("app.routes.webhooks.get_email_service", lambda: _NullEmail()):
|
||||||
|
r = client.post(
|
||||||
|
"/webhooks/gumroad?secret=test-gumroad-secret",
|
||||||
|
data=_form(product_id="not-a-real-sku", sale_id="GUM-666"),
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
assert r.json()["status"] == "logged-no-mint"
|
||||||
|
|
||||||
|
# No license, but audit row with error.
|
||||||
|
assert db_session.query(License).filter_by(source_order_id="GUM-666").count() == 0
|
||||||
|
event = db_session.query(GumroadEvent).filter_by(order_id="GUM-666").one()
|
||||||
|
assert event.processed is False
|
||||||
|
assert "no entry in config/products.yaml" in (event.error or "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_webhook_email_failure_keeps_license(client, with_gumroad_secret, db_session):
|
||||||
|
"""If Postmark hiccups, the buyer's license is still minted and
|
||||||
|
persists in the DB. They can be served from the renewal portal
|
||||||
|
(PR 3) or a manual resend."""
|
||||||
|
from app.email import EmailDeliveryError
|
||||||
|
|
||||||
|
class FailingEmail:
|
||||||
|
def send_license(self, msg):
|
||||||
|
raise EmailDeliveryError("Postmark 503")
|
||||||
|
|
||||||
|
with patch("app.routes.webhooks.get_email_service", lambda: FailingEmail()):
|
||||||
|
r = client.post(
|
||||||
|
"/webhooks/gumroad?secret=test-gumroad-secret",
|
||||||
|
data=_form(sale_id="GUM-3001"),
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
assert r.json()["status"] == "minted-email-failed"
|
||||||
|
|
||||||
|
assert db_session.query(License).filter_by(source_order_id="GUM-3001").count() == 1
|
||||||
|
event = db_session.query(GumroadEvent).filter_by(order_id="GUM-3001").one()
|
||||||
|
assert event.processed is True # we count it processed so Gumroad doesn't retry
|
||||||
|
assert "email error" in (event.error or "")
|
||||||
|
|
||||||
|
|
||||||
|
class _NullEmail:
|
||||||
|
def send_license(self, msg):
|
||||||
|
return "null"
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user