Compare commits
6 Commits
4d8513b1a3
...
rollback-2
| Author | SHA1 | Date | |
|---|---|---|---|
| 58d0009849 | |||
| b6c39d7a09 | |||
| b2fa8503e6 | |||
| b703911df3 | |||
| 93ccada974 | |||
| 17faf84aed |
49
.github/workflows/build.yml
vendored
49
.github/workflows/build.yml
vendored
@@ -65,6 +65,30 @@ jobs:
|
|||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
pip install pyinstaller pillow
|
pip install pyinstaller pillow
|
||||||
|
|
||||||
|
# ---- Tesseract bundling cache --------------------------------
|
||||||
|
# The fetch logic inside build/make_release.py downloads:
|
||||||
|
# * build/vendor/tessdata/eng.traineddata (~16 MB, shared)
|
||||||
|
# * build/_tesseract/<platform>/ (binary + libs, 30-120 MB)
|
||||||
|
# Cache both so iterative CI runs don't re-download. The
|
||||||
|
# cache key bakes in the pinned Tesseract version + tessdata
|
||||||
|
# URL so a version bump invalidates automatically.
|
||||||
|
- name: Cache Tesseract bundle inputs
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
build/_tesseract
|
||||||
|
build/vendor/tessdata
|
||||||
|
key: tesseract-${{ runner.os }}-5.5.0-tessdata_best-v1
|
||||||
|
|
||||||
|
# ---- Linux: install patchelf so make_release.py can rewrite
|
||||||
|
# RPATH on the bundled tesseract binary. apt-get install
|
||||||
|
# tesseract-ocr is handled inside make_release.py itself. -----
|
||||||
|
- name: Install Linux build prereqs for Tesseract bundling
|
||||||
|
if: matrix.os == 'ubuntu-latest'
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install -y patchelf
|
||||||
|
|
||||||
- name: Read version
|
- name: Read version
|
||||||
id: version
|
id: version
|
||||||
shell: bash
|
shell: bash
|
||||||
@@ -75,7 +99,32 @@ jobs:
|
|||||||
- name: Generate platform icons
|
- name: Generate platform icons
|
||||||
run: python build/generate_icons.py
|
run: python build/generate_icons.py
|
||||||
|
|
||||||
|
# Stage Tesseract before PyInstaller. The make_release.py
|
||||||
|
# helpers handle the per-platform fetch (UB-Mannheim on Win,
|
||||||
|
# brew on Mac, apt on Linux) and stage the binary + libs into
|
||||||
|
# build/_tesseract/<platform>/ where the spec picks them up.
|
||||||
|
# We invoke a tiny inline Python so the workflow doesn't have
|
||||||
|
# to know the per-platform target string.
|
||||||
|
- name: Stage Tesseract binary + tessdata
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
DATATOOLS_PLATFORM: ${{ matrix.platform }}
|
||||||
|
run: |
|
||||||
|
python - <<'PY'
|
||||||
|
import os, sys
|
||||||
|
sys.path.insert(0, "build")
|
||||||
|
from make_release import fetch_tessdata, fetch_tesseract_for_platform
|
||||||
|
target = os.environ["DATATOOLS_PLATFORM"]
|
||||||
|
fetch_tessdata()
|
||||||
|
fetch_tesseract_for_platform(target)
|
||||||
|
PY
|
||||||
|
|
||||||
- name: Build PyInstaller bundle
|
- name: Build PyInstaller bundle
|
||||||
|
shell: bash
|
||||||
|
env:
|
||||||
|
# The spec reads this to find the per-platform staging dir;
|
||||||
|
# see build/datatools.spec for the contract.
|
||||||
|
DATATOOLS_TESS_STAGING: build/_tesseract/${{ matrix.platform }}
|
||||||
run: pyinstaller build/datatools.spec --clean --noconfirm
|
run: pyinstaller build/datatools.spec --clean --noconfirm
|
||||||
|
|
||||||
# ---- Per-platform installer packaging ------------------------
|
# ---- Per-platform installer packaging ------------------------
|
||||||
|
|||||||
8
.gitignore
vendored
8
.gitignore
vendored
@@ -16,6 +16,14 @@ build/dist/
|
|||||||
build/icon.ico
|
build/icon.ico
|
||||||
build/icon.icns
|
build/icon.icns
|
||||||
build/icon.png
|
build/icon.png
|
||||||
|
|
||||||
|
# Tesseract bundling — fetched at build time, not committed. See
|
||||||
|
# build/vendor/README.md for the canonical URLs and rationale.
|
||||||
|
# - build/_tesseract/ : per-platform binary + DLLs/dylibs staging dir
|
||||||
|
# - build/vendor/tessdata/eng.traineddata : ~16 MB language data
|
||||||
|
build/_tesseract/
|
||||||
|
build/vendor/tessdata/*.traineddata
|
||||||
|
|
||||||
.pytest_cache/
|
.pytest_cache/
|
||||||
|
|
||||||
# Claude Code agent worktrees + local settings
|
# Claude Code agent worktrees + local settings
|
||||||
|
|||||||
220
LICENSE_TESSERACT.txt
Normal file
220
LICENSE_TESSERACT.txt
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
This license applies to the bundled Tesseract OCR binary distributed
|
||||||
|
inside DataTools installer artifacts (Windows .exe, macOS .dmg, Linux
|
||||||
|
.AppImage) and the corresponding portable .zip downloads.
|
||||||
|
|
||||||
|
Tesseract OCR upstream: https://github.com/tesseract-ocr/tesseract
|
||||||
|
Copyright (C) 2006-2024 Google Inc. and the Tesseract OCR contributors
|
||||||
|
|
||||||
|
The Tesseract OCR binary is distributed under the Apache License,
|
||||||
|
Version 2.0, the full text of which is reproduced verbatim below.
|
||||||
|
|
||||||
|
The bundled `eng.traineddata` data file is the "best" English model
|
||||||
|
from https://github.com/tesseract-ocr/tessdata_best and is licensed
|
||||||
|
under the Apache License, Version 2.0 as well.
|
||||||
|
|
||||||
|
DataTools itself is proprietary and is NOT covered by this license;
|
||||||
|
see LICENSE.txt at the repository root for DataTools' own license.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
Apache License
|
||||||
|
Version 2.0, January 2004
|
||||||
|
http://www.apache.org/licenses/
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||||
|
|
||||||
|
1. Definitions.
|
||||||
|
|
||||||
|
"License" shall mean the terms and conditions for use, reproduction,
|
||||||
|
and distribution as defined by Sections 1 through 9 of this document.
|
||||||
|
|
||||||
|
"Licensor" shall mean the copyright owner or entity authorized by
|
||||||
|
the copyright owner that is granting the License.
|
||||||
|
|
||||||
|
"Legal Entity" shall mean the union of the acting entity and all
|
||||||
|
other entities that control, are controlled by, or are under common
|
||||||
|
control with that entity. For the purposes of this definition,
|
||||||
|
"control" means (i) the power, direct or indirect, to cause the
|
||||||
|
direction or management of such entity, whether by contract or
|
||||||
|
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||||
|
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||||
|
|
||||||
|
"You" (or "Your") shall mean an individual or Legal Entity
|
||||||
|
exercising permissions granted by this License.
|
||||||
|
|
||||||
|
"Source" form shall mean the preferred form for making modifications,
|
||||||
|
including but not limited to software source code, documentation
|
||||||
|
source, and configuration files.
|
||||||
|
|
||||||
|
"Object" form shall mean any form resulting from mechanical
|
||||||
|
transformation or translation of a Source form, including but
|
||||||
|
not limited to compiled object code, generated documentation,
|
||||||
|
and conversions to other media types.
|
||||||
|
|
||||||
|
"Work" shall mean the work of authorship, whether in Source or
|
||||||
|
Object form, made available under the License, as indicated by a
|
||||||
|
copyright notice that is included in or attached to the work
|
||||||
|
(an example is provided in the Appendix below).
|
||||||
|
|
||||||
|
"Derivative Works" shall mean any work, whether in Source or Object
|
||||||
|
form, that is based on (or derived from) the Work and for which the
|
||||||
|
editorial revisions, annotations, elaborations, or other modifications
|
||||||
|
represent, as a whole, an original work of authorship. For the purposes
|
||||||
|
of this License, Derivative Works shall not include works that remain
|
||||||
|
separable from, or merely link (or bind by name) to the interfaces of,
|
||||||
|
the Work and Derivative Works thereof.
|
||||||
|
|
||||||
|
"Contribution" shall mean any work of authorship, including
|
||||||
|
the original version of the Work and any modifications or additions
|
||||||
|
to that Work or Derivative Works thereof, that is intentionally
|
||||||
|
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||||
|
or by an individual or Legal Entity authorized to submit on behalf of
|
||||||
|
the copyright owner. For the purposes of this definition, "submitted"
|
||||||
|
means any form of electronic, verbal, or written communication sent
|
||||||
|
to the Licensor or its representatives, including but not limited to
|
||||||
|
communication on electronic mailing lists, source code control systems,
|
||||||
|
and issue tracking systems that are managed by, or on behalf of, the
|
||||||
|
Licensor for the purpose of discussing and improving the Work, but
|
||||||
|
excluding communication that is conspicuously marked or otherwise
|
||||||
|
designated in writing by the copyright owner as "Not a Contribution."
|
||||||
|
|
||||||
|
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||||
|
on behalf of whom a Contribution has been received by Licensor and
|
||||||
|
subsequently incorporated within the Work.
|
||||||
|
|
||||||
|
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
copyright license to reproduce, prepare Derivative Works of,
|
||||||
|
publicly display, publicly perform, sublicense, and distribute the
|
||||||
|
Work and such Derivative Works in Source or Object form.
|
||||||
|
|
||||||
|
3. Grant of Patent License. Subject to the terms and conditions of
|
||||||
|
this License, each Contributor hereby grants to You a perpetual,
|
||||||
|
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||||
|
(except as stated in this section) patent license to make, have made,
|
||||||
|
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||||
|
where such license applies only to those patent claims licensable
|
||||||
|
by such Contributor that are necessarily infringed by their
|
||||||
|
Contribution(s) alone or by combination of their Contribution(s)
|
||||||
|
with the Work to which such Contribution(s) was submitted. If You
|
||||||
|
institute patent litigation against any entity (including a
|
||||||
|
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||||
|
or a Contribution incorporated within the Work constitutes direct
|
||||||
|
or contributory patent infringement, then any patent licenses
|
||||||
|
granted to You under this License for that Work shall terminate
|
||||||
|
as of the date such litigation is filed.
|
||||||
|
|
||||||
|
4. Redistribution. You may reproduce and distribute copies of the
|
||||||
|
Work or Derivative Works thereof in any medium, with or without
|
||||||
|
modifications, and in Source or Object form, provided that You
|
||||||
|
meet the following conditions:
|
||||||
|
|
||||||
|
(a) You must give any other recipients of the Work or
|
||||||
|
Derivative Works a copy of this License; and
|
||||||
|
|
||||||
|
(b) You must cause any modified files to carry prominent notices
|
||||||
|
stating that You changed the files; and
|
||||||
|
|
||||||
|
(c) You must retain, in the Source form of any Derivative Works
|
||||||
|
that You distribute, all copyright, patent, trademark, and
|
||||||
|
attribution notices from the Source form of the Work,
|
||||||
|
excluding those notices that do not pertain to any part of
|
||||||
|
the Derivative Works; and
|
||||||
|
|
||||||
|
(d) If the Work includes a "NOTICE" text file as part of its
|
||||||
|
distribution, then any Derivative Works that You distribute must
|
||||||
|
include a readable copy of the attribution notices contained
|
||||||
|
within such NOTICE file, excluding those notices that do not
|
||||||
|
pertain to any part of the Derivative Works, in at least one
|
||||||
|
of the following places: within a NOTICE text file distributed
|
||||||
|
as part of the Derivative Works; within the Source form or
|
||||||
|
documentation, if provided along with the Derivative Works; or,
|
||||||
|
within a display generated by the Derivative Works, if and
|
||||||
|
wherever such third-party notices normally appear. The contents
|
||||||
|
of the NOTICE file are for informational purposes only and
|
||||||
|
do not modify the License. You may add Your own attribution
|
||||||
|
notices within Derivative Works that You distribute, alongside
|
||||||
|
or as an addendum to the NOTICE text from the Work, provided
|
||||||
|
that such additional attribution notices cannot be construed
|
||||||
|
as modifying the License.
|
||||||
|
|
||||||
|
You may add Your own copyright statement to Your modifications and
|
||||||
|
may provide additional or different license terms and conditions
|
||||||
|
for use, reproduction, or distribution of Your modifications, or
|
||||||
|
for any such Derivative Works as a whole, provided Your use,
|
||||||
|
reproduction, and distribution of the Work otherwise complies with
|
||||||
|
the conditions stated in this License.
|
||||||
|
|
||||||
|
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||||
|
any Contribution intentionally submitted for inclusion in the Work
|
||||||
|
by You to the Licensor shall be under the terms and conditions of
|
||||||
|
this License, without any additional terms or conditions.
|
||||||
|
Notwithstanding the above, nothing herein shall supersede or modify
|
||||||
|
the terms of any separate license agreement you may have executed
|
||||||
|
with Licensor regarding such Contributions.
|
||||||
|
|
||||||
|
6. Trademarks. This License does not grant permission to use the trade
|
||||||
|
names, trademarks, service marks, or product names of the Licensor,
|
||||||
|
except as required for describing the origin of the Work and
|
||||||
|
reproducing the content of the NOTICE file.
|
||||||
|
|
||||||
|
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||||
|
agreed to in writing, Licensor provides the Work (and each
|
||||||
|
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied, including, without limitation, any warranties or conditions
|
||||||
|
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||||
|
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||||
|
appropriateness of using or redistributing the Work and assume any
|
||||||
|
risks associated with Your exercise of permissions under this License.
|
||||||
|
|
||||||
|
8. Limitation of Liability. In no event and under no legal theory,
|
||||||
|
whether in tort (including negligence), contract, or otherwise,
|
||||||
|
unless required by applicable law (such as deliberate and grossly
|
||||||
|
negligent acts) or agreed to in writing, shall any Contributor be
|
||||||
|
liable to You for damages, including any direct, indirect, special,
|
||||||
|
incidental, or consequential damages of any character arising as a
|
||||||
|
result of this License or out of the use or inability to use the
|
||||||
|
Work (including but not limited to damages for loss of goodwill,
|
||||||
|
work stoppage, computer failure or malfunction, or any and all
|
||||||
|
other commercial damages or losses), even if such Contributor
|
||||||
|
has been advised of the possibility of such damages.
|
||||||
|
|
||||||
|
9. Accepting Warranty or Additional Liability. While redistributing
|
||||||
|
the Work or Derivative Works thereof, You may accept and charge a
|
||||||
|
fee for, acceptance of support, warranty, indemnity, or other
|
||||||
|
liability obligations and/or rights consistent with this License.
|
||||||
|
However, in accepting such obligations, You may act only on Your
|
||||||
|
own behalf and on Your sole responsibility, not on behalf of any
|
||||||
|
other Contributor, and only if You agree to indemnify, defend,
|
||||||
|
and hold each Contributor harmless for any liability incurred by,
|
||||||
|
or claims asserted against, such Contributor by reason of your
|
||||||
|
accepting any such warranty or additional liability.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
APPENDIX: How to apply the Apache License to your work.
|
||||||
|
|
||||||
|
To apply the Apache License to your work, attach the following
|
||||||
|
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||||
|
replaced with your own identifying information. (Don't include
|
||||||
|
the brackets!) The text should be enclosed in the appropriate
|
||||||
|
comment syntax for the file format. We also recommend that a
|
||||||
|
file or class name and description of purpose be included on the
|
||||||
|
same "printed page" as the copyright notice for easier
|
||||||
|
identification within third-party archives.
|
||||||
|
|
||||||
|
Copyright [yyyy] [name of copyright owner]
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||||
|
implied. See the License for the specific language governing
|
||||||
|
permissions and limitations under the License.
|
||||||
@@ -30,7 +30,9 @@ Paquetes precompilados — sin instalar Python, sin permisos de administrador, s
|
|||||||
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — ejecuta el instalador (por usuario, sin admin). Crea acceso directo en el escritorio + entrada en el menú Inicio. | `DataTools-X.Y.Z-win-portable.zip` — descomprime donde quieras, doble clic en `DataTools.exe`. |
|
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — ejecuta el instalador (por usuario, sin admin). Crea acceso directo en el escritorio + entrada en el menú Inicio. | `DataTools-X.Y.Z-win-portable.zip` — descomprime donde quieras, doble clic en `DataTools.exe`. |
|
||||||
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x` y doble clic. | El AppImage ya es portable. |
|
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x` y doble clic. | El AppImage ya es portable. |
|
||||||
|
|
||||||
Última versión: consulta [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (o el listado de Gumroad). Cada paquete ocupa ~200 MB descomprimido; al primer arranque la app levanta un servidor local en http://127.0.0.1:8501 y abre tu navegador predeterminado. Nada sale de tu equipo — instalador y portable son idénticos por dentro.
|
Última versión: consulta [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (o el listado de Gumroad). Cada paquete ocupa ~300 MB descomprimido; al primer arranque la app levanta un servidor local en http://127.0.0.1:8501 y abre tu navegador predeterminado. Nada sale de tu equipo — instalador y portable son idénticos por dentro.
|
||||||
|
|
||||||
|
**Tesseract OCR viene incluido.** El soporte para PDFs escaneados del Extractor de PDF funciona sin configuración adicional en las tres plataformas — no hace falta instalar Tesseract por separado. Atribución de licencia: ver [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
||||||
|
|
||||||
**Avisos del primer arranque (una sola vez):**
|
**Avisos del primer arranque (una sola vez):**
|
||||||
- **macOS** sin firma: clic derecho → **Abrir** → confirma. (Las compilaciones firmadas se lo saltan.)
|
- **macOS** sin firma: clic derecho → **Abrir** → confirma. (Las compilaciones firmadas se lo saltan.)
|
||||||
|
|||||||
@@ -30,7 +30,9 @@ Pre-built bundles — no Python install, no admin rights, no internet at runtime
|
|||||||
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — run installer (per-user, no admin). Desktop shortcut + Start Menu entry created. | `DataTools-X.Y.Z-win-portable.zip` — unzip anywhere, double-click `DataTools.exe`. |
|
| **Windows** | `DataTools-X.Y.Z-win-setup.exe` — run installer (per-user, no admin). Desktop shortcut + Start Menu entry created. | `DataTools-X.Y.Z-win-portable.zip` — unzip anywhere, double-click `DataTools.exe`. |
|
||||||
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x`, double-click. | The AppImage is already portable. |
|
| **Linux** | `DataTools-X.Y.Z-linux-x86_64.AppImage` — `chmod +x`, double-click. | The AppImage is already portable. |
|
||||||
|
|
||||||
Latest release: see [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (or the Gumroad listing). Each bundle is ~200 MB unpacked; on first launch the app starts a local server at http://127.0.0.1:8501 and opens your default browser. Nothing leaves your machine — installers and portables are byte-identical inside.
|
Latest release: see [GitHub Releases](https://git.invixiom.com/giteadmin/datatools-dev/releases) (or the Gumroad listing). Each bundle is ~300 MB unpacked; on first launch the app starts a local server at http://127.0.0.1:8501 and opens your default browser. Nothing leaves your machine — installers and portables are byte-identical inside.
|
||||||
|
|
||||||
|
**Tesseract OCR is bundled.** Scanned-PDF support in the PDF Extractor works out of the box on all three platforms — no separate Tesseract install required. License attribution: see [`LICENSE_TESSERACT.txt`](LICENSE_TESSERACT.txt).
|
||||||
|
|
||||||
**First-launch warnings (one-time):**
|
**First-launch warnings (one-time):**
|
||||||
- **macOS** unsigned builds: right-click → **Open** → confirm. (Signed builds skip this.)
|
- **macOS** unsigned builds: right-click → **Open** → confirm. (Signed builds skip this.)
|
||||||
|
|||||||
@@ -54,8 +54,11 @@ for buyers (or IT-locked-down machines) that can't run installers:
|
|||||||
| Linux | `DataTools-<ver>-linux-x86_64.AppImage`| (the AppImage IS the portable) |
|
| Linux | `DataTools-<ver>-linux-x86_64.AppImage`| (the AppImage IS the portable) |
|
||||||
|
|
||||||
All six outputs are self-contained: every dependency (Python, pandas,
|
All six outputs are self-contained: every dependency (Python, pandas,
|
||||||
streamlit, pdfplumber, the lot) is frozen into the bundle. The buyer
|
streamlit, pdfplumber, **Tesseract OCR + `eng.traineddata`**, the lot)
|
||||||
does not need to install Python, pip, or anything else first.
|
is frozen into the bundle. The buyer does not need to install Python,
|
||||||
|
pip, Tesseract, or anything else first. With Tesseract bundled, each
|
||||||
|
artifact is roughly **250–300 MB** on disk (up from ~120 MB pre-OCR);
|
||||||
|
unpacked installs run ~300–400 MB once scratch space is counted.
|
||||||
|
|
||||||
## Easy-launch surface
|
## Easy-launch surface
|
||||||
|
|
||||||
@@ -287,6 +290,56 @@ Mac code-signing in CI requires the cert + private key as a GitHub
|
|||||||
secret (encoded with `base64`). Detailed walkthrough belongs in a
|
secret (encoded with `base64`). Detailed walkthrough belongs in a
|
||||||
later doc — for v1, sign locally and upload to GitHub Releases.
|
later doc — for v1, sign locally and upload to GitHub Releases.
|
||||||
|
|
||||||
|
## Tesseract bundling (PDF Extractor OCR)
|
||||||
|
|
||||||
|
Frozen artifacts ship a per-platform Tesseract binary plus the English
|
||||||
|
`eng.traineddata` model so scanned-PDF support in the PDF Extractor
|
||||||
|
works out of the box — no separate user install. Source / pip
|
||||||
|
developer setups still need system Tesseract on `PATH`.
|
||||||
|
|
||||||
|
**Layout inside the bundle**:
|
||||||
|
|
||||||
|
```
|
||||||
|
DataTools/ (or DataTools.app/Contents/MacOS/)
|
||||||
|
└── tesseract/
|
||||||
|
├── tesseract (Linux/macOS binary; tesseract.exe on Windows)
|
||||||
|
└── tessdata/
|
||||||
|
└── eng.traineddata
|
||||||
|
```
|
||||||
|
|
||||||
|
The runtime resolver (in `src/`, owned by the runtime team) walks:
|
||||||
|
|
||||||
|
1. `DATATOOLS_TESSERACT_BIN` env var override.
|
||||||
|
2. `Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"` — frozen
|
||||||
|
bundles only.
|
||||||
|
3. `tesseract` on `PATH`.
|
||||||
|
4. Windows well-known paths.
|
||||||
|
|
||||||
|
**Where the bytes come from**:
|
||||||
|
|
||||||
|
- **Tessdata** — vendored in-repo at `build/vendor/tessdata/eng.traineddata`
|
||||||
|
(sourced from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best)).
|
||||||
|
`datatools.spec` copies it into `tesseract/tessdata/`.
|
||||||
|
- **Binary** — fetched per-platform at build time by
|
||||||
|
`build/make_release.py` from pinned upstream URLs. Current pin:
|
||||||
|
**Tesseract 5.5.0**.
|
||||||
|
|
||||||
|
**Updating Tesseract**:
|
||||||
|
|
||||||
|
1. Bump the version pin and the per-platform fetch URLs in
|
||||||
|
`build/make_release.py`.
|
||||||
|
2. If the model schema changed upstream, refresh
|
||||||
|
`build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the
|
||||||
|
matching tag.
|
||||||
|
3. Rebuild on each platform (`python build/make_release.py`) and
|
||||||
|
smoke-test a scanned PDF through the PDF Extractor.
|
||||||
|
4. Update `LICENSE_TESSERACT.txt` at the repo root if upstream license
|
||||||
|
terms change (Apache-2.0 today).
|
||||||
|
|
||||||
|
License attribution for the bundled binary lives at
|
||||||
|
`LICENSE_TESSERACT.txt` at the repo root — it must ship alongside any
|
||||||
|
binary that contains Tesseract.
|
||||||
|
|
||||||
## Common pitfalls
|
## Common pitfalls
|
||||||
|
|
||||||
| Symptom | Fix |
|
| Symptom | Fix |
|
||||||
|
|||||||
@@ -9,6 +9,11 @@
|
|||||||
# latest release from https://github.com/AppImage/AppImageKit/releases).
|
# latest release from https://github.com/AppImage/AppImageKit/releases).
|
||||||
#
|
#
|
||||||
# Output: dist/DataTools-<version>-linux-x86_64.AppImage
|
# Output: dist/DataTools-<version>-linux-x86_64.AppImage
|
||||||
|
#
|
||||||
|
# Tesseract bundling: no-op here. The PyInstaller bundle in
|
||||||
|
# dist/DataTools/ already contains tesseract/{tesseract, *.so,
|
||||||
|
# tessdata/eng.traineddata} from the spec's datas; ``cp -R``
|
||||||
|
# below carries it along into the AppDir.
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
|||||||
@@ -24,6 +24,7 @@
|
|||||||
|
|
||||||
# -*- mode: python ; coding: utf-8 -*-
|
# -*- mode: python ; coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from PyInstaller.utils.hooks import (
|
from PyInstaller.utils.hooks import (
|
||||||
collect_all,
|
collect_all,
|
||||||
@@ -103,6 +104,78 @@ datas += [
|
|||||||
(str(REPO / ".streamlit" / "config.toml"),".streamlit"),
|
(str(REPO / ".streamlit" / "config.toml"),".streamlit"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# ----- Tesseract OCR bundle ----------------------------------------
|
||||||
|
# ``build/make_release.py`` stages the per-platform Tesseract binary
|
||||||
|
# + its runtime libs (DLLs/dylibs/sos) into
|
||||||
|
# ``build/_tesseract/<target>/`` and the shared eng.traineddata into
|
||||||
|
# ``build/vendor/tessdata/``. We add both to ``datas`` so PyInstaller
|
||||||
|
# drops them at the path the runtime expects:
|
||||||
|
#
|
||||||
|
# <bundle>/tesseract/tesseract[.exe]
|
||||||
|
# <bundle>/tesseract/<all dll/dylib/so deps>
|
||||||
|
# <bundle>/tesseract/tessdata/eng.traineddata
|
||||||
|
#
|
||||||
|
# The runtime discovery code in src/pdf_extract.py reads this layout
|
||||||
|
# from ``Path(sys._MEIPASS) / "tesseract" / ...``. Keep the two ends
|
||||||
|
# in sync — if you rename "tesseract" here, update pdf_extract.py too.
|
||||||
|
#
|
||||||
|
# The orchestrator (make_release.py) sets DATATOOLS_TESS_STAGING to
|
||||||
|
# the right per-platform dir before invoking PyInstaller. For ad-hoc
|
||||||
|
# `pyinstaller build/datatools.spec` runs without the orchestrator,
|
||||||
|
# fall back to the canonical staging path.
|
||||||
|
_tess_staging_env = os.environ.get("DATATOOLS_TESS_STAGING")
|
||||||
|
if _tess_staging_env:
|
||||||
|
_tess_staging = Path(_tess_staging_env)
|
||||||
|
else:
|
||||||
|
# Pick the obvious per-host staging dir as a fallback so spec-only
|
||||||
|
# builds (without the orchestrator) still work in dev.
|
||||||
|
import sys as _sys_for_target
|
||||||
|
_target_guess = (
|
||||||
|
"win" if _sys_for_target.platform.startswith("win")
|
||||||
|
else "mac" if _sys_for_target.platform == "darwin"
|
||||||
|
else "linux"
|
||||||
|
)
|
||||||
|
_tess_staging = REPO / "build" / "_tesseract" / _target_guess
|
||||||
|
|
||||||
|
_tessdata = REPO / "build" / "vendor" / "tessdata"
|
||||||
|
|
||||||
|
if _tess_staging.is_dir() and any(_tess_staging.iterdir()):
|
||||||
|
# Drop every file in the staging dir directly under
|
||||||
|
# ``<bundle>/tesseract/`` (binary + DLL/dylib/so siblings).
|
||||||
|
datas += [(str(_tess_staging), "tesseract")]
|
||||||
|
else:
|
||||||
|
# Don't hard-fail spec parse — useful for first-time devs running
|
||||||
|
# PyInstaller before fetching binaries. Surface a loud warning
|
||||||
|
# though, since the OCR feature will silently fail at runtime.
|
||||||
|
print(
|
||||||
|
f"WARNING: {_tess_staging} is empty or missing — OCR will be "
|
||||||
|
"disabled in the bundle. Run build/make_release.py (which "
|
||||||
|
"calls fetch_tesseract_for_platform) before pyinstaller, or "
|
||||||
|
"pre-stage the binary manually."
|
||||||
|
)
|
||||||
|
|
||||||
|
if (_tessdata / "eng.traineddata").exists():
|
||||||
|
datas += [(str(_tessdata), "tesseract/tessdata")]
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"WARNING: {_tessdata}/eng.traineddata is missing — OCR will "
|
||||||
|
"have no language data at runtime. Run build/make_release.py "
|
||||||
|
"or fetch manually per build/vendor/README.md."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Bundle the Apache-2.0 LICENSE text alongside the binary. The docs
|
||||||
|
# agent maintains LICENSE_TESSERACT.txt at the repo root; PyInstaller
|
||||||
|
# drops it at the bundle root next to DataTools[.exe].
|
||||||
|
_tess_license = REPO / "LICENSE_TESSERACT.txt"
|
||||||
|
if _tess_license.exists():
|
||||||
|
datas += [(str(_tess_license), ".")]
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
"WARNING: LICENSE_TESSERACT.txt missing at repo root. Required "
|
||||||
|
"by Apache-2.0 for redistribution; the docs agent should "
|
||||||
|
"create it. Continuing without it for now."
|
||||||
|
)
|
||||||
|
|
||||||
# ----- Analysis ------------------------------------------------------
|
# ----- Analysis ------------------------------------------------------
|
||||||
|
|
||||||
a = Analysis(
|
a = Analysis(
|
||||||
@@ -158,6 +231,13 @@ coll = COLLECT(
|
|||||||
|
|
||||||
# macOS .app bundle wrapper. PyInstaller produces it only on Mac;
|
# macOS .app bundle wrapper. PyInstaller produces it only on Mac;
|
||||||
# this block is a no-op on Win/Linux.
|
# this block is a no-op on Win/Linux.
|
||||||
|
#
|
||||||
|
# Tesseract bundling note: ``BUNDLE(coll, ...)`` carries the entire
|
||||||
|
# COLLECT output (binaries + datas) into the .app's
|
||||||
|
# Contents/Resources tree, so the ``tesseract/`` subdir we built up
|
||||||
|
# in ``datas`` lands at ``DataTools.app/Contents/Resources/tesseract/``
|
||||||
|
# and the runtime ``sys._MEIPASS`` resolves there. No extra plumbing
|
||||||
|
# needed.
|
||||||
import sys as _sys
|
import sys as _sys
|
||||||
if _sys.platform == "darwin":
|
if _sys.platform == "darwin":
|
||||||
app = BUNDLE(
|
app = BUNDLE(
|
||||||
|
|||||||
@@ -63,6 +63,14 @@ Name: "desktopicon"; Description: "Create a &desktop shortcut"; GroupDescription
|
|||||||
Name: "quicklaunchicon"; Description: "Create a &Quick Launch shortcut"; GroupDescription: "Additional shortcuts:"; Flags: unchecked; OnlyBelowVersion: 6.1
|
Name: "quicklaunchicon"; Description: "Create a &Quick Launch shortcut"; GroupDescription: "Additional shortcuts:"; Flags: unchecked; OnlyBelowVersion: 6.1
|
||||||
|
|
||||||
[Files]
|
[Files]
|
||||||
|
; PyInstaller's dist/DataTools/ tree includes:
|
||||||
|
; * DataTools.exe + frozen Python runtime
|
||||||
|
; * tesseract/tesseract.exe + DLLs + tessdata/eng.traineddata
|
||||||
|
; (bundled via build/datatools.spec datas; runtime discovery in
|
||||||
|
; src/pdf_extract.py reads sys._MEIPASS / "tesseract" / ...).
|
||||||
|
; * LICENSE_TESSERACT.txt at the bundle root (Apache-2.0).
|
||||||
|
; The recursesubdirs flag below picks all of those up — no separate
|
||||||
|
; Files: entry needed for tesseract/.
|
||||||
Source: "..\dist\DataTools\*"; DestDir: "{app}"; Flags: recursesubdirs ignoreversion
|
Source: "..\dist\DataTools\*"; DestDir: "{app}"; Flags: recursesubdirs ignoreversion
|
||||||
|
|
||||||
[Icons]
|
[Icons]
|
||||||
|
|||||||
@@ -10,6 +10,11 @@
|
|||||||
#
|
#
|
||||||
# Code signing + notarization happen separately (see build/README.md
|
# Code signing + notarization happen separately (see build/README.md
|
||||||
# "Signing"). This script only handles the packaging step.
|
# "Signing"). This script only handles the packaging step.
|
||||||
|
#
|
||||||
|
# Tesseract bundling: no-op here. The .app already contains
|
||||||
|
# Contents/Resources/tesseract/{tesseract, *.dylib, tessdata/} thanks
|
||||||
|
# to PyInstaller's BUNDLE() carrying the spec's datas through. This
|
||||||
|
# script just wraps the finished .app — no extra steps for OCR.
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,11 @@
|
|||||||
# Run after ``pyinstaller build/datatools.spec --clean --noconfirm``
|
# Run after ``pyinstaller build/datatools.spec --clean --noconfirm``
|
||||||
# has produced ``dist/DataTools.app``. Output goes to
|
# has produced ``dist/DataTools.app``. Output goes to
|
||||||
# ``dist/DataTools-<version>-mac-portable.zip``.
|
# ``dist/DataTools-<version>-mac-portable.zip``.
|
||||||
|
#
|
||||||
|
# Tesseract bundling: no-op here. The bundled Tesseract binary +
|
||||||
|
# dylibs + tessdata are already inside DataTools.app/Contents/Resources/tesseract/
|
||||||
|
# (placed by PyInstaller's BUNDLE/datas mechanism). ``ditto -c -k``
|
||||||
|
# preserves the whole .app tree.
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
|||||||
@@ -32,17 +32,33 @@ Run from the repo root or from build/ — either works.
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
import platform
|
import platform
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import urllib.request
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
REPO = Path(__file__).resolve().parent.parent
|
REPO = Path(__file__).resolve().parent.parent
|
||||||
BUILD = REPO / "build"
|
BUILD = REPO / "build"
|
||||||
DIST = REPO / "dist"
|
DIST = REPO / "dist"
|
||||||
|
|
||||||
|
# Tesseract bundling. The runtime discovery code in
|
||||||
|
# ``src/pdf_extract.py`` looks for the binary at
|
||||||
|
# ``Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"`` and tessdata
|
||||||
|
# at ``... / "tesseract" / "tessdata" / "eng.traineddata"``. We stage
|
||||||
|
# everything under ``build/_tesseract/<platform>/`` (gitignored) and
|
||||||
|
# the PyInstaller spec adds that staging dir to ``datas=`` so it lands
|
||||||
|
# at the right place inside the frozen bundle.
|
||||||
|
TESSERACT_VERSION = "5.5.0"
|
||||||
|
TESSDATA_DIR = BUILD / "vendor" / "tessdata"
|
||||||
|
TESSDATA_URL = (
|
||||||
|
"https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata"
|
||||||
|
)
|
||||||
|
TESSERACT_STAGING = BUILD / "_tesseract"
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Output helpers — colourless so logs stay readable in any terminal/CI tail.
|
# Output helpers — colourless so logs stay readable in any terminal/CI tail.
|
||||||
@@ -192,6 +208,382 @@ def preflight(target: str) -> None:
|
|||||||
_ok("all prerequisites present")
|
_ok("all prerequisites present")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tesseract bundling — fetch the binary + tessdata at build time.
|
||||||
|
#
|
||||||
|
# We download (not vendor) because:
|
||||||
|
# * Binaries are large (5-40 MB per platform) and license-encumbered
|
||||||
|
# to keep current in git.
|
||||||
|
# * tessdata is Apache-2.0 and ~16 MB — fine to redistribute but
|
||||||
|
# bloats clones for contributors who don't touch OCR.
|
||||||
|
#
|
||||||
|
# Caching layout:
|
||||||
|
# build/_tesseract/win/tesseract.exe + DLLs
|
||||||
|
# build/_tesseract/mac/tesseract + dylibs
|
||||||
|
# build/_tesseract/linux/tesseract + libs
|
||||||
|
# build/vendor/tessdata/eng.traineddata (shared across platforms)
|
||||||
|
#
|
||||||
|
# The PyInstaller spec reads ``build/_tesseract/<platform>/`` and the
|
||||||
|
# tessdata dir, then bundles them under ``<bundle>/tesseract/``.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _download(url: str, dest: Path, *, expected_min_bytes: int = 1024) -> None:
|
||||||
|
"""Download *url* to *dest* atomically. Sanity-check the size."""
|
||||||
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
tmp = dest.with_suffix(dest.suffix + ".part")
|
||||||
|
print(f" GET {url}", flush=True)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(url, timeout=120) as r, open(tmp, "wb") as f:
|
||||||
|
shutil.copyfileobj(r, f)
|
||||||
|
except Exception as e: # noqa: BLE001 — bubble any network error up
|
||||||
|
if tmp.exists():
|
||||||
|
tmp.unlink()
|
||||||
|
_err(f"download failed: {url}\n {e}")
|
||||||
|
raise
|
||||||
|
size = tmp.stat().st_size
|
||||||
|
if size < expected_min_bytes:
|
||||||
|
tmp.unlink()
|
||||||
|
raise RuntimeError(
|
||||||
|
f"downloaded file too small ({size} bytes < {expected_min_bytes}); "
|
||||||
|
f"the URL probably 404'd into an HTML error page."
|
||||||
|
)
|
||||||
|
tmp.replace(dest)
|
||||||
|
_ok(f"downloaded {dest.name} ({size / (1024 * 1024):.1f} MB)")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_tessdata() -> Path:
|
||||||
|
"""Ensure ``build/vendor/tessdata/eng.traineddata`` exists; return its path.
|
||||||
|
|
||||||
|
Shared across platforms. Downloaded once and cached. The
|
||||||
|
runtime expects this file at ``<bundle>/tesseract/tessdata/eng.traineddata``;
|
||||||
|
the PyInstaller spec handles the placement.
|
||||||
|
"""
|
||||||
|
_step("fetch tessdata (eng.traineddata)")
|
||||||
|
TESSDATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
target = TESSDATA_DIR / "eng.traineddata"
|
||||||
|
if target.exists() and target.stat().st_size > 1_000_000:
|
||||||
|
_ok(f"already cached: {target.relative_to(REPO)} "
|
||||||
|
f"({target.stat().st_size / (1024 * 1024):.1f} MB)")
|
||||||
|
return target
|
||||||
|
# ~16 MB on disk for the "best" model. Allow some slack on the
|
||||||
|
# min-bytes check (3 MB) so we still catch HTML 404 pages.
|
||||||
|
_download(TESSDATA_URL, target, expected_min_bytes=3 * 1024 * 1024)
|
||||||
|
return target
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_tesseract_windows(staging: Path) -> None:
|
||||||
|
"""Stage tesseract.exe + DLLs into *staging*.
|
||||||
|
|
||||||
|
Strategy (no easy stand-alone Windows tarball exists — UB-Mannheim
|
||||||
|
ships the canonical Windows builds as Inno Setup installers):
|
||||||
|
|
||||||
|
1. Download the installer .exe from the UB-Mannheim mirror.
|
||||||
|
2. Extract it with 7-Zip (which can read Inno Setup archives via
|
||||||
|
the {app} group). 7-Zip is preinstalled on
|
||||||
|
``windows-latest`` GitHub Actions runners (`C:\\Program Files\\7-Zip\\7z.exe`).
|
||||||
|
3. Copy tesseract.exe + every DLL + the tessdata dir from the
|
||||||
|
extraction into ``staging/``.
|
||||||
|
|
||||||
|
The DLL set tesseract.exe needs at runtime (per UB-Mannheim's
|
||||||
|
Inno Setup script):
|
||||||
|
libtesseract-5.dll, libleptonica-6.dll, libgomp-1.dll,
|
||||||
|
libstdc++-6.dll, libwinpthread-1.dll, libgcc_s_seh-1.dll,
|
||||||
|
liblz4.dll, libjpeg-8.dll, libpng16-16.dll, libtiff-6.dll,
|
||||||
|
libwebp-7.dll, libwebpmux-3.dll, libopenjp2-7.dll, zlib1.dll
|
||||||
|
The whole {app} tree from the installer is ~120 MB; we copy
|
||||||
|
just the .exe + .dll files (~50 MB) since the runtime only
|
||||||
|
needs the binary and its direct deps.
|
||||||
|
"""
|
||||||
|
# UB-Mannheim posts builds under a versioned filename; the exact
|
||||||
|
# build revision changes (5.5.0.20241111 at time of writing).
|
||||||
|
# We pin a specific rev so reproducible builds don't drift.
|
||||||
|
rev = "20241111" # patch rev for tesseract 5.5.0 on the UB-Mannheim mirror
|
||||||
|
fname = f"tesseract-ocr-w64-setup-{TESSERACT_VERSION}.{rev}.exe"
|
||||||
|
url = f"https://digi.bib.uni-mannheim.de/tesseract/{fname}"
|
||||||
|
|
||||||
|
cache = TESSERACT_STAGING / fname
|
||||||
|
if not cache.exists():
|
||||||
|
_download(url, cache, expected_min_bytes=20 * 1024 * 1024)
|
||||||
|
|
||||||
|
# 7-Zip is preinstalled on windows-latest runners; on a dev box
|
||||||
|
# the user installs it (choco install 7zip) or substitutes
|
||||||
|
# innoextract. Locate it.
|
||||||
|
sevenz = (
|
||||||
|
shutil.which("7z")
|
||||||
|
or shutil.which("7z.exe")
|
||||||
|
or r"C:\Program Files\7-Zip\7z.exe"
|
||||||
|
)
|
||||||
|
if not Path(sevenz).exists() and not shutil.which("7z"):
|
||||||
|
_err(
|
||||||
|
"7-Zip not found. On Windows CI runners it's preinstalled; "
|
||||||
|
"on a dev box install via ``choco install 7zip`` or extract "
|
||||||
|
f"{cache} manually into {staging}/ and re-run with "
|
||||||
|
"TESSERACT_SKIP_FETCH=1."
|
||||||
|
)
|
||||||
|
raise FileNotFoundError("7z")
|
||||||
|
|
||||||
|
extract = TESSERACT_STAGING / "win_extract"
|
||||||
|
if extract.exists():
|
||||||
|
shutil.rmtree(extract)
|
||||||
|
extract.mkdir(parents=True)
|
||||||
|
_run([str(sevenz), "x", "-y", f"-o{extract}", str(cache)])
|
||||||
|
|
||||||
|
staging.mkdir(parents=True, exist_ok=True)
|
||||||
|
# The Inno Setup payload lands under ``{app}/`` inside the
|
||||||
|
# extraction. Recursively grab tesseract.exe + DLLs.
|
||||||
|
found_exe = False
|
||||||
|
for root, _dirs, files in os.walk(extract):
|
||||||
|
for f in files:
|
||||||
|
src = Path(root) / f
|
||||||
|
if f.lower() == "tesseract.exe":
|
||||||
|
shutil.copy2(src, staging / "tesseract.exe")
|
||||||
|
found_exe = True
|
||||||
|
elif f.lower().endswith(".dll"):
|
||||||
|
shutil.copy2(src, staging / f)
|
||||||
|
if not found_exe:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"tesseract.exe not found inside extracted installer at {extract}"
|
||||||
|
)
|
||||||
|
_ok(f"staged Windows tesseract into {staging.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_tesseract_macos(staging: Path) -> None:
|
||||||
|
"""Stage tesseract + dylibs into *staging* on macOS.
|
||||||
|
|
||||||
|
Strategy: use Homebrew. ``brew install tesseract`` is the
|
||||||
|
sanctioned macOS path and the binary it installs is the same one
|
||||||
|
every guide on the internet points at. We copy the binary +
|
||||||
|
every dylib it links against into the staging dir, then run
|
||||||
|
``install_name_tool`` to rewrite the load paths so the binary
|
||||||
|
works after relocation into the .app bundle.
|
||||||
|
|
||||||
|
Caveat: ``brew`` must be on PATH (it is on ``macos-latest``
|
||||||
|
runners). If it isn't, we surface a helpful error rather than
|
||||||
|
fail mysteriously.
|
||||||
|
"""
|
||||||
|
if not shutil.which("brew"):
|
||||||
|
_err(
|
||||||
|
"Homebrew not found. On macos-latest GitHub runners it's "
|
||||||
|
"preinstalled; on a dev Mac install from https://brew.sh and "
|
||||||
|
"re-run. Alternatively pre-stage tesseract into "
|
||||||
|
f"{staging}/ and set TESSERACT_SKIP_FETCH=1."
|
||||||
|
)
|
||||||
|
raise FileNotFoundError("brew")
|
||||||
|
|
||||||
|
# ``brew install`` is idempotent — fine to run on every build. We
|
||||||
|
# don't pin the version through brew because brew tracks its own
|
||||||
|
# taps; instead we assert the version matches TESSERACT_VERSION
|
||||||
|
# after install.
|
||||||
|
_run(["brew", "install", "tesseract"])
|
||||||
|
|
||||||
|
# Find the binary brew just installed.
|
||||||
|
tess_path = shutil.which("tesseract")
|
||||||
|
if not tess_path:
|
||||||
|
raise RuntimeError("brew install tesseract succeeded but tesseract not on PATH")
|
||||||
|
|
||||||
|
staging.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(tess_path, staging / "tesseract")
|
||||||
|
|
||||||
|
# Copy every non-system dylib the binary links against. The
|
||||||
|
# ``otool -L`` output lists absolute paths under /opt/homebrew/
|
||||||
|
# (Apple Silicon) or /usr/local/ (Intel). We skip /usr/lib/* and
|
||||||
|
# /System/* (Apple-shipped, present on every Mac).
|
||||||
|
try:
|
||||||
|
otool = subprocess.run(
|
||||||
|
["otool", "-L", str(staging / "tesseract")],
|
||||||
|
check=True, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise RuntimeError(f"otool failed: {e.stderr}") from e
|
||||||
|
|
||||||
|
deps = []
|
||||||
|
for line in otool.stdout.splitlines()[1:]:
|
||||||
|
path = line.strip().split(" ", 1)[0]
|
||||||
|
if path.startswith(("/opt/homebrew/", "/usr/local/")):
|
||||||
|
deps.append(path)
|
||||||
|
|
||||||
|
# Copy each dep and its transitive deps. One level of recursion
|
||||||
|
# is usually enough for the tesseract dep tree (libtesseract →
|
||||||
|
# libleptonica → libpng/libjpeg/libtiff/libwebp).
|
||||||
|
copied: set[str] = set()
|
||||||
|
|
||||||
|
def _copy_with_deps(libpath: str) -> None:
|
||||||
|
if libpath in copied or not Path(libpath).exists():
|
||||||
|
return
|
||||||
|
copied.add(libpath)
|
||||||
|
dest = staging / Path(libpath).name
|
||||||
|
shutil.copy2(libpath, dest)
|
||||||
|
# Rewrite the dest's own load path to @loader_path so the
|
||||||
|
# bundle is relocatable.
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["install_name_tool", "-id", f"@loader_path/{Path(libpath).name}", str(dest)],
|
||||||
|
check=True, capture_output=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
# Not fatal — install_name_tool refuses on already-relative
|
||||||
|
# IDs. The dyld loader will still find them via
|
||||||
|
# @loader_path rewrites on the consumer side.
|
||||||
|
pass
|
||||||
|
# Walk this lib's own deps.
|
||||||
|
try:
|
||||||
|
sub = subprocess.run(
|
||||||
|
["otool", "-L", libpath], check=True, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
for sub_line in sub.stdout.splitlines()[1:]:
|
||||||
|
sub_path = sub_line.strip().split(" ", 1)[0]
|
||||||
|
if sub_path.startswith(("/opt/homebrew/", "/usr/local/")):
|
||||||
|
_copy_with_deps(sub_path)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
for dep in deps:
|
||||||
|
_copy_with_deps(dep)
|
||||||
|
|
||||||
|
# Rewrite the tesseract binary's references to point at
|
||||||
|
# @loader_path/<dyname> so it can find its deps inside the bundle.
|
||||||
|
bin_path = staging / "tesseract"
|
||||||
|
for dep in deps:
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["install_name_tool", "-change", dep,
|
||||||
|
f"@loader_path/{Path(dep).name}", str(bin_path)],
|
||||||
|
check=True, capture_output=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
_ok(f"staged macOS tesseract + {len(copied)} dylibs into {staging.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_tesseract_linux(staging: Path) -> None:
|
||||||
|
"""Stage tesseract + .so files into *staging* on Linux.
|
||||||
|
|
||||||
|
Strategy: ``apt-get install tesseract-ocr libtesseract5``
|
||||||
|
(preinstalled on most ubuntu-latest images; we run install
|
||||||
|
anyway because the package is idempotent). Then copy the
|
||||||
|
binary + every .so it links against into staging. ``patchelf``
|
||||||
|
rewrites RPATH so the bundle is relocatable.
|
||||||
|
"""
|
||||||
|
if not shutil.which("apt-get") and not shutil.which("tesseract"):
|
||||||
|
_err(
|
||||||
|
"Neither apt-get nor a pre-installed tesseract found. On "
|
||||||
|
"ubuntu-latest runners both are present. On other distros "
|
||||||
|
"install tesseract-ocr via your package manager and re-run "
|
||||||
|
"with TESSERACT_SKIP_FETCH=1 after pre-staging the binary."
|
||||||
|
)
|
||||||
|
raise FileNotFoundError("tesseract")
|
||||||
|
|
||||||
|
if shutil.which("apt-get") and not shutil.which("tesseract"):
|
||||||
|
_run(["sudo", "apt-get", "update"])
|
||||||
|
_run(["sudo", "apt-get", "install", "-y", "tesseract-ocr", "libtesseract5"])
|
||||||
|
|
||||||
|
tess_path = shutil.which("tesseract")
|
||||||
|
if not tess_path:
|
||||||
|
raise RuntimeError("apt-get install succeeded but tesseract not on PATH")
|
||||||
|
|
||||||
|
staging.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(tess_path, staging / "tesseract")
|
||||||
|
|
||||||
|
# Collect .so dependencies via ldd. Skip the dynamic linker and
|
||||||
|
# libc/libpthread/libdl/libm/libstdc++/libgcc_s — those are
|
||||||
|
# guaranteed to exist on every Linux target and shipping them can
|
||||||
|
# cause GLIBC mismatch errors on older distros. The interesting
|
||||||
|
# tesseract-specific deps are libtesseract, libleptonica, and the
|
||||||
|
# image format libs (libpng, libjpeg, libtiff, libwebp, libgif).
|
||||||
|
SKIP_PREFIXES = (
|
||||||
|
"linux-vdso", "/lib64/ld-linux", "/lib/ld-linux",
|
||||||
|
"libc.so", "libdl.so", "libpthread.so", "libm.so",
|
||||||
|
"librt.so", "libnsl.so", "libutil.so",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
ldd = subprocess.run(
|
||||||
|
["ldd", str(staging / "tesseract")],
|
||||||
|
check=True, capture_output=True, text=True,
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
raise RuntimeError(f"ldd failed: {e.stderr}") from e
|
||||||
|
|
||||||
|
copied = 0
|
||||||
|
for line in ldd.stdout.splitlines():
|
||||||
|
# Format: " libfoo.so.N => /path/to/libfoo.so.N (0x...)"
|
||||||
|
parts = line.split("=>")
|
||||||
|
if len(parts) != 2:
|
||||||
|
continue
|
||||||
|
soname = parts[0].strip()
|
||||||
|
if soname.startswith(SKIP_PREFIXES):
|
||||||
|
continue
|
||||||
|
path_part = parts[1].strip().split(" ", 1)[0]
|
||||||
|
if not path_part or not Path(path_part).exists():
|
||||||
|
continue
|
||||||
|
shutil.copy2(path_part, staging / Path(path_part).name)
|
||||||
|
copied += 1
|
||||||
|
|
||||||
|
# patchelf is optional — if present, rewrite RPATH to $ORIGIN so
|
||||||
|
# the binary finds its bundled .so files. If absent, the
|
||||||
|
# PyInstaller LD_LIBRARY_PATH that the launcher sets will cover
|
||||||
|
# it (we already chdir into _MEIPASS for the runtime).
|
||||||
|
if shutil.which("patchelf"):
|
||||||
|
try:
|
||||||
|
_run(["patchelf", "--set-rpath", "$ORIGIN", str(staging / "tesseract")])
|
||||||
|
except SystemExit:
|
||||||
|
_warn("patchelf rpath rewrite failed — relying on LD_LIBRARY_PATH at runtime")
|
||||||
|
|
||||||
|
_ok(f"staged Linux tesseract + {copied} .so files into {staging.relative_to(REPO)}")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_tesseract_for_platform(target: str) -> Path:
|
||||||
|
"""Stage the per-platform Tesseract binary + libs into ``build/_tesseract/<target>/``.
|
||||||
|
|
||||||
|
Returns the staging dir path. The PyInstaller spec adds this dir
|
||||||
|
(plus tessdata) to its ``datas=`` so the bundle ends up with
|
||||||
|
everything under ``<bundle>/tesseract/`` where the runtime
|
||||||
|
discovery code expects it.
|
||||||
|
|
||||||
|
Honours ``TESSERACT_SKIP_FETCH=1`` — set this when you've
|
||||||
|
pre-staged the binary by hand (offline build, behind a proxy,
|
||||||
|
custom build of tesseract, etc.). The script still verifies the
|
||||||
|
binary is present and surfaces a helpful error if not.
|
||||||
|
"""
|
||||||
|
_step(f"fetch tesseract binary ({target})")
|
||||||
|
staging = TESSERACT_STAGING / target
|
||||||
|
exe_name = "tesseract.exe" if target == "win" else "tesseract"
|
||||||
|
exe_path = staging / exe_name
|
||||||
|
|
||||||
|
if os.environ.get("TESSERACT_SKIP_FETCH") == "1":
|
||||||
|
if not exe_path.exists():
|
||||||
|
_err(
|
||||||
|
f"TESSERACT_SKIP_FETCH=1 but {exe_path} is missing. "
|
||||||
|
"Pre-stage the binary + its libs into that dir, then re-run."
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
_ok(f"skipping fetch (TESSERACT_SKIP_FETCH=1); using {exe_path.relative_to(REPO)}")
|
||||||
|
return staging
|
||||||
|
|
||||||
|
if exe_path.exists():
|
||||||
|
_ok(f"already staged: {exe_path.relative_to(REPO)}")
|
||||||
|
return staging
|
||||||
|
|
||||||
|
if target == "win":
|
||||||
|
_fetch_tesseract_windows(staging)
|
||||||
|
elif target == "mac":
|
||||||
|
_fetch_tesseract_macos(staging)
|
||||||
|
elif target == "linux":
|
||||||
|
_fetch_tesseract_linux(staging)
|
||||||
|
else:
|
||||||
|
_err(f"unknown target {target!r} for tesseract fetch")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
if not exe_path.exists():
|
||||||
|
_err(
|
||||||
|
f"fetch step finished but {exe_path.relative_to(REPO)} is missing. "
|
||||||
|
"Inspect the logs above; you may need to pre-stage the binary manually."
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
return staging
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Build steps
|
# Build steps
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -202,7 +594,7 @@ def step_generate_icons() -> None:
|
|||||||
_run([sys.executable, str(BUILD / "generate_icons.py")])
|
_run([sys.executable, str(BUILD / "generate_icons.py")])
|
||||||
|
|
||||||
|
|
||||||
def step_pyinstaller(clean: bool) -> None:
|
def step_pyinstaller(clean: bool, *, target: str | None = None) -> None:
|
||||||
_step("pyinstaller bundle")
|
_step("pyinstaller bundle")
|
||||||
# Use ``python -m PyInstaller`` so we don't depend on the binary
|
# Use ``python -m PyInstaller`` so we don't depend on the binary
|
||||||
# being on PATH (Windows users frequently see this — pip's
|
# being on PATH (Windows users frequently see this — pip's
|
||||||
@@ -212,7 +604,14 @@ def step_pyinstaller(clean: bool) -> None:
|
|||||||
"--noconfirm"]
|
"--noconfirm"]
|
||||||
if clean:
|
if clean:
|
||||||
cmd.append("--clean")
|
cmd.append("--clean")
|
||||||
_run(cmd)
|
# The spec reads ``DATATOOLS_TESS_STAGING`` to find the per-platform
|
||||||
|
# tesseract staging dir. Passing it via env keeps the spec file
|
||||||
|
# platform-agnostic — the spec doesn't need to detect win/mac/linux
|
||||||
|
# itself; the orchestrator already did.
|
||||||
|
env = os.environ.copy()
|
||||||
|
if target:
|
||||||
|
env["DATATOOLS_TESS_STAGING"] = str(TESSERACT_STAGING / target)
|
||||||
|
_run(cmd, env=env)
|
||||||
|
|
||||||
|
|
||||||
def step_package_win(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
def step_package_win(version: str, do_installer: bool, do_portable: bool) -> list[Path]:
|
||||||
@@ -331,7 +730,17 @@ def main() -> int:
|
|||||||
shutil.rmtree(DIST)
|
shutil.rmtree(DIST)
|
||||||
|
|
||||||
step_generate_icons()
|
step_generate_icons()
|
||||||
step_pyinstaller(clean=args.clean)
|
|
||||||
|
# Stage Tesseract OCR before PyInstaller runs. The spec reads
|
||||||
|
# ``build/_tesseract/<target>/`` + ``build/vendor/tessdata/`` and
|
||||||
|
# bundles them under ``<bundle>/tesseract/`` so the runtime
|
||||||
|
# discovery in src/pdf_extract.py finds them at:
|
||||||
|
# Path(sys._MEIPASS) / "tesseract" / "tesseract[.exe]"
|
||||||
|
# Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"
|
||||||
|
fetch_tessdata()
|
||||||
|
fetch_tesseract_for_platform(target)
|
||||||
|
|
||||||
|
step_pyinstaller(clean=args.clean, target=target)
|
||||||
|
|
||||||
if target == "win":
|
if target == "win":
|
||||||
outputs = step_package_win(version, do_installer, do_portable)
|
outputs = step_package_win(version, do_installer, do_portable)
|
||||||
|
|||||||
62
build/vendor/README.md
vendored
Normal file
62
build/vendor/README.md
vendored
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
# build/vendor/ — third-party bundle inputs (fetched at build time)
|
||||||
|
|
||||||
|
This tree holds the third-party assets that get bundled into the
|
||||||
|
PyInstaller artifacts but that we deliberately do **not** keep in git
|
||||||
|
(too large / license-encumbered / re-fetchable on demand).
|
||||||
|
|
||||||
|
The build pipeline (`build/make_release.py`) populates everything in
|
||||||
|
here before the PyInstaller step. The contents are git-ignored except
|
||||||
|
for this README.
|
||||||
|
|
||||||
|
## tessdata/
|
||||||
|
|
||||||
|
Holds the Tesseract language data file(s) used by the PDF Extractor
|
||||||
|
OCR fallback. Only English is bundled today.
|
||||||
|
|
||||||
|
### Canonical source
|
||||||
|
|
||||||
|
We use the **"best" model** from `tesseract-ocr/tessdata_best` (LSTM,
|
||||||
|
slower but higher accuracy than the legacy `tessdata` set, and only
|
||||||
|
~12 MB compressed → ~16 MB uncompressed):
|
||||||
|
|
||||||
|
```
|
||||||
|
https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
|
||||||
|
```
|
||||||
|
|
||||||
|
There is also `tessdata_fast/` (~4 MB, lower accuracy) if you ever
|
||||||
|
want to optimise for bundle size over recognition quality. For bank
|
||||||
|
statements (the only OCR use case so far), the extra accuracy of the
|
||||||
|
`_best` model is worth the 10 MB.
|
||||||
|
|
||||||
|
### Why we don't vendor it in git
|
||||||
|
|
||||||
|
* ~16 MB binary file — bloats clone times for everyone, including
|
||||||
|
contributors who never touch the OCR code path.
|
||||||
|
* Apache-2.0-licensed and stable; the file rarely changes upstream
|
||||||
|
(last touched 2021), so a build-time fetch is safe.
|
||||||
|
* The Tesseract project explicitly distributes these via GitHub
|
||||||
|
raw URLs — they're meant to be downloaded, not redistributed
|
||||||
|
through other repos.
|
||||||
|
|
||||||
|
### How it gets populated
|
||||||
|
|
||||||
|
`build/make_release.py::fetch_tessdata()` checks for
|
||||||
|
`build/vendor/tessdata/eng.traineddata` on every run. If it's
|
||||||
|
missing, the script downloads it from the canonical URL above and
|
||||||
|
caches it here. Subsequent builds reuse the cached file.
|
||||||
|
|
||||||
|
On CI, the directory is restored from the GitHub Actions cache so we
|
||||||
|
don't pay the download cost on every run (`.github/workflows/build.yml`
|
||||||
|
caches `build/vendor/tessdata/` keyed on the URL above).
|
||||||
|
|
||||||
|
## Manual one-time fetch (if you're offline or behind a proxy)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p build/vendor/tessdata
|
||||||
|
curl -L -o build/vendor/tessdata/eng.traineddata \
|
||||||
|
https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
|
||||||
|
```
|
||||||
|
|
||||||
|
Verify the file is non-empty and starts with the magic bytes
|
||||||
|
`b"\x00\x00\x00\x00"` followed by a header that `pytesseract` can
|
||||||
|
read; the script does a basic sanity check after download.
|
||||||
0
build/vendor/tessdata/.gitkeep
vendored
Normal file
0
build/vendor/tessdata/.gitkeep
vendored
Normal file
@@ -296,6 +296,37 @@ GUI / CLI handlers: use `format_for_user(exc, context="...")` to render.
|
|||||||
|
|
||||||
All `DataToolsError` subclasses extend stdlib `ValueError` or `OSError` so existing handlers still catch them.
|
All `DataToolsError` subclasses extend stdlib `ValueError` or `OSError` so existing handlers still catch them.
|
||||||
|
|
||||||
|
## PDF Extractor — bundled Tesseract
|
||||||
|
|
||||||
|
Frozen builds (installer / portable .zip / AppImage) ship Tesseract OCR inside the bundle so scanned PDFs work without a separate system install. Source / `pip` developer environments still resolve Tesseract from `PATH`.
|
||||||
|
|
||||||
|
**Runtime layout (frozen bundles)**:
|
||||||
|
|
||||||
|
| Resource | Path |
|
||||||
|
|---|---|
|
||||||
|
| Tesseract binary | `Path(sys._MEIPASS) / "tesseract" / "tesseract"` (Linux/macOS), `…/tesseract/tesseract.exe` (Windows) |
|
||||||
|
| Tessdata directory | `Path(sys._MEIPASS) / "tesseract" / "tessdata"` |
|
||||||
|
| English model | `Path(sys._MEIPASS) / "tesseract" / "tessdata" / "eng.traineddata"` |
|
||||||
|
|
||||||
|
**Discovery order** (PDF Extractor runtime):
|
||||||
|
|
||||||
|
1. `DATATOOLS_TESSERACT_BIN` env var (override — explicit path to a `tesseract` binary).
|
||||||
|
2. Bundled path under `sys._MEIPASS` (frozen bundles only — falls through to step 3 otherwise).
|
||||||
|
3. `tesseract` on `PATH` (developer setups, source checkouts).
|
||||||
|
4. Windows well-known locations (`C:\Program Files\Tesseract-OCR\tesseract.exe`, etc.).
|
||||||
|
|
||||||
|
**Where the bytes come from**:
|
||||||
|
|
||||||
|
- **Tessdata** is vendored at `build/vendor/tessdata/eng.traineddata` — the "best" English model from [tessdata_best](https://github.com/tesseract-ocr/tessdata_best). PyInstaller's spec copies it into `tesseract/tessdata/` inside the bundle.
|
||||||
|
- **Tesseract binary** is fetched at build time by `build/make_release.py` — per-platform download URLs are pinned in that script. The current pin is **Tesseract 5.5.0**.
|
||||||
|
|
||||||
|
**To update Tesseract**:
|
||||||
|
|
||||||
|
1. Bump the version pin + the per-platform fetch URLs in `build/make_release.py`.
|
||||||
|
2. If upstream changed the `eng.traineddata` schema, refresh `build/vendor/tessdata/eng.traineddata` from `tessdata_best` at the matching tag.
|
||||||
|
3. Rebuild on each platform (`python build/make_release.py`) and smoke-test a scanned-PDF run through the PDF Extractor before tagging the release.
|
||||||
|
4. Update `LICENSE_TESSERACT.txt` at the repo root if the upstream license terms change (Tesseract is Apache-2.0 today).
|
||||||
|
|
||||||
## Tests
|
## Tests
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -122,6 +122,17 @@ Tag a release → 3 platform artifacts upload to GitHub Releases. Manual: copy t
|
|||||||
|
|
||||||
`demo/streamlit_app.py` → Streamlit Community Cloud. Configure deployment in Streamlit UI. Custom domain via CNAME (verify policy at deploy time). Fall back to $5/mo VPS if rate limits / branding constraints hit.
|
`demo/streamlit_app.py` → Streamlit Community Cloud. Configure deployment in Streamlit UI. Custom domain via CNAME (verify policy at deploy time). Fall back to $5/mo VPS if rate limits / branding constraints hit.
|
||||||
|
|
||||||
|
### 3.10 Bundled Tesseract (PDF Extractor OCR)
|
||||||
|
|
||||||
|
Frozen builds ship Tesseract 5.5 + `eng.traineddata` inside the PyInstaller bundle so scanned PDFs work without a separate install. Per-platform binary URLs pinned in `build/make_release.py`; tessdata vendored at `build/vendor/tessdata/eng.traineddata`. License attribution in `LICENSE_TESSERACT.txt` at the repo root.
|
||||||
|
|
||||||
|
**Discovery order at runtime** (see `docs/DEVELOPER.md` for the full Path layout):
|
||||||
|
|
||||||
|
1. `DATATOOLS_TESSERACT_BIN` env var override.
|
||||||
|
2. Bundled path under `sys._MEIPASS / "tesseract" /` (frozen bundles only).
|
||||||
|
3. `tesseract` on `PATH` (source / pip developer environments).
|
||||||
|
4. Windows well-known locations.
|
||||||
|
|
||||||
## 4. Libraries
|
## 4. Libraries
|
||||||
|
|
||||||
| Purpose | Library |
|
| Purpose | Library |
|
||||||
|
|||||||
@@ -103,7 +103,9 @@ La ventana del lanzador queda abierta en segundo plano. Cerrarla detiene el serv
|
|||||||
|
|
||||||
- Windows 10/11 (64 bits), macOS 11+, Linux moderno (2020+).
|
- Windows 10/11 (64 bits), macOS 11+, Linux moderno (2020+).
|
||||||
- Navegador moderno (Chrome, Edge, Firefox, Safari, últimos 3 años).
|
- Navegador moderno (Chrome, Edge, Firefox, Safari, últimos 3 años).
|
||||||
- ~400 MB de espacio libre en disco (el paquete ocupa ~200 MB; el resto es espacio de trabajo para CSV grandes).
|
- ~500 MB de espacio libre en disco (el paquete ocupa ~300 MB; el resto es espacio de trabajo para CSV grandes).
|
||||||
|
|
||||||
|
**OCR para PDFs escaneados viene incluido** — Tesseract 5.5 y el modelo en inglés `eng.traineddata` vienen dentro de cada instalador / portable / AppImage. La ruta de extracción de PDFs escaneados del Extractor de PDF funciona sin configuración adicional; no hace falta instalar nada por separado. (Quien ejecute desde un checkout con `pip install -r requirements.txt` sigue necesitando Tesseract del sistema en el `PATH` — ver [DEVELOPER.md §PDF Extractor — bundled Tesseract](DEVELOPER.md#pdf-extractor--bundled-tesseract) (solo en inglés).)
|
||||||
|
|
||||||
Matriz de soporte completa: [REQUIREMENTS.md](REQUIREMENTS.md) (solo en inglés).
|
Matriz de soporte completa: [REQUIREMENTS.md](REQUIREMENTS.md) (solo en inglés).
|
||||||
|
|
||||||
|
|||||||
@@ -103,7 +103,9 @@ The launcher window stays open in the background. Closing it stops the server
|
|||||||
|
|
||||||
- Windows 10/11 (64-bit), macOS 11+, modern Linux (2020+).
|
- Windows 10/11 (64-bit), macOS 11+, modern Linux (2020+).
|
||||||
- Modern browser (Chrome, Edge, Firefox, Safari, last 3 years).
|
- Modern browser (Chrome, Edge, Firefox, Safari, last 3 years).
|
||||||
- ~400 MB free disk space (the bundle itself is ~200 MB; the rest is working scratch space for large CSVs).
|
- ~500 MB free disk space (the bundle itself is ~300 MB; the rest is working scratch space for large CSVs).
|
||||||
|
|
||||||
|
**OCR for scanned PDFs is bundled** — Tesseract 5.5 + the English `eng.traineddata` model ship inside every installer / portable / AppImage. The PDF Extractor's scanned-statement path works out of the box; no separate install required. (Developers running from a `pip install -r requirements.txt` checkout still need system Tesseract on `PATH` — see [DEVELOPER.md §PDF Extractor — bundled Tesseract](DEVELOPER.md#pdf-extractor--bundled-tesseract).)
|
||||||
|
|
||||||
Full numbered support matrix: [REQUIREMENTS.md](REQUIREMENTS.md).
|
Full numbered support matrix: [REQUIREMENTS.md](REQUIREMENTS.md).
|
||||||
|
|
||||||
|
|||||||
187
layout-review/01_deduplicator.html
Normal file
187
layout-review/01_deduplicator.html
Normal file
@@ -0,0 +1,187 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Find Duplicates</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="01_deduplicator">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Find Duplicates</strong>, shown with a file imported and a completed run (results + match-group review). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Find Duplicates</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Find rows that repeat, then keep one and remove the extras.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding & delimiter auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">customers_export.csv</span>
|
||||||
|
<span class="size">2.1 MB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Delimiter selector (CSV) -->
|
||||||
|
<div class="dt-field" style="max-width:320px">
|
||||||
|
<label class="dt-label">Delimiter</label>
|
||||||
|
<div class="dt-select">Comma (,)</div>
|
||||||
|
<div class="dt-help-text">Auto-detected on upload. Change if the preview looks wrong.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed after a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: customers_export.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">18,442 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>Austin</td><td>512-555-0190</td><td>2024-01-04</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>jane doe</td><td>JANE@ACME.IO</td><td>austin</td><td>(512) 555-0190</td><td>01/04/2024</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R. Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Options expander -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<details class="dt-expander" style="margin-top:0">
|
||||||
|
<summary>Advanced Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Match on columns</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-placeholder">Leave empty to auto-detect</span></div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Strong keys</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">email <span class="x">✕</span></span></div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Fuzzy columns</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">name <span class="x">✕</span></span></div></div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Fuzzy algorithm</label><div class="dt-select">jaro_winkler</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Similarity threshold</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:70%"></div><div class="knob" style="left:70%"></div></div><div class="val">85</div></div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Survivor rule</label><div class="dt-select">most-complete</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on" style="margin-top:6px"><span class="box"><span class="dt-mi">check</span></span> Merge mode — fill missing fields in the surviving row</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Find Duplicates</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Original rows</div><div class="value">18,442</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Duplicate rows</div><div class="value">312</div><div class="delta down">−312 removed</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Match groups</div><div class="value">147</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Rows kept</div><div class="value">18,130</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-btn-row" style="max-width:560px">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download deduplicated CSV</button>
|
||||||
|
<button class="dt-btn">Download removed rows</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Match groups -->
|
||||||
|
<h2>Match Groups</h2>
|
||||||
|
<div class="dt-cols-3" style="max-width:520px">
|
||||||
|
<button class="dt-btn">Accept All</button>
|
||||||
|
<button class="dt-btn">Reject All</button>
|
||||||
|
<button class="dt-btn">Clear Decisions</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Match group card 1 -->
|
||||||
|
<div class="dt-match-card">
|
||||||
|
<div class="dt-match-head">
|
||||||
|
<span class="title">Group 1 · 2 rows</span>
|
||||||
|
<span class="conf"><span class="dt-count-pill success">98% match</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-match-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>keep</th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr class="dt-keep-row"><td><span class="dt-keep-tag">keep</span></td><td>Jane Doe</td><td>jane@acme.io</td><td>Austin</td><td>512-555-0190</td><td>2024-01-04</td></tr>
|
||||||
|
<tr><td><span class="dt-caption">remove</span></td><td class="dt-cell-flag">jane doe</td><td class="dt-cell-flag">JANE@ACME.IO</td><td class="dt-cell-flag">austin</td><td>(512) 555-0190</td><td class="dt-cell-flag">01/04/2024</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Differing columns highlighted. The survivor row is kept; uncheck rows to split the group.</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Match group card 2 -->
|
||||||
|
<div class="dt-match-card">
|
||||||
|
<div class="dt-match-head">
|
||||||
|
<span class="title">Group 2 · 2 rows</span>
|
||||||
|
<span class="conf"><span class="dt-count-pill warn">87% match</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-match-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>keep</th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr class="dt-keep-row"><td><span class="dt-keep-tag">keep</span></td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
<tr><td><span class="dt-caption">remove</span></td><td class="dt-cell-flag">R. Smith</td><td>bob@globex.com</td><td>Denver</td><td>720-555-7781</td><td>2024-02-11</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="dt-caption" style="margin-top:14px">Decisions: 1 merged, 1 pending</p>
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block" style="margin-top:8px">Apply Review Decisions & Download</button>
|
||||||
|
|
||||||
|
<!-- Processing log -->
|
||||||
|
<details class="dt-expander" style="margin-top:18px">
|
||||||
|
<summary>Processing Log</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-code">[00:00.01] Loaded 18,442 rows from customers_export.csv
|
||||||
|
[00:00.04] Strategy: exact(email) + fuzzy(name, jaro_winkler ≥ 85)
|
||||||
|
[00:00.91] Compared 18,442 rows → 147 match groups
|
||||||
|
[00:01.02] Survivor rule: most-complete · merge=on
|
||||||
|
[00:01.05] 312 rows flagged for removal</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
208
layout-review/02_text_cleaner.html
Normal file
208
layout-review/02_text_cleaner.html
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Clean Text</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
<style>
|
||||||
|
/* Hidden-character badges — mirrors src/core/text_clean.py:hidden_char_css(),
|
||||||
|
not part of app.css so reproduced inline against the same palette. */
|
||||||
|
.hidden-char { display: inline-block; padding: 0 2px; margin: 0 1px; border-radius: 3px; font-family: var(--font-mono); font-size: 0.85em; cursor: help; }
|
||||||
|
.hidden-char.hidden-whitespace { background: #fff3cd; color: #856404; border: 1px solid #ffeaa7; }
|
||||||
|
.hidden-char.hidden-special { background: #d1ecf1; color: #0c5460; border: 1px solid #bee5eb; }
|
||||||
|
.hidden-char.hidden-control { background: #f8d7da; color: #721c24; border: 1px solid #f5c6cb; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body data-page="02_text_cleaner">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Clean Text</strong>, shown with a file imported and a completed run (results metrics, changes-by-column, before/after examples, cleaned preview, downloads). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Clean Text</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Trim extra spaces and strip out odd characters.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">contacts_messy.csv</span>
|
||||||
|
<span class="size">684 KB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: contacts_messy.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">4,120 rows, 4 columns</p>
|
||||||
|
<div class="dt-check on" style="margin-top:2px"><span class="box"><span class="dt-mi">check</span></span> Show hidden characters in preview</div>
|
||||||
|
<div class="dt-table-wrap" style="margin-top:8px">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>company</th><th>notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td><span class="hidden-char hidden-whitespace" title="U+0020 SP LEAD">·</span>Jane Doe<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>jane@acme.io</td><td>Acme<span class="hidden-char hidden-whitespace" title="U+00A0 NBSP">·</span>Inc.</td><td>VIP<span class="hidden-char hidden-special" title="U+201D RIGHT DOUBLE QUOTE">”</span></td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com<span class="hidden-char hidden-special" title="U+200B ZWSP">∅</span></td><td>Globex</td><td>—<span class="hidden-char hidden-control" title="U+0007 CTRL">␣</span></td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Ana López</td><td>ana@initech.com</td><td>Initech<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>follow up</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td><span class="hidden-char hidden-whitespace" title="U+0009 TAB">→</span>Wei Chen</td><td>WEI@umbrella.co</td><td>Umbrella</td><td>“key<span class="hidden-char hidden-special" title="U+2014 EM DASH">—</span>account”</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Preset</label>
|
||||||
|
<div class="dt-radio-row">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> excel-hygiene (recommended)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> minimal</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> paranoid</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">excel-hygiene: trim, collapse whitespace, fold smart quotes, strip invisible chars, normalize line endings, NFC.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Advanced options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Trim leading/trailing whitespace</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Collapse internal whitespace</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Normalize line endings (\r\n → \n)</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Strip control characters</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Strip BOM</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Fold smart characters (curly quotes, em-dash, NBSP)</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Strip zero-width / invisible characters</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Unicode NFC normalization</div>
|
||||||
|
<div class="dt-check"><span class="box"></span> Unicode NFKC compat fold (lossy: ① → 1, fi → fi)</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Scope</h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to clean (default: all string columns)</label>
|
||||||
|
<div class="dt-multiselect">
|
||||||
|
<span class="dt-ms-chip">name <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">email <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">company <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">notes <span class="x">✕</span></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to skip even if they look like text</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-placeholder">Choose columns to leave untouched</span></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Case conversion</h4>
|
||||||
|
<div class="dt-field" style="max-width:360px">
|
||||||
|
<label class="dt-label">Apply case conversion to selected columns</label>
|
||||||
|
<div class="dt-select">None</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Clean Text</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Cells scanned</div><div class="value">16,480</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells changed</div><div class="value">3,947</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">% changed</div><div class="value">24.0%</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Columns processed</div><div class="value">4</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Show hidden characters (NBSP, ZWSP, smart quotes, control chars…)</div>
|
||||||
|
|
||||||
|
<h4>Changes by column</h4>
|
||||||
|
<div class="dt-table-wrap" style="max-width:360px">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>cells_changed</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">company</td><td>1,604</td></tr>
|
||||||
|
<tr><td class="idx">name</td><td>1,210</td></tr>
|
||||||
|
<tr><td class="idx">notes</td><td>982</td></tr>
|
||||||
|
<tr><td class="idx">email</td><td>151</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Examples (first 25 changes)</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Row</th><th>Column</th><th>Before</th><th>After</th><th>Ops applied</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>1</td><td>name</td><td><span class="hidden-char hidden-whitespace" title="U+0020 SP LEAD">·</span>Jane Doe<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>Jane Doe</td><td>trim</td></tr>
|
||||||
|
<tr><td>1</td><td>company</td><td>Acme<span class="hidden-char hidden-whitespace" title="U+00A0 NBSP">·</span>Inc.</td><td>Acme Inc.</td><td>fold_smart</td></tr>
|
||||||
|
<tr><td>1</td><td>notes</td><td>VIP<span class="hidden-char hidden-special" title="U+201D RIGHT DOUBLE QUOTE">”</span></td><td>VIP"</td><td>fold_smart</td></tr>
|
||||||
|
<tr><td>2</td><td>name</td><td>Bob<span class="hidden-char hidden-whitespace" title="U+0020 SP">·</span><span class="hidden-char hidden-whitespace" title="U+0020 SP">·</span>Smith</td><td>Bob Smith</td><td>collapse_ws</td></tr>
|
||||||
|
<tr><td>2</td><td>email</td><td>bob@globex.com<span class="hidden-char hidden-special" title="U+200B ZWSP">∅</span></td><td>bob@globex.com</td><td>strip_zero_width</td></tr>
|
||||||
|
<tr><td>2</td><td>notes</td><td>—<span class="hidden-char hidden-control" title="U+0007 CTRL">␣</span></td><td>—</td><td>strip_control</td></tr>
|
||||||
|
<tr><td>3</td><td>company</td><td>Initech<span class="hidden-char hidden-whitespace" title="U+0020 SP TRAIL">·</span></td><td>Initech</td><td>trim</td></tr>
|
||||||
|
<tr><td>4</td><td>name</td><td><span class="hidden-char hidden-whitespace" title="U+0009 TAB">→</span>Wei Chen</td><td>Wei Chen</td><td>trim</td></tr>
|
||||||
|
<tr><td>4</td><td>notes</td><td>“key<span class="hidden-char hidden-special" title="U+2014 EM DASH">—</span>account”</td><td>"key-account"</td><td>fold_smart, nfc</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Cleaned preview (first 10 rows)</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>company</th><th>notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td class="dt-cell-add">Jane Doe</td><td>jane@acme.io</td><td class="dt-cell-add">Acme Inc.</td><td class="dt-cell-add">VIP"</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td class="dt-cell-add">Bob Smith</td><td class="dt-cell-add">bob@globex.com</td><td>Globex</td><td class="dt-cell-add">—</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Ana López</td><td>ana@initech.com</td><td class="dt-cell-add">Initech</td><td>follow up</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td class="dt-cell-add">Wei Chen</td><td>WEI@umbrella.co</td><td>Umbrella</td><td class="dt-cell-add">"key-account"</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Changed cells highlighted. Toggle “Show hidden characters” to inspect the invisibles being removed.</p>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download cleaned CSV</button>
|
||||||
|
<button class="dt-btn">Download changes audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
224
layout-review/03_format_standardizer.html
Normal file
224
layout-review/03_format_standardizer.html
Normal file
@@ -0,0 +1,224 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Standardize Formats</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="03_format_standardizer">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Standardize Formats</strong>, shown with a file imported from the upload screen and a completed run (results + changes audit + standardized preview). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Standardize Formats</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Make dates, phones, currency, and names look the same throughout.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- File pickup banner (using file from upload screen) -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">description</span>
|
||||||
|
<span>Using <strong>customers_export.csv</strong> from the upload screen.</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn" style="margin-bottom:4px">Use a different file</button>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: customers_export.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">18,442 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>full_name</th><th>phone</th><th>amount</th><th>signup_date</th><th>active</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>jane DOE</td><td>(512) 555-0190</td><td>$1,234.5</td><td>01/04/2024</td><td>Y</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>bob smith</td><td>720.555.7781</td><td>$99</td><td>2024-2-11</td><td>yes</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>ALICIA REYES</td><td>+1 415 555 2233</td><td>$45,000</td><td>Mar 3, 2024</td><td>n</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>m. okafor</td><td>2125550148</td><td>$7.999</td><td>2024/04/22</td><td>true</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (collapsed after run; opened here to show the most informative content) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<h3 style="margin-top:0">Column types</h3>
|
||||||
|
<p class="dt-caption">Assign each column to a field type. Auto-detected suggestions are pre-filled; pick <strong>(skip)</strong> to leave a column untouched.</p>
|
||||||
|
|
||||||
|
<!-- Per-column type selectboxes, 3 per row -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<div class="dt-field"><label class="dt-label">full_name</label><div class="dt-select">Name</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">phone</label><div class="dt-select">Phone</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">amount</label><div class="dt-select">Currency</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<div class="dt-field"><label class="dt-label">signup_date</label><div class="dt-select">Date</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">active</label><div class="dt-select">Boolean</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">notes</label><div class="dt-select">(skip)</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<h3>Format options</h3>
|
||||||
|
|
||||||
|
<!-- Standards preset radio (vertical) -->
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Standards preset</label>
|
||||||
|
<div style="display:flex;flex-direction:column;gap:8px;margin-top:4px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> US (default) — ISO 8601 dates · E.164 phones · USD</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> European — DMY input · INTL phones · EUR comma decimal</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> UK — DD/MM/YYYY · GB phones · Yes/No booleans</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> ISO Strict — ISO 8601 · bare-number currency · true/false</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Legacy US — MM/DD/YYYY · National phones · Yes/No</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Custom — keep current settings</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">Pick a published standard or regional convention as the baseline. Every option below is still individually overridable.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Two-column format options -->
|
||||||
|
<div class="dt-cols-2" style="margin-top:14px">
|
||||||
|
<!-- Left column: Dates + Phones -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0"><strong>Dates</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Output format</label><div class="dt-select">YYYY-MM-DD (ISO)</div></div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Ambiguous input order (e.g. 01/02/2024)</label>
|
||||||
|
<div class="dt-radio-row">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> MDY (US)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> DMY (EU)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4><strong>Phones</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Output format</label><div class="dt-select">E.164 (+15551234567)</div></div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Default region (ISO-2)</label>
|
||||||
|
<div class="dt-input">US</div>
|
||||||
|
<div class="dt-help-text">Region used when the input has no country code. US, GB, DE, etc.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Right column: Currency + Names + Booleans -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0"><strong>Currency</strong></h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Decimal separator in input</label>
|
||||||
|
<div class="dt-radio-row">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> dot (1,234.56)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> comma (1.234,56)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field" style="max-width:200px"><label class="dt-label">Round to decimals</label><div class="dt-input">2</div></div>
|
||||||
|
<div class="dt-check"><span class="box"></span> Preserve original precision (don't round)</div>
|
||||||
|
<div class="dt-check"><span class="box"></span> Preserve currency code (emit <code>USD 1234.56</code>, <code>EUR 99.00</code>, etc.)</div>
|
||||||
|
|
||||||
|
<h4><strong>Names</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Casing</label><div class="dt-select">Title Case</div></div>
|
||||||
|
|
||||||
|
<h4><strong>Booleans</strong></h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Output style</label><div class="dt-select">True/False</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Standardize Formats</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Cells scanned</div><div class="value">92,210</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells changed</div><div class="value">61,838</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">% changed</div><div class="value">67.1%</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Unparseable</div><div class="value">47</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>47 cell(s) in typed columns didn't match a recognizable shape and were left as-is. Check the changes audit below to find them, or re-classify the column to <strong>(skip)</strong>.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Changes by column -->
|
||||||
|
<p style="margin-bottom:6px"><strong>Changes by column</strong></p>
|
||||||
|
<div class="dt-table-wrap" style="max-width:520px">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>field_type</th><th>cells_changed</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>amount</td><td>currency</td><td>17,902</td></tr>
|
||||||
|
<tr><td>full_name</td><td>name</td><td>16,041</td></tr>
|
||||||
|
<tr><td>phone</td><td>phone</td><td>14,388</td></tr>
|
||||||
|
<tr><td>signup_date</td><td>date</td><td>11,205</td></tr>
|
||||||
|
<tr><td>active</td><td>boolean</td><td>2,302</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Examples (first 25 changes) -->
|
||||||
|
<p style="margin:14px 0 6px"><strong>Examples (first 25 changes)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>row</th><th>column</th><th>field_type</th><th>before</th><th>after</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>1</td><td>full_name</td><td>name</td><td class="dt-cell-del">jane DOE</td><td class="dt-cell-add">Jane Doe</td></tr>
|
||||||
|
<tr><td>1</td><td>phone</td><td>phone</td><td class="dt-cell-del">(512) 555-0190</td><td class="dt-cell-add">+15125550190</td></tr>
|
||||||
|
<tr><td>1</td><td>amount</td><td>currency</td><td class="dt-cell-del">$1,234.5</td><td class="dt-cell-add">1234.50</td></tr>
|
||||||
|
<tr><td>1</td><td>signup_date</td><td>date</td><td class="dt-cell-del">01/04/2024</td><td class="dt-cell-add">2024-01-04</td></tr>
|
||||||
|
<tr><td>1</td><td>active</td><td>boolean</td><td class="dt-cell-del">Y</td><td class="dt-cell-add">True</td></tr>
|
||||||
|
<tr><td>2</td><td>full_name</td><td>name</td><td class="dt-cell-del">bob smith</td><td class="dt-cell-add">Bob Smith</td></tr>
|
||||||
|
<tr><td>2</td><td>phone</td><td>phone</td><td class="dt-cell-del">720.555.7781</td><td class="dt-cell-add">+17205557781</td></tr>
|
||||||
|
<tr><td>2</td><td>signup_date</td><td>date</td><td class="dt-cell-del">2024-2-11</td><td class="dt-cell-add">2024-02-11</td></tr>
|
||||||
|
<tr><td>3</td><td>signup_date</td><td>date</td><td class="dt-cell-del">Mar 3, 2024</td><td class="dt-cell-add">2024-03-03</td></tr>
|
||||||
|
<tr><td>4</td><td>amount</td><td>currency</td><td class="dt-cell-del">$7.999</td><td class="dt-cell-add">8.00</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Standardized preview -->
|
||||||
|
<p style="margin:14px 0 6px"><strong>Standardized preview (first 10 rows)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>full_name</th><th>phone</th><th>amount</th><th>signup_date</th><th>active</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>+15125550190</td><td>1234.50</td><td>2024-01-04</td><td>True</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>+17205557781</td><td>99.00</td><td>2024-02-11</td><td>True</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Alicia Reyes</td><td>+14155552233</td><td>45000.00</td><td>2024-03-03</td><td>False</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>M. Okafor</td><td>+12125550148</td><td>8.00</td><td>2024-04-22</td><td>True</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (3 columns) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download standardized CSV</button>
|
||||||
|
<button class="dt-btn">Download changes audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
271
layout-review/04_missing_handler.html
Normal file
271
layout-review/04_missing_handler.html
Normal file
@@ -0,0 +1,271 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Fix Missing Values</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="04_missing_handler">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Fix Missing Values</strong>, shown with a file imported and a completed run (per-column missingness profile + before/after results). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Fix Missing Values</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Find blank cells (even hidden ones) and fill them in or remove them.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<p class="dt-caption">Tip: files imported on the Home screen are picked up here automatically.</p>
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">survey_responses.csv</span>
|
||||||
|
<span class="size">684 KB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed after a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: survey_responses.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">2,150 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>respondent_id</th><th>age</th><th>region</th><th>income</th><th>satisfaction</th><th>comments</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>R-1001</td><td>34</td><td>West</td><td>52000</td><td>4</td><td>great service</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>R-1002</td><td class="dt-cell-flag">N/A</td><td>East</td><td class="dt-cell-flag"></td><td>3</td><td class="dt-cell-flag">?</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>R-1003</td><td>41</td><td class="dt-cell-flag">-</td><td>61000</td><td class="dt-cell-flag">NULL</td><td>none</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R-1004</td><td>29</td><td>South</td><td class="dt-cell-flag">N/A</td><td>5</td><td>quick</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (Missingness profile + Strategy) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<h3>Missingness profile</h3>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Rows</div><div class="value">2,150</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells missing</div><div class="value">1,043</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">% cells missing</div><div class="value">8.1%</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Complete rows</div><div class="value">1,388</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>dtype</th><th>missing</th><th>missing_pct</th><th>disguised</th><th>has_missing</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>respondent_id</td><td>object</td><td>0</td><td>0.0%</td><td>0</td><td>False</td></tr>
|
||||||
|
<tr><td>age</td><td>float64</td><td>187</td><td>8.7%</td><td>61</td><td>True</td></tr>
|
||||||
|
<tr><td>region</td><td>object</td><td>142</td><td>6.6%</td><td>142</td><td>True</td></tr>
|
||||||
|
<tr><td>income</td><td>float64</td><td>329</td><td>15.3%</td><td>118</td><td>True</td></tr>
|
||||||
|
<tr><td>satisfaction</td><td>float64</td><td>95</td><td>4.4%</td><td>40</td><td>True</td></tr>
|
||||||
|
<tr><td>comments</td><td>object</td><td>290</td><td>13.5%</td><td>290</td><td>True</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<h3>Strategy</h3>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Preset</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column;gap:10px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> detect-only (standardize sentinels to NaN, no fill or drop)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> safe-fill (numeric → median, categorical → mode)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> drop-incomplete (drop any row with missing)</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">detect-only: replace 'N/A', '-', 'NULL', etc. with real NaN, then stop. safe-fill: also fill — numeric columns with median, others with mode. drop-incomplete: also drop every row that has any missing cell.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Advanced options expander (open — most informative) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Advanced options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<h4>Detection</h4>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Standardize disguised nulls to NaN</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Sentinel values (comma-separated)</label>
|
||||||
|
<div class="dt-input">N/A, n/a, NA, NULL, null, None, -, --, ?, #N/A</div>
|
||||||
|
<div class="dt-help-text">Matched case-insensitively after stripping whitespace.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<h4>Strategy override</h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Global strategy</label>
|
||||||
|
<div class="dt-select">(use preset)</div>
|
||||||
|
<div class="dt-help-text">drop_row / drop_col use the thresholds below. mean / median / interpolate are numeric only — non-numeric columns fall back to the categorical strategy.</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Categorical fallback (for non-numeric columns)</label>
|
||||||
|
<div class="dt-select">mode</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Drop thresholds</h4>
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Row drop threshold (drop rows with ≥ this fraction missing across selected cols)</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:100%"></div><div class="knob" style="left:100%"></div></div><div class="val">1.00</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Column drop threshold (drop columns with ≥ this fraction missing)</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:100%"></div><div class="knob" style="left:100%"></div></div><div class="val">1.00</div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Scope</h4>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to handle (default: all)</label>
|
||||||
|
<div class="dt-multiselect">
|
||||||
|
<span class="dt-ms-chip">respondent_id <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">age <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">region <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">income <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">satisfaction <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">comments <span class="x">✕</span></span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Columns to skip</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-placeholder">Choose columns</span></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Per-column strategy overrides (optional)</h4>
|
||||||
|
<p class="dt-caption">Set a different strategy for specific columns. Leave any row blank to use the global strategy.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Column</th><th>Override</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>age</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px">median</span></td></tr>
|
||||||
|
<tr><td>region</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px">mode</span></td></tr>
|
||||||
|
<tr><td>income</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px"></span></td></tr>
|
||||||
|
<tr><td>satisfaction</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px"></span></td></tr>
|
||||||
|
<tr><td>comments</td><td><span class="dt-select" style="display:inline-block;min-width:160px;padding:4px 24px 4px 10px">constant</span></td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Handle Missing Values</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<div id="missing-results-anchor"></div>
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Sentinels → NaN</div><div class="value">651</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Cells filled</div><div class="value">1,043</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Rows dropped</div><div class="value">0</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Columns dropped</div><div class="value">0</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Missingness — before vs. after</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>before_missing</th><th>before_pct</th><th>after_missing</th><th>after_pct</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>respondent_id</td><td>0</td><td>0.0</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>age</td><td class="dt-cell-flag">187</td><td>8.7</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>region</td><td class="dt-cell-flag">142</td><td>6.6</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>income</td><td class="dt-cell-flag">329</td><td>15.3</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>satisfaction</td><td class="dt-cell-flag">95</td><td>4.4</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
<tr><td>comments</td><td class="dt-cell-flag">290</td><td>13.5</td><td class="dt-cell-add">0</td><td class="dt-cell-add">0.0</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Strategy applied per column</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>column</th><th>strategy</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>age</td><td>median</td></tr>
|
||||||
|
<tr><td>region</td><td>mode</td></tr>
|
||||||
|
<tr><td>income</td><td>median</td></tr>
|
||||||
|
<tr><td>satisfaction</td><td>median</td></tr>
|
||||||
|
<tr><td>comments</td><td>constant</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Audit (first 50 changes)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>row</th><th>column</th><th>old_value</th><th>new_value</th><th>reason</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2</td><td>age</td><td class="dt-cell-flag">N/A</td><td class="dt-cell-add">37.0</td><td>fill: median</td></tr>
|
||||||
|
<tr><td>2</td><td>income</td><td class="dt-cell-flag">(blank)</td><td class="dt-cell-add">54000.0</td><td>fill: median</td></tr>
|
||||||
|
<tr><td>2</td><td>comments</td><td class="dt-cell-flag">?</td><td class="dt-cell-add">(no comment)</td><td>fill: constant</td></tr>
|
||||||
|
<tr><td>3</td><td>region</td><td class="dt-cell-flag">-</td><td class="dt-cell-add">West</td><td>fill: mode</td></tr>
|
||||||
|
<tr><td>3</td><td>satisfaction</td><td class="dt-cell-flag">NULL</td><td class="dt-cell-add">4.0</td><td>fill: median</td></tr>
|
||||||
|
<tr><td>4</td><td>income</td><td class="dt-cell-flag">N/A</td><td class="dt-cell-add">54000.0</td><td>fill: median</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">… and 1,037 more (download the full audit below).</p>
|
||||||
|
|
||||||
|
<p><strong>Handled preview (first 10 rows)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>respondent_id</th><th>age</th><th>region</th><th>income</th><th>satisfaction</th><th>comments</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>R-1001</td><td>34.0</td><td>West</td><td>52000.0</td><td>4.0</td><td>great service</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>R-1002</td><td class="dt-cell-add">37.0</td><td>East</td><td class="dt-cell-add">54000.0</td><td>3.0</td><td class="dt-cell-add">(no comment)</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>R-1003</td><td>41.0</td><td class="dt-cell-add">West</td><td>61000.0</td><td class="dt-cell-add">4.0</td><td>none</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R-1004</td><td>29.0</td><td>South</td><td class="dt-cell-add">54000.0</td><td>5.0</td><td>quick</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (html_download_button anchors) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download handled CSV</button>
|
||||||
|
<button class="dt-btn">Download changes audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
222
layout-review/05_column_mapper.html
Normal file
222
layout-review/05_column_mapper.html
Normal file
@@ -0,0 +1,222 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Map Columns</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="05_column_mapper">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Map Columns</strong>, shown with a file imported, an interactive target schema + mapping configured, and a completed run (results + mapped preview). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Map Columns</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Rename columns, change their order, and set each one as text, number, or date.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<p class="dt-caption">You can also import a file on the home screen and pick it up here.</p>
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding & delimiter auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">crm_contacts_raw.csv</span>
|
||||||
|
<span class="size">684 KB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed after a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: crm_contacts_raw.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">4,210 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>Full Name</th><th>EmailAddr</th><th>Phone #</th><th>Signup</th><th>Amount Spent</th><th>Notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>512-555-0190</td><td>01/04/2024</td><td>$1,204.50</td><td>VIP</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com</td><td>720-555-7781</td><td>02/11/2024</td><td>$88.00</td><td></td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Carla Reyes</td><td>carla@initech.net</td><td>415-555-3322</td><td>03/02/2024</td><td>$612.10</td><td>renewal</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>Dev Patel</td><td>dev@umbrella.co</td><td>206-555-9043</td><td>03/19/2024</td><td>$0.00</td><td></td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options expander (open — heart of the tool) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<!-- ===== Target schema ===== -->
|
||||||
|
<h3 style="margin-top:0">Target schema</h3>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">How would you like to define the target schema?</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column; gap:8px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> Build interactively (start from current columns)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Import schema JSON</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Skip (rename / coerce only — no schema)</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">An interactive build is fastest for one-off cleanup. Import a JSON when you have a fixed contract (a CRM import format, db schema). Skip when you only want to rename or coerce specific columns.</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="dt-caption">Edit the table to define your target schema. Add rows for fields the input doesn't have yet (with a default), or remove rows for columns you want to drop.</p>
|
||||||
|
|
||||||
|
<!-- Schema editor (st.data_editor, num_rows=dynamic) -->
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Target name</th><th>Type</th><th>Required</th><th>Default (for added cols)</th><th>Aliases (comma-sep, helps fuzzy-match)</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>full_name</td><td>string</td><td>✗</td><td></td><td>Full Name, name</td></tr>
|
||||||
|
<tr><td>email</td><td>string</td><td>✓</td><td></td><td>EmailAddr, email_address</td></tr>
|
||||||
|
<tr><td>phone</td><td>string</td><td>✗</td><td></td><td>Phone #, tel</td></tr>
|
||||||
|
<tr><td>signup_date</td><td>date</td><td>✗</td><td></td><td>Signup</td></tr>
|
||||||
|
<tr><td>amount_spent</td><td>float</td><td>✗</td><td>0.0</td><td>Amount Spent</td></tr>
|
||||||
|
<tr><td>source</td><td>string</td><td>✗</td><td>crm-import</td><td></td></tr>
|
||||||
|
<tr><td class="idx" style="color:var(--ink-tertiary)"><span class="dt-mi" style="font-size:16px;vertical-align:-3px">add</span> add row</td><td></td><td></td><td></td><td></td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">6 target fields · 1 added field (<code>source</code>) not present in the input.</p>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- ===== Strategy ===== -->
|
||||||
|
<h3>Strategy</h3>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Preset</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column; gap:8px">
|
||||||
|
<span class="dt-radio"><span class="dot"></span> rename-only (just rename, leave types alone, keep extras)</span>
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> lenient-schema (rename + coerce + reorder, keep extras)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> strict-schema (rename + coerce + reorder, drop extras)</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Advanced options expander -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Advanced options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Unmapped source columns</label>
|
||||||
|
<div class="dt-select">keep</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Coerce types per schema</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Reorder to schema order</div>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Auto-infer mapping (fuzzy match)</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Fuzzy match threshold</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:80%"></div><div class="knob" style="left:80%"></div></div><div class="val">0.80</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on"><span class="box"><span class="dt-mi">check</span></span> Enforce required fields</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- ===== Mapping ===== -->
|
||||||
|
<h3>Mapping</h3>
|
||||||
|
<!-- schema is set → source→target selectbox editor with auto-suggested flag -->
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>Source</th><th>Target</th><th>Auto-suggested</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Full Name</td><td>full_name</td><td>✓</td></tr>
|
||||||
|
<tr><td>EmailAddr</td><td>email</td><td>✓</td></tr>
|
||||||
|
<tr><td>Phone #</td><td>phone</td><td>✓</td></tr>
|
||||||
|
<tr><td>Signup</td><td>signup_date</td><td>✓</td></tr>
|
||||||
|
<tr><td>Amount Spent</td><td>amount_spent</td><td>✓</td></tr>
|
||||||
|
<tr><td>Notes</td><td>(unmapped)</td><td>✗</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Pick a target for each source column. <code>Notes</code> stays unmapped — with the lenient preset it is kept as-is. <code>source</code> is added from the schema default.</p>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Apply Column Mapping</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- ===== Results ===== -->
|
||||||
|
<div id="colmap-results-anchor" style="height:1px"></div>
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Renamed</div><div class="value">5</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Dropped</div><div class="value">0</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Added</div><div class="value">1</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Coerce fails</div><div class="value">3</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-alert info"><span class="dt-mi">info</span><span>Added (with defaults): <code>source</code></span></div>
|
||||||
|
<div class="dt-alert warn"><span class="dt-mi">warning</span><span>Some cells could not be coerced and were left as NaN: amount_spent (3)</span></div>
|
||||||
|
|
||||||
|
<p><strong>Resolved mapping</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>source</th><th>target</th><th>auto</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>Full Name</td><td>full_name</td><td>True</td></tr>
|
||||||
|
<tr><td>EmailAddr</td><td>email</td><td>True</td></tr>
|
||||||
|
<tr><td>Phone #</td><td>phone</td><td>True</td></tr>
|
||||||
|
<tr><td>Signup</td><td>signup_date</td><td>True</td></tr>
|
||||||
|
<tr><td>Amount Spent</td><td>amount_spent</td><td>True</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p><strong>Mapped preview (first 10 rows)</strong></p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th class="dt-cell-add">full_name</th><th>email</th><th>phone</th><th>signup_date</th><th>amount_spent</th><th class="dt-cell-add">source</th><th>Notes</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>512-555-0190</td><td>2024-01-04</td><td>1204.5</td><td>crm-import</td><td>VIP</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com</td><td>720-555-7781</td><td>2024-02-11</td><td>88.0</td><td>crm-import</td><td></td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Carla Reyes</td><td>carla@initech.net</td><td>415-555-3322</td><td>2024-03-02</td><td>612.1</td><td>crm-import</td><td>renewal</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>Dev Patel</td><td>dev@umbrella.co</td><td>206-555-9043</td><td>2024-03-19</td><td>0.0</td><td>crm-import</td><td></td></tr>
|
||||||
|
<tr><td class="idx">4</td><td>Mei Lin</td><td>mei@hooli.com</td><td>503-555-1188</td><td>2024-04-07</td><td class="dt-cell-flag">NaN</td><td>crm-import</td><td>trial</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (3 columns) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary">Download mapped CSV</button>
|
||||||
|
<button class="dt-btn">Download mapping audit</button>
|
||||||
|
<button class="dt-btn">Download config JSON</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
91
layout-review/06_outlier_detector.html
Normal file
91
layout-review/06_outlier_detector.html
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Find Unusual Values</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="06_outlier_detector">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Find Unusual Values</strong> — a <strong>Coming Soon</strong> tool. The page is a stub/teaser: an "under development" notice, a list of planned features, and disabled placeholder controls (only the file uploader is live). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Find Unusual Values</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Spot values that look wrong — way too high, too low, or breaking your rules.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- st.info: under development -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>This tool is under development.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Planned features (st.markdown) -->
|
||||||
|
<p><strong>Features:</strong></p>
|
||||||
|
<ul>
|
||||||
|
<li>Z-score detection (configurable threshold)</li>
|
||||||
|
<li>IQR (interquartile range) detection</li>
|
||||||
|
<li>MAD (median absolute deviation) detection</li>
|
||||||
|
<li>Domain-rule violations (e.g., age < 0, price > $1M)</li>
|
||||||
|
<li>Visual outlier highlighting in data preview</li>
|
||||||
|
<li>Handling: flag only, remove, cap/winsorize to bounds</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- File upload (functional) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS · Import a file to preview. Processing is not yet available.</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Placeholder options (all disabled) -->
|
||||||
|
<h3>Detection Method</h3>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px">
|
||||||
|
<label class="dt-label">Method</label>
|
||||||
|
<div class="dt-select" style="opacity:.55;cursor:not-allowed">Z-Score</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px;opacity:.55">
|
||||||
|
<label class="dt-label">Z-Score threshold</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:50%"></div><div class="knob" style="left:50%"></div></div><div class="val">3.0</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px;opacity:.55">
|
||||||
|
<label class="dt-label">IQR multiplier</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:25%"></div><div class="knob" style="left:25%"></div></div><div class="val">1.5</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Handling</h3>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:420px">
|
||||||
|
<label class="dt-label">Action</label>
|
||||||
|
<div class="dt-select" style="opacity:.55;cursor:not-allowed">Flag only (add column)</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block is-disabled" disabled>Detect Outliers</button>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
83
layout-review/07_multi_file_merger.html
Normal file
83
layout-review/07_multi_file_merger.html
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Combine Files</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="07_multi_file_merger">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Combine Files</strong> — a Coming-Soon tool. The page is a stub: an "under development" notice, a planned-features list, a working multi-file uploader, and disabled placeholder options. <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Combine Files</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Combine several CSV or Excel files into one — even if columns differ.</p>
|
||||||
|
|
||||||
|
<!-- Under-development notice (st.info) -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>This tool is under development.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Planned features (st.markdown) -->
|
||||||
|
<p><strong>Features:</strong></p>
|
||||||
|
<ul style="font-size:14px;line-height:1.55;color:var(--ink);margin:0 0 0.6rem;padding-left:22px">
|
||||||
|
<li>Import multiple CSV/Excel files at once</li>
|
||||||
|
<li>Automatic schema alignment (matching columns by name)</li>
|
||||||
|
<li>Append mode: stack files vertically (union)</li>
|
||||||
|
<li>Join mode: merge files on shared key columns</li>
|
||||||
|
<li>Handle mismatched columns (fill missing with nulls or drop)</li>
|
||||||
|
<li>Source file tracking column</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Multi-file upload (functional) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel files</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop files here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS · multiple files allowed</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text">Import multiple files to preview. Processing is not yet available.</div>
|
||||||
|
|
||||||
|
<!-- Placeholder options (all disabled) -->
|
||||||
|
<h3>Merge Strategy</h3>
|
||||||
|
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Mode</label>
|
||||||
|
<div class="dt-select" style="color:var(--ink-tertiary);background-color:var(--surface-hover)">Append (stack vertically)</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Mismatched columns</label>
|
||||||
|
<div class="dt-select" style="color:var(--ink-tertiary);background-color:var(--surface-hover)">Fill with null</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-check on" style="opacity:0.6">
|
||||||
|
<span class="box"><span class="dt-mi">check</span></span> Add source filename column
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block is-disabled">Merge Files</button>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
93
layout-review/08_validator_reporter.html
Normal file
93
layout-review/08_validator_reporter.html
Normal file
@@ -0,0 +1,93 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Quality Check</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="08_validator_reporter">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Quality Check</strong>, a Coming-Soon tool. The page is a stub: an "under development" notice, a feature list, a working file uploader, and disabled placeholder controls. <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Quality Check</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Check your file against rules you set, and export a PDF or Excel report.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Under-development notice (st.info) -->
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>This tool is under development.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Features list (st.markdown) -->
|
||||||
|
<p><strong>Features:</strong></p>
|
||||||
|
<ul>
|
||||||
|
<li>Column-level validation rules (not null, unique, regex pattern, range, enum)</li>
|
||||||
|
<li>Cross-column validation (e.g., start_date < end_date)</li>
|
||||||
|
<li>Data quality score per column and overall</li>
|
||||||
|
<li>Generate PDF quality report</li>
|
||||||
|
<li>Generate Excel report with flagged rows highlighted</li>
|
||||||
|
<li>Summary dashboard: pass/fail counts, severity breakdown</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- File upload (functional) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Import a file to preview. Processing is not yet available.</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Placeholder options -->
|
||||||
|
<h3>Validation Rules</h3>
|
||||||
|
|
||||||
|
<label class="dt-label">Load rules file (JSON)</label>
|
||||||
|
<div class="dt-uploader" style="opacity:0.55">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">JSON</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn is-disabled" disabled>Browse files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Quick checks</label>
|
||||||
|
<div class="dt-multiselect" style="opacity:0.55">
|
||||||
|
<span class="dt-ms-placeholder">Choose options</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Report Format</h3>
|
||||||
|
|
||||||
|
<div class="dt-field" style="max-width:320px">
|
||||||
|
<label class="dt-label">Output format</label>
|
||||||
|
<div class="dt-select" style="opacity:0.55">Excel (flagged rows)</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block is-disabled" disabled>Validate & Generate Report</button>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
231
layout-review/09_pipeline_runner.html
Normal file
231
layout-review/09_pipeline_runner.html
Normal file
@@ -0,0 +1,231 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Automated Workflows</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="09_pipeline_runner">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Automated Workflows</strong> (Pipeline Runner), shown with a file imported, a four-step pipeline configured, and a completed run (results + per-step summary). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Automated Workflows</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Run several tools in a row — save the steps once, reuse them anytime.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Upload (file staged) -->
|
||||||
|
<label class="dt-label">Import CSV or Excel file</label>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">Up to 1.5 GB · CSV, TSV, XLSX, XLS · encoding & delimiter auto-detected</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">customers_export.csv</span>
|
||||||
|
<span class="size">2.1 MB</span>
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Preview expander (collapsed once a result exists) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview: customers_export.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">18,442 rows, 6 columns</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td> Jane Doe </td><td>jane@acme.io</td><td>Austin</td><td>512-555-0190</td><td>2024-01-04</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>jane doe</td><td>JANE@ACME.IO</td><td>austin</td><td>(512) 555-0190</td><td>01/04/2024</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td>720.555.7781</td><td>2024-02-11</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>R. Smith</td><td>bob@globex.com</td><td>—</td><td>720-555-7781</td><td>Feb 11 2024</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Options: pipeline builder (collapsed once a result exists; opened here to show structure) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
|
||||||
|
<!-- Mode radio -->
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">How would you like to define the pipeline?</label>
|
||||||
|
<div class="dt-radio-row" style="flex-direction:column;gap:9px">
|
||||||
|
<span class="dt-radio on"><span class="dot"></span> Use the recommended default (text-clean → format → missing → dedup)</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Build interactively</span>
|
||||||
|
<span class="dt-radio"><span class="dot"></span> Import a saved pipeline JSON</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="dt-caption" style="margin:10px 0">
|
||||||
|
Edit the table to add, remove, reorder (drag the row index), enable, or configure each step.
|
||||||
|
Tool order is recommended, not enforced — violations surface as warnings below the table.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<!-- Pipeline editor (st.data_editor: Tool selectbox · Enabled checkbox · Options JSON) -->
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th class="idx"></th>
|
||||||
|
<th>Tool</th>
|
||||||
|
<th>Enabled</th>
|
||||||
|
<th>Options (JSON)</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 0</td>
|
||||||
|
<td>text_clean <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"trim": true, "collapse_whitespace": true}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 1</td>
|
||||||
|
<td>format_standardize <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"column_types": {"phone": "phone", "signup_date": "date"}}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 2</td>
|
||||||
|
<td>missing <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"strategy": "flag", "sentinels": ["N/A", "—"]}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx">≡ 3</td>
|
||||||
|
<td>dedup <span class="dt-mi" style="font-size:14px;vertical-align:-2px;color:var(--ink-tertiary)">expand_more</span></td>
|
||||||
|
<td><span class="dt-check on" style="margin:0;justify-content:center"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>{"survivor_rule": "most_complete", "merge": true}</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td class="idx" style="color:var(--ink-tertiary)">+</td>
|
||||||
|
<td colspan="3" style="color:var(--ink-tertiary);font-family:var(--font-sans)">Add row</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Validation: pipeline is in recommended order, so no warning shown (warning block omitted) -->
|
||||||
|
|
||||||
|
<!-- Nested explainer expander -->
|
||||||
|
<details class="dt-expander" open style="margin-top:14px">
|
||||||
|
<summary>Recommended tool order — why each step belongs where it does</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p><strong>text_clean</strong> before <strong>format_standardize</strong> — format parsers (phone / currency / date) fail on smart-quote-contaminated or NBSP-padded input — clean text first</p>
|
||||||
|
<p><strong>text_clean</strong> before <strong>missing</strong> — sentinel detection misses cells padded with NBSP / zero-width characters — clean text first</p>
|
||||||
|
<p><strong>text_clean</strong> before <strong>dedup</strong> — fuzzy matching treats NBSP-padded values as different — clean text first</p>
|
||||||
|
<p><strong>format_standardize</strong> before <strong>missing</strong> — numeric imputation needs numeric dtypes; canonical phones / currencies improve sentinel detection</p>
|
||||||
|
<p><strong>format_standardize</strong> before <strong>dedup</strong> — canonical phones / lowercase emails enable cross-format duplicate matching</p>
|
||||||
|
<p style="margin-bottom:0"><strong>missing</strong> before <strong>dedup</strong> — deduping rows with mixed NaN sentinels produces brittle merges — resolve missing values first</p>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Run -->
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Run Pipeline</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Initial rows</div><div class="value">18,442</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Final rows</div><div class="value">18,130</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Steps run</div><div class="value">4</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Elapsed</div><div class="value">1.84 s</div></div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Per-step summary</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead>
|
||||||
|
<tr><th>step</th><th>status</th><th>elapsed_ms</th><th>summary</th><th>error</th></tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>text_clean</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>214</td>
|
||||||
|
<td>{"cells_changed": 1204, "columns": ["name", "city"]}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>format_standardize</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>388</td>
|
||||||
|
<td>{"phone": 18301, "signup_date": 17996}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>missing</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>121</td>
|
||||||
|
<td>{"flagged_cells": 642, "sentinels_found": ["—"]}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>dedup</td>
|
||||||
|
<td><span class="dt-count-pill success">ok</span></td>
|
||||||
|
<td>911</td>
|
||||||
|
<td>{"input_rows": 18442, "output_rows": 18130, "duplicates_removed": 312, "groups": 147}</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h4>Output preview (first 10 rows)</h4>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th class="idx"></th><th>name</th><th>email</th><th>city</th><th>phone</th><th>signup_date</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="idx">0</td><td>Jane Doe</td><td>jane@acme.io</td><td>Austin</td><td class="dt-cell-add">+1 512-555-0190</td><td class="dt-cell-add">2024-01-04</td></tr>
|
||||||
|
<tr><td class="idx">1</td><td>Bob Smith</td><td>bob@globex.com</td><td>Denver</td><td class="dt-cell-add">+1 720-555-7781</td><td class="dt-cell-add">2024-02-11</td></tr>
|
||||||
|
<tr><td class="idx">2</td><td>Carla Reyes</td><td>carla@initech.co</td><td>Phoenix</td><td class="dt-cell-add">+1 480-555-3320</td><td class="dt-cell-add">2024-03-02</td></tr>
|
||||||
|
<tr><td class="idx">3</td><td>Dan Okafor</td><td>dan@umbrella.net</td><td><span class="dt-cell-flag">⚑ missing</span></td><td class="dt-cell-add">+1 206-555-7745</td><td class="dt-cell-add">2024-03-18</td></tr>
|
||||||
|
<tr><td class="idx">4</td><td>Emily Tran</td><td>emily@hooli.com</td><td>Seattle</td><td class="dt-cell-add">+1 206-555-1182</td><td class="dt-cell-add">2024-04-05</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (3 columns) -->
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<button class="dt-btn dt-btn-primary"><span class="dt-mi">download</span> Download cleaned CSV</button>
|
||||||
|
<button class="dt-btn"><span class="dt-mi">download</span> Download pipeline JSON</button>
|
||||||
|
<button class="dt-btn"><span class="dt-mi">download</span> Download run audit</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
189
layout-review/10_pdf_extractor.html
Normal file
189
layout-review/10_pdf_extractor.html
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — PDF to CSV</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="10_pdf_extractor">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>PDF to CSV</strong>, shown with two bank-statement PDFs imported and a completed scan (candidate transactions in the editable preview table). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>PDF to CSV</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Pull transactions out of bank-statement PDFs into a clean CSV file.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Scan options expander (collapsed by default) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Scan options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div class="dt-check on">
|
||||||
|
<span class="box"><span class="dt-mi">check</span></span>
|
||||||
|
Treat (4.50) as negative
|
||||||
|
</div>
|
||||||
|
<div class="dt-check on">
|
||||||
|
<span class="box"><span class="dt-mi">check</span></span>
|
||||||
|
Use OCR for scanned pages
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-help-text" style="margin:0 0 10px">OCR status: ready (bundled Tesseract). Most modern bank PDFs are text-based and don't need OCR — only enable for image-based scans.</p>
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Output date format</label>
|
||||||
|
<div class="dt-select">YYYY-MM-DD (2026-01-13)</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field">
|
||||||
|
<label class="dt-label">Override year for short dates (optional)</label>
|
||||||
|
<input class="dt-input" type="text" placeholder="" value="" disabled>
|
||||||
|
<div class="dt-help-text">Leave blank for automatic (statement period → filename year → this override).</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Files section head -->
|
||||||
|
<div class="dt-files-section-head">
|
||||||
|
<h2>Files</h2>
|
||||||
|
<span class="dt-section-meta">2 files · 318.4 KB total</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Files card (Home-style bordered list + Add more files) -->
|
||||||
|
<div class="dt-card" style="padding-bottom:0">
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove statement-jan-2026.pdf">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">statement-jan-2026.pdf</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">171.2 KB</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove statement-feb-2026.pdf">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">statement-feb-2026.pdf</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">147.2 KB</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-file-add">
|
||||||
|
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M12 5v14M5 12h14"/></svg> Add more files
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Action buttons -->
|
||||||
|
<div class="dt-btn-row" style="margin-top:16px;max-width:340px">
|
||||||
|
<button class="dt-btn dt-btn-primary">Scan</button>
|
||||||
|
<button class="dt-btn">Clear all files</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Warnings expander (collapsed) -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Warnings (1)</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-alert warn">
|
||||||
|
<span class="dt-mi">warning</span>
|
||||||
|
<span>[statement-feb-2026.pdf] 2 lines matched a date but no amount — skipped (likely a wrapped description). Check the source if a transaction looks missing.</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h4>47 candidate transaction(s) from 2 file(s)</h4>
|
||||||
|
<p class="dt-caption">Uncheck rows to exclude. Edit any cell to fix a value the scanner got wrong. The <code>raw</code> column shows the original PDF text for that row.</p>
|
||||||
|
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Include</th>
|
||||||
|
<th>date</th>
|
||||||
|
<th>description</th>
|
||||||
|
<th>amount_debit</th>
|
||||||
|
<th>amount_credit</th>
|
||||||
|
<th>account_number</th>
|
||||||
|
<th>source_file</th>
|
||||||
|
<th>page</th>
|
||||||
|
<th>raw</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-03</td><td>OPENING BALANCE</td><td></td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">1</td><td>01/03 OPENING BALANCE 2,140.55</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-05</td><td>POS PURCHASE WHOLE FOODS MKT</td><td>84.12</td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">1</td><td>01/05 POS PURCHASE WHOLE FOODS MKT (84.12)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-08</td><td>ACH DEPOSIT PAYROLL ACME CORP</td><td></td><td>3,250.00</td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">1</td><td>01/08 ACH DEPOSIT PAYROLL ACME CORP 3,250.00</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-11</td><td>ONLINE TRANSFER TO SAVINGS</td><td>500.00</td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">2</td><td>01/11 ONLINE TRANSFER TO SAVINGS (500.00)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check" style="margin:0"><span class="box"></span></span></td>
|
||||||
|
<td class="dt-cell-flag">2026-01-12</td><td class="dt-cell-flag">INTEREST RATE 0.50% APY DETAIL</td><td></td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">2</td><td>01/12 INTEREST RATE 0.50% APY 0.00</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-01-14</td><td>DEBIT CARD SHELL OIL #2287</td><td>52.40</td><td></td><td>****4821</td><td>statement-jan-2026.pdf</td><td class="idx">2</td><td>01/14 DEBIT CARD SHELL OIL #2287 (52.40)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-02-02</td><td>POS PURCHASE TRADER JOES #511</td><td>61.88</td><td></td><td>****4821</td><td>statement-feb-2026.pdf</td><td class="idx">1</td><td>02/02 POS PURCHASE TRADER JOES #511 (61.88)</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-02-06</td><td>ACH DEPOSIT PAYROLL ACME CORP</td><td></td><td>3,250.00</td><td>****4821</td><td>statement-feb-2026.pdf</td><td class="idx">2</td><td>02/06 ACH DEPOSIT PAYROLL ACME CORP 3,250.00</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><span class="dt-check on" style="margin:0"><span class="box"><span class="dt-mi">check</span></span></span></td>
|
||||||
|
<td>2026-02-09</td><td>CHECK #1043</td><td>1,200.00</td><td></td><td>****4821</td><td>statement-feb-2026.pdf</td><td class="idx">2</td><td>02/09 CHECK #1043 (1,200.00)</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Download row: download button (left) + columns multiselect (right) -->
|
||||||
|
<div class="dt-row" style="margin-top:14px;align-items:flex-start">
|
||||||
|
<div style="flex:2">
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Download 46 rows as CSV</button>
|
||||||
|
<p class="dt-caption" style="margin-top:8px">46 of 47 rows selected.</p>
|
||||||
|
</div>
|
||||||
|
<div style="flex:3">
|
||||||
|
<div class="dt-field" style="margin:0">
|
||||||
|
<label class="dt-label">Columns to include in CSV</label>
|
||||||
|
<div class="dt-multiselect">
|
||||||
|
<span class="dt-ms-chip">date <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">description <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">amount_debit <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">amount_credit <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">account_number <span class="x">✕</span></span>
|
||||||
|
<span class="dt-ms-chip">source_file <span class="x">✕</span></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-help-text"><code>page</code> and <code>raw</code> are kept off by default; tick them if you want them in the file.</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
251
layout-review/11_reconciler.html
Normal file
251
layout-review/11_reconciler.html
Normal file
@@ -0,0 +1,251 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — Reconcile Two Files</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="11_reconciler">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of <strong>Reconcile Two Files</strong>, shown with both files imported, key columns mapped, and a completed reconciliation (matched / review / unmatched results). <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Tool header -->
|
||||||
|
<div class="dt-tool-header">
|
||||||
|
<h1>Reconcile Two Files</h1>
|
||||||
|
<button class="dt-help-btn"><span class="dt-mi">help_outline</span> Help</button>
|
||||||
|
</div>
|
||||||
|
<p class="dt-tool-caption">Compare two lists of transactions (e.g. bank vs. ledger) and flag what doesn't match.</p>
|
||||||
|
|
||||||
|
<div class="dt-spacer"></div>
|
||||||
|
|
||||||
|
<!-- Side-by-side upload (st.columns(2) → two _side_panel) -->
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<!-- Left side -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Left (e.g. bank feed)</h4>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">bank_feed_may.csv</span>
|
||||||
|
<span class="size">214 KB</span>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption" style="margin-top:6px"><code>bank_feed_may.csv</code> — 1,204 rows, 4 columns</p>
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview left (e.g. bank feed)</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>posted_date</th><th>description</th><th>amount</th><th>ref</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2026-05-01</td><td>ACME SUPPLIES</td><td>-1240.00</td><td>CHK1041</td></tr>
|
||||||
|
<tr><td>2026-05-02</td><td>PAYROLL RUN</td><td>-8800.00</td><td>ACH5520</td></tr>
|
||||||
|
<tr><td>2026-05-03</td><td>CLIENT GLOBEX</td><td>5200.00</td><td>DEP0090</td></tr>
|
||||||
|
<tr><td>2026-05-04</td><td>UTILITY CO</td><td>-318.42</td><td>CHK1042</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
<!-- Right side -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Right (e.g. ledger)</h4>
|
||||||
|
<div class="dt-uploader">
|
||||||
|
<div class="dt-uploader-text">
|
||||||
|
<span class="hint"><span class="dt-mi" style="vertical-align:-4px">upload_file</span> Drag and drop file here</span>
|
||||||
|
<span class="sub">CSV, TSV, XLSX, XLS</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-btn">Browse files</button>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-chip">
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="name">ledger_may.xlsx</span>
|
||||||
|
<span class="size">96 KB</span>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption" style="margin-top:6px"><code>ledger_may.xlsx</code> — 1,198 rows, 5 columns</p>
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Preview right (e.g. ledger)</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>txn_date</th><th>memo</th><th>value</th><th>invoice_no</th><th>account</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2026-05-01</td><td>Acme Supplies Inc</td><td>-1240.00</td><td>INV-1041</td><td>5000</td></tr>
|
||||||
|
<tr><td>2026-05-02</td><td>Monthly payroll</td><td>-8800.00</td><td>INV-5520</td><td>6000</td></tr>
|
||||||
|
<tr><td>2026-05-03</td><td>Globex retainer</td><td>5200.00</td><td>INV-0090</td><td>4000</td></tr>
|
||||||
|
<tr><td>2026-05-04</td><td>City Utilities</td><td>-318.40</td><td>INV-1042</td><td>6100</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Match settings -->
|
||||||
|
<h2>Match settings</h2>
|
||||||
|
<div class="dt-cols-2">
|
||||||
|
<!-- Left pickers (file order: posted_date, description, amount → date, desc, amount) -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Left columns</h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Date column (optional)</label><div class="dt-select">posted_date</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Description column (optional)</label><div class="dt-select">description</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Amount column</label><div class="dt-select">amount</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Reference columns (optional, e.g. check / invoice no.)</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">ref <span class="x">✕</span></span></div></div>
|
||||||
|
</div>
|
||||||
|
<!-- Right pickers (file order: txn_date, memo, value → date, desc, amount) -->
|
||||||
|
<div>
|
||||||
|
<h4 style="margin-top:0">Right columns</h4>
|
||||||
|
<div class="dt-field"><label class="dt-label">Date column (optional)</label><div class="dt-select">txn_date</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Description column (optional)</label><div class="dt-select">memo</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Amount column</label><div class="dt-select">value</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Reference columns (must match left count)</label>
|
||||||
|
<div class="dt-multiselect"><span class="dt-ms-chip">invoice_no <span class="x">✕</span></span></div></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Tolerances & options (expanded=True) -->
|
||||||
|
<details class="dt-expander" open>
|
||||||
|
<summary>Tolerances & options</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<div class="dt-cols-3">
|
||||||
|
<div class="dt-field"><label class="dt-label">Amount tolerance</label>
|
||||||
|
<div class="dt-input">0.0200</div>
|
||||||
|
<div class="dt-help-text">Absolute tolerance on amount (e.g. 0.01 to absorb cent rounding).</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Date tolerance (days)</label>
|
||||||
|
<div class="dt-input">1</div>
|
||||||
|
<div class="dt-help-text">Allow N calendar days of drift between posting dates.</div></div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Invert right amount sign</label>
|
||||||
|
<div class="dt-check" style="margin-top:8px"><span class="box"></span> Invert right amount sign</div>
|
||||||
|
<div class="dt-help-text">Use when one side records debits as positive and the other as negative.</div></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-field"><label class="dt-label">Description similarity boost (0 disables)</label>
|
||||||
|
<div class="dt-slider"><div class="track"><div class="fill" style="width:80%"></div><div class="knob" style="left:80%"></div></div><div class="val">80</div></div>
|
||||||
|
<div class="dt-help-text">When both sides have a description column set, accept matches with this minimum fuzzy similarity even if amount/date are merely within tolerance. Lower = more permissive.</div></div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<button class="dt-btn dt-btn-primary dt-btn-block">Reconcile</button>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Results -->
|
||||||
|
<h2>Results</h2>
|
||||||
|
<div class="dt-metrics">
|
||||||
|
<div class="dt-metric"><div class="label">Matched</div><div class="value">1,173</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Review</div><div class="value">9</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Unmatched left</div><div class="value">22</div></div>
|
||||||
|
<div class="dt-metric"><div class="label">Unmatched right</div><div class="value">16</div></div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-caption">Coverage: 97.4% of the larger side</p>
|
||||||
|
|
||||||
|
<!-- Tabs (st.tabs) — Matched active -->
|
||||||
|
<div class="dt-tabs">
|
||||||
|
<span class="dt-tab is-active">Matched (1,173)</span>
|
||||||
|
<span class="dt-tab">Review (9)</span>
|
||||||
|
<span class="dt-tab">Unmatched left (22)</span>
|
||||||
|
<span class="dt-tab">Unmatched right (16)</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Matched tab content -->
|
||||||
|
<p class="dt-caption">Preview of first 25 of 1,173 rows — download the CSV below for the full set.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr>
|
||||||
|
<th>left_posted_date</th><th>left_description</th><th>left_amount</th>
|
||||||
|
<th>right_txn_date</th><th>right_memo</th><th>right_value</th><th>amount_diff</th>
|
||||||
|
</tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>2026-05-01</td><td>ACME SUPPLIES</td><td>-1240.00</td><td>2026-05-01</td><td>Acme Supplies Inc</td><td>-1240.00</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
<tr><td>2026-05-02</td><td>PAYROLL RUN</td><td>-8800.00</td><td>2026-05-02</td><td>Monthly payroll</td><td>-8800.00</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
<tr><td>2026-05-03</td><td>CLIENT GLOBEX</td><td>5200.00</td><td>2026-05-03</td><td>Globex retainer</td><td>5200.00</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
<tr><td>2026-05-04</td><td>UTILITY CO</td><td>-318.42</td><td>2026-05-04</td><td>City Utilities</td><td>-318.40</td><td class="dt-cell-flag">0.02</td></tr>
|
||||||
|
<tr><td>2026-05-06</td><td>OFFICE DEPOT</td><td>-89.15</td><td>2026-05-07</td><td>Office supplies</td><td>-89.15</td><td class="dt-cell-add">0.00</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Other tab previews shown as collapsed expanders for review context -->
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Review (9) — ambiguous candidates</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">Pairs flagged because the algorithm couldn't pick a single best match (e.g. multiple equally-good candidates). Use the left/right indices to disambiguate manually.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>left_idx</th><th>left_amount</th><th>right_idx</th><th>right_value</th><th>candidates</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td>118</td><td>-450.00</td><td>121, 209</td><td>-450.00</td><td class="dt-cell-flag">2 equal</td></tr>
|
||||||
|
<tr><td>203</td><td>1000.00</td><td>198, 244</td><td>1000.00</td><td class="dt-cell-flag">2 equal</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Unmatched left (22) — only in bank_feed_may.csv</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">Preview of first 25 of 22 rows.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>posted_date</th><th>description</th><th>amount</th><th>ref</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-09</td><td class="dt-cell-del">BANK FEE</td><td class="dt-cell-del">-12.00</td><td class="dt-cell-del">FEE0001</td></tr>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-14</td><td class="dt-cell-del">ATM WITHDRAWAL</td><td class="dt-cell-del">-200.00</td><td class="dt-cell-del">ATM7781</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details class="dt-expander">
|
||||||
|
<summary>Unmatched right (16) — only in ledger_may.xlsx</summary>
|
||||||
|
<div class="dt-expander-body">
|
||||||
|
<p class="dt-caption">Preview of first 25 of 16 rows.</p>
|
||||||
|
<div class="dt-table-wrap">
|
||||||
|
<table class="dt-table">
|
||||||
|
<thead><tr><th>txn_date</th><th>memo</th><th>value</th><th>invoice_no</th><th>account</th></tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-11</td><td class="dt-cell-del">Accrued interest</td><td class="dt-cell-del">37.50</td><td class="dt-cell-del">INV-9001</td><td class="dt-cell-del">7000</td></tr>
|
||||||
|
<tr><td class="dt-cell-del">2026-05-22</td><td class="dt-cell-del">Depreciation</td><td class="dt-cell-del">-410.00</td><td class="dt-cell-del">INV-9044</td><td class="dt-cell-del">8000</td></tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Downloads (st.columns(4) of html_download_button) -->
|
||||||
|
<div class="dt-btn-row">
|
||||||
|
<button class="dt-btn dt-btn-primary">Matched CSV</button>
|
||||||
|
<button class="dt-btn">Review CSV</button>
|
||||||
|
<button class="dt-btn">Unmatched left</button>
|
||||||
|
<button class="dt-btn">Unmatched right</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
473
layout-review/app.css
Normal file
473
layout-review/app.css
Normal file
@@ -0,0 +1,473 @@
|
|||||||
|
/* ===========================================================================
|
||||||
|
DataTools — static layout-review stylesheet
|
||||||
|
---------------------------------------------------------------------------
|
||||||
|
Faithful reproduction of the live Streamlit app's design system for human
|
||||||
|
review of page layouts. Tokens are copied verbatim from src/gui/theme.py
|
||||||
|
(§3 color + type scale) and the component values from
|
||||||
|
src/gui/components/_legacy.py:_DESIGN_TOKENS_CSS.
|
||||||
|
|
||||||
|
The live app applies these styles to Streamlit's data-testid DOM; here we
|
||||||
|
re-express the same look against clean semantic classes so the static HTML
|
||||||
|
stays readable. Where the app uses real .dt-* classes (page header, files
|
||||||
|
card, findings, stats) the class names are kept identical.
|
||||||
|
=========================================================================== */
|
||||||
|
|
||||||
|
@import url("https://fonts.googleapis.com/css2?family=Geist:wght@400;500;600;700&family=Geist+Mono:wght@400;500&display=swap");
|
||||||
|
@import url("https://fonts.googleapis.com/css2?family=Material+Symbols+Outlined:opsz,wght,FILL,GRAD@20..48,400,0,0&display=block");
|
||||||
|
|
||||||
|
:root {
|
||||||
|
--font-sans: "Geist", -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif;
|
||||||
|
--font-mono: "Geist Mono", ui-monospace, "SF Mono", Menlo, monospace;
|
||||||
|
|
||||||
|
--ink: #1c1917;
|
||||||
|
--ink-secondary: #57534e;
|
||||||
|
--ink-tertiary: #a8a29e;
|
||||||
|
--bg: #fafaf7;
|
||||||
|
--surface: #ffffff;
|
||||||
|
--surface-hover: #f8f7f3;
|
||||||
|
--border: #e7e5dc;
|
||||||
|
--border-strong: #d6d3c7;
|
||||||
|
--accent: #c2410c;
|
||||||
|
--accent-hover: #9a3412;
|
||||||
|
--accent-fill: #fef4ed;
|
||||||
|
--accent-fill-strong: #fde4d3;
|
||||||
|
|
||||||
|
--warn: #b45309;
|
||||||
|
--warn-fill: #fef3c7;
|
||||||
|
--info: #0369a1;
|
||||||
|
--info-fill: #e0f2fe;
|
||||||
|
--success: #15803d;
|
||||||
|
--success-fill: #dcfce7;
|
||||||
|
--danger: #b91c1c;
|
||||||
|
--danger-fill: #fee2e2;
|
||||||
|
|
||||||
|
--r-sm: 6px;
|
||||||
|
--r-md: 10px;
|
||||||
|
--r-lg: 14px;
|
||||||
|
|
||||||
|
--sidebar-w: 264px;
|
||||||
|
}
|
||||||
|
|
||||||
|
* { box-sizing: border-box; }
|
||||||
|
|
||||||
|
html, body {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
background: var(--bg);
|
||||||
|
color: var(--ink);
|
||||||
|
font-family: var(--font-sans);
|
||||||
|
font-feature-settings: "ss01", "cv01", "cv11";
|
||||||
|
-webkit-font-smoothing: antialiased;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ---------- Type scale (theme.py §4) ---------- */
|
||||||
|
h1 { font-size: 32px; font-weight: 600; letter-spacing: -0.035em; line-height: 1.1; margin: 0 0 4px; }
|
||||||
|
h2 { font-size: 22px; font-weight: 600; letter-spacing: -0.025em; line-height: 1.2; margin: 1.5rem 0 0.75rem; }
|
||||||
|
h3 { font-size: 18px; font-weight: 500; letter-spacing: -0.018em; line-height: 1.25; margin: 1.25rem 0 0.5rem; }
|
||||||
|
h4 { font-size: 15px; font-weight: 500; letter-spacing: -0.012em; line-height: 1.35; margin: 1rem 0 0.5rem; }
|
||||||
|
p { font-size: 14px; font-weight: 400; line-height: 1.55; color: var(--ink); margin: 0 0 0.6rem; }
|
||||||
|
strong { font-weight: 500; color: var(--ink); }
|
||||||
|
a { color: var(--accent); text-decoration: none; }
|
||||||
|
a:hover { color: var(--accent-hover); text-decoration: underline; }
|
||||||
|
code, .dt-mono { font-family: var(--font-mono); font-size: 0.92em; font-feature-settings: "ss02"; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
App frame — sidebar + main + sticky footer
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-app { display: flex; min-height: 100vh; }
|
||||||
|
|
||||||
|
/* ---------- Sidebar (cream paper) ---------- */
|
||||||
|
.dt-sidebar {
|
||||||
|
width: var(--sidebar-w);
|
||||||
|
flex-shrink: 0;
|
||||||
|
background: #f5f4ef;
|
||||||
|
border-right: 1px solid var(--border);
|
||||||
|
padding: 18px 14px 90px;
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
align-self: flex-start;
|
||||||
|
height: 100vh;
|
||||||
|
overflow-y: auto;
|
||||||
|
}
|
||||||
|
.dt-brand { display: flex; align-items: center; gap: 10px; padding: 0 4px 18px; }
|
||||||
|
.dt-brand-mark {
|
||||||
|
width: 28px; height: 28px; border-radius: 7px;
|
||||||
|
background: var(--ink); color: var(--accent-fill);
|
||||||
|
display: inline-flex; align-items: center; justify-content: center;
|
||||||
|
font-weight: 700; font-size: 16px; letter-spacing: -0.04em; line-height: 1; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-brand-name { display: flex; flex-direction: column; gap: 1px; line-height: 1.05; }
|
||||||
|
.dt-brand-eyebrow {
|
||||||
|
font-size: 9.5px; font-weight: 600; letter-spacing: 0.14em;
|
||||||
|
text-transform: uppercase; color: var(--ink-tertiary); line-height: 1;
|
||||||
|
}
|
||||||
|
.dt-brand-word { font-weight: 600; font-size: 15px; letter-spacing: -0.02em; color: var(--ink); }
|
||||||
|
|
||||||
|
.dt-nav { display: flex; flex-direction: column; }
|
||||||
|
.dt-nav-section {
|
||||||
|
font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.08em;
|
||||||
|
color: var(--ink-tertiary); font-weight: 500;
|
||||||
|
padding: 14px 10px 4px; margin: 0;
|
||||||
|
display: flex; align-items: center; justify-content: space-between;
|
||||||
|
}
|
||||||
|
.dt-nav-section .dt-nav-indicator { font-size: 16px; color: var(--ink-tertiary); }
|
||||||
|
.dt-nav-link {
|
||||||
|
display: flex; align-items: center; gap: 8px;
|
||||||
|
color: var(--ink-secondary); font-size: 13px; font-weight: 500; line-height: 1.3;
|
||||||
|
padding: 5px 10px; border-radius: var(--r-sm); margin-bottom: 1px;
|
||||||
|
text-decoration: none; transition: background 0.12s ease, color 0.12s ease;
|
||||||
|
}
|
||||||
|
.dt-nav-link:hover { background: rgba(0,0,0,0.04); color: var(--ink); text-decoration: none; }
|
||||||
|
.dt-nav-link.is-active { background: rgba(0,0,0,0.04); color: var(--ink); font-weight: 600; }
|
||||||
|
.dt-nav-link .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; color: var(--ink-secondary); line-height: 1; }
|
||||||
|
.dt-nav-link.is-active .dt-mi { color: var(--ink); }
|
||||||
|
.dt-nav-link.is-soon { opacity: 0.55; }
|
||||||
|
.dt-nav-soon-tag {
|
||||||
|
margin-left: auto; font-size: 9px; font-weight: 600; letter-spacing: 0.06em;
|
||||||
|
text-transform: uppercase; color: var(--ink-tertiary);
|
||||||
|
border: 1px solid var(--border-strong); border-radius: 999px; padding: 1px 6px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.dt-sidebar-foot { margin-top: 22px; padding-top: 16px; border-top: 1px solid var(--border); display: flex; flex-direction: column; gap: 10px; }
|
||||||
|
.dt-sidebar-label { font-size: 11.5px; font-weight: 500; text-transform: uppercase; letter-spacing: 0.08em; color: var(--ink-tertiary); margin-bottom: 4px; }
|
||||||
|
.dt-license-badge { font-size: 12.5px; color: var(--ink-secondary); }
|
||||||
|
|
||||||
|
/* ---------- Main column ---------- */
|
||||||
|
.dt-main { flex: 1; min-width: 0; padding: 40px 56px 96px; }
|
||||||
|
.dt-main-inner { max-width: 920px; margin: 0 auto; }
|
||||||
|
|
||||||
|
/* Review banner above every mockup */
|
||||||
|
.dt-review-banner {
|
||||||
|
max-width: 920px; margin: 0 auto 20px; display: flex; gap: 10px; align-items: center;
|
||||||
|
background: var(--info-fill); color: var(--info);
|
||||||
|
border: 1px solid transparent; border-radius: var(--r-md);
|
||||||
|
padding: 8px 14px; font-size: 12.5px; line-height: 1.4;
|
||||||
|
}
|
||||||
|
.dt-review-banner a { color: var(--info); text-decoration: underline; }
|
||||||
|
.dt-review-banner .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; }
|
||||||
|
|
||||||
|
/* ---------- Sticky footer ---------- */
|
||||||
|
.dt-footer {
|
||||||
|
position: fixed; bottom: 0; left: var(--sidebar-w); right: 0;
|
||||||
|
background: rgba(255,255,255,0.97); backdrop-filter: blur(8px);
|
||||||
|
border-top: 1px solid var(--border-strong);
|
||||||
|
padding: 8px 20px; z-index: 50;
|
||||||
|
display: flex; align-items: center; gap: 8px;
|
||||||
|
}
|
||||||
|
.dt-footer-btn {
|
||||||
|
display: inline-flex; align-items: center; gap: 8px;
|
||||||
|
color: var(--ink-secondary); font-size: 13px; font-weight: 500; line-height: 1.3;
|
||||||
|
padding: 5px 10px; border-radius: var(--r-sm);
|
||||||
|
background: transparent; border: none; cursor: pointer; text-decoration: none;
|
||||||
|
}
|
||||||
|
.dt-footer-btn:hover { background: rgba(0,0,0,0.04); color: var(--ink); text-decoration: none; }
|
||||||
|
.dt-footer-btn .dt-mi { font-family: "Material Symbols Outlined"; font-size: 16px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Page header (brand + privacy pill) — .dt-page-* mirror the live app
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-page-header {
|
||||||
|
display: flex; align-items: center; justify-content: space-between; gap: 24px;
|
||||||
|
margin: 0 0 24px; padding-bottom: 22px; border-bottom: 1px solid var(--border);
|
||||||
|
}
|
||||||
|
.dt-page-brand { display: flex; flex-direction: column; gap: 8px; }
|
||||||
|
.dt-page-brand-row { display: flex; align-items: center; gap: 18px; }
|
||||||
|
.dt-page-brand-mark {
|
||||||
|
width: 56px; height: 56px; border-radius: 14px; background: var(--ink);
|
||||||
|
color: var(--accent-fill); display: inline-flex; align-items: center; justify-content: center;
|
||||||
|
font-weight: 700; font-size: 32px; letter-spacing: -0.04em; line-height: 1; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-page-brand-words { display: flex; flex-direction: column; gap: 2px; line-height: 1; }
|
||||||
|
.dt-page-eyebrow { font-size: 11.5px; font-weight: 600; letter-spacing: 0.14em; text-transform: uppercase; color: var(--ink-tertiary); line-height: 1.2; }
|
||||||
|
.dt-page-wordmark { margin: 0; font-weight: 600; font-size: 32px; letter-spacing: -0.035em; line-height: 1.1; color: var(--ink); }
|
||||||
|
.dt-page-subtitle { margin: 4px 0 0; color: var(--ink-secondary); font-size: 14px; line-height: 1.5; }
|
||||||
|
.dt-privacy-pill {
|
||||||
|
display: inline-flex; align-items: center; gap: 6px; padding: 6px 11px;
|
||||||
|
background: var(--success-fill); color: var(--success); border-radius: 999px;
|
||||||
|
font-size: 12px; font-weight: 500; white-space: nowrap; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-privacy-pill svg { width: 13px; height: 13px; stroke-width: 2; }
|
||||||
|
|
||||||
|
/* ---------- Tool header (title + Help popover) ---------- */
|
||||||
|
.dt-tool-header { display: flex; align-items: flex-start; justify-content: space-between; gap: 16px; }
|
||||||
|
.dt-tool-header h1 { margin: 0; }
|
||||||
|
.dt-help-btn {
|
||||||
|
display: inline-flex; align-items: center; gap: 6px; white-space: nowrap;
|
||||||
|
background: var(--surface); color: var(--ink); border: 1px solid var(--border-strong);
|
||||||
|
border-radius: var(--r-md); padding: 9px 16px; font-size: 13.5px; font-weight: 500;
|
||||||
|
cursor: pointer; flex-shrink: 0; margin-top: 6px;
|
||||||
|
}
|
||||||
|
.dt-help-btn .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; }
|
||||||
|
.dt-tool-caption { font-size: 12.5px; color: var(--ink-tertiary); line-height: 1.5; margin: 2px 0 0; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Buttons
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-btn {
|
||||||
|
border-radius: var(--r-md); font-family: var(--font-sans); font-weight: 500;
|
||||||
|
font-size: 13.5px; letter-spacing: -0.005em; line-height: 1; padding: 9px 16px;
|
||||||
|
border: 1px solid var(--border-strong); background: var(--surface); color: var(--ink);
|
||||||
|
cursor: pointer; transition: background 0.12s ease, border-color 0.12s ease, color 0.12s ease;
|
||||||
|
display: inline-flex; align-items: center; justify-content: center; gap: 8px;
|
||||||
|
}
|
||||||
|
.dt-btn:hover { background: var(--surface-hover); border-color: var(--ink-tertiary); }
|
||||||
|
.dt-btn-primary { background: var(--ink); color: var(--bg); border-color: var(--ink); }
|
||||||
|
.dt-btn-primary:hover { background: #292524; border-color: #292524; color: var(--bg); }
|
||||||
|
.dt-btn-tertiary { background: transparent; border: none; color: var(--ink-tertiary); padding: 4px 8px; }
|
||||||
|
.dt-btn-tertiary:hover { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-btn:disabled, .dt-btn.is-disabled {
|
||||||
|
background: var(--surface-hover); color: var(--ink-tertiary);
|
||||||
|
border: 1px solid var(--border); cursor: not-allowed;
|
||||||
|
}
|
||||||
|
.dt-btn-block { width: 100%; }
|
||||||
|
.dt-btn .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; }
|
||||||
|
|
||||||
|
.dt-btn-row { display: flex; gap: 10px; flex-wrap: wrap; }
|
||||||
|
.dt-btn-row > .dt-btn { flex: 1; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
File uploader (cream dropzone)
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-uploader {
|
||||||
|
background: var(--surface-hover); border: 1px dashed var(--border-strong);
|
||||||
|
border-radius: var(--r-md); padding: 22px 20px;
|
||||||
|
display: flex; align-items: center; justify-content: space-between; gap: 16px;
|
||||||
|
}
|
||||||
|
.dt-uploader-text { display: flex; flex-direction: column; gap: 2px; }
|
||||||
|
.dt-uploader-text .hint { font-size: 14px; color: var(--ink); }
|
||||||
|
.dt-uploader-text .sub { font-size: 12.5px; color: var(--ink-tertiary); }
|
||||||
|
.dt-uploader .dt-mi { font-family: "Material Symbols Outlined"; font-size: 24px; color: var(--ink-tertiary); }
|
||||||
|
|
||||||
|
/* Staged-file chip */
|
||||||
|
.dt-file-chip {
|
||||||
|
display: flex; align-items: center; gap: 12px;
|
||||||
|
background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-sm);
|
||||||
|
padding: 10px 14px; margin-top: 10px;
|
||||||
|
}
|
||||||
|
.dt-file-chip .name { font-family: var(--font-mono); font-size: 13px; color: var(--ink); font-feature-settings: "ss02"; }
|
||||||
|
.dt-file-chip .size { font-family: var(--font-mono); font-size: 12px; color: var(--ink-tertiary); margin-left: auto; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Expanders / bordered cards
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-expander {
|
||||||
|
background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg);
|
||||||
|
overflow: hidden; box-shadow: 0 1px 2px rgba(28,25,23,0.03); margin: 10px 0;
|
||||||
|
}
|
||||||
|
.dt-expander > summary, .dt-expander-head {
|
||||||
|
background: var(--surface-hover); border-bottom: 1px solid var(--border);
|
||||||
|
padding: 12px 16px; font-weight: 500; color: var(--ink); font-size: 14px;
|
||||||
|
cursor: pointer; list-style: none; display: flex; align-items: center; gap: 8px;
|
||||||
|
}
|
||||||
|
.dt-expander > summary::-webkit-details-marker { display: none; }
|
||||||
|
.dt-expander > summary::before {
|
||||||
|
content: "expand_more"; font-family: "Material Symbols Outlined"; font-size: 20px;
|
||||||
|
color: var(--ink-tertiary); transition: transform 0.15s ease;
|
||||||
|
}
|
||||||
|
.dt-expander[open] > summary::before { transform: rotate(180deg); }
|
||||||
|
.dt-expander-body, .dt-expander > .dt-expander-body { padding: 14px 16px; }
|
||||||
|
.dt-expander:not([open]) > summary { border-bottom: none; }
|
||||||
|
|
||||||
|
.dt-card {
|
||||||
|
background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg);
|
||||||
|
box-shadow: 0 1px 2px rgba(28,25,23,0.03); padding: 16px; margin: 10px 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Alerts
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-alert {
|
||||||
|
border-radius: var(--r-md); border: 1px solid transparent;
|
||||||
|
padding: 10px 14px; font-size: 13.5px; line-height: 1.45; margin: 10px 0;
|
||||||
|
display: flex; gap: 10px; align-items: flex-start;
|
||||||
|
}
|
||||||
|
.dt-alert .dt-mi { font-family: "Material Symbols Outlined"; font-size: 18px; flex-shrink: 0; margin-top: 1px; }
|
||||||
|
.dt-alert.info { background: var(--info-fill); color: var(--info); }
|
||||||
|
.dt-alert.success { background: var(--success-fill); color: var(--success); }
|
||||||
|
.dt-alert.warn { background: var(--warn-fill); color: var(--warn); }
|
||||||
|
.dt-alert.error { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-alert code { background: rgba(0,0,0,0.05); padding: 1px 5px; border-radius: 4px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Inputs (static representations of Streamlit widgets)
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-field { margin: 10px 0; }
|
||||||
|
.dt-label { font-size: 13px; font-weight: 500; color: var(--ink); margin-bottom: 5px; display: block; }
|
||||||
|
.dt-label .req { color: var(--accent); }
|
||||||
|
.dt-input, .dt-select, .dt-textarea {
|
||||||
|
width: 100%; background: var(--surface); border: 1px solid var(--border-strong);
|
||||||
|
border-radius: var(--r-sm); padding: 8px 11px; font-family: var(--font-sans);
|
||||||
|
font-size: 13.5px; color: var(--ink);
|
||||||
|
}
|
||||||
|
.dt-select { appearance: none; background-image: linear-gradient(45deg, transparent 50%, var(--ink-tertiary) 50%), linear-gradient(135deg, var(--ink-tertiary) 50%, transparent 50%); background-position: calc(100% - 16px) 14px, calc(100% - 11px) 14px; background-size: 5px 5px, 5px 5px; background-repeat: no-repeat; }
|
||||||
|
.dt-textarea { min-height: 76px; resize: vertical; font-family: var(--font-mono); font-size: 13px; }
|
||||||
|
.dt-help-text { font-size: 12px; color: var(--ink-tertiary); margin-top: 4px; }
|
||||||
|
|
||||||
|
/* Multiselect — chips inside a box */
|
||||||
|
.dt-multiselect {
|
||||||
|
width: 100%; background: var(--surface); border: 1px solid var(--border-strong);
|
||||||
|
border-radius: var(--r-sm); padding: 6px 8px; min-height: 38px;
|
||||||
|
display: flex; flex-wrap: wrap; gap: 6px; align-items: center;
|
||||||
|
}
|
||||||
|
.dt-ms-chip {
|
||||||
|
display: inline-flex; align-items: center; gap: 5px; background: var(--accent-fill);
|
||||||
|
color: var(--accent-hover); border-radius: var(--r-sm); padding: 3px 8px;
|
||||||
|
font-size: 12.5px; font-weight: 500;
|
||||||
|
}
|
||||||
|
.dt-ms-chip .x { color: var(--accent); font-size: 13px; }
|
||||||
|
.dt-ms-placeholder { color: var(--ink-tertiary); font-size: 13px; padding: 2px 4px; }
|
||||||
|
|
||||||
|
/* Checkbox / radio */
|
||||||
|
.dt-check { display: flex; align-items: center; gap: 9px; margin: 8px 0; font-size: 13.5px; color: var(--ink); }
|
||||||
|
.dt-check .box {
|
||||||
|
width: 18px; height: 18px; border-radius: 5px; border: 1px solid var(--border-strong);
|
||||||
|
background: var(--surface); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0;
|
||||||
|
}
|
||||||
|
.dt-check.on .box { background: var(--ink); border-color: var(--ink); color: var(--bg); }
|
||||||
|
.dt-check.on .box .dt-mi { font-family: "Material Symbols Outlined"; font-size: 14px; }
|
||||||
|
.dt-radio-row { display: flex; gap: 18px; flex-wrap: wrap; margin: 8px 0; }
|
||||||
|
.dt-radio { display: inline-flex; align-items: center; gap: 7px; font-size: 13.5px; }
|
||||||
|
.dt-radio .dot { width: 16px; height: 16px; border-radius: 50%; border: 1px solid var(--border-strong); display: inline-block; flex-shrink: 0; }
|
||||||
|
.dt-radio.on .dot { border: 5px solid var(--ink); }
|
||||||
|
|
||||||
|
/* Slider */
|
||||||
|
.dt-slider { margin: 14px 0 6px; }
|
||||||
|
.dt-slider .track { position: relative; height: 4px; background: var(--border-strong); border-radius: 2px; }
|
||||||
|
.dt-slider .fill { position: absolute; left: 0; top: 0; height: 4px; background: var(--ink); border-radius: 2px; }
|
||||||
|
.dt-slider .knob { position: absolute; top: 50%; width: 16px; height: 16px; border-radius: 50%; background: var(--ink); transform: translate(-50%, -50%); }
|
||||||
|
.dt-slider .val { font-family: var(--font-mono); font-size: 12px; color: var(--ink-secondary); margin-top: 8px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Layout helpers
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-row { display: flex; gap: 16px; }
|
||||||
|
.dt-row > * { flex: 1; min-width: 0; }
|
||||||
|
.dt-cols-2 { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
|
||||||
|
.dt-cols-3 { display: grid; grid-template-columns: repeat(3, 1fr); gap: 16px; }
|
||||||
|
.dt-divider { border: none; border-top: 1px solid var(--border); margin: 22px 0; }
|
||||||
|
.dt-caption { font-size: 12.5px; color: var(--ink-tertiary); line-height: 1.5; }
|
||||||
|
.dt-spacer { height: 12px; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
DataFrame / preview table
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-table-wrap { border: 1px solid var(--border); border-radius: var(--r-md); overflow: hidden; margin: 8px 0; }
|
||||||
|
table.dt-table { width: 100%; border-collapse: collapse; font-size: 13px; }
|
||||||
|
table.dt-table th {
|
||||||
|
background: var(--surface-hover); color: var(--ink-secondary); font-weight: 500;
|
||||||
|
text-align: left; padding: 8px 12px; border-bottom: 1px solid var(--border);
|
||||||
|
font-size: 12px; text-transform: none; white-space: nowrap;
|
||||||
|
}
|
||||||
|
table.dt-table td {
|
||||||
|
padding: 7px 12px; border-bottom: 1px solid var(--border);
|
||||||
|
font-family: var(--font-mono); font-size: 12.5px; color: var(--ink); font-feature-settings: "ss02"; white-space: nowrap;
|
||||||
|
}
|
||||||
|
table.dt-table tr:last-child td { border-bottom: none; }
|
||||||
|
table.dt-table tr:nth-child(even) td { background: #fcfbf8; }
|
||||||
|
table.dt-table td.idx { color: var(--ink-tertiary); background: var(--surface-hover); }
|
||||||
|
.dt-cell-flag { color: var(--warn); }
|
||||||
|
.dt-cell-del { color: var(--danger); text-decoration: line-through; }
|
||||||
|
.dt-cell-add { color: var(--success); }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Stats overview (home) — copied from _legacy.py
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-stats { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin: 8px 0 20px; }
|
||||||
|
.dt-stat { background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg); padding: 16px 18px; box-shadow: 0 1px 2px rgba(28,25,23,0.03); }
|
||||||
|
.dt-stat-label { font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.08em; color: var(--ink-tertiary); font-weight: 500; margin-bottom: 6px; line-height: 1.4; }
|
||||||
|
.dt-stat-value { font-size: 28px; font-weight: 600; letter-spacing: -0.03em; line-height: 1; color: var(--ink); display: flex; align-items: baseline; gap: 6px; }
|
||||||
|
.dt-stat-unit { font-size: 12px; font-weight: 400; color: var(--ink-tertiary); letter-spacing: 0; }
|
||||||
|
.dt-stat.is-warn .dt-stat-value { color: var(--warn); }
|
||||||
|
.dt-stat.is-info .dt-stat-value { color: var(--info); }
|
||||||
|
.dt-stat.is-success .dt-stat-value { color: var(--success); }
|
||||||
|
@media (max-width: 900px) { .dt-stats { grid-template-columns: repeat(2, 1fr); } }
|
||||||
|
|
||||||
|
/* Metric (st.metric) */
|
||||||
|
.dt-metrics { display: flex; gap: 28px; flex-wrap: wrap; margin: 6px 0 14px; }
|
||||||
|
.dt-metric .label { font-size: 12.5px; color: var(--ink-tertiary); margin-bottom: 4px; }
|
||||||
|
.dt-metric .value { font-size: 26px; font-weight: 600; letter-spacing: -0.03em; color: var(--ink); line-height: 1; }
|
||||||
|
.dt-metric .delta { font-size: 12.5px; margin-top: 3px; }
|
||||||
|
.dt-metric .delta.up { color: var(--success); }
|
||||||
|
.dt-metric .delta.down { color: var(--danger); }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Files card (home) — copied from _legacy.py
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-files-section-head { display: flex; align-items: baseline; justify-content: space-between; margin: 4px 0 10px; gap: 12px; }
|
||||||
|
.dt-files-section-head h2 { margin: 0; }
|
||||||
|
.dt-section-meta { font-size: 12.5px; color: var(--ink-tertiary); }
|
||||||
|
.dt-file-row { display: flex; align-items: center; gap: 12px; }
|
||||||
|
.dt-file-icon-chip { width: 28px; height: 28px; border-radius: var(--r-sm); background: var(--accent-fill); color: var(--accent); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0; }
|
||||||
|
.dt-file-icon-chip svg { width: 14px; height: 14px; stroke-width: 1.8; }
|
||||||
|
.dt-file-name { font-family: var(--font-mono); font-size: 13px; color: var(--ink); font-feature-settings: "ss02"; }
|
||||||
|
.dt-file-size { font-family: var(--font-mono); font-size: 12px; color: var(--ink-tertiary); font-feature-settings: "ss02"; }
|
||||||
|
.dt-file-add {
|
||||||
|
display: flex; align-items: center; justify-content: center; gap: 8px;
|
||||||
|
width: 100%; padding: 12px 16px; background: var(--surface-hover);
|
||||||
|
border: none; border-top: 1px dashed var(--border-strong);
|
||||||
|
border-radius: 0 0 var(--r-lg) var(--r-lg); cursor: pointer;
|
||||||
|
font-size: 13px; font-weight: 500; color: var(--ink-secondary); margin-top: 14px;
|
||||||
|
}
|
||||||
|
.dt-file-add:hover { background: var(--accent-fill); color: var(--accent); }
|
||||||
|
.dt-file-add svg { width: 14px; height: 14px; stroke-width: 2; }
|
||||||
|
|
||||||
|
/* ===========================================================================
|
||||||
|
Findings panel — copied from _legacy.py
|
||||||
|
=========================================================================== */
|
||||||
|
.dt-finding-group-head {
|
||||||
|
display: flex; align-items: center; gap: 12px; padding: 16px 22px;
|
||||||
|
border-bottom: 1px solid var(--border); background: var(--surface-hover);
|
||||||
|
margin: -16px -16px 1.2rem; border-radius: var(--r-lg) var(--r-lg) 0 0;
|
||||||
|
cursor: pointer; user-select: none;
|
||||||
|
}
|
||||||
|
.dt-finding-group-chevron { color: var(--ink-tertiary); font-family: "Material Symbols Outlined"; font-size: 20px; line-height: 1; flex-shrink: 0; }
|
||||||
|
.dt-severity-dot { width: 8px; height: 8px; border-radius: 50%; flex-shrink: 0; display: inline-block; }
|
||||||
|
.dt-severity-dot.warn { background: var(--warn); }
|
||||||
|
.dt-severity-dot.info { background: var(--info); }
|
||||||
|
.dt-severity-dot.error { background: var(--danger); }
|
||||||
|
.dt-severity-dot.success { background: var(--success); }
|
||||||
|
.dt-group-filename { font-family: var(--font-mono); font-size: 13.5px; font-weight: 500; color: var(--ink); font-feature-settings: "ss02"; }
|
||||||
|
.dt-group-counts { margin-left: auto; display: flex; align-items: center; gap: 8px; }
|
||||||
|
.dt-count-pill { display: inline-flex; align-items: center; padding: 3px 9px; border-radius: 999px; font-size: 11.5px; font-weight: 500; line-height: 1.4; white-space: nowrap; }
|
||||||
|
.dt-count-pill.warn { background: var(--warn-fill); color: var(--warn); }
|
||||||
|
.dt-count-pill.info { background: var(--info-fill); color: var(--info); }
|
||||||
|
.dt-count-pill.error { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-count-pill.success { background: var(--success-fill); color: var(--success); }
|
||||||
|
.dt-finding-row { display: flex; align-items: flex-start; gap: 12px; padding: 12px 0; border-top: 1px solid var(--border); }
|
||||||
|
.dt-finding-row:first-of-type { border-top: none; }
|
||||||
|
.dt-finding-icon { width: 24px; height: 24px; border-radius: var(--r-sm); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0; }
|
||||||
|
.dt-finding-icon.warn { background: var(--warn-fill); color: var(--warn); }
|
||||||
|
.dt-finding-icon.info { background: var(--info-fill); color: var(--info); }
|
||||||
|
.dt-finding-icon.error { background: var(--danger-fill); color: var(--danger); }
|
||||||
|
.dt-finding-icon .dt-mi { font-family: "Material Symbols Outlined"; font-size: 16px; line-height: 1; }
|
||||||
|
.dt-finding-body { flex: 1; min-width: 0; }
|
||||||
|
.dt-finding-title { font-size: 14px; color: var(--ink); margin: 0 0 2px; line-height: 1.4; letter-spacing: -0.005em; }
|
||||||
|
.dt-finding-title strong { font-weight: 500; }
|
||||||
|
.dt-finding-meta { font-family: var(--font-mono); font-size: 12px; color: var(--ink-tertiary); line-height: 1.4; margin: 0; font-feature-settings: "ss02"; }
|
||||||
|
|
||||||
|
/* Match-group review card (dedup) */
|
||||||
|
.dt-match-card { background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg); box-shadow: 0 1px 2px rgba(28,25,23,0.03); margin: 12px 0; overflow: hidden; }
|
||||||
|
.dt-match-head { background: var(--surface-hover); border-bottom: 1px solid var(--border); padding: 12px 16px; display: flex; align-items: center; gap: 12px; }
|
||||||
|
.dt-match-head .title { font-weight: 500; font-size: 14px; }
|
||||||
|
.dt-match-head .conf { margin-left: auto; }
|
||||||
|
.dt-match-body { padding: 14px 16px; }
|
||||||
|
.dt-keep-row { background: var(--success-fill); }
|
||||||
|
.dt-keep-tag { display: inline-flex; align-items: center; gap: 4px; background: var(--success-fill); color: var(--success); border-radius: 999px; padding: 2px 8px; font-size: 11px; font-weight: 500; }
|
||||||
|
|
||||||
|
/* Progress bar */
|
||||||
|
.dt-progress { height: 6px; background: var(--border); border-radius: 3px; overflow: hidden; margin: 10px 0; }
|
||||||
|
.dt-progress .bar { height: 100%; background: var(--ink); border-radius: 3px; }
|
||||||
|
|
||||||
|
/* Tabs */
|
||||||
|
.dt-tabs { display: flex; gap: 18px; border-bottom: 1px solid var(--border); margin: 10px 0 16px; }
|
||||||
|
.dt-tab { font-size: 13.5px; color: var(--ink-secondary); padding: 8px 2px; border-bottom: 2px solid transparent; cursor: pointer; }
|
||||||
|
.dt-tab.is-active { color: var(--ink); font-weight: 500; border-bottom-color: var(--accent); }
|
||||||
|
|
||||||
|
/* Code block */
|
||||||
|
.dt-code { background: var(--surface-hover); border: 1px solid var(--border); border-radius: var(--r-md); padding: 12px 14px; font-family: var(--font-mono); font-size: 12.5px; color: var(--ink); white-space: pre; overflow-x: auto; font-feature-settings: "ss02"; }
|
||||||
|
|
||||||
|
@media (max-width: 1100px) {
|
||||||
|
.dt-footer { left: 0; }
|
||||||
|
.dt-sidebar { display: none; }
|
||||||
|
.dt-main { padding: 28px 24px 96px; }
|
||||||
|
}
|
||||||
164
layout-review/home.html
Normal file
164
layout-review/home.html
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>Layout review — File Analysis (Home)</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
</head>
|
||||||
|
<body data-page="home">
|
||||||
|
<div class="dt-app">
|
||||||
|
<aside class="dt-sidebar" id="dt-sidebar"></aside>
|
||||||
|
<main class="dt-main">
|
||||||
|
<div class="dt-review-banner">
|
||||||
|
<span class="dt-mi">visibility</span>
|
||||||
|
<span>Static layout preview of the <strong>Home / File Analysis</strong> page, shown with three imported files in the post-analysis state. <a href="index.html">All pages →</a></span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-main-inner">
|
||||||
|
|
||||||
|
<!-- Page header: brand block + privacy pill -->
|
||||||
|
<header class="dt-page-header">
|
||||||
|
<div class="dt-page-brand">
|
||||||
|
<div class="dt-page-brand-row">
|
||||||
|
<div class="dt-page-brand-mark">D</div>
|
||||||
|
<div class="dt-page-brand-words">
|
||||||
|
<span class="dt-page-eyebrow">UNALOGIX</span>
|
||||||
|
<h1 class="dt-page-wordmark">DataTools</h1>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-page-subtitle">Clean. Normalize. Transform.</p>
|
||||||
|
</div>
|
||||||
|
<span class="dt-privacy-pill">
|
||||||
|
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor">
|
||||||
|
<rect x="4" y="11" width="16" height="10" rx="2"/>
|
||||||
|
<path d="M8 11V7a4 4 0 018 0v4"/>
|
||||||
|
</svg>
|
||||||
|
Runs 100% locally
|
||||||
|
</span>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<!-- Files section head -->
|
||||||
|
<div class="dt-files-section-head">
|
||||||
|
<h2>Files</h2>
|
||||||
|
<span class="dt-section-meta">3 files · 4.7 MB total</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Files card -->
|
||||||
|
<div class="dt-card" style="padding-bottom:0">
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">customers_export.csv</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">2.1 MB</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">q3_transactions.xlsx</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">1.8 MB</span>
|
||||||
|
</div>
|
||||||
|
<div class="dt-file-row" style="padding:6px 0">
|
||||||
|
<button class="dt-btn dt-btn-tertiary" title="Remove">✕</button>
|
||||||
|
<span class="dt-file-icon-chip"><svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M14 2H6a2 2 0 00-2 2v16a2 2 0 002 2h12a2 2 0 002-2V8z"/><path d="M14 2v6h6"/></svg></span>
|
||||||
|
<span class="dt-file-name">vendor_list.csv</span>
|
||||||
|
<span class="dt-file-size" style="margin-left:auto">0.8 MB</span>
|
||||||
|
</div>
|
||||||
|
<button class="dt-file-add" style="margin-left:-16px;margin-right:-16px;width:calc(100% + 32px)">
|
||||||
|
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor"><path d="M12 5v14M5 12h14"/></svg> Add more files
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Action bar -->
|
||||||
|
<div class="dt-btn-row" style="margin-top:16px;max-width:340px">
|
||||||
|
<button class="dt-btn dt-btn-primary">Run analysis</button>
|
||||||
|
<button class="dt-btn">Clear results</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<hr class="dt-divider">
|
||||||
|
|
||||||
|
<!-- Stats overview -->
|
||||||
|
<div class="dt-stats">
|
||||||
|
<div class="dt-stat">
|
||||||
|
<div class="dt-stat-label">Files analyzed</div>
|
||||||
|
<div class="dt-stat-value">3</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-stat">
|
||||||
|
<div class="dt-stat-label">Total findings</div>
|
||||||
|
<div class="dt-stat-value">14</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-stat is-warn">
|
||||||
|
<div class="dt-stat-label">Warnings</div>
|
||||||
|
<div class="dt-stat-value">9 <span class="dt-stat-unit">to review</span></div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-stat is-info">
|
||||||
|
<div class="dt-stat-label">Info</div>
|
||||||
|
<div class="dt-stat-value">5 <span class="dt-stat-unit">suggestions</span></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Per-file findings panel #1 -->
|
||||||
|
<div class="dt-card">
|
||||||
|
<div class="dt-finding-group-head">
|
||||||
|
<span class="dt-finding-group-chevron" style="transform:rotate(90deg)">chevron_right</span>
|
||||||
|
<span class="dt-severity-dot warn"></span>
|
||||||
|
<span class="dt-group-filename">customers_export.csv</span>
|
||||||
|
<div class="dt-group-counts">
|
||||||
|
<span class="dt-count-pill warn">6 warnings</span>
|
||||||
|
<span class="dt-count-pill info">2 info</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="dt-finding-row">
|
||||||
|
<span class="dt-finding-icon warn"><span class="dt-mi">priority_high</span></span>
|
||||||
|
<div class="dt-finding-body">
|
||||||
|
<p class="dt-finding-title"><strong>312 duplicate rows</strong> across exact + near matches</p>
|
||||||
|
<p class="dt-finding-meta">column: email · Find Duplicates →</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-finding-row">
|
||||||
|
<span class="dt-finding-icon warn"><span class="dt-mi">format_color_text</span></span>
|
||||||
|
<div class="dt-finding-body">
|
||||||
|
<p class="dt-finding-title"><strong>1,204 cells</strong> with leading / trailing whitespace</p>
|
||||||
|
<p class="dt-finding-meta">columns: name, city · Clean Text →</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="dt-finding-row">
|
||||||
|
<span class="dt-finding-icon info"><span class="dt-mi">event</span></span>
|
||||||
|
<div class="dt-finding-body">
|
||||||
|
<p class="dt-finding-title">Mixed date formats in <strong>signup_date</strong></p>
|
||||||
|
<p class="dt-finding-meta">3 formats detected · Standardize Formats →</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Per-file findings panel #2 (collapsed) -->
|
||||||
|
<div class="dt-card" style="padding-bottom:16px">
|
||||||
|
<div class="dt-finding-group-head" style="margin-bottom:-16px;border-radius:var(--r-lg);border-bottom:none">
|
||||||
|
<span class="dt-finding-group-chevron">chevron_right</span>
|
||||||
|
<span class="dt-severity-dot warn"></span>
|
||||||
|
<span class="dt-group-filename">q3_transactions.xlsx</span>
|
||||||
|
<div class="dt-group-counts">
|
||||||
|
<span class="dt-count-pill warn">3 warnings</span>
|
||||||
|
<span class="dt-count-pill info">3 info</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Per-file findings panel #3 (clean) -->
|
||||||
|
<div class="dt-card" style="padding-bottom:16px">
|
||||||
|
<div class="dt-finding-group-head" style="margin-bottom:-16px;border-radius:var(--r-lg);border-bottom:none">
|
||||||
|
<span class="dt-severity-dot success"></span>
|
||||||
|
<span class="dt-group-filename">vendor_list.csv</span>
|
||||||
|
<div class="dt-group-counts">
|
||||||
|
<span class="dt-count-pill success">no issues</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</div>
|
||||||
|
<footer class="dt-footer" id="dt-footer"></footer>
|
||||||
|
<script src="shell.js"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
71
layout-review/index.html
Normal file
71
layout-review/index.html
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
<!doctype html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
|
<title>DataTools — Layout Review</title>
|
||||||
|
<link rel="stylesheet" href="app.css">
|
||||||
|
<style>
|
||||||
|
.lr-wrap { max-width: 960px; margin: 0 auto; padding: 48px 32px 80px; }
|
||||||
|
.lr-grid { display: grid; grid-template-columns: repeat(2, 1fr); gap: 14px; margin-top: 18px; }
|
||||||
|
.lr-card { display: flex; align-items: center; gap: 14px; background: var(--surface); border: 1px solid var(--border); border-radius: var(--r-lg); padding: 16px 18px; box-shadow: 0 1px 2px rgba(28,25,23,0.03); text-decoration: none; transition: border-color .12s ease, box-shadow .12s ease; }
|
||||||
|
.lr-card:hover { border-color: var(--border-strong); box-shadow: 0 2px 8px rgba(28,25,23,0.06); text-decoration: none; }
|
||||||
|
.lr-ico { width: 40px; height: 40px; border-radius: var(--r-md); background: var(--accent-fill); color: var(--accent); display: inline-flex; align-items: center; justify-content: center; flex-shrink: 0; }
|
||||||
|
.lr-ico .dt-mi { font-family: "Material Symbols Outlined"; font-size: 22px; }
|
||||||
|
.lr-body { min-width: 0; }
|
||||||
|
.lr-name { font-size: 15px; font-weight: 600; color: var(--ink); letter-spacing: -0.01em; display:flex; align-items:center; gap:8px; }
|
||||||
|
.lr-desc { font-size: 12.5px; color: var(--ink-secondary); margin-top: 2px; line-height: 1.45; }
|
||||||
|
.lr-sec { font-size: 11.5px; text-transform: uppercase; letter-spacing: 0.08em; color: var(--ink-tertiary); font-weight: 600; margin: 26px 0 2px; }
|
||||||
|
.lr-soon { font-size: 9px; font-weight: 600; letter-spacing: .06em; text-transform: uppercase; color: var(--ink-tertiary); border: 1px solid var(--border-strong); border-radius: 999px; padding: 1px 6px; }
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div class="lr-wrap">
|
||||||
|
<header class="dt-page-header">
|
||||||
|
<div class="dt-page-brand">
|
||||||
|
<div class="dt-page-brand-row">
|
||||||
|
<div class="dt-page-brand-mark">D</div>
|
||||||
|
<div class="dt-page-brand-words">
|
||||||
|
<span class="dt-page-eyebrow">UNALOGIX · LAYOUT REVIEW</span>
|
||||||
|
<h1 class="dt-page-wordmark">DataTools</h1>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<p class="dt-page-subtitle">Static HTML reproductions of every tool page, built from the live app's design tokens for human review of layouts.</p>
|
||||||
|
</div>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<div class="dt-alert info">
|
||||||
|
<span class="dt-mi">info</span>
|
||||||
|
<span>These are faithful static mockups — not the running Streamlit app. Colors, type scale, spacing, and components are copied verbatim from <code>theme.py</code> and <code>components/_legacy.py</code>. Each page is shown in a representative <strong>populated</strong> state so the layout can be reviewed end-to-end. Fonts load from Google Fonts (needs network); the chrome (sidebar + footer) is shared across every page.</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Analysis</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="home.html"><span class="lr-ico"><span class="dt-mi">insert_chart_outlined</span></span><span class="lr-body"><span class="lr-name">File Analysis (Home)</span><span class="lr-desc">Import files, run the analyzer, browse per-file findings.</span></span></a>
|
||||||
|
<a class="lr-card" href="11_reconciler.html"><span class="lr-ico"><span class="dt-mi">compare_arrows</span></span><span class="lr-body"><span class="lr-name">Reconcile Two Files</span><span class="lr-desc">Compare two lists of transactions and flag what doesn't match.</span></span></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Data Cleaners</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="04_missing_handler.html"><span class="lr-ico"><span class="dt-mi">help_outline</span></span><span class="lr-body"><span class="lr-name">Fix Missing Values</span><span class="lr-desc">Find blank cells (even hidden ones) and fill them in or remove them.</span></span></a>
|
||||||
|
<a class="lr-card" href="06_outlier_detector.html"><span class="lr-ico"><span class="dt-mi">insights</span></span><span class="lr-body"><span class="lr-name">Find Unusual Values <span class="lr-soon">Soon</span></span><span class="lr-desc">Spot values that look wrong — too high, too low, or rule-breaking.</span></span></a>
|
||||||
|
<a class="lr-card" href="02_text_cleaner.html"><span class="lr-ico"><span class="dt-mi">text_format</span></span><span class="lr-body"><span class="lr-name">Clean Text</span><span class="lr-desc">Trim extra spaces and strip out odd characters.</span></span></a>
|
||||||
|
<a class="lr-card" href="03_format_standardizer.html"><span class="lr-ico"><span class="dt-mi">format_list_bulleted</span></span><span class="lr-body"><span class="lr-name">Standardize Formats</span><span class="lr-desc">Make dates, phones, currency, and names look the same throughout.</span></span></a>
|
||||||
|
<a class="lr-card" href="01_deduplicator.html"><span class="lr-ico"><span class="dt-mi">search</span></span><span class="lr-body"><span class="lr-name">Find Duplicates</span><span class="lr-desc">Find rows that repeat, then keep one and remove the extras.</span></span></a>
|
||||||
|
<a class="lr-card" href="08_validator_reporter.html"><span class="lr-ico"><span class="dt-mi">check_circle</span></span><span class="lr-body"><span class="lr-name">Quality Check <span class="lr-soon">Soon</span></span><span class="lr-desc">Check your file against rules and export a PDF or Excel report.</span></span></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Transformations</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="05_column_mapper.html"><span class="lr-ico"><span class="dt-mi">view_column</span></span><span class="lr-body"><span class="lr-name">Map Columns</span><span class="lr-desc">Rename columns, reorder, and set each one as text, number, or date.</span></span></a>
|
||||||
|
<a class="lr-card" href="07_multi_file_merger.html"><span class="lr-ico"><span class="dt-mi">account_tree</span></span><span class="lr-body"><span class="lr-name">Combine Files <span class="lr-soon">Soon</span></span><span class="lr-desc">Combine several CSV or Excel files into one — even if columns differ.</span></span></a>
|
||||||
|
<a class="lr-card" href="10_pdf_extractor.html"><span class="lr-ico"><span class="dt-mi">picture_as_pdf</span></span><span class="lr-body"><span class="lr-name">PDF to CSV</span><span class="lr-desc">Pull transactions out of bank-statement PDFs into a clean CSV file.</span></span></a>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="lr-sec">Automations</div>
|
||||||
|
<div class="lr-grid">
|
||||||
|
<a class="lr-card" href="09_pipeline_runner.html"><span class="lr-ico"><span class="dt-mi">auto_awesome</span></span><span class="lr-body"><span class="lr-name">Automated Workflows</span><span class="lr-desc">Run several tools in a row — save the steps and reuse them anytime.</span></span></a>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
74
layout-review/shell.js
Normal file
74
layout-review/shell.js
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
/* Shared app chrome (sidebar nav + sticky footer) for the static layout
|
||||||
|
review pages. Mirrors src/gui/app.py:_build_navigation() ordering and
|
||||||
|
src/gui/components/_legacy.py:render_sticky_footer(). Each page sets
|
||||||
|
<body data-page="<tool_id|home>"> to mark the active nav item. */
|
||||||
|
(function () {
|
||||||
|
// Sections + entries in the same order app.py registers them.
|
||||||
|
var NAV = [
|
||||||
|
{ label: "Analysis", items: [
|
||||||
|
{ id: "home", icon: "insert_chart_outlined", name: "File Analysis", href: "home.html" },
|
||||||
|
{ id: "11_reconciler", icon: "compare_arrows", name: "Reconcile Two Files", href: "11_reconciler.html" },
|
||||||
|
]},
|
||||||
|
{ label: "Data Cleaners", items: [
|
||||||
|
{ id: "04_missing_handler", icon: "help_outline", name: "Fix Missing Values", href: "04_missing_handler.html" },
|
||||||
|
{ id: "06_outlier_detector", icon: "insights", name: "Find Unusual Values", href: "06_outlier_detector.html", soon: true },
|
||||||
|
{ id: "02_text_cleaner", icon: "text_format", name: "Clean Text", href: "02_text_cleaner.html" },
|
||||||
|
{ id: "03_format_standardizer", icon: "format_list_bulleted", name: "Standardize Formats", href: "03_format_standardizer.html" },
|
||||||
|
{ id: "01_deduplicator", icon: "search", name: "Find Duplicates", href: "01_deduplicator.html" },
|
||||||
|
{ id: "08_validator_reporter", icon: "check_circle", name: "Quality Check", href: "08_validator_reporter.html", soon: true },
|
||||||
|
]},
|
||||||
|
{ label: "Transformations", items: [
|
||||||
|
{ id: "05_column_mapper", icon: "view_column", name: "Map Columns", href: "05_column_mapper.html" },
|
||||||
|
{ id: "07_multi_file_merger", icon: "account_tree", name: "Combine Files", href: "07_multi_file_merger.html", soon: true },
|
||||||
|
{ id: "10_pdf_extractor", icon: "picture_as_pdf", name: "PDF to CSV", href: "10_pdf_extractor.html" },
|
||||||
|
]},
|
||||||
|
{ label: "Automations", items: [
|
||||||
|
{ id: "09_pipeline_runner", icon: "auto_awesome", name: "Automated Workflows", href: "09_pipeline_runner.html" },
|
||||||
|
]},
|
||||||
|
];
|
||||||
|
|
||||||
|
var active = document.body.getAttribute("data-page") || "";
|
||||||
|
|
||||||
|
// ---- Sidebar -----------------------------------------------------------
|
||||||
|
var sb = document.getElementById("dt-sidebar");
|
||||||
|
if (sb) {
|
||||||
|
var html = '' +
|
||||||
|
'<a class="dt-brand" href="index.html" style="text-decoration:none">' +
|
||||||
|
'<span class="dt-brand-mark">D</span>' +
|
||||||
|
'<span class="dt-brand-name">' +
|
||||||
|
'<span class="dt-brand-eyebrow">UNALOGIX</span>' +
|
||||||
|
'<span class="dt-brand-word">DataTools</span>' +
|
||||||
|
'</span>' +
|
||||||
|
'</a>' +
|
||||||
|
'<nav class="dt-nav">';
|
||||||
|
NAV.forEach(function (sec) {
|
||||||
|
var indicator = sec.label === "Analysis" ? "−" : "−";
|
||||||
|
html += '<div class="dt-nav-section">' + sec.label +
|
||||||
|
'<span class="dt-nav-indicator">' + indicator + '</span></div>';
|
||||||
|
sec.items.forEach(function (it) {
|
||||||
|
var cls = "dt-nav-link" + (it.id === active ? " is-active" : "") + (it.soon ? " is-soon" : "");
|
||||||
|
html += '<a class="' + cls + '" href="' + it.href + '">' +
|
||||||
|
'<span class="dt-mi">' + it.icon + '</span>' +
|
||||||
|
'<span>' + it.name + '</span>' +
|
||||||
|
(it.soon ? '<span class="dt-nav-soon-tag">Soon</span>' : '') +
|
||||||
|
'</a>';
|
||||||
|
});
|
||||||
|
});
|
||||||
|
html += '</nav>' +
|
||||||
|
'<div class="dt-sidebar-foot">' +
|
||||||
|
'<div><div class="dt-sidebar-label">Language</div>' +
|
||||||
|
'<div class="dt-select" style="pointer-events:none">English</div></div>' +
|
||||||
|
'<div class="dt-license-badge">Core · 1,820 days left</div>' +
|
||||||
|
'</div>';
|
||||||
|
sb.innerHTML = html;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---- Sticky footer -----------------------------------------------------
|
||||||
|
var ft = document.getElementById("dt-footer");
|
||||||
|
if (ft) {
|
||||||
|
ft.innerHTML =
|
||||||
|
'<a class="dt-footer-btn" href="index.html"><span class="dt-mi">close</span>Close</a>' +
|
||||||
|
'<button class="dt-footer-btn" type="button"><span class="dt-mi">help_outline</span>Help</button>' +
|
||||||
|
'<span style="margin-left:auto;font-size:11.5px;color:var(--ink-tertiary)">DataTools · local-first · static layout preview</span>';
|
||||||
|
}
|
||||||
|
})();
|
||||||
@@ -24,6 +24,7 @@ import io
|
|||||||
import os
|
import os
|
||||||
import platform
|
import platform
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -286,10 +287,96 @@ def page_has_extractable_text(page: Page, min_words: int = 5) -> bool:
|
|||||||
return len(page.words) >= min_words
|
return len(page.words) >= min_words
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Tesseract discovery
|
||||||
|
#
|
||||||
|
# Discovery order (shared with the PyInstaller build agent):
|
||||||
|
#
|
||||||
|
# 1. ``DATATOOLS_TESSERACT_PATH`` env var override (user escape hatch)
|
||||||
|
# 2. Bundled binary inside the PyInstaller frozen bundle
|
||||||
|
# (``sys._MEIPASS / "tesseract" / "tesseract[.exe]"``) — only
|
||||||
|
# present when running from a frozen DataTools installer/portable
|
||||||
|
# build. No-op in a dev checkout.
|
||||||
|
# 3. System PATH lookup (``pytesseract.get_tesseract_version()``)
|
||||||
|
# 4. Windows well-known install dirs (legacy fallback for users who
|
||||||
|
# installed UB Mannheim's Tesseract-OCR themselves)
|
||||||
|
#
|
||||||
|
# When a bundled tessdata directory exists, ``TESSDATA_PREFIX`` is set
|
||||||
|
# so Tesseract picks up the bundled ``eng.traineddata``. User-supplied
|
||||||
|
# ``TESSDATA_PREFIX`` is never clobbered.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _bundled_tesseract_path() -> Path | None:
|
||||||
|
"""Return the path to the bundled Tesseract binary, or ``None``.
|
||||||
|
|
||||||
|
Only returns a non-None value when running from a PyInstaller
|
||||||
|
frozen bundle (``sys.frozen`` is truthy AND ``sys._MEIPASS`` is
|
||||||
|
set). The bundled binary lives at
|
||||||
|
``<_MEIPASS>/tesseract/tesseract`` (``.exe`` on Windows) per the
|
||||||
|
contract shared with the build agent.
|
||||||
|
|
||||||
|
The file is NOT required to exist for this helper to return a
|
||||||
|
path — callers ``stat`` / ``.exists()``-check it themselves so a
|
||||||
|
missing bundled binary is treated the same as "not bundled" and
|
||||||
|
discovery falls through to PATH lookup.
|
||||||
|
"""
|
||||||
|
if not getattr(sys, "frozen", False):
|
||||||
|
return None
|
||||||
|
meipass = getattr(sys, "_MEIPASS", None)
|
||||||
|
if not meipass:
|
||||||
|
return None
|
||||||
|
binary = "tesseract.exe" if platform.system() == "Windows" else "tesseract"
|
||||||
|
return Path(meipass) / "tesseract" / binary
|
||||||
|
|
||||||
|
|
||||||
|
def _bundled_tessdata_dir() -> Path | None:
|
||||||
|
"""Return the bundled ``tessdata`` directory or ``None``.
|
||||||
|
|
||||||
|
Same frozen-state gating as ``_bundled_tesseract_path``; the dir
|
||||||
|
lives at ``<_MEIPASS>/tesseract/tessdata``. Callers use this to
|
||||||
|
point Tesseract at the bundled language data via the
|
||||||
|
``TESSDATA_PREFIX`` env var.
|
||||||
|
"""
|
||||||
|
if not getattr(sys, "frozen", False):
|
||||||
|
return None
|
||||||
|
meipass = getattr(sys, "_MEIPASS", None)
|
||||||
|
if not meipass:
|
||||||
|
return None
|
||||||
|
return Path(meipass) / "tesseract" / "tessdata"
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_bundled_tessdata_prefix() -> None:
|
||||||
|
"""Point Tesseract at the bundled ``tessdata`` directory.
|
||||||
|
|
||||||
|
Sets ``TESSDATA_PREFIX`` to the bundled path so the frozen
|
||||||
|
Tesseract binary picks up the bundled ``eng.traineddata``. A
|
||||||
|
user-supplied ``TESSDATA_PREFIX`` is preserved untouched — power
|
||||||
|
users who explicitly chose their own language data win.
|
||||||
|
|
||||||
|
No-op outside a frozen bundle, or if the bundled dir doesn't
|
||||||
|
exist (e.g. tessdata wasn't packaged for the current platform).
|
||||||
|
"""
|
||||||
|
if os.environ.get("TESSDATA_PREFIX"):
|
||||||
|
return
|
||||||
|
tessdata = _bundled_tessdata_dir()
|
||||||
|
if tessdata is not None and tessdata.exists():
|
||||||
|
os.environ["TESSDATA_PREFIX"] = str(tessdata)
|
||||||
|
|
||||||
|
|
||||||
def _autodetect_tesseract_path() -> str | None:
|
def _autodetect_tesseract_path() -> str | None:
|
||||||
"""Probe well-known install locations for ``tesseract.exe`` on
|
"""Locate a Tesseract binary outside the user's ``PATH``.
|
||||||
Windows. No-op on macOS/Linux where Tesseract is on PATH via
|
|
||||||
the system package manager."""
|
Tries the bundled binary first (only present in PyInstaller
|
||||||
|
frozen builds) so installer/portable users get a working OCR
|
||||||
|
without touching their system. Falls back to the legacy Windows
|
||||||
|
well-known install locations so users who installed UB
|
||||||
|
Mannheim's Tesseract-OCR themselves keep working too.
|
||||||
|
"""
|
||||||
|
bundled = _bundled_tesseract_path()
|
||||||
|
if bundled is not None and bundled.exists():
|
||||||
|
return str(bundled)
|
||||||
|
|
||||||
if platform.system() != "Windows":
|
if platform.system() != "Windows":
|
||||||
return None
|
return None
|
||||||
candidates = [
|
candidates = [
|
||||||
@@ -309,17 +396,30 @@ def ocr_available() -> tuple[bool, str]:
|
|||||||
"""Return ``(available, reason)`` — is OCR usable right now?
|
"""Return ``(available, reason)`` — is OCR usable right now?
|
||||||
|
|
||||||
Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override,
|
Discovery order: ``DATATOOLS_TESSERACT_PATH`` env var override,
|
||||||
then PATH-based lookup, then well-known Windows install
|
then the bundled binary (only present in a frozen build), then
|
||||||
locations.
|
PATH-based lookup, then well-known Windows install locations.
|
||||||
|
See the module-level discovery block for the full contract.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
import pytesseract # noqa: PLC0415
|
import pytesseract # noqa: PLC0415
|
||||||
except ImportError:
|
except ImportError:
|
||||||
return False, "pytesseract is not installed."
|
return False, "pytesseract is not installed."
|
||||||
|
|
||||||
|
# Point Tesseract at the bundled tessdata (if any) BEFORE the
|
||||||
|
# first ``get_tesseract_version`` call so the bundled language
|
||||||
|
# data is loaded even when the user happens to also have a
|
||||||
|
# system Tesseract that we'd otherwise fall through to.
|
||||||
|
_apply_bundled_tessdata_prefix()
|
||||||
|
|
||||||
override = os.environ.get("DATATOOLS_TESSERACT_PATH")
|
override = os.environ.get("DATATOOLS_TESSERACT_PATH")
|
||||||
if override:
|
if override:
|
||||||
pytesseract.pytesseract.tesseract_cmd = override
|
pytesseract.pytesseract.tesseract_cmd = override
|
||||||
|
else:
|
||||||
|
# Probe the bundled binary BEFORE PATH so frozen builds use
|
||||||
|
# their own Tesseract instead of any incidental system one.
|
||||||
|
bundled = _bundled_tesseract_path()
|
||||||
|
if bundled is not None and bundled.exists():
|
||||||
|
pytesseract.pytesseract.tesseract_cmd = str(bundled)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
pytesseract.get_tesseract_version()
|
pytesseract.get_tesseract_version()
|
||||||
|
|||||||
@@ -12,9 +12,16 @@ a fixture statement at test time.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from src import pdf_extract
|
||||||
from src.pdf_extract import (
|
from src.pdf_extract import (
|
||||||
Page,
|
Page,
|
||||||
WordBox,
|
WordBox,
|
||||||
|
_apply_bundled_tessdata_prefix,
|
||||||
|
_bundled_tessdata_dir,
|
||||||
|
_bundled_tesseract_path,
|
||||||
_extract_account_number,
|
_extract_account_number,
|
||||||
_extract_statement_period,
|
_extract_statement_period,
|
||||||
_find_amount_tokens,
|
_find_amount_tokens,
|
||||||
@@ -456,3 +463,131 @@ class TestYearFromFilename:
|
|||||||
def test_empty_filename(self):
|
def test_empty_filename(self):
|
||||||
assert year_from_filename("") is None
|
assert year_from_filename("") is None
|
||||||
assert year_from_filename(None) is None
|
assert year_from_filename(None) is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestBundledTesseractPath:
|
||||||
|
"""Frozen-bundle Tesseract discovery for installer / portable builds.
|
||||||
|
|
||||||
|
The build agent packages Tesseract at
|
||||||
|
``<sys._MEIPASS>/tesseract/tesseract[.exe]`` with language data
|
||||||
|
at ``<sys._MEIPASS>/tesseract/tessdata``. These tests pin that
|
||||||
|
contract on the runtime side."""
|
||||||
|
|
||||||
|
def test_returns_none_when_not_frozen(self, monkeypatch):
|
||||||
|
# Default dev environment: ``sys.frozen`` is unset.
|
||||||
|
monkeypatch.delattr("sys.frozen", raising=False)
|
||||||
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||||
|
assert _bundled_tesseract_path() is None
|
||||||
|
assert _bundled_tessdata_dir() is None
|
||||||
|
|
||||||
|
def test_returns_none_when_frozen_but_no_meipass(self, monkeypatch):
|
||||||
|
# Defensive: ``sys.frozen`` true but ``_MEIPASS`` missing
|
||||||
|
# (shouldn't happen in real PyInstaller bundles but guard
|
||||||
|
# the helper so it can't NoneType-explode).
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||||
|
assert _bundled_tesseract_path() is None
|
||||||
|
assert _bundled_tessdata_dir() is None
|
||||||
|
|
||||||
|
def test_frozen_linux_returns_unsuffixed_binary(
|
||||||
|
self, monkeypatch, tmp_path,
|
||||||
|
):
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Linux")
|
||||||
|
expected = tmp_path / "tesseract" / "tesseract"
|
||||||
|
assert _bundled_tesseract_path() == expected
|
||||||
|
|
||||||
|
def test_frozen_macos_returns_unsuffixed_binary(
|
||||||
|
self, monkeypatch, tmp_path,
|
||||||
|
):
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Darwin")
|
||||||
|
expected = tmp_path / "tesseract" / "tesseract"
|
||||||
|
assert _bundled_tesseract_path() == expected
|
||||||
|
|
||||||
|
def test_frozen_windows_returns_exe_binary(self, monkeypatch, tmp_path):
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||||
|
expected = tmp_path / "tesseract" / "tesseract.exe"
|
||||||
|
assert _bundled_tesseract_path() == expected
|
||||||
|
|
||||||
|
def test_frozen_returns_tessdata_dir(self, monkeypatch, tmp_path):
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
expected = tmp_path / "tesseract" / "tessdata"
|
||||||
|
assert _bundled_tessdata_dir() == expected
|
||||||
|
|
||||||
|
|
||||||
|
class TestAutodetectFavoursBundled:
|
||||||
|
"""When a bundled binary exists, ``_autodetect_tesseract_path``
|
||||||
|
should return it BEFORE falling through to Windows install
|
||||||
|
locations — frozen builds shouldn't depend on the user's
|
||||||
|
system tesseract even on Windows."""
|
||||||
|
|
||||||
|
def test_bundled_wins_over_windows_program_files(
|
||||||
|
self, monkeypatch, tmp_path,
|
||||||
|
):
|
||||||
|
# Simulate frozen Windows build with a bundled binary on disk.
|
||||||
|
bundle_root = tmp_path / "bundle"
|
||||||
|
bundled_bin = bundle_root / "tesseract" / "tesseract.exe"
|
||||||
|
bundled_bin.parent.mkdir(parents=True)
|
||||||
|
bundled_bin.write_bytes(b"")
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"sys._MEIPASS", str(bundle_root), raising=False,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Windows")
|
||||||
|
# Pretend the Program Files install also exists — bundled
|
||||||
|
# should still win because we probe it first.
|
||||||
|
monkeypatch.setattr(Path, "exists", lambda self: True)
|
||||||
|
assert pdf_extract._autodetect_tesseract_path() == str(bundled_bin)
|
||||||
|
|
||||||
|
def test_falls_through_when_not_frozen(self, monkeypatch):
|
||||||
|
# Dev: not frozen, not Windows → no candidate at all.
|
||||||
|
monkeypatch.delattr("sys.frozen", raising=False)
|
||||||
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||||
|
monkeypatch.setattr("platform.system", lambda: "Linux")
|
||||||
|
assert pdf_extract._autodetect_tesseract_path() is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestApplyBundledTessdataPrefix:
|
||||||
|
"""``TESSDATA_PREFIX`` env var handling — bundled data should be
|
||||||
|
pointed at without clobbering a user override."""
|
||||||
|
|
||||||
|
def test_no_op_when_not_frozen(self, monkeypatch):
|
||||||
|
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||||
|
monkeypatch.delattr("sys.frozen", raising=False)
|
||||||
|
monkeypatch.delattr("sys._MEIPASS", raising=False)
|
||||||
|
_apply_bundled_tessdata_prefix()
|
||||||
|
assert "TESSDATA_PREFIX" not in os.environ
|
||||||
|
|
||||||
|
def test_sets_when_frozen_and_bundled_exists(
|
||||||
|
self, monkeypatch, tmp_path,
|
||||||
|
):
|
||||||
|
tessdata = tmp_path / "tesseract" / "tessdata"
|
||||||
|
tessdata.mkdir(parents=True)
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||||
|
_apply_bundled_tessdata_prefix()
|
||||||
|
assert os.environ.get("TESSDATA_PREFIX") == str(tessdata)
|
||||||
|
|
||||||
|
def test_does_not_clobber_user_override(self, monkeypatch, tmp_path):
|
||||||
|
tessdata = tmp_path / "tesseract" / "tessdata"
|
||||||
|
tessdata.mkdir(parents=True)
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.setenv("TESSDATA_PREFIX", "/user/picked/this")
|
||||||
|
_apply_bundled_tessdata_prefix()
|
||||||
|
assert os.environ["TESSDATA_PREFIX"] == "/user/picked/this"
|
||||||
|
|
||||||
|
def test_no_op_when_bundled_dir_missing(self, monkeypatch, tmp_path):
|
||||||
|
# Frozen, but the build didn't ship a tessdata dir.
|
||||||
|
monkeypatch.setattr("sys.frozen", True, raising=False)
|
||||||
|
monkeypatch.setattr("sys._MEIPASS", str(tmp_path), raising=False)
|
||||||
|
monkeypatch.delenv("TESSDATA_PREFIX", raising=False)
|
||||||
|
_apply_bundled_tessdata_prefix()
|
||||||
|
assert "TESSDATA_PREFIX" not in os.environ
|
||||||
|
|||||||
Reference in New Issue
Block a user