test(scripts): one-shot 1.25GB stress harness for the gate pipeline
Generates a synthetic messy CSV at the target size, then runs every pipeline stage end-to-end (detect_encoding, repair_bytes, analyze, auto_fix on sample + full file) capturing wall-clock and peak RSS at each stage. Not part of the automated suite — invoke directly via ``python scripts/stress_1_25gb.py``. ``--keep`` to preserve the file between runs, ``--target-gb`` to tune the size. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
289
scripts/stress_1_25gb.py
Normal file
289
scripts/stress_1_25gb.py
Normal file
@@ -0,0 +1,289 @@
|
|||||||
|
"""One-time 1.25 GB stress test for the analyzer + gate pipeline.
|
||||||
|
|
||||||
|
Not part of the automated suite. Generates a synthetic messy CSV at the
|
||||||
|
target size, runs every pipeline stage end-to-end, captures wall-clock
|
||||||
|
+ peak RSS at each stage, and prints a summary.
|
||||||
|
|
||||||
|
Run:
|
||||||
|
python scripts/stress_1_25gb.py [--keep] [--target-gb 1.25]
|
||||||
|
|
||||||
|
The generated file lives in $TMPDIR (default /tmp). With --keep the file
|
||||||
|
is not deleted after the run (useful for re-runs without regenerating).
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import gc
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import resource
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import tracemalloc
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
# Project root
|
||||||
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Generator — designed so every analyzer detector finds something.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# 10 columns × ~180 bytes/row ≈ 1.25 GB at 7M rows. Tune via --target-gb.
|
||||||
|
HEADER = (
|
||||||
|
"id,Name ,\"email\",phone,city,notes,status,amount,date,“description”\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
NAMES = [
|
||||||
|
"Alice Smith", " Bob Jones ", "Carol O’Connor",
|
||||||
|
"David Lee", "Eva Garcia", "Frank Miller", "Grace Kim",
|
||||||
|
"Henry Davis", "Iris Wong",
|
||||||
|
]
|
||||||
|
EMAILS = [
|
||||||
|
"alice@example.com", "BOB@Example.COM", "carol@example.com",
|
||||||
|
"DAVID@example.com", "eva@Example.com", "Frank@example.COM",
|
||||||
|
]
|
||||||
|
CITIES = ["New York", "Köln", "São Paulo", "Zürich", "Düsseldorf", "Madrid", "Tokyo"]
|
||||||
|
NOTES = [
|
||||||
|
"VIP — contact ASAP…",
|
||||||
|
"regular customer",
|
||||||
|
"follow up next quarter",
|
||||||
|
"needs “signed” agreement",
|
||||||
|
"5′ 11″ height noted",
|
||||||
|
"nice client",
|
||||||
|
]
|
||||||
|
STATUSES = ["active", "N/A", "TBD", "", "active", "unknown", "active"]
|
||||||
|
AMOUNTS = ["$1,500.00", "100.00", "250.50", "$50.00", "1,234.56", "75.00"]
|
||||||
|
DATES = ["2024-01-15", "2024-02-20", "2024-03-12", "2024-04-08", "2024-05-30"]
|
||||||
|
DESCRIPTIONS = [
|
||||||
|
"first ‘contact’ made",
|
||||||
|
"long-time client",
|
||||||
|
"referred by partner",
|
||||||
|
"premium support tier",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def gen_chunk(start_id: int, n_rows: int) -> str:
|
||||||
|
"""Build n_rows rows as a single string (cheap append to BufferedWriter)."""
|
||||||
|
out = []
|
||||||
|
for i in range(n_rows):
|
||||||
|
rid = start_id + i
|
||||||
|
name = NAMES[rid % len(NAMES)]
|
||||||
|
email = EMAILS[rid % len(EMAILS)]
|
||||||
|
city = CITIES[rid % len(CITIES)]
|
||||||
|
note = NOTES[rid % len(NOTES)]
|
||||||
|
status = STATUSES[rid % len(STATUSES)]
|
||||||
|
amount = AMOUNTS[rid % len(AMOUNTS)]
|
||||||
|
date = DATES[rid % len(DATES)]
|
||||||
|
desc = DESCRIPTIONS[rid % len(DESCRIPTIONS)]
|
||||||
|
# Note: 'amount' contains a comma — quoted to avoid breaking CSV.
|
||||||
|
out.append(
|
||||||
|
f"{rid},{name},{email},555-0{rid % 10000:04d},{city},"
|
||||||
|
f"{note},{status},\"{amount}\",{date},{desc}\n"
|
||||||
|
)
|
||||||
|
return "".join(out)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_file(path: Path, target_gb: float) -> None:
|
||||||
|
"""Stream the synthetic CSV to disk in chunks until target size hit."""
|
||||||
|
target_bytes = int(target_gb * 1024**3)
|
||||||
|
rows_per_chunk = 50_000
|
||||||
|
total_rows = 0
|
||||||
|
written = 0
|
||||||
|
print(f" writing → {path} (target {target_gb} GB)")
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
with path.open("w", encoding="utf-8", newline="") as fh:
|
||||||
|
fh.write(HEADER)
|
||||||
|
written += len(HEADER.encode("utf-8"))
|
||||||
|
while written < target_bytes:
|
||||||
|
chunk = gen_chunk(total_rows, rows_per_chunk)
|
||||||
|
fh.write(chunk)
|
||||||
|
written += len(chunk.encode("utf-8"))
|
||||||
|
total_rows += rows_per_chunk
|
||||||
|
if total_rows % 1_000_000 == 0:
|
||||||
|
print(
|
||||||
|
f" …{total_rows:,} rows, "
|
||||||
|
f"{written / 1024**3:.2f} GB in "
|
||||||
|
f"{time.perf_counter() - t0:.1f}s"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f" done: {total_rows:,} rows, "
|
||||||
|
f"{written / 1024**3:.2f} GB in "
|
||||||
|
f"{time.perf_counter() - t0:.1f}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Stage runner — captures wall-clock + peak RSS delta per stage.
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _rss_mb() -> float:
|
||||||
|
"""Current process RSS in MB."""
|
||||||
|
return resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def stage(name: str, results: list) -> Iterator[None]:
|
||||||
|
gc.collect()
|
||||||
|
rss_before = _rss_mb()
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
err: Exception | None = None
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
except Exception as e:
|
||||||
|
err = e
|
||||||
|
finally:
|
||||||
|
wall = time.perf_counter() - t0
|
||||||
|
rss_after = _rss_mb()
|
||||||
|
results.append({
|
||||||
|
"stage": name,
|
||||||
|
"wall_s": wall,
|
||||||
|
"rss_before_mb": rss_before,
|
||||||
|
"rss_after_mb": rss_after,
|
||||||
|
"rss_delta_mb": rss_after - rss_before,
|
||||||
|
"error": repr(err) if err else "",
|
||||||
|
})
|
||||||
|
print(
|
||||||
|
f" {name:<42} {wall:>8.2f}s "
|
||||||
|
f"RSS {rss_before:>7.0f} → {rss_after:>7.0f} MB "
|
||||||
|
f"(Δ {rss_after - rss_before:+.0f})"
|
||||||
|
+ (f" ERROR {err!r}" if err else "")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--target-gb", type=float, default=1.25)
|
||||||
|
ap.add_argument("--keep", action="store_true",
|
||||||
|
help="Don't delete the test file at the end.")
|
||||||
|
ap.add_argument("--skip-generate", action="store_true",
|
||||||
|
help="Reuse an existing file at the target path.")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
tmp = Path(os.environ.get("TMPDIR", "/tmp"))
|
||||||
|
path = tmp / f"stress_{args.target_gb}gb.csv"
|
||||||
|
|
||||||
|
print(f"=== Stress test: {args.target_gb} GB ===")
|
||||||
|
print(f"Path: {path}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if not args.skip_generate or not path.exists():
|
||||||
|
print("[1/2] Generating fixture")
|
||||||
|
gen_t0 = time.perf_counter()
|
||||||
|
generate_file(path, args.target_gb)
|
||||||
|
gen_wall = time.perf_counter() - gen_t0
|
||||||
|
print(f"Generation total: {gen_wall:.1f}s")
|
||||||
|
print()
|
||||||
|
else:
|
||||||
|
print(f"[1/2] Reusing existing fixture ({path.stat().st_size / 1024**3:.2f} GB)")
|
||||||
|
print()
|
||||||
|
|
||||||
|
actual_gb = path.stat().st_size / 1024**3
|
||||||
|
print(f"[2/2] Pipeline run on {actual_gb:.2f} GB file")
|
||||||
|
print(f" {'stage':<42} {'wall':>9} {'RSS':>23}")
|
||||||
|
|
||||||
|
results: list[dict] = []
|
||||||
|
from src.core.io import detect_encoding, detect_delimiter, repair_bytes
|
||||||
|
from src.core.analyze import analyze, _load_for_analysis
|
||||||
|
from src.core.normalize import auto_fix, apply_decisions, is_normalized
|
||||||
|
|
||||||
|
with stage("detect_encoding", results):
|
||||||
|
enc = detect_encoding(path)
|
||||||
|
enc_used = enc if not results[-1]["error"] else "utf-8"
|
||||||
|
print(f" detected encoding: {enc_used!r}")
|
||||||
|
|
||||||
|
with stage("detect_delimiter", results):
|
||||||
|
delim = detect_delimiter(path, enc_used)
|
||||||
|
delim_used = delim if not results[-1]["error"] else ","
|
||||||
|
print(f" detected delimiter: {delim_used!r}")
|
||||||
|
|
||||||
|
raw_bytes = None
|
||||||
|
with stage("path.read_bytes (1.25GB → memory)", results):
|
||||||
|
raw_bytes = path.read_bytes()
|
||||||
|
|
||||||
|
repair = None
|
||||||
|
if raw_bytes is not None:
|
||||||
|
with stage("repair_bytes (full file)", results):
|
||||||
|
repair = repair_bytes(raw_bytes, encoding=enc_used, delimiter=delim_used)
|
||||||
|
if repair:
|
||||||
|
print(f" repair actions: {repair.summary()}")
|
||||||
|
print(f" repaired size: {len(repair.repaired_bytes) / 1024**3:.2f} GB")
|
||||||
|
|
||||||
|
# Free raw bytes before next stage so RSS deltas are honest.
|
||||||
|
del raw_bytes
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
|
findings = []
|
||||||
|
with stage("analyze (default sample_rows=1000)", results):
|
||||||
|
findings = analyze(path, sample_rows=1000)
|
||||||
|
print(f" findings: {sorted({f.id for f in findings})}")
|
||||||
|
|
||||||
|
df_sample = None
|
||||||
|
with stage("_load_for_analysis (1000 rows)", results):
|
||||||
|
df_sample, _, _ = _load_for_analysis(path, sample_rows=1000)
|
||||||
|
if df_sample is not None:
|
||||||
|
print(f" sample df: {df_sample.shape}")
|
||||||
|
|
||||||
|
if df_sample is not None and findings:
|
||||||
|
with stage("auto_fix on 1000-row sample", results):
|
||||||
|
sample_result = auto_fix(df_sample, findings)
|
||||||
|
print(f" fixes applied: {len(sample_result.applied)}, cells changed: "
|
||||||
|
f"{sum(a.cells_changed for a in sample_result.applied)}")
|
||||||
|
|
||||||
|
# Now the heavy run: parse the FULL file, then auto_fix.
|
||||||
|
print()
|
||||||
|
print(" --- full-file pass (no sample cap) ---")
|
||||||
|
full_df = None
|
||||||
|
with stage("full read_csv via repaired bytes (full file)", results):
|
||||||
|
if repair is not None:
|
||||||
|
import pandas as pd
|
||||||
|
full_df = pd.read_csv(
|
||||||
|
io.BytesIO(repair.repaired_bytes),
|
||||||
|
encoding="utf-8", delimiter=delim_used,
|
||||||
|
dtype=str, keep_default_na=False, on_bad_lines="warn",
|
||||||
|
)
|
||||||
|
if full_df is not None:
|
||||||
|
print(f" full df: {full_df.shape}, "
|
||||||
|
f"approx mem: {full_df.memory_usage(deep=True).sum() / 1024**3:.2f} GB")
|
||||||
|
|
||||||
|
if full_df is not None:
|
||||||
|
# Re-run analysis on the full DataFrame to get findings against
|
||||||
|
# actual content, not the 1000-row sample.
|
||||||
|
with stage("analyze full df (no sample cap)", results):
|
||||||
|
full_findings = analyze(full_df, sample_rows=10**9)
|
||||||
|
print(f" full findings: {sorted({f.id for f in full_findings})}")
|
||||||
|
|
||||||
|
with stage("auto_fix full df (~7M rows)", results):
|
||||||
|
full_result = auto_fix(full_df, full_findings)
|
||||||
|
if full_result is not None:
|
||||||
|
print(f" full fixes applied: {len(full_result.applied)}, "
|
||||||
|
f"cells changed: {sum(a.cells_changed for a in full_result.applied)}")
|
||||||
|
print(f" cleaned_bytes: {len(full_result.cleaned_bytes) / 1024**3:.2f} GB")
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
if not args.keep and path.exists():
|
||||||
|
path.unlink()
|
||||||
|
print()
|
||||||
|
print(f"Removed {path}")
|
||||||
|
|
||||||
|
# Summary table
|
||||||
|
print()
|
||||||
|
print("=== Summary ===")
|
||||||
|
print(f"{'stage':<46} {'wall (s)':>10} {'RSS Δ (MB)':>12}")
|
||||||
|
for r in results:
|
||||||
|
suffix = " ⚠" if r["error"] else ""
|
||||||
|
print(f"{r['stage']:<46} {r['wall_s']:>10.2f} {r['rss_delta_mb']:>+12.0f}{suffix}")
|
||||||
|
print(f"\nPeak RSS: {max(r['rss_after_mb'] for r in results):.0f} MB")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Reference in New Issue
Block a user