Tools shipped this batch (4 → 6 of 9 Ready):
04 Missing Value Handler src/core/missing.py + cli_missing.py + GUI
05 Column Mapper src/core/column_mapper.py + cli_column_map.py + GUI
09 Pipeline Runner src/core/pipeline.py + cli_pipeline.py + GUI
with soft tool-dependency graph (recommended,
not enforced) and JSON save/load for repeatable
weekly cleanups.
Format Standardizer reworked for 1 GB international files:
• Vectorised dispatch + LRU cache over phone/date/currency/boolean/email
• Per-row country / address columns drive parsing
• Audit cap (default 10 k rows, ~50 MB RAM)
• standardize_file(): chunked streaming entry point (~165 k rows/sec)
• currency_decimal="auto" for EU comma-decimal locales
• R$ / kr / zł multi-char currency prefixes
• cli_format.py with auto-stream above 100 MB inputs
Encoding detection arbiter + language-aware probe:
Closes the last 4 xfails (cp1250 / mac_iceland / shift_jis_2004 / lying-BOM)
via tied-confidence arbiter + Cyrillic / EE-Latin coverage probes.
Distribution-readiness assets:
• streamlit_app.py — Streamlit Community Cloud entry shim
• src/gui/app_demo.py — single-page demo, ?p=<persona> routing,
100-row cap + watermark, free-vs-paid boundary enforced at surface
• samples/demo/ — 3 niche datasets + pre-tuned pipeline JSONs
• landing/ — 4 static HTML pages (apex chooser + 3 niche),
shared CSS, deploy.py URL-substitution script,
auto-generated robots.txt + sitemap.xml + 404.html + favicon
• docs/PLAN.md, DEMO-PLAN.md, DEPLOYMENT.md, POST-LAUNCH.md, NEXT-STEPS.md
— full strategy + measurement + deployment + master checklist
Test counts:
before: 1,520 passed · 4 skipped · 17 xfailed
after: 1,729 passed · 0 skipped · 0 xfailed
Tier-1 corpora added:
• missing-corpus 3 use cases + 16 edge cases
• column-mapper-corpus 3 use cases + 5 edge cases
• format-cleaner intl 20-row 13-country stress fixture
Engine hardening flushed out by the corpora:
• interpolate guards against object-dtype columns
• mean/median skip all-NaN columns (silences numpy warning)
• fillna runs under future.no_silent_downcasting (silences pandas warning)
• mojibake test no longer skips when ftfy installed (monkeypatch path)
• drop-row threshold semantics: strict-greater (consistent across rows / cols)
• currency_decimal validator allow-set updated for "auto"
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
464 lines
19 KiB
Python
464 lines
19 KiB
Python
"""Acceptance corpus for the Missing Value Handler.
|
|
|
|
Loads every fixture in ``test-cases/missing-corpus/test_data/`` and
|
|
asserts the documented behaviour. The fixtures are split into:
|
|
|
|
* ``uc##`` — three target-client use cases (Shopify operator,
|
|
marketing analyst, consultant intake).
|
|
* ``ec##`` — edge cases the engine must handle without surprise:
|
|
all-NaN columns, zeros that aren't missing, Excel errors, unicode
|
|
whitespace, mixed dtypes, padding, single row/column, every default
|
|
sentinel, per-column constants, drop thresholds, leading-NaN ffill,
|
|
numeric-strategy fallback for non-numeric columns, headers-only,
|
|
idempotency.
|
|
|
|
Each test runs through the public API (``handle_missing``) so any
|
|
regression in the engine surfaces here. Fixture files double as living
|
|
documentation for what the tool is supposed to do.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import pytest
|
|
|
|
from src.core.missing import (
|
|
MissingOptions,
|
|
handle_missing,
|
|
is_missing_like,
|
|
profile_missing,
|
|
)
|
|
|
|
CORPUS = Path(__file__).resolve().parents[1] / "test-cases" / "missing-corpus"
|
|
TEST_DATA = CORPUS / "test_data"
|
|
|
|
|
|
def _read(name: str, *, dtype_str: bool = False) -> pd.DataFrame:
|
|
"""Load a corpus CSV.
|
|
|
|
By default we let pandas infer dtypes — that's the most realistic
|
|
intake path (Excel exports keep numeric columns numeric). A handful
|
|
of cases pass ``dtype_str=True`` to keep sentinels visible in
|
|
columns that would otherwise be coerced to float.
|
|
"""
|
|
path = TEST_DATA / name
|
|
if dtype_str:
|
|
return pd.read_csv(path, dtype=str, keep_default_na=False)
|
|
return pd.read_csv(path)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Use case 1 — Shopify operator: detect-only
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestUC01ShopifyExport:
|
|
"""SMB operator standardizes disguised nulls before reimporting."""
|
|
|
|
def test_detect_only_replaces_sentinels(self):
|
|
df = _read("uc01_shopify_export.csv", dtype_str=True)
|
|
opts = MissingOptions.from_preset("detect-only")
|
|
res = handle_missing(df, opts)
|
|
# Spot-check known sentinels from the fixture
|
|
assert res.sentinels_standardized > 0
|
|
assert res.cells_filled == 0
|
|
assert res.rows_dropped == 0
|
|
|
|
# Fields that contained 'N/A', '-', 'NULL', '(blank)', '#N/A',
|
|
# 'n/a', '?', '(none)' should now be NaN.
|
|
for row, col in [
|
|
(1, "phone"), # 'N/A'
|
|
(2, "city"), # '-'
|
|
(3, "total_orders"), # 'NULL'
|
|
(5, "phone"), # ' '
|
|
(5, "last_order_date"), # '(blank)'
|
|
(6, "last_order_date"), # '#N/A'
|
|
(7, "phone"), # 'n/a'
|
|
(8, "city"), # '?'
|
|
(9, "total_orders"), # '(none)'
|
|
]:
|
|
assert pd.isna(res.handled_df.iloc[row][col]), (
|
|
f"Expected NaN at row {row} col {col}, got "
|
|
f"{res.handled_df.iloc[row][col]!r}"
|
|
)
|
|
|
|
def test_real_values_preserved(self):
|
|
df = _read("uc01_shopify_export.csv", dtype_str=True)
|
|
res = handle_missing(df, MissingOptions.from_preset("detect-only"))
|
|
# First row should be untouched.
|
|
assert res.handled_df.iloc[0]["first_name"] == "Alice"
|
|
assert res.handled_df.iloc[0]["email"] == "alice@shop.com"
|
|
assert res.handled_df.iloc[0]["lifetime_value"] == "1240.50"
|
|
|
|
def test_audit_log_complete(self):
|
|
df = _read("uc01_shopify_export.csv", dtype_str=True)
|
|
res = handle_missing(df, MissingOptions.from_preset("detect-only"))
|
|
# One audit row per sentinel replacement.
|
|
assert len(res.changes) == res.sentinels_standardized
|
|
assert set(res.changes["action"].apply(lambda s: s.startswith("standardize:"))) == {True}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Use case 2 — Marketing analyst: safe-fill
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestUC02MarketingAudience:
|
|
"""Marketer fills numeric columns with median, categorical with mode."""
|
|
|
|
def test_safe_fill_clears_all_missing(self):
|
|
df = _read("uc02_marketing_audience.csv")
|
|
opts = MissingOptions.from_preset("safe-fill")
|
|
res = handle_missing(df, opts)
|
|
# Every cell in scope should be filled.
|
|
assert res.profile_after.cells_missing == 0
|
|
assert res.cells_filled > 0
|
|
|
|
def test_numeric_uses_median_categorical_uses_mode(self):
|
|
df = _read("uc02_marketing_audience.csv")
|
|
opts = MissingOptions.from_preset("safe-fill")
|
|
res = handle_missing(df, opts)
|
|
# 'age' is numeric → median strategy
|
|
assert res.strategy_per_column["age"] == "median"
|
|
# 'segment' / 'region' / 'source' are object → mode fallback
|
|
assert res.strategy_per_column["segment"] == "mode"
|
|
assert res.strategy_per_column["region"] == "mode"
|
|
|
|
def test_per_column_override(self):
|
|
df = _read("uc02_marketing_audience.csv")
|
|
opts = MissingOptions.from_preset("safe-fill")
|
|
opts.column_strategies = {"source": "constant"}
|
|
opts.column_fill_values = {"source": "unknown"}
|
|
res = handle_missing(df, opts)
|
|
# Cells previously holding sentinels in 'source' should now equal "unknown".
|
|
assert (res.handled_df["source"] == "unknown").sum() >= 3
|
|
|
|
def test_consent_real_false_not_dropped(self):
|
|
# 'consent' column has empty cells but also explicit "true"; mode fill
|
|
# must not silently change a real "true" to anything else.
|
|
df = _read("uc02_marketing_audience.csv")
|
|
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
|
original_trues = (df["consent"] == "true").sum()
|
|
result_trues = (res.handled_df["consent"] == "true").sum()
|
|
# Filled rows can become "true" (mode) but should not lose existing trues.
|
|
assert result_trues >= original_trues
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Use case 3 — Consultant intake: threshold drops + fill
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestUC03ConsultantIntake:
|
|
"""Drop sparse columns and rows, then fill the survivors."""
|
|
|
|
def test_drop_col_removes_legacy_fields(self):
|
|
df = _read("uc03_consultant_intake.csv", dtype_str=True)
|
|
# internal_id_legacy and beta_field are 100% missing — drop them.
|
|
opts = MissingOptions(
|
|
standardize_sentinels=True,
|
|
strategy="drop_col",
|
|
col_drop_threshold=0.99,
|
|
)
|
|
res = handle_missing(df, opts)
|
|
assert "internal_id_legacy" in res.columns_dropped
|
|
assert "beta_field" in res.columns_dropped
|
|
|
|
def test_drop_row_removes_mostly_empty_respondents(self):
|
|
df = _read("uc03_consultant_intake.csv", dtype_str=True)
|
|
opts = MissingOptions(
|
|
standardize_sentinels=True,
|
|
strategy="drop_both",
|
|
col_drop_threshold=0.99, # drop the legacy / beta cols first
|
|
row_drop_threshold=0.5, # then drop rows with >50% missing
|
|
)
|
|
res = handle_missing(df, opts)
|
|
# R-002, R-005, R-007, R-010 are mostly-empty respondents.
|
|
assert res.rows_dropped >= 4
|
|
# Non-empty respondents survive.
|
|
kept_ids = set(res.handled_df["respondent_id"].tolist())
|
|
for survivor in ("R-001", "R-003", "R-006", "R-008", "R-009", "R-012"):
|
|
assert survivor in kept_ids
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Edge cases
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestEC01AllNanColumn:
|
|
def test_fill_skips_all_nan_column(self):
|
|
df = _read("ec01_all_nan_column.csv")
|
|
res = handle_missing(df, MissingOptions(strategy="mean"))
|
|
# Mean of all-NaN is NaN — engine must NOT fabricate a value.
|
|
assert res.handled_df["deprecated_field"].isna().all()
|
|
assert res.cells_filled == 0
|
|
|
|
def test_drop_col_catches_all_nan(self):
|
|
df = _read("ec01_all_nan_column.csv")
|
|
res = handle_missing(
|
|
df, MissingOptions(strategy="drop_col", col_drop_threshold=0.99),
|
|
)
|
|
assert "deprecated_field" in res.columns_dropped
|
|
assert "name" not in res.columns_dropped
|
|
|
|
|
|
class TestEC02NoMissing:
|
|
def test_clean_file_is_noop(self):
|
|
df = _read("ec02_no_missing.csv")
|
|
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
|
assert res.sentinels_standardized == 0
|
|
assert res.cells_filled == 0
|
|
assert res.rows_dropped == 0
|
|
pd.testing.assert_frame_equal(res.handled_df, df)
|
|
|
|
|
|
class TestEC03ZeroIsNotMissing:
|
|
def test_zero_preserved(self):
|
|
df = _read("ec03_zero_is_not_missing.csv")
|
|
res = handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
|
# Original zeros remain zero.
|
|
assert (res.handled_df["balance"] == 0).sum() == (df["balance"] == 0).sum()
|
|
assert (res.handled_df["count"] == 0).sum() == (df["count"] == 0).sum()
|
|
# No spurious changes recorded.
|
|
assert res.cells_filled == 0
|
|
assert res.sentinels_standardized == 0
|
|
|
|
def test_is_missing_like_zero_predicate(self):
|
|
# Direct predicate check — zeros, false, "0" must all be non-missing.
|
|
assert not is_missing_like(0)
|
|
assert not is_missing_like(0.0)
|
|
assert not is_missing_like(False)
|
|
assert not is_missing_like("0")
|
|
assert not is_missing_like("0.00")
|
|
|
|
|
|
class TestEC04ExcelErrors:
|
|
def test_excel_error_sentinels_recognized(self):
|
|
df = _read("ec04_excel_errors.csv", dtype_str=True)
|
|
res = handle_missing(df, MissingOptions(strategy="none"))
|
|
# 6 error sentinels in the fixture: #N/A, #NULL!, #VALUE!, #N/A, #N/A, #NULL!
|
|
assert res.sentinels_standardized == 6
|
|
|
|
|
|
class TestEC05UnicodeWhitespace:
|
|
def test_nbsp_and_ideographic_space_count_as_missing(self):
|
|
df = _read("ec05_unicode_whitespace.csv", dtype_str=True)
|
|
res = handle_missing(df, MissingOptions(strategy="none"))
|
|
# rows 1, 2, 4 contain NBSP / tab / ideographic space respectively
|
|
assert res.handled_df["note"].isna().sum() == 3
|
|
assert res.handled_df.iloc[0]["note"] == "hello"
|
|
assert res.handled_df.iloc[3]["note"] == "real"
|
|
|
|
|
|
class TestEC06MixedDtypes:
|
|
def test_mixed_column_falls_back_to_mode(self):
|
|
# Read with native dtypes so 'real_num' stays numeric.
|
|
df = _read("ec06_mixed_dtypes.csv")
|
|
opts = MissingOptions(
|
|
standardize_sentinels=True,
|
|
strategy="median",
|
|
categorical_strategy="mode",
|
|
)
|
|
res = handle_missing(df, opts)
|
|
# mixed_col holds 'N/A' / 'hello' alongside numbers → object dtype,
|
|
# median falls back to mode.
|
|
assert res.strategy_per_column["mixed_col"] == "mode"
|
|
# real_num is float dtype → median runs.
|
|
assert res.strategy_per_column["real_num"] == "median"
|
|
|
|
|
|
class TestEC07RealDataWithPadding:
|
|
def test_padded_real_data_not_treated_as_missing(self):
|
|
df = _read("ec07_real_data_with_padding.csv", dtype_str=True)
|
|
res = handle_missing(df, MissingOptions(strategy="none"))
|
|
# Only row 1 (name=" ") and row 2 (city=blank) should become NaN.
|
|
# " Alice ", " Bob ", " SF" must remain.
|
|
assert res.handled_df.iloc[0]["name"] == " Alice "
|
|
assert res.handled_df.iloc[2]["name"] == " Bob "
|
|
assert res.handled_df.iloc[3]["city"] == " SF"
|
|
|
|
|
|
class TestEC08SingleRow:
|
|
def test_single_row_handles_cleanly(self):
|
|
df = _read("ec08_single_row.csv", dtype_str=True)
|
|
# detect-only
|
|
res = handle_missing(df, MissingOptions(strategy="none"))
|
|
assert res.sentinels_standardized == 2 # 'N/A' + ''
|
|
# safe-fill on a one-row file: median/mode of a single value is itself.
|
|
res2 = handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
|
assert res2.handled_df.iloc[0]["name"] == "Alice"
|
|
|
|
|
|
class TestEC09SingleColumn:
|
|
def test_single_column_works(self):
|
|
df = _read("ec09_single_column.csv", dtype_str=True)
|
|
res = handle_missing(df, MissingOptions(strategy="none"))
|
|
# 'N/A', whitespace-only ' ', '-' = 3 sentinels
|
|
assert res.sentinels_standardized == 3
|
|
assert res.handled_df["value"].isna().sum() == 3
|
|
|
|
|
|
class TestEC10AllSentinelVariants:
|
|
def test_every_default_sentinel_recognized(self):
|
|
df = _read("ec10_all_sentinel_variants.csv", dtype_str=True)
|
|
res = handle_missing(df, MissingOptions(strategy="none"))
|
|
# 20 sentinels + 1 real value
|
|
assert res.sentinels_standardized == 20
|
|
# The 'real_value' row stays.
|
|
assert (res.handled_df["sentinel_value"] == "real_value").sum() == 1
|
|
|
|
|
|
class TestEC11ConstantPerColumn:
|
|
def test_per_column_fill_values(self):
|
|
df = _read("ec11_constant_per_column.csv", dtype_str=True)
|
|
opts = MissingOptions(
|
|
strategy="constant",
|
|
column_fill_values={
|
|
"country": "USA",
|
|
"salary": "0",
|
|
"department": "Unassigned",
|
|
},
|
|
)
|
|
res = handle_missing(df, opts)
|
|
# Fixture has 1 UK row + 2 USA rows + 2 blanks. Filling blanks with
|
|
# "USA" yields 4 USA total; UK is preserved.
|
|
assert (res.handled_df["country"] == "USA").sum() == 4
|
|
assert (res.handled_df["country"] == "UK").sum() == 1
|
|
assert (res.handled_df["department"] == "Unassigned").sum() >= 2
|
|
|
|
|
|
class TestEC12DropThresholdBoundary:
|
|
def test_threshold_one_never_drops(self):
|
|
# threshold 1.0 + strict-greater = never drop.
|
|
df = _read("ec12_drop_threshold_boundary.csv")
|
|
opts = MissingOptions(strategy="drop_row", row_drop_threshold=1.0)
|
|
res = handle_missing(df, opts)
|
|
assert res.rows_dropped == 0
|
|
|
|
def test_threshold_just_under_one_drops_fully_missing(self):
|
|
# threshold 0.99: drop only fully-missing rows (frac > 0.99 → frac == 1.0).
|
|
df = _read("ec12_drop_threshold_boundary.csv")
|
|
opts = MissingOptions(
|
|
strategy="drop_row",
|
|
row_drop_threshold=0.99,
|
|
columns=["a", "b", "c", "d"], # exclude id from the scope
|
|
)
|
|
res = handle_missing(df, opts)
|
|
# Only row 3 (id=4, all four are NaN) qualifies.
|
|
assert res.rows_dropped == 1
|
|
|
|
def test_threshold_half_drops_majority_missing(self):
|
|
df = _read("ec12_drop_threshold_boundary.csv")
|
|
opts = MissingOptions(
|
|
strategy="drop_row",
|
|
row_drop_threshold=0.5,
|
|
columns=["a", "b", "c", "d"],
|
|
)
|
|
res = handle_missing(df, opts)
|
|
# Missing fractions across [a,b,c,d]:
|
|
# row 0: 0/4=0.0 keep
|
|
# row 1: 2/4=0.5 keep (strict >, not equal)
|
|
# row 2: 3/4=0.75 drop
|
|
# row 3: 4/4=1.0 drop
|
|
# row 4: 2/4=0.5 keep
|
|
assert res.rows_dropped == 2
|
|
|
|
def test_threshold_zero_drops_any_missing(self):
|
|
df = _read("ec12_drop_threshold_boundary.csv")
|
|
opts = MissingOptions(
|
|
strategy="drop_row",
|
|
row_drop_threshold=0.0,
|
|
columns=["a", "b", "c", "d"],
|
|
)
|
|
res = handle_missing(df, opts)
|
|
# Every body row except row 0 has at least one missing.
|
|
assert res.rows_dropped == 4
|
|
|
|
|
|
class TestEC13FfillLeadingNan:
|
|
def test_leading_nan_run_survives_ffill(self):
|
|
df = _read("ec13_ffill_leading_nan.csv")
|
|
res = handle_missing(df, MissingOptions(strategy="ffill"))
|
|
# First two rows (leading NaN) remain NaN — there's nothing to fill from.
|
|
assert pd.isna(res.handled_df["price"].iloc[0])
|
|
assert pd.isna(res.handled_df["price"].iloc[1])
|
|
# Mid-series gets filled forward.
|
|
assert res.handled_df["price"].iloc[3] == 100.0
|
|
assert res.handled_df["price"].iloc[4] == 100.0
|
|
# Trailing NaN gets filled by the last seen value.
|
|
assert res.handled_df["price"].iloc[6] == 150.0
|
|
|
|
|
|
class TestEC14InterpolateFallback:
|
|
def test_interpolate_on_non_numeric_falls_back(self):
|
|
df = _read("ec14_interpolate_fallback.csv", dtype_str=True)
|
|
opts = MissingOptions(
|
|
strategy="interpolate",
|
|
categorical_strategy="mode",
|
|
)
|
|
res = handle_missing(df, opts)
|
|
# All columns are object dtype here → fallback to mode.
|
|
assert res.strategy_per_column["category"] == "mode"
|
|
assert res.strategy_per_column["value"] == "mode"
|
|
|
|
|
|
class TestEC15HeadersOnly:
|
|
def test_empty_body_does_not_crash(self):
|
|
df = _read("ec15_headers_only.csv")
|
|
# All operations must be no-ops on an empty body.
|
|
for preset in ("detect-only", "safe-fill", "drop-incomplete"):
|
|
res = handle_missing(df, MissingOptions.from_preset(preset))
|
|
assert len(res.handled_df) == 0
|
|
assert res.cells_filled == 0
|
|
assert res.rows_dropped == 0
|
|
|
|
|
|
class TestEC16Idempotency:
|
|
def test_safe_fill_is_idempotent(self):
|
|
df = _read("ec16_idempotent_apply.csv", dtype_str=True)
|
|
opts = MissingOptions.from_preset("safe-fill")
|
|
first = handle_missing(df, opts)
|
|
second = handle_missing(first.handled_df, opts)
|
|
# Second pass should make no further changes.
|
|
pd.testing.assert_frame_equal(
|
|
second.handled_df.reset_index(drop=True),
|
|
first.handled_df.reset_index(drop=True),
|
|
)
|
|
assert second.cells_filled == 0
|
|
assert second.sentinels_standardized == 0
|
|
|
|
def test_detect_only_is_idempotent(self):
|
|
df = _read("ec16_idempotent_apply.csv", dtype_str=True)
|
|
opts = MissingOptions.from_preset("detect-only")
|
|
first = handle_missing(df, opts)
|
|
second = handle_missing(first.handled_df, opts)
|
|
assert second.sentinels_standardized == 0
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Whole-corpus property tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
ALL_FIXTURES = sorted(p.name for p in TEST_DATA.glob("*.csv"))
|
|
|
|
|
|
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
|
|
def test_handle_missing_does_not_mutate_input(fixture):
|
|
"""Every fixture must leave the input DataFrame untouched."""
|
|
df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
|
|
if df.empty and len(df.columns) == 0:
|
|
pytest.skip(f"{fixture}: completely empty file")
|
|
snapshot = df.copy(deep=True)
|
|
handle_missing(df, MissingOptions.from_preset("safe-fill"))
|
|
pd.testing.assert_frame_equal(df, snapshot)
|
|
|
|
|
|
@pytest.mark.parametrize("fixture", ALL_FIXTURES)
|
|
def test_profile_runs_on_every_fixture(fixture):
|
|
"""``profile_missing`` must succeed on every corpus file."""
|
|
df = pd.read_csv(TEST_DATA / fixture, dtype=str, keep_default_na=False)
|
|
prof = profile_missing(df, MissingOptions())
|
|
assert prof.rows_total == len(df)
|
|
assert prof.cells_total == len(df) * len(df.columns)
|