feat(analyze): add mixed_line_endings + near_duplicate_rows detectors

Two more detectors close the analyzer gap list:

mixed_line_endings (warn, tool=02): scans raw bytes for combinations of
  CRLF / LF / bare CR. Disaster pattern after multi-source concat
  (Windows + macOS + Linux exports stitched together). Operates on raw
  bytes only — DataFrame-mode analyze() skips it because raw bytes
  aren't available. _load_for_analysis now returns the raw bytes
  alongside the DataFrame and repair result so the detector has them.

near_duplicate_rows (info, tool=01): cheap dedup signal — strip and
  lowercase every string column, then count df.duplicated(). Catches the
  most common case (same customer entered twice with subtle formatting
  differences) without paying for fuzzy matching. Anything more
  sophisticated stays in tool 01.

Six new tests cover both detectors plus the dataframe-mode skip path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-04-29 16:09:42 +00:00
parent 0671ef277e
commit 8dfc6ad8ae
2 changed files with 146 additions and 7 deletions

View File

@@ -173,6 +173,57 @@ class TestLeadingZeroIds:
assert "leading_zero_ids" not in _ids(findings)
# ---------------------------------------------------------------------------
# Near-duplicate rows
# ---------------------------------------------------------------------------
class TestNearDuplicates:
def test_finds_case_insensitive_dupes(self):
df = pd.DataFrame({
"name": ["Alice", "alice ", "Bob"],
"email": ["a@b.com", "A@B.COM", "bob@b.com"],
})
findings = analyze(df)
assert "near_duplicate_rows" in _ids(findings)
def test_unique_rows_no_finding(self):
df = pd.DataFrame({
"name": ["Alice", "Bob", "Carol"],
"email": ["a@x.com", "b@x.com", "c@x.com"],
})
findings = analyze(df)
assert "near_duplicate_rows" not in _ids(findings)
def test_single_row_no_finding(self):
df = pd.DataFrame({"x": ["only"]})
findings = analyze(df)
assert "near_duplicate_rows" not in _ids(findings)
# ---------------------------------------------------------------------------
# Mixed line endings
# ---------------------------------------------------------------------------
class TestMixedLineEndings:
def test_crlf_plus_lf_flagged(self, tmp_path):
f = tmp_path / "mixed.csv"
f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n")
findings = analyze(f)
assert "mixed_line_endings" in _ids(findings)
def test_uniform_lf_not_flagged(self, tmp_path):
f = tmp_path / "uniform.csv"
f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
findings = analyze(f)
assert "mixed_line_endings" not in _ids(findings)
def test_dataframe_mode_skips_detector(self):
# No raw bytes -> mixed_line_endings cannot be detected.
df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
findings = analyze(df)
assert "mixed_line_endings" not in _ids(findings)
# ---------------------------------------------------------------------------
# Findings synthesized from RepairResult
# ---------------------------------------------------------------------------