feat(analyze): add mixed_line_endings + near_duplicate_rows detectors
Two more detectors close the analyzer gap list: mixed_line_endings (warn, tool=02): scans raw bytes for combinations of CRLF / LF / bare CR. Disaster pattern after multi-source concat (Windows + macOS + Linux exports stitched together). Operates on raw bytes only — DataFrame-mode analyze() skips it because raw bytes aren't available. _load_for_analysis now returns the raw bytes alongside the DataFrame and repair result so the detector has them. near_duplicate_rows (info, tool=01): cheap dedup signal — strip and lowercase every string column, then count df.duplicated(). Catches the most common case (same customer entered twice with subtle formatting differences) without paying for fuzzy matching. Anything more sophisticated stays in tool 01. Six new tests cover both detectors plus the dataframe-mode skip path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -173,6 +173,57 @@ class TestLeadingZeroIds:
|
||||
assert "leading_zero_ids" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Near-duplicate rows
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNearDuplicates:
|
||||
def test_finds_case_insensitive_dupes(self):
|
||||
df = pd.DataFrame({
|
||||
"name": ["Alice", "alice ", "Bob"],
|
||||
"email": ["a@b.com", "A@B.COM", "bob@b.com"],
|
||||
})
|
||||
findings = analyze(df)
|
||||
assert "near_duplicate_rows" in _ids(findings)
|
||||
|
||||
def test_unique_rows_no_finding(self):
|
||||
df = pd.DataFrame({
|
||||
"name": ["Alice", "Bob", "Carol"],
|
||||
"email": ["a@x.com", "b@x.com", "c@x.com"],
|
||||
})
|
||||
findings = analyze(df)
|
||||
assert "near_duplicate_rows" not in _ids(findings)
|
||||
|
||||
def test_single_row_no_finding(self):
|
||||
df = pd.DataFrame({"x": ["only"]})
|
||||
findings = analyze(df)
|
||||
assert "near_duplicate_rows" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mixed line endings
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMixedLineEndings:
|
||||
def test_crlf_plus_lf_flagged(self, tmp_path):
|
||||
f = tmp_path / "mixed.csv"
|
||||
f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n")
|
||||
findings = analyze(f)
|
||||
assert "mixed_line_endings" in _ids(findings)
|
||||
|
||||
def test_uniform_lf_not_flagged(self, tmp_path):
|
||||
f = tmp_path / "uniform.csv"
|
||||
f.write_bytes(b"id,name\n1,Alice\n2,Bob\n")
|
||||
findings = analyze(f)
|
||||
assert "mixed_line_endings" not in _ids(findings)
|
||||
|
||||
def test_dataframe_mode_skips_detector(self):
|
||||
# No raw bytes -> mixed_line_endings cannot be detected.
|
||||
df = pd.DataFrame({"id": ["1"], "name": ["Alice"]})
|
||||
findings = analyze(df)
|
||||
assert "mixed_line_endings" not in _ids(findings)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Findings synthesized from RepairResult
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user