diff --git a/src/core/analyze.py b/src/core/analyze.py index 62ac87f..ad50bed 100644 --- a/src/core/analyze.py +++ b/src/core/analyze.py @@ -20,6 +20,7 @@ from pathlib import Path from typing import Any, Iterable, Literal, Optional import pandas as pd +from pandas.api import types as pdtypes from .io import RepairResult, repair_bytes, detect_encoding, detect_delimiter @@ -319,6 +320,51 @@ def _detect_mixed_case_email(df: pd.DataFrame) -> list[Finding]: return findings +def _detect_near_duplicates(df: pd.DataFrame) -> list[Finding]: + """Detect duplicate rows that differ only in case or padding. + + Cheap pass: lowercase + strip every string column, then count + ``df.duplicated()``. Catches the most common dedup signal (the same + customer entered twice with subtle formatting differences) without + paying the cost of fuzzy matching. Anything more sophisticated belongs + in tool 01. + """ + if len(df) < 2: + return [] + norm = df.copy() + for col in norm.columns: + if pdtypes.is_object_dtype(norm[col]) or pdtypes.is_string_dtype(norm[col]): + norm[col] = ( + norm[col].astype(str).str.strip().str.lower() + ) + dup_mask = norm.duplicated(keep=False) + n_dupes = int(dup_mask.sum()) + if n_dupes < 2: + return [] + # Count *extra* copies, not total members of duplicate groups. + n_groups = int(norm[dup_mask].drop_duplicates().shape[0]) + samples: list[tuple[int, str, str]] = [] + for i in df[dup_mask].index[:5]: + # Render the first textual column's value as a sample. + col_name = next( + (c for c in df.columns if isinstance(df[c].iloc[i], str)), + df.columns[0], + ) + samples.append((int(i), str(col_name), str(df[col_name].iloc[i]))) + return [Finding( + id="near_duplicate_rows", + severity="info", + tool=TOOL_DEDUPLICATOR, + count=n_dupes, + description=( + f"{n_dupes} row(s) across ~{n_groups} group(s) are duplicates " + f"after stripping whitespace and lowercasing string columns. " + f"Run the deduplicator to merge or remove." + ), + samples=samples, + )] + + def _detect_leading_zero_ids(df: pd.DataFrame) -> list[Finding]: """Informational: a column where most values are zero-padded digit IDs. @@ -355,6 +401,42 @@ def _detect_leading_zero_ids(df: pd.DataFrame) -> list[Finding]: return findings +def _detect_mixed_line_endings(raw: bytes) -> list[Finding]: + """Flag files that mix CRLF, LF, and bare CR line terminators. + + Mixed endings are a classic disaster pattern after multi-source concat + (Windows + macOS + Linux exports stitched together). Operates on raw + bytes only — DataFrame-mode :func:`analyze` skips this detector. + """ + if not raw: + return [] + n_crlf = raw.count(b"\r\n") + # Count standalone \r and \n (not part of \r\n) by subtracting overlaps. + n_lf = raw.count(b"\n") - n_crlf + n_cr = raw.count(b"\r") - n_crlf + kinds_present = sum(1 for n in (n_crlf, n_lf, n_cr) if n > 0) + if kinds_present <= 1: + return [] + breakdown = [] + if n_crlf: + breakdown.append(f"{n_crlf} CRLF") + if n_lf: + breakdown.append(f"{n_lf} LF") + if n_cr: + breakdown.append(f"{n_cr} CR") + return [Finding( + id="mixed_line_endings", + severity="warn", + tool=TOOL_TEXT_CLEANER, + count=kinds_present, + description=( + f"File mixes {kinds_present} line-ending styles " + f"({', '.join(breakdown)}). Naive splits on one style produce " + f"ghost rows or merged lines. Run the text cleaner to normalize." + ), + )] + + def _findings_from_repair(repair: RepairResult) -> list[Finding]: """Synthesize findings from a :class:`RepairResult`. @@ -452,8 +534,11 @@ def analyze( to synthesize ``csv_*`` findings so the user sees what the parser quietly fixed. """ + raw_for_byte_scan: Optional[bytes] = None if isinstance(source, (str, Path)): - df, internal_repair = _load_for_analysis(Path(source), sample_rows=sample_rows) + df, internal_repair, raw_for_byte_scan = _load_for_analysis( + Path(source), sample_rows=sample_rows, + ) # Caller-supplied repair_result wins over the internally produced one, # since the caller may have used non-default repair flags. if repair_result is None: @@ -464,6 +549,8 @@ def analyze( findings: list[Finding] = [] if repair_result is not None: findings.extend(_findings_from_repair(repair_result)) + if raw_for_byte_scan is not None: + findings.extend(_detect_mixed_line_endings(raw_for_byte_scan)) findings.extend(_detect_smart_punctuation(df)) findings.extend(_detect_invisible_chars(df)) findings.extend(_detect_whitespace_padding(df)) @@ -471,18 +558,19 @@ def analyze( findings.extend(_detect_mojibake(df)) findings.extend(_detect_mixed_case_email(df)) findings.extend(_detect_leading_zero_ids(df)) + findings.extend(_detect_near_duplicates(df)) return findings def _load_for_analysis( path: Path, *, sample_rows: int, -) -> tuple[pd.DataFrame, Optional[RepairResult]]: +) -> tuple[pd.DataFrame, Optional[RepairResult], Optional[bytes]]: """Read just enough of *path* to scan, with the same robust pre-parse repair the tool pages will use. - Returns ``(df, repair_result)``. The repair result is *None* for Excel - files since the byte-level repair step (BOM/NUL/smart-quote folding) - is CSV-specific. + Returns ``(df, repair_result, raw_bytes)``. The repair result and raw + bytes are *None* for Excel files since the byte-level repair step + (BOM/NUL/smart-quote folding) and line-ending scan are CSV-specific. """ suffix = path.suffix.lower() if suffix in (".xlsx", ".xls"): @@ -490,7 +578,7 @@ def _load_for_analysis( path, dtype=str, keep_default_na=False, engine="openpyxl", nrows=sample_rows, ) - return df, None + return df, None, None enc = detect_encoding(path) delim = detect_delimiter(path, enc) raw = path.read_bytes() @@ -502,7 +590,7 @@ def _load_for_analysis( dtype=str, keep_default_na=False, on_bad_lines="warn", nrows=sample_rows, ) - return df, repair + return df, repair, raw def to_dict(finding: Finding) -> dict[str, Any]: diff --git a/tests/test_analyze.py b/tests/test_analyze.py index ef519ae..66335af 100644 --- a/tests/test_analyze.py +++ b/tests/test_analyze.py @@ -173,6 +173,57 @@ class TestLeadingZeroIds: assert "leading_zero_ids" not in _ids(findings) +# --------------------------------------------------------------------------- +# Near-duplicate rows +# --------------------------------------------------------------------------- + +class TestNearDuplicates: + def test_finds_case_insensitive_dupes(self): + df = pd.DataFrame({ + "name": ["Alice", "alice ", "Bob"], + "email": ["a@b.com", "A@B.COM", "bob@b.com"], + }) + findings = analyze(df) + assert "near_duplicate_rows" in _ids(findings) + + def test_unique_rows_no_finding(self): + df = pd.DataFrame({ + "name": ["Alice", "Bob", "Carol"], + "email": ["a@x.com", "b@x.com", "c@x.com"], + }) + findings = analyze(df) + assert "near_duplicate_rows" not in _ids(findings) + + def test_single_row_no_finding(self): + df = pd.DataFrame({"x": ["only"]}) + findings = analyze(df) + assert "near_duplicate_rows" not in _ids(findings) + + +# --------------------------------------------------------------------------- +# Mixed line endings +# --------------------------------------------------------------------------- + +class TestMixedLineEndings: + def test_crlf_plus_lf_flagged(self, tmp_path): + f = tmp_path / "mixed.csv" + f.write_bytes(b"id,name\r\n1,Alice\n2,Bob\r\n") + findings = analyze(f) + assert "mixed_line_endings" in _ids(findings) + + def test_uniform_lf_not_flagged(self, tmp_path): + f = tmp_path / "uniform.csv" + f.write_bytes(b"id,name\n1,Alice\n2,Bob\n") + findings = analyze(f) + assert "mixed_line_endings" not in _ids(findings) + + def test_dataframe_mode_skips_detector(self): + # No raw bytes -> mixed_line_endings cannot be detected. + df = pd.DataFrame({"id": ["1"], "name": ["Alice"]}) + findings = analyze(df) + assert "mixed_line_endings" not in _ids(findings) + + # --------------------------------------------------------------------------- # Findings synthesized from RepairResult # ---------------------------------------------------------------------------