- Rewrite README.md with project overview, quick-start, and CLI summary - Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections - Add docs/DEVELOPER.md with architecture, data flow, and extension guides - Rewrite src/core/__init__.py with public API exports and module docstring - Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive match group review with side-by-side diff, and download buttons - Add .gitignore, requirements.txt, all source code, tests, and sample data - Add streamlit to requirements.txt Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
131 lines
4.4 KiB
Python
131 lines
4.4 KiB
Python
"""Tests for src.core.io — file reading, encoding/delimiter detection."""
|
|
|
|
import pandas as pd
|
|
import pytest
|
|
from pathlib import Path
|
|
|
|
from src.core.io import (
|
|
detect_encoding,
|
|
detect_delimiter,
|
|
detect_header_row,
|
|
read_file,
|
|
write_file,
|
|
list_sheets,
|
|
)
|
|
|
|
|
|
class TestDetectEncoding:
|
|
def test_utf8_file(self, sample_csv_path):
|
|
enc = detect_encoding(sample_csv_path)
|
|
assert enc.lower().replace("-", "") in ("utf8", "ascii", "utf8sig")
|
|
|
|
def test_empty_file(self, tmp_path):
|
|
f = tmp_path / "empty.csv"
|
|
f.write_bytes(b"")
|
|
assert detect_encoding(f) == "utf-8"
|
|
|
|
def test_bom_file(self, tmp_path):
|
|
f = tmp_path / "bom.csv"
|
|
f.write_bytes(b"\xef\xbb\xbfname,email\nAlice,a@b.com\n")
|
|
assert detect_encoding(f) == "utf-8-sig"
|
|
|
|
def test_latin1_file(self, tmp_path):
|
|
f = tmp_path / "latin.csv"
|
|
content = "name,city\nJosé,São Paulo\n".encode("latin-1")
|
|
f.write_bytes(content)
|
|
enc = detect_encoding(f)
|
|
# Should detect something compatible with latin-1 family
|
|
assert enc in ("iso-8859-1", "latin-1", "windows-1252", "cp1252",
|
|
"iso-8859-9", "cp1250", "iso-8859-15", "utf-8")
|
|
|
|
|
|
class TestDetectDelimiter:
|
|
def test_comma(self, sample_csv_path):
|
|
assert detect_delimiter(sample_csv_path) == ","
|
|
|
|
def test_tab(self, tmp_path):
|
|
f = tmp_path / "tabs.tsv"
|
|
f.write_text("name\temail\nAlice\ta@b.com\n")
|
|
assert detect_delimiter(f) == "\t"
|
|
|
|
def test_semicolon(self, tmp_path):
|
|
f = tmp_path / "semi.csv"
|
|
f.write_text("name;email;phone\nAlice;a@b.com;555\n")
|
|
assert detect_delimiter(f) == ";"
|
|
|
|
def test_pipe(self, tmp_path):
|
|
f = tmp_path / "pipe.csv"
|
|
f.write_text("name|email|phone\nAlice|a@b.com|555\n")
|
|
assert detect_delimiter(f) == "|"
|
|
|
|
|
|
class TestDetectHeaderRow:
|
|
def test_standard_csv(self, sample_csv_path):
|
|
assert detect_header_row(sample_csv_path) == 0
|
|
|
|
def test_with_junk_rows(self, tmp_path):
|
|
f = tmp_path / "junk.csv"
|
|
f.write_text("Report generated 2024-01-01\n\nname,email,phone\nAlice,a@b.com,555\n")
|
|
# Row 0 has "Report generated..." which is a single non-numeric string
|
|
# Row 2 has "name,email,phone" which looks like headers
|
|
# The heuristic checks all cells, so row 0 may match if it's a single cell
|
|
hdr = detect_header_row(f)
|
|
assert hdr in (0, 2) # depends on delimiter detection
|
|
|
|
|
|
class TestReadFile:
|
|
def test_read_csv(self, sample_csv_path):
|
|
df = read_file(sample_csv_path)
|
|
assert isinstance(df, pd.DataFrame)
|
|
assert len(df) == 50
|
|
assert "customer_name" in df.columns
|
|
|
|
def test_read_nonexistent(self):
|
|
with pytest.raises(FileNotFoundError):
|
|
read_file("/tmp/nonexistent_file_xyz.csv")
|
|
|
|
def test_read_with_encoding_override(self, sample_csv_path):
|
|
df = read_file(sample_csv_path, encoding="utf-8")
|
|
assert len(df) == 50
|
|
|
|
def test_chunked_reading(self, sample_csv_path):
|
|
chunks = read_file(sample_csv_path, chunk_size=10)
|
|
# Should be a generator
|
|
all_chunks = list(chunks)
|
|
assert len(all_chunks) == 5
|
|
total_rows = sum(len(c) for c in all_chunks)
|
|
assert total_rows == 50
|
|
|
|
|
|
class TestWriteFile:
|
|
def test_write_csv(self, tmp_path, simple_df):
|
|
out = tmp_path / "output.csv"
|
|
write_file(simple_df, out)
|
|
assert out.exists()
|
|
# Read back
|
|
df = pd.read_csv(out, encoding="utf-8-sig")
|
|
assert len(df) == len(simple_df)
|
|
|
|
def test_write_xlsx(self, tmp_path, simple_df):
|
|
out = tmp_path / "output.xlsx"
|
|
write_file(simple_df, out)
|
|
assert out.exists()
|
|
df = pd.read_excel(out)
|
|
assert len(df) == len(simple_df)
|
|
|
|
def test_utf8_bom_default(self, tmp_path, simple_df):
|
|
out = tmp_path / "bom.csv"
|
|
write_file(simple_df, out)
|
|
raw = out.read_bytes()
|
|
assert raw[:3] == b"\xef\xbb\xbf"
|
|
|
|
|
|
class TestListSheets:
|
|
def test_list_sheets(self, tmp_path, simple_df):
|
|
path = tmp_path / "multi.xlsx"
|
|
with pd.ExcelWriter(path, engine="openpyxl") as writer:
|
|
simple_df.to_excel(writer, sheet_name="Sheet1", index=False)
|
|
simple_df.to_excel(writer, sheet_name="Sheet2", index=False)
|
|
sheets = list_sheets(path)
|
|
assert sheets == ["Sheet1", "Sheet2"]
|