feat: add documentation, Streamlit GUI, and full source tree

- Rewrite README.md with project overview, quick-start, and CLI summary
- Add docs/CLI-REFERENCE.md with full flag reference and 8 recipe sections
- Add docs/DEVELOPER.md with architecture, data flow, and extension guides
- Rewrite src/core/__init__.py with public API exports and module docstring
- Add Streamlit GUI (src/gui/) with file upload, advanced options, interactive
  match group review with side-by-side diff, and download buttons
- Add .gitignore, requirements.txt, all source code, tests, and sample data
- Add streamlit to requirements.txt

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-28 23:06:39 +00:00
parent 0613dc420c
commit b871ab24fc
47 changed files with 4413 additions and 2 deletions

47
tests/conftest.py Normal file
View File

@@ -0,0 +1,47 @@
"""Shared test fixtures."""
import pandas as pd
import pytest
from pathlib import Path
SAMPLES_DIR = Path(__file__).parent.parent / "samples"
@pytest.fixture
def sample_csv_path():
return SAMPLES_DIR / "messy_sales.csv"
@pytest.fixture
def sample_df(sample_csv_path):
return pd.read_csv(sample_csv_path, dtype=str, keep_default_na=False)
@pytest.fixture
def simple_df():
"""Small DataFrame with obvious duplicates for unit testing."""
return pd.DataFrame({
"name": ["Alice", "alice", "Bob", "Charlie", "ALICE"],
"email": ["alice@test.com", "alice@test.com", "bob@test.com",
"charlie@test.com", "alice@test.com"],
"phone": ["555-1234", "555-1234", "555-5678", "555-9012", "555-1234"],
})
@pytest.fixture
def merge_df():
"""DataFrame with partial records that benefit from merge."""
return pd.DataFrame({
"name": ["John Doe", "John Doe", "Jane Smith"],
"email": ["john@test.com", "john@test.com", "jane@test.com"],
"phone": ["555-1111", "", "555-3333"],
"address": ["", "123 Main St", "456 Oak Ave"],
})
@pytest.fixture
def tmp_csv(tmp_path, simple_df):
"""Write simple_df to a temp CSV and return the path."""
path = tmp_path / "test_input.csv"
simple_df.to_csv(path, index=False)
return path