feat: konfigurierbarer Dateiname + Archiv-Modus für Original (v0.3.0)
Neue [output]-Section: - name_mode: prefix | suffix | none (suffix wird vor Extension eingefügt) - name_tag: verbatim einfügbarer String - original_on_success: delete | archive - archive_dir mit Kollisions-Schutz (Timestamp-Suffix) 20 neue Tests (50 insgesamt, alle grün). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,190 @@
|
||||
"""Tests für Feature: konfigurierbare Dateinamen und Original-Behandlung."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf_ocr_hotfolder.config import OcrConfig, OutputConfig, VeraPdfConfig
|
||||
from pdf_ocr_hotfolder.processor import build_output_name, process_pdf
|
||||
from pdf_ocr_hotfolder.service import PreflightError, check_output_config
|
||||
|
||||
|
||||
# ---------------- build_output_name ----------------
|
||||
|
||||
@pytest.mark.parametrize("src,mode,tag,expected", [
|
||||
# prefix
|
||||
("scan.pdf", "prefix", "OCR_", "OCR_scan.pdf"),
|
||||
("scan.pdf", "prefix", "[OCR] ", "[OCR] scan.pdf"),
|
||||
# suffix (Tag vor Extension)
|
||||
("scan.pdf", "suffix", "_OCR", "scan_OCR.pdf"),
|
||||
("scan.pdf", "suffix", "-ocr", "scan-ocr.pdf"),
|
||||
# none
|
||||
("scan.pdf", "none", "OCR_", "scan.pdf"),
|
||||
# leerer Tag = none
|
||||
("scan.pdf", "prefix", "", "scan.pdf"),
|
||||
("scan.pdf", "suffix", "", "scan.pdf"),
|
||||
# Mehrfach-Punkte im Namen: nur letzte Extension zählt
|
||||
("rechnung.2026.pdf", "suffix", "_OCR", "rechnung.2026_OCR.pdf"),
|
||||
("rechnung.2026.pdf", "prefix", "OCR_", "OCR_rechnung.2026.pdf"),
|
||||
# Name ohne Extension
|
||||
("NO_EXT", "suffix", "_OCR", "NO_EXT_OCR"),
|
||||
])
|
||||
def test_build_output_name(src, mode, tag, expected) -> None:
|
||||
assert build_output_name(src, mode, tag) == expected
|
||||
|
||||
|
||||
def test_build_output_name_invalid_mode() -> None:
|
||||
with pytest.raises(ValueError, match="name_mode"):
|
||||
build_output_name("x.pdf", "bogus", "OCR_")
|
||||
|
||||
|
||||
# ---------------- check_output_config ----------------
|
||||
|
||||
def test_check_output_config_delete_ok() -> None:
|
||||
check_output_config("delete", "") # ok
|
||||
|
||||
|
||||
def test_check_output_config_archive_requires_dir() -> None:
|
||||
with pytest.raises(PreflightError, match="archive_dir"):
|
||||
check_output_config("archive", "")
|
||||
|
||||
|
||||
def test_check_output_config_archive_with_dir_ok() -> None:
|
||||
check_output_config("archive", "/var/archive") # ok
|
||||
|
||||
|
||||
def test_check_output_config_invalid_mode() -> None:
|
||||
with pytest.raises(PreflightError, match="ungültig"):
|
||||
check_output_config("trash", "")
|
||||
|
||||
|
||||
# ---------------- process_pdf mit Original-Behandlung ----------------
|
||||
|
||||
def _fake_ocr(src: Path, dst: Path, cfg: OcrConfig) -> None:
|
||||
"""Simuliert ocrmypdf: kopiert Inhalt, erzeugt Zieldatei."""
|
||||
dst.write_bytes(b"%PDF-1.4 OCRed\n" + src.read_bytes())
|
||||
|
||||
|
||||
def _prepare(tmp_path: Path) -> dict:
|
||||
dirs = {
|
||||
"working": tmp_path / "working",
|
||||
"outgoing": tmp_path / "outgoing",
|
||||
"error": tmp_path / "error",
|
||||
"archive": tmp_path / "archive",
|
||||
"incoming": tmp_path / "incoming",
|
||||
}
|
||||
for d in dirs.values():
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
src = dirs["incoming"] / "scan.pdf"
|
||||
src.write_bytes(b"%PDF-1.4 original\n")
|
||||
return {"src": src, **dirs}
|
||||
|
||||
|
||||
def test_process_pdf_prefix_delete(tmp_path: Path) -> None:
|
||||
env = _prepare(tmp_path)
|
||||
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
|
||||
original_on_success="delete")
|
||||
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
||||
result = process_pdf(
|
||||
src=env["src"],
|
||||
working_dir=env["working"],
|
||||
outgoing_dir=env["outgoing"],
|
||||
error_dir=env["error"],
|
||||
ocr_cfg=OcrConfig(),
|
||||
vera_cfg=VeraPdfConfig(enabled=False),
|
||||
output_cfg=out_cfg,
|
||||
)
|
||||
assert result.success
|
||||
assert (env["outgoing"] / "OCR_scan.pdf").exists()
|
||||
# Original ist weg, weder in incoming noch in working
|
||||
assert not env["src"].exists()
|
||||
assert not (env["working"] / "scan.pdf").exists()
|
||||
|
||||
|
||||
def test_process_pdf_suffix_delete(tmp_path: Path) -> None:
|
||||
env = _prepare(tmp_path)
|
||||
out_cfg = OutputConfig(name_mode="suffix", name_tag="_OCR",
|
||||
original_on_success="delete")
|
||||
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
||||
result = process_pdf(
|
||||
src=env["src"],
|
||||
working_dir=env["working"],
|
||||
outgoing_dir=env["outgoing"],
|
||||
error_dir=env["error"],
|
||||
ocr_cfg=OcrConfig(),
|
||||
vera_cfg=VeraPdfConfig(enabled=False),
|
||||
output_cfg=out_cfg,
|
||||
)
|
||||
assert result.success
|
||||
assert (env["outgoing"] / "scan_OCR.pdf").exists()
|
||||
|
||||
|
||||
def test_process_pdf_none_mode(tmp_path: Path) -> None:
|
||||
env = _prepare(tmp_path)
|
||||
out_cfg = OutputConfig(name_mode="none", name_tag="OCR_",
|
||||
original_on_success="delete")
|
||||
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
||||
result = process_pdf(
|
||||
src=env["src"],
|
||||
working_dir=env["working"],
|
||||
outgoing_dir=env["outgoing"],
|
||||
error_dir=env["error"],
|
||||
ocr_cfg=OcrConfig(),
|
||||
vera_cfg=VeraPdfConfig(enabled=False),
|
||||
output_cfg=out_cfg,
|
||||
)
|
||||
assert result.success
|
||||
# Ausgang hat GLEICHEN Namen wie Original
|
||||
assert (env["outgoing"] / "scan.pdf").exists()
|
||||
|
||||
|
||||
def test_process_pdf_archive_original(tmp_path: Path) -> None:
|
||||
env = _prepare(tmp_path)
|
||||
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
|
||||
original_on_success="archive",
|
||||
archive_dir=str(env["archive"]))
|
||||
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
||||
result = process_pdf(
|
||||
src=env["src"],
|
||||
working_dir=env["working"],
|
||||
outgoing_dir=env["outgoing"],
|
||||
error_dir=env["error"],
|
||||
ocr_cfg=OcrConfig(),
|
||||
vera_cfg=VeraPdfConfig(enabled=False),
|
||||
output_cfg=out_cfg,
|
||||
)
|
||||
assert result.success
|
||||
assert (env["outgoing"] / "OCR_scan.pdf").exists()
|
||||
# Original liegt jetzt im Archiv
|
||||
archived = env["archive"] / "scan.pdf"
|
||||
assert archived.exists()
|
||||
assert archived.read_bytes() == b"%PDF-1.4 original\n"
|
||||
|
||||
|
||||
def test_process_pdf_archive_name_collision(tmp_path: Path) -> None:
|
||||
"""Bei Namens-Kollision im Archiv wird Timestamp angehängt."""
|
||||
env = _prepare(tmp_path)
|
||||
# Vorhandene Kollisions-Datei
|
||||
(env["archive"] / "scan.pdf").write_bytes(b"old")
|
||||
|
||||
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
|
||||
original_on_success="archive",
|
||||
archive_dir=str(env["archive"]))
|
||||
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
||||
process_pdf(
|
||||
src=env["src"],
|
||||
working_dir=env["working"],
|
||||
outgoing_dir=env["outgoing"],
|
||||
error_dir=env["error"],
|
||||
ocr_cfg=OcrConfig(),
|
||||
vera_cfg=VeraPdfConfig(enabled=False),
|
||||
output_cfg=out_cfg,
|
||||
)
|
||||
# Alte Datei unverändert
|
||||
assert (env["archive"] / "scan.pdf").read_bytes() == b"old"
|
||||
# Neue Datei mit Timestamp-Suffix
|
||||
archived = list(env["archive"].glob("scan_*.pdf"))
|
||||
assert len(archived) == 1
|
||||
assert archived[0].read_bytes() == b"%PDF-1.4 original\n"
|
||||
Reference in New Issue
Block a user