a23a3968ef
Neue [output]-Section: - name_mode: prefix | suffix | none (suffix wird vor Extension eingefügt) - name_tag: verbatim einfügbarer String - original_on_success: delete | archive - archive_dir mit Kollisions-Schutz (Timestamp-Suffix) 20 neue Tests (50 insgesamt, alle grün). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
191 lines
6.9 KiB
Python
191 lines
6.9 KiB
Python
"""Tests für Feature: konfigurierbare Dateinamen und Original-Behandlung."""
|
|
from __future__ import annotations
|
|
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
from pdf_ocr_hotfolder.config import OcrConfig, OutputConfig, VeraPdfConfig
|
|
from pdf_ocr_hotfolder.processor import build_output_name, process_pdf
|
|
from pdf_ocr_hotfolder.service import PreflightError, check_output_config
|
|
|
|
|
|
# ---------------- build_output_name ----------------
|
|
|
|
@pytest.mark.parametrize("src,mode,tag,expected", [
|
|
# prefix
|
|
("scan.pdf", "prefix", "OCR_", "OCR_scan.pdf"),
|
|
("scan.pdf", "prefix", "[OCR] ", "[OCR] scan.pdf"),
|
|
# suffix (Tag vor Extension)
|
|
("scan.pdf", "suffix", "_OCR", "scan_OCR.pdf"),
|
|
("scan.pdf", "suffix", "-ocr", "scan-ocr.pdf"),
|
|
# none
|
|
("scan.pdf", "none", "OCR_", "scan.pdf"),
|
|
# leerer Tag = none
|
|
("scan.pdf", "prefix", "", "scan.pdf"),
|
|
("scan.pdf", "suffix", "", "scan.pdf"),
|
|
# Mehrfach-Punkte im Namen: nur letzte Extension zählt
|
|
("rechnung.2026.pdf", "suffix", "_OCR", "rechnung.2026_OCR.pdf"),
|
|
("rechnung.2026.pdf", "prefix", "OCR_", "OCR_rechnung.2026.pdf"),
|
|
# Name ohne Extension
|
|
("NO_EXT", "suffix", "_OCR", "NO_EXT_OCR"),
|
|
])
|
|
def test_build_output_name(src, mode, tag, expected) -> None:
|
|
assert build_output_name(src, mode, tag) == expected
|
|
|
|
|
|
def test_build_output_name_invalid_mode() -> None:
|
|
with pytest.raises(ValueError, match="name_mode"):
|
|
build_output_name("x.pdf", "bogus", "OCR_")
|
|
|
|
|
|
# ---------------- check_output_config ----------------
|
|
|
|
def test_check_output_config_delete_ok() -> None:
|
|
check_output_config("delete", "") # ok
|
|
|
|
|
|
def test_check_output_config_archive_requires_dir() -> None:
|
|
with pytest.raises(PreflightError, match="archive_dir"):
|
|
check_output_config("archive", "")
|
|
|
|
|
|
def test_check_output_config_archive_with_dir_ok() -> None:
|
|
check_output_config("archive", "/var/archive") # ok
|
|
|
|
|
|
def test_check_output_config_invalid_mode() -> None:
|
|
with pytest.raises(PreflightError, match="ungültig"):
|
|
check_output_config("trash", "")
|
|
|
|
|
|
# ---------------- process_pdf mit Original-Behandlung ----------------
|
|
|
|
def _fake_ocr(src: Path, dst: Path, cfg: OcrConfig) -> None:
|
|
"""Simuliert ocrmypdf: kopiert Inhalt, erzeugt Zieldatei."""
|
|
dst.write_bytes(b"%PDF-1.4 OCRed\n" + src.read_bytes())
|
|
|
|
|
|
def _prepare(tmp_path: Path) -> dict:
|
|
dirs = {
|
|
"working": tmp_path / "working",
|
|
"outgoing": tmp_path / "outgoing",
|
|
"error": tmp_path / "error",
|
|
"archive": tmp_path / "archive",
|
|
"incoming": tmp_path / "incoming",
|
|
}
|
|
for d in dirs.values():
|
|
d.mkdir(parents=True, exist_ok=True)
|
|
src = dirs["incoming"] / "scan.pdf"
|
|
src.write_bytes(b"%PDF-1.4 original\n")
|
|
return {"src": src, **dirs}
|
|
|
|
|
|
def test_process_pdf_prefix_delete(tmp_path: Path) -> None:
|
|
env = _prepare(tmp_path)
|
|
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
|
|
original_on_success="delete")
|
|
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
|
result = process_pdf(
|
|
src=env["src"],
|
|
working_dir=env["working"],
|
|
outgoing_dir=env["outgoing"],
|
|
error_dir=env["error"],
|
|
ocr_cfg=OcrConfig(),
|
|
vera_cfg=VeraPdfConfig(enabled=False),
|
|
output_cfg=out_cfg,
|
|
)
|
|
assert result.success
|
|
assert (env["outgoing"] / "OCR_scan.pdf").exists()
|
|
# Original ist weg, weder in incoming noch in working
|
|
assert not env["src"].exists()
|
|
assert not (env["working"] / "scan.pdf").exists()
|
|
|
|
|
|
def test_process_pdf_suffix_delete(tmp_path: Path) -> None:
|
|
env = _prepare(tmp_path)
|
|
out_cfg = OutputConfig(name_mode="suffix", name_tag="_OCR",
|
|
original_on_success="delete")
|
|
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
|
result = process_pdf(
|
|
src=env["src"],
|
|
working_dir=env["working"],
|
|
outgoing_dir=env["outgoing"],
|
|
error_dir=env["error"],
|
|
ocr_cfg=OcrConfig(),
|
|
vera_cfg=VeraPdfConfig(enabled=False),
|
|
output_cfg=out_cfg,
|
|
)
|
|
assert result.success
|
|
assert (env["outgoing"] / "scan_OCR.pdf").exists()
|
|
|
|
|
|
def test_process_pdf_none_mode(tmp_path: Path) -> None:
|
|
env = _prepare(tmp_path)
|
|
out_cfg = OutputConfig(name_mode="none", name_tag="OCR_",
|
|
original_on_success="delete")
|
|
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
|
result = process_pdf(
|
|
src=env["src"],
|
|
working_dir=env["working"],
|
|
outgoing_dir=env["outgoing"],
|
|
error_dir=env["error"],
|
|
ocr_cfg=OcrConfig(),
|
|
vera_cfg=VeraPdfConfig(enabled=False),
|
|
output_cfg=out_cfg,
|
|
)
|
|
assert result.success
|
|
# Ausgang hat GLEICHEN Namen wie Original
|
|
assert (env["outgoing"] / "scan.pdf").exists()
|
|
|
|
|
|
def test_process_pdf_archive_original(tmp_path: Path) -> None:
|
|
env = _prepare(tmp_path)
|
|
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
|
|
original_on_success="archive",
|
|
archive_dir=str(env["archive"]))
|
|
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
|
result = process_pdf(
|
|
src=env["src"],
|
|
working_dir=env["working"],
|
|
outgoing_dir=env["outgoing"],
|
|
error_dir=env["error"],
|
|
ocr_cfg=OcrConfig(),
|
|
vera_cfg=VeraPdfConfig(enabled=False),
|
|
output_cfg=out_cfg,
|
|
)
|
|
assert result.success
|
|
assert (env["outgoing"] / "OCR_scan.pdf").exists()
|
|
# Original liegt jetzt im Archiv
|
|
archived = env["archive"] / "scan.pdf"
|
|
assert archived.exists()
|
|
assert archived.read_bytes() == b"%PDF-1.4 original\n"
|
|
|
|
|
|
def test_process_pdf_archive_name_collision(tmp_path: Path) -> None:
|
|
"""Bei Namens-Kollision im Archiv wird Timestamp angehängt."""
|
|
env = _prepare(tmp_path)
|
|
# Vorhandene Kollisions-Datei
|
|
(env["archive"] / "scan.pdf").write_bytes(b"old")
|
|
|
|
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
|
|
original_on_success="archive",
|
|
archive_dir=str(env["archive"]))
|
|
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
|
process_pdf(
|
|
src=env["src"],
|
|
working_dir=env["working"],
|
|
outgoing_dir=env["outgoing"],
|
|
error_dir=env["error"],
|
|
ocr_cfg=OcrConfig(),
|
|
vera_cfg=VeraPdfConfig(enabled=False),
|
|
output_cfg=out_cfg,
|
|
)
|
|
# Alte Datei unverändert
|
|
assert (env["archive"] / "scan.pdf").read_bytes() == b"old"
|
|
# Neue Datei mit Timestamp-Suffix
|
|
archived = list(env["archive"].glob("scan_*.pdf"))
|
|
assert len(archived) == 1
|
|
assert archived[0].read_bytes() == b"%PDF-1.4 original\n"
|