Files
pdf-ocr-hotfolder/tests/test_output_naming.py
T
techadmin a23a3968ef feat: konfigurierbarer Dateiname + Archiv-Modus für Original (v0.3.0)
Neue [output]-Section:
- name_mode: prefix | suffix | none (suffix wird vor Extension eingefügt)
- name_tag: verbatim einfügbarer String
- original_on_success: delete | archive
- archive_dir mit Kollisions-Schutz (Timestamp-Suffix)

20 neue Tests (50 insgesamt, alle grün).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-09 22:32:41 +02:00

191 lines
6.9 KiB
Python

"""Tests für Feature: konfigurierbare Dateinamen und Original-Behandlung."""
from __future__ import annotations
from pathlib import Path
from unittest.mock import patch
import pytest
from pdf_ocr_hotfolder.config import OcrConfig, OutputConfig, VeraPdfConfig
from pdf_ocr_hotfolder.processor import build_output_name, process_pdf
from pdf_ocr_hotfolder.service import PreflightError, check_output_config
# ---------------- build_output_name ----------------
@pytest.mark.parametrize("src,mode,tag,expected", [
# prefix
("scan.pdf", "prefix", "OCR_", "OCR_scan.pdf"),
("scan.pdf", "prefix", "[OCR] ", "[OCR] scan.pdf"),
# suffix (Tag vor Extension)
("scan.pdf", "suffix", "_OCR", "scan_OCR.pdf"),
("scan.pdf", "suffix", "-ocr", "scan-ocr.pdf"),
# none
("scan.pdf", "none", "OCR_", "scan.pdf"),
# leerer Tag = none
("scan.pdf", "prefix", "", "scan.pdf"),
("scan.pdf", "suffix", "", "scan.pdf"),
# Mehrfach-Punkte im Namen: nur letzte Extension zählt
("rechnung.2026.pdf", "suffix", "_OCR", "rechnung.2026_OCR.pdf"),
("rechnung.2026.pdf", "prefix", "OCR_", "OCR_rechnung.2026.pdf"),
# Name ohne Extension
("NO_EXT", "suffix", "_OCR", "NO_EXT_OCR"),
])
def test_build_output_name(src, mode, tag, expected) -> None:
assert build_output_name(src, mode, tag) == expected
def test_build_output_name_invalid_mode() -> None:
with pytest.raises(ValueError, match="name_mode"):
build_output_name("x.pdf", "bogus", "OCR_")
# ---------------- check_output_config ----------------
def test_check_output_config_delete_ok() -> None:
check_output_config("delete", "") # ok
def test_check_output_config_archive_requires_dir() -> None:
with pytest.raises(PreflightError, match="archive_dir"):
check_output_config("archive", "")
def test_check_output_config_archive_with_dir_ok() -> None:
check_output_config("archive", "/var/archive") # ok
def test_check_output_config_invalid_mode() -> None:
with pytest.raises(PreflightError, match="ungültig"):
check_output_config("trash", "")
# ---------------- process_pdf mit Original-Behandlung ----------------
def _fake_ocr(src: Path, dst: Path, cfg: OcrConfig) -> None:
"""Simuliert ocrmypdf: kopiert Inhalt, erzeugt Zieldatei."""
dst.write_bytes(b"%PDF-1.4 OCRed\n" + src.read_bytes())
def _prepare(tmp_path: Path) -> dict:
dirs = {
"working": tmp_path / "working",
"outgoing": tmp_path / "outgoing",
"error": tmp_path / "error",
"archive": tmp_path / "archive",
"incoming": tmp_path / "incoming",
}
for d in dirs.values():
d.mkdir(parents=True, exist_ok=True)
src = dirs["incoming"] / "scan.pdf"
src.write_bytes(b"%PDF-1.4 original\n")
return {"src": src, **dirs}
def test_process_pdf_prefix_delete(tmp_path: Path) -> None:
env = _prepare(tmp_path)
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
original_on_success="delete")
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
result = process_pdf(
src=env["src"],
working_dir=env["working"],
outgoing_dir=env["outgoing"],
error_dir=env["error"],
ocr_cfg=OcrConfig(),
vera_cfg=VeraPdfConfig(enabled=False),
output_cfg=out_cfg,
)
assert result.success
assert (env["outgoing"] / "OCR_scan.pdf").exists()
# Original ist weg, weder in incoming noch in working
assert not env["src"].exists()
assert not (env["working"] / "scan.pdf").exists()
def test_process_pdf_suffix_delete(tmp_path: Path) -> None:
env = _prepare(tmp_path)
out_cfg = OutputConfig(name_mode="suffix", name_tag="_OCR",
original_on_success="delete")
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
result = process_pdf(
src=env["src"],
working_dir=env["working"],
outgoing_dir=env["outgoing"],
error_dir=env["error"],
ocr_cfg=OcrConfig(),
vera_cfg=VeraPdfConfig(enabled=False),
output_cfg=out_cfg,
)
assert result.success
assert (env["outgoing"] / "scan_OCR.pdf").exists()
def test_process_pdf_none_mode(tmp_path: Path) -> None:
env = _prepare(tmp_path)
out_cfg = OutputConfig(name_mode="none", name_tag="OCR_",
original_on_success="delete")
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
result = process_pdf(
src=env["src"],
working_dir=env["working"],
outgoing_dir=env["outgoing"],
error_dir=env["error"],
ocr_cfg=OcrConfig(),
vera_cfg=VeraPdfConfig(enabled=False),
output_cfg=out_cfg,
)
assert result.success
# Ausgang hat GLEICHEN Namen wie Original
assert (env["outgoing"] / "scan.pdf").exists()
def test_process_pdf_archive_original(tmp_path: Path) -> None:
env = _prepare(tmp_path)
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
original_on_success="archive",
archive_dir=str(env["archive"]))
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
result = process_pdf(
src=env["src"],
working_dir=env["working"],
outgoing_dir=env["outgoing"],
error_dir=env["error"],
ocr_cfg=OcrConfig(),
vera_cfg=VeraPdfConfig(enabled=False),
output_cfg=out_cfg,
)
assert result.success
assert (env["outgoing"] / "OCR_scan.pdf").exists()
# Original liegt jetzt im Archiv
archived = env["archive"] / "scan.pdf"
assert archived.exists()
assert archived.read_bytes() == b"%PDF-1.4 original\n"
def test_process_pdf_archive_name_collision(tmp_path: Path) -> None:
"""Bei Namens-Kollision im Archiv wird Timestamp angehängt."""
env = _prepare(tmp_path)
# Vorhandene Kollisions-Datei
(env["archive"] / "scan.pdf").write_bytes(b"old")
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
original_on_success="archive",
archive_dir=str(env["archive"]))
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
process_pdf(
src=env["src"],
working_dir=env["working"],
outgoing_dir=env["outgoing"],
error_dir=env["error"],
ocr_cfg=OcrConfig(),
vera_cfg=VeraPdfConfig(enabled=False),
output_cfg=out_cfg,
)
# Alte Datei unverändert
assert (env["archive"] / "scan.pdf").read_bytes() == b"old"
# Neue Datei mit Timestamp-Suffix
archived = list(env["archive"].glob("scan_*.pdf"))
assert len(archived) == 1
assert archived[0].read_bytes() == b"%PDF-1.4 original\n"