"""Tests für Feature: konfigurierbare Dateinamen und Original-Behandlung.""" from __future__ import annotations from pathlib import Path from unittest.mock import patch import pytest from pdf_ocr_hotfolder.config import OcrConfig, OutputConfig, VeraPdfConfig from pdf_ocr_hotfolder.processor import build_output_name, process_pdf from pdf_ocr_hotfolder.service import PreflightError, check_output_config # ---------------- build_output_name ---------------- @pytest.mark.parametrize("src,mode,tag,expected", [ # prefix ("scan.pdf", "prefix", "OCR_", "OCR_scan.pdf"), ("scan.pdf", "prefix", "[OCR] ", "[OCR] scan.pdf"), # suffix (Tag vor Extension) ("scan.pdf", "suffix", "_OCR", "scan_OCR.pdf"), ("scan.pdf", "suffix", "-ocr", "scan-ocr.pdf"), # none ("scan.pdf", "none", "OCR_", "scan.pdf"), # leerer Tag = none ("scan.pdf", "prefix", "", "scan.pdf"), ("scan.pdf", "suffix", "", "scan.pdf"), # Mehrfach-Punkte im Namen: nur letzte Extension zählt ("rechnung.2026.pdf", "suffix", "_OCR", "rechnung.2026_OCR.pdf"), ("rechnung.2026.pdf", "prefix", "OCR_", "OCR_rechnung.2026.pdf"), # Name ohne Extension ("NO_EXT", "suffix", "_OCR", "NO_EXT_OCR"), ]) def test_build_output_name(src, mode, tag, expected) -> None: assert build_output_name(src, mode, tag) == expected def test_build_output_name_invalid_mode() -> None: with pytest.raises(ValueError, match="name_mode"): build_output_name("x.pdf", "bogus", "OCR_") # ---------------- check_output_config ---------------- def test_check_output_config_delete_ok() -> None: check_output_config("delete", "") # ok def test_check_output_config_archive_requires_dir() -> None: with pytest.raises(PreflightError, match="archive_dir"): check_output_config("archive", "") def test_check_output_config_archive_with_dir_ok() -> None: check_output_config("archive", "/var/archive") # ok def test_check_output_config_invalid_mode() -> None: with pytest.raises(PreflightError, match="ungültig"): check_output_config("trash", "") # ---------------- process_pdf mit Original-Behandlung ---------------- def _fake_ocr(src: Path, dst: Path, cfg: OcrConfig) -> None: """Simuliert ocrmypdf: kopiert Inhalt, erzeugt Zieldatei.""" dst.write_bytes(b"%PDF-1.4 OCRed\n" + src.read_bytes()) def _prepare(tmp_path: Path) -> dict: dirs = { "working": tmp_path / "working", "outgoing": tmp_path / "outgoing", "error": tmp_path / "error", "archive": tmp_path / "archive", "incoming": tmp_path / "incoming", } for d in dirs.values(): d.mkdir(parents=True, exist_ok=True) src = dirs["incoming"] / "scan.pdf" src.write_bytes(b"%PDF-1.4 original\n") return {"src": src, **dirs} def test_process_pdf_prefix_delete(tmp_path: Path) -> None: env = _prepare(tmp_path) out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_", original_on_success="delete") with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr): result = process_pdf( src=env["src"], working_dir=env["working"], outgoing_dir=env["outgoing"], error_dir=env["error"], ocr_cfg=OcrConfig(), vera_cfg=VeraPdfConfig(enabled=False), output_cfg=out_cfg, ) assert result.success assert (env["outgoing"] / "OCR_scan.pdf").exists() # Original ist weg, weder in incoming noch in working assert not env["src"].exists() assert not (env["working"] / "scan.pdf").exists() def test_process_pdf_suffix_delete(tmp_path: Path) -> None: env = _prepare(tmp_path) out_cfg = OutputConfig(name_mode="suffix", name_tag="_OCR", original_on_success="delete") with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr): result = process_pdf( src=env["src"], working_dir=env["working"], outgoing_dir=env["outgoing"], error_dir=env["error"], ocr_cfg=OcrConfig(), vera_cfg=VeraPdfConfig(enabled=False), output_cfg=out_cfg, ) assert result.success assert (env["outgoing"] / "scan_OCR.pdf").exists() def test_process_pdf_none_mode(tmp_path: Path) -> None: env = _prepare(tmp_path) out_cfg = OutputConfig(name_mode="none", name_tag="OCR_", original_on_success="delete") with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr): result = process_pdf( src=env["src"], working_dir=env["working"], outgoing_dir=env["outgoing"], error_dir=env["error"], ocr_cfg=OcrConfig(), vera_cfg=VeraPdfConfig(enabled=False), output_cfg=out_cfg, ) assert result.success # Ausgang hat GLEICHEN Namen wie Original assert (env["outgoing"] / "scan.pdf").exists() def test_process_pdf_archive_original(tmp_path: Path) -> None: env = _prepare(tmp_path) out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_", original_on_success="archive", archive_dir=str(env["archive"])) with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr): result = process_pdf( src=env["src"], working_dir=env["working"], outgoing_dir=env["outgoing"], error_dir=env["error"], ocr_cfg=OcrConfig(), vera_cfg=VeraPdfConfig(enabled=False), output_cfg=out_cfg, ) assert result.success assert (env["outgoing"] / "OCR_scan.pdf").exists() # Original liegt jetzt im Archiv archived = env["archive"] / "scan.pdf" assert archived.exists() assert archived.read_bytes() == b"%PDF-1.4 original\n" def test_process_pdf_archive_name_collision(tmp_path: Path) -> None: """Bei Namens-Kollision im Archiv wird Timestamp angehängt.""" env = _prepare(tmp_path) # Vorhandene Kollisions-Datei (env["archive"] / "scan.pdf").write_bytes(b"old") out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_", original_on_success="archive", archive_dir=str(env["archive"])) with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr): process_pdf( src=env["src"], working_dir=env["working"], outgoing_dir=env["outgoing"], error_dir=env["error"], ocr_cfg=OcrConfig(), vera_cfg=VeraPdfConfig(enabled=False), output_cfg=out_cfg, ) # Alte Datei unverändert assert (env["archive"] / "scan.pdf").read_bytes() == b"old" # Neue Datei mit Timestamp-Suffix archived = list(env["archive"].glob("scan_*.pdf")) assert len(archived) == 1 assert archived[0].read_bytes() == b"%PDF-1.4 original\n"