From 6f7cadfc6376acd1f1cf45e7486058d7906bcdf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20H=C3=B6fling?= Date: Thu, 9 Apr 2026 07:24:00 +0200 Subject: [PATCH] fix: Preflight-Check und Exit-Code in --once Modus (v0.2.1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - #1: check_preflight() prüft beim Start tesseract + gs, wirft PreflightError. CLI endet mit Exit 2 statt grün zu bleiben. - #2: run_once() gibt Anzahl fehlgeschlagener PDFs zurück, CLI endet mit Exit 1 wenn mindestens eine Datei scheiterte. - pytest-Suite mit 11 Tests für beide Szenarien - ocrmypdf-Import lazy in processor.py (Tests ohne ocrmypdf möglich) Closes #1, #2 Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + CHANGELOG.md | 12 +++++ VERSION | 2 +- pdf_ocr_hotfolder/__main__.py | 16 ++++-- pdf_ocr_hotfolder/processor.py | 4 +- pdf_ocr_hotfolder/service.py | 57 +++++++++++++++++++- tests/__init__.py | 0 tests/conftest.py | 52 ++++++++++++++++++ tests/test_once_exit_code.py | 96 ++++++++++++++++++++++++++++++++++ tests/test_preflight.py | 75 ++++++++++++++++++++++++++ 10 files changed, 305 insertions(+), 10 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_once_exit_code.py create mode 100644 tests/test_preflight.py diff --git a/.gitignore b/.gitignore index c4f3e69..60e283c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__/ venv/ env/ .venv/ +.pytest_cache/ *.egg-info/ build/ dist/ diff --git a/CHANGELOG.md b/CHANGELOG.md index c04cb41..8c856a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## [0.2.1] - 2026-04-09 + +### Fixed +- **Issue #1**: Preflight-Check beim Start prüft jetzt `tesseract` und `gs` (Ghostscript). Fehlt eine Abhängigkeit, beendet sich der Service sofort mit Exit-Code 2 und klarer Fehlermeldung statt erst bei der ersten Datei. +- **Issue #2**: `--once`-Modus liefert jetzt Exit-Code `1`, sobald **mindestens ein** PDF fehlgeschlagen ist. Exit-Code `0` nur bei vollständigem Erfolg (inkl. "keine Dateien vorhanden"). Exit-Code `2` bei Preflight-Fehler. + +### Added +- Public API: `HotfolderService.run_once()`, `.success_count`, `.error_count`, `.ensure_dirs()` +- `check_preflight()` / `PreflightError` in `pdf_ocr_hotfolder.service` +- pytest-Test-Suite (`tests/`) mit 11 Tests — deckt alle Szenarien aus Issue #1 und #2 ab +- `ocrmypdf`-Import in `processor.py` ist jetzt lazy (Tests ohne ocrmypdf-Installation möglich) + ## [0.2.0] - 2026-04-08 ### Added diff --git a/VERSION b/VERSION index 0ea3a94..0c62199 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.0 +0.2.1 diff --git a/pdf_ocr_hotfolder/__main__.py b/pdf_ocr_hotfolder/__main__.py index a4fc3d6..2c23e2f 100644 --- a/pdf_ocr_hotfolder/__main__.py +++ b/pdf_ocr_hotfolder/__main__.py @@ -8,7 +8,7 @@ from pathlib import Path from . import __version__ from .config import load_config -from .service import HotfolderService +from .service import HotfolderService, PreflightError def _setup_logging(level: str) -> None: @@ -40,14 +40,20 @@ def main() -> int: _setup_logging(cfg.log_level) service = HotfolderService(cfg) + if args.once: - service._ensure_dirs() # noqa: SLF001 - service._scan_existing() # noqa: SLF001 - service._executor.shutdown(wait=True) # noqa: SLF001 - return 0 + try: + errors = service.run_once() + except PreflightError as e: + print(f"FEHLER: {e}", file=sys.stderr) + return 2 + return 1 if errors > 0 else 0 try: service.run() + except PreflightError as e: + print(f"FEHLER: {e}", file=sys.stderr) + return 2 except KeyboardInterrupt: pass return 0 diff --git a/pdf_ocr_hotfolder/processor.py b/pdf_ocr_hotfolder/processor.py index b138720..06ce78b 100644 --- a/pdf_ocr_hotfolder/processor.py +++ b/pdf_ocr_hotfolder/processor.py @@ -7,8 +7,6 @@ import subprocess from dataclasses import dataclass from pathlib import Path -import ocrmypdf - from .config import OcrConfig, VeraPdfConfig log = logging.getLogger(__name__) @@ -25,6 +23,8 @@ class ProcessResult: def run_ocr(src: Path, dst: Path, cfg: OcrConfig) -> None: """Führt ocrmypdf als Library-Call aus (kein Subprozess-Overhead).""" + import ocrmypdf # lazy, damit Tests ohne ocrmypdf laufen + kwargs: dict = { "language": cfg.languages, "jobs": cfg.jobs, diff --git a/pdf_ocr_hotfolder/service.py b/pdf_ocr_hotfolder/service.py index 91066f0..93c5a27 100644 --- a/pdf_ocr_hotfolder/service.py +++ b/pdf_ocr_hotfolder/service.py @@ -2,6 +2,7 @@ from __future__ import annotations import logging +import shutil import signal import threading import time @@ -18,6 +19,27 @@ from .uploaders import notify_email, upload_folder, upload_nextcloud, upload_sft log = logging.getLogger(__name__) +class PreflightError(RuntimeError): + """Erforderliche externe Binaries fehlen.""" + + +# Pflicht-Binaries für ocrmypdf +_REQUIRED_BINARIES = ("tesseract", "gs") + + +def check_preflight() -> None: + """Prüft, ob alle externen Abhängigkeiten (Tesseract, Ghostscript) installiert sind. + + Wirft PreflightError mit Liste der fehlenden Binaries. + """ + missing = [b for b in _REQUIRED_BINARIES if shutil.which(b) is None] + if missing: + raise PreflightError( + "Fehlende Abhängigkeiten: " + ", ".join(missing) + + ". Bitte installieren: sudo apt install tesseract-ocr ghostscript" + ) + + def _is_pdf(path: Path) -> bool: return path.suffix.lower() == ".pdf" and path.is_file() @@ -70,10 +92,20 @@ class HotfolderService: self._stop = threading.Event() self._inflight: set[str] = set() self._lock = threading.Lock() + self._success_count = 0 + self._error_count = 0 + + @property + def success_count(self) -> int: + return self._success_count + + @property + def error_count(self) -> int: + return self._error_count # ---- Setup ---- - def _ensure_dirs(self) -> None: + def ensure_dirs(self) -> None: for p in (self.cfg.paths.incoming, self.cfg.paths.outgoing, self.cfg.paths.working, self.cfg.paths.error): p.mkdir(parents=True, exist_ok=True) @@ -81,7 +113,8 @@ class HotfolderService: # ---- Lifecycle ---- def run(self) -> None: - self._ensure_dirs() + check_preflight() + self.ensure_dirs() self._scan_existing() self._observer = Observer() @@ -98,6 +131,20 @@ class HotfolderService: finally: self.shutdown() + def run_once(self) -> int: + """Verarbeitet alle bereits im incoming-Ordner liegenden PDFs und beendet sich. + + Returns: + Anzahl fehlgeschlagener PDFs (0 = alles ok). + """ + check_preflight() + self.ensure_dirs() + self._scan_existing() + self._executor.shutdown(wait=True) + log.info("One-shot fertig: %d ok, %d Fehler", + self._success_count, self._error_count) + return self._error_count + def shutdown(self) -> None: log.info("Shutdown läuft...") if self._observer: @@ -150,6 +197,12 @@ class HotfolderService: vera_cfg=self.cfg.verapdf, ) + with self._lock: + if result.success: + self._success_count += 1 + else: + self._error_count += 1 + if result.success: self._dispatch_uploads(result.output) self._notify(result) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..002b9b6 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,52 @@ +"""Gemeinsame pytest-Fixtures.""" +from __future__ import annotations + +from pathlib import Path + +import pytest + +from pdf_ocr_hotfolder.config import ( + Config, + EmailNotify, + FolderUpload, + NextcloudUpload, + OcrConfig, + Paths, + SftpUpload, + VeraPdfConfig, +) + + +@pytest.fixture +def tmp_config(tmp_path: Path) -> Config: + """Minimal-Config mit tmp_path-Verzeichnissen, alle Uploads deaktiviert.""" + paths = Paths( + incoming=tmp_path / "incoming", + outgoing=tmp_path / "outgoing", + working=tmp_path / "working", + error=tmp_path / "error", + ) + for p in (paths.incoming, paths.outgoing, paths.working, paths.error): + p.mkdir(parents=True, exist_ok=True) + + return Config( + paths=paths, + ocr=OcrConfig(max_workers=1), + verapdf=VeraPdfConfig(enabled=False), + folder=FolderUpload(enabled=False), + nextcloud=NextcloudUpload(enabled=False), + sftp=SftpUpload(enabled=False), + email=EmailNotify(enabled=False), + log_level="DEBUG", + ) + + +@pytest.fixture +def dummy_pdf(tmp_config: Config) -> Path: + """Legt eine Datei mit .pdf-Extension im incoming-Ordner ab. + + Achtung: kein echtes PDF. Für Tests wird `process_pdf` gemockt. + """ + pdf = tmp_config.paths.incoming / "test.pdf" + pdf.write_bytes(b"%PDF-1.4 fake\n") + return pdf diff --git a/tests/test_once_exit_code.py b/tests/test_once_exit_code.py new file mode 100644 index 0000000..7471bca --- /dev/null +++ b/tests/test_once_exit_code.py @@ -0,0 +1,96 @@ +"""Tests für Issue #2: --once Modus muss Exit-Code != 0 bei Fehlern liefern.""" +from __future__ import annotations + +from pathlib import Path +from unittest.mock import patch + +from pdf_ocr_hotfolder.processor import ProcessResult +from pdf_ocr_hotfolder.service import HotfolderService + + +def _fake_success(src: Path, working_dir, outgoing_dir, error_dir, ocr_cfg, vera_cfg): + out = outgoing_dir / f"OCR_{src.name}" + out.parent.mkdir(parents=True, exist_ok=True) + out.write_bytes(b"%PDF-1.4 ocr\n") + src.unlink(missing_ok=True) + return ProcessResult(src, out, True) + + +def _fake_failure(src: Path, working_dir, outgoing_dir, error_dir, ocr_cfg, vera_cfg): + error_dir.mkdir(parents=True, exist_ok=True) + dest = error_dir / src.name + src.rename(dest) + return ProcessResult(src, outgoing_dir / f"OCR_{src.name}", False, + error="fake ocr failure") + + +def _run(tmp_config, fake_process): + """Helper: führt run_once() mit gemocktem process_pdf und preflight aus.""" + with patch("pdf_ocr_hotfolder.service.check_preflight", return_value=None), \ + patch("pdf_ocr_hotfolder.service.process_pdf", side_effect=fake_process), \ + patch("pdf_ocr_hotfolder.service._wait_until_stable", return_value=True): + service = HotfolderService(tmp_config) + try: + return service.run_once() + finally: + service._executor.shutdown(wait=False) + + +def test_once_exit_0_when_no_files(tmp_config) -> None: + """Szenario: Keine PDFs vorhanden → Exit 0.""" + errors = _run(tmp_config, _fake_success) + assert errors == 0 + + +def test_once_exit_0_when_all_success(tmp_config) -> None: + """Szenario: Alle PDFs erfolgreich → Exit 0.""" + (tmp_config.paths.incoming / "a.pdf").write_bytes(b"%PDF-1.4\n") + (tmp_config.paths.incoming / "b.pdf").write_bytes(b"%PDF-1.4\n") + + errors = _run(tmp_config, _fake_success) + assert errors == 0 + + +def test_once_exit_nonzero_when_all_fail(tmp_config) -> None: + """Szenario: Alle PDFs fehlgeschlagen → Exit != 0 (Issue #2).""" + (tmp_config.paths.incoming / "a.pdf").write_bytes(b"%PDF-1.4\n") + (tmp_config.paths.incoming / "b.pdf").write_bytes(b"%PDF-1.4\n") + + errors = _run(tmp_config, _fake_failure) + assert errors == 2 + + +def test_once_exit_nonzero_when_some_fail(tmp_config) -> None: + """Szenario: Teilweise fehlgeschlagen → Exit != 0.""" + (tmp_config.paths.incoming / "ok.pdf").write_bytes(b"%PDF-1.4\n") + (tmp_config.paths.incoming / "bad.pdf").write_bytes(b"%PDF-1.4\n") + + def mixed(src, *args, **kwargs): + if "bad" in src.name: + return _fake_failure(src, *args, **kwargs) + return _fake_success(src, *args, **kwargs) + + errors = _run(tmp_config, mixed) + assert errors == 1 + + +def test_counters_track_success_and_failure(tmp_config) -> None: + """success_count und error_count sollen korrekt mitzählen.""" + (tmp_config.paths.incoming / "ok.pdf").write_bytes(b"%PDF-1.4\n") + (tmp_config.paths.incoming / "bad.pdf").write_bytes(b"%PDF-1.4\n") + + def mixed(src, *args, **kwargs): + if "bad" in src.name: + return _fake_failure(src, *args, **kwargs) + return _fake_success(src, *args, **kwargs) + + with patch("pdf_ocr_hotfolder.service.check_preflight", return_value=None), \ + patch("pdf_ocr_hotfolder.service.process_pdf", side_effect=mixed), \ + patch("pdf_ocr_hotfolder.service._wait_until_stable", return_value=True): + service = HotfolderService(tmp_config) + try: + service.run_once() + assert service.success_count == 1 + assert service.error_count == 1 + finally: + service._executor.shutdown(wait=False) diff --git a/tests/test_preflight.py b/tests/test_preflight.py new file mode 100644 index 0000000..2edcc15 --- /dev/null +++ b/tests/test_preflight.py @@ -0,0 +1,75 @@ +"""Tests für Issue #1: Preflight-Check bei fehlendem Tesseract.""" +from __future__ import annotations + +import sys +from unittest.mock import patch + +import pytest + +from pdf_ocr_hotfolder.service import ( + HotfolderService, + PreflightError, + check_preflight, +) + + +def test_preflight_passes_when_all_binaries_present() -> None: + """Wenn tesseract + gs im PATH sind, darf kein Fehler fliegen.""" + with patch("pdf_ocr_hotfolder.service.shutil.which", return_value="/usr/bin/fake"): + check_preflight() # darf nicht werfen + + +def test_preflight_fails_when_tesseract_missing() -> None: + """Fehlendes tesseract → PreflightError mit passender Meldung.""" + def fake_which(name: str) -> str | None: + return None if name == "tesseract" else "/usr/bin/fake" + + with patch("pdf_ocr_hotfolder.service.shutil.which", side_effect=fake_which): + with pytest.raises(PreflightError, match="tesseract"): + check_preflight() + + +def test_preflight_fails_when_ghostscript_missing() -> None: + def fake_which(name: str) -> str | None: + return None if name == "gs" else "/usr/bin/fake" + + with patch("pdf_ocr_hotfolder.service.shutil.which", side_effect=fake_which): + with pytest.raises(PreflightError, match="gs"): + check_preflight() + + +def test_preflight_lists_all_missing_binaries() -> None: + """Bei mehreren fehlenden Binaries werden alle genannt.""" + with patch("pdf_ocr_hotfolder.service.shutil.which", return_value=None): + with pytest.raises(PreflightError) as exc_info: + check_preflight() + msg = str(exc_info.value) + assert "tesseract" in msg + assert "gs" in msg + + +def test_run_once_raises_preflight_error(tmp_config) -> None: + """HotfolderService.run_once() wirft PreflightError, wenn tesseract fehlt.""" + service = HotfolderService(tmp_config) + try: + with patch("pdf_ocr_hotfolder.service.shutil.which", return_value=None): + with pytest.raises(PreflightError): + service.run_once() + finally: + service._executor.shutdown(wait=False) + + +def test_main_returns_2_on_preflight_error(tmp_config, tmp_path, monkeypatch) -> None: + """CLI liefert Exit-Code 2 bei Preflight-Fehler (Issue #1 Szenario).""" + cfg_file = tmp_path / "cfg.toml" + cfg_file.write_text(f""" +[paths] +incoming = "{tmp_config.paths.incoming}" +outgoing = "{tmp_config.paths.outgoing}" +working = "{tmp_config.paths.working}" +error = "{tmp_config.paths.error}" +""") + monkeypatch.setattr(sys, "argv", ["pdf-ocr-hotfolder", "--config", str(cfg_file), "--once"]) + with patch("pdf_ocr_hotfolder.service.shutil.which", return_value=None): + from pdf_ocr_hotfolder.__main__ import main + assert main() == 2