"""OCR-Verarbeitung einer einzelnen PDF mit ocrmypdf + optional veraPDF.""" from __future__ import annotations import logging import shutil import subprocess from dataclasses import dataclass from pathlib import Path import ocrmypdf from .config import OcrConfig, VeraPdfConfig log = logging.getLogger(__name__) @dataclass class ProcessResult: source: Path output: Path success: bool error: str = "" verapdf_passed: bool | None = None def run_ocr(src: Path, dst: Path, cfg: OcrConfig) -> None: """Führt ocrmypdf als Library-Call aus (kein Subprozess-Overhead).""" kwargs: dict = { "language": cfg.languages, "jobs": cfg.jobs, "deskew": cfg.deskew, "clean": cfg.clean, "oversample": cfg.oversample, "progress_bar": False, "skip_text": cfg.skip_text, } if cfg.pdfa_level: kwargs["output_type"] = f"pdfa-{cfg.pdfa_level}" else: kwargs["output_type"] = "pdf" log.info("OCR start: %s", src.name) ocrmypdf.ocr(str(src), str(dst), **kwargs) log.info("OCR done: %s", dst.name) def run_verapdf(pdf: Path, cfg: VeraPdfConfig) -> bool: """Validiert PDF/A mit veraPDF (CLI). Gibt True zurück, wenn konform.""" if not cfg.enabled: return True if not Path(cfg.binary).exists(): log.warning("veraPDF binary nicht gefunden: %s", cfg.binary) return False try: result = subprocess.run( [cfg.binary, "--flavour", cfg.flavour, "--format", "text", str(pdf)], capture_output=True, text=True, timeout=300, ) ok = result.returncode == 0 and "PASS" in result.stdout log.info("veraPDF %s: %s", "PASS" if ok else "FAIL", pdf.name) return ok except subprocess.TimeoutExpired: log.error("veraPDF Timeout: %s", pdf.name) return False def process_pdf( src: Path, working_dir: Path, outgoing_dir: Path, error_dir: Path, ocr_cfg: OcrConfig, vera_cfg: VeraPdfConfig, ) -> ProcessResult: """Verarbeitet eine einzelne PDF: move→OCR→validate→outgoing/error.""" work_src = working_dir / src.name work_out = working_dir / f"OCR_{src.name}" final_out = outgoing_dir / f"OCR_{src.name}" try: shutil.move(str(src), str(work_src)) except OSError as e: return ProcessResult(src, final_out, False, f"move to working failed: {e}") try: run_ocr(work_src, work_out, ocr_cfg) except Exception as e: # noqa: BLE001 - ocrmypdf wirft viele Typen log.exception("OCR fehlgeschlagen für %s", src.name) _move_to_error(work_src, error_dir) return ProcessResult(src, final_out, False, f"ocr failed: {e}") vera_ok: bool | None = None if vera_cfg.enabled: vera_ok = run_verapdf(work_out, vera_cfg) if not vera_ok: _move_to_error(work_out, error_dir) work_src.unlink(missing_ok=True) return ProcessResult(src, final_out, False, "verapdf validation failed", verapdf_passed=False) outgoing_dir.mkdir(parents=True, exist_ok=True) shutil.move(str(work_out), str(final_out)) work_src.unlink(missing_ok=True) return ProcessResult(src, final_out, True, verapdf_passed=vera_ok) def _move_to_error(p: Path, error_dir: Path) -> None: error_dir.mkdir(parents=True, exist_ok=True) try: shutil.move(str(p), str(error_dir / p.name)) except OSError: log.exception("Konnte %s nicht in error-Verzeichnis verschieben", p)