Initial commit: PDF OCR Hotfolder v0.1.0
Komplettes Rewrite des alten Bash-Tools `pdf-tool` in Python. - ocrmypdf als Library, watchdog für Hotfolder, ThreadPool für Parallelität - Upload-Targets: folder, Nextcloud (WebDAV), SFTP - E-Mail-Notify, optional veraPDF - Interaktiver Installer mit Service-User-Support (lokal + AD via SSSD) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,112 @@
|
||||
"""OCR-Verarbeitung einer einzelnen PDF mit ocrmypdf + optional veraPDF."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import ocrmypdf
|
||||
|
||||
from .config import OcrConfig, VeraPdfConfig
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessResult:
|
||||
source: Path
|
||||
output: Path
|
||||
success: bool
|
||||
error: str = ""
|
||||
verapdf_passed: bool | None = None
|
||||
|
||||
|
||||
def run_ocr(src: Path, dst: Path, cfg: OcrConfig) -> None:
|
||||
"""Führt ocrmypdf als Library-Call aus (kein Subprozess-Overhead)."""
|
||||
kwargs: dict = {
|
||||
"language": cfg.languages,
|
||||
"jobs": cfg.jobs,
|
||||
"deskew": cfg.deskew,
|
||||
"clean": cfg.clean,
|
||||
"oversample": cfg.oversample,
|
||||
"progress_bar": False,
|
||||
"skip_text": cfg.skip_text,
|
||||
}
|
||||
if cfg.pdfa_level:
|
||||
kwargs["output_type"] = f"pdfa-{cfg.pdfa_level}"
|
||||
else:
|
||||
kwargs["output_type"] = "pdf"
|
||||
|
||||
log.info("OCR start: %s", src.name)
|
||||
ocrmypdf.ocr(str(src), str(dst), **kwargs)
|
||||
log.info("OCR done: %s", dst.name)
|
||||
|
||||
|
||||
def run_verapdf(pdf: Path, cfg: VeraPdfConfig) -> bool:
|
||||
"""Validiert PDF/A mit veraPDF (CLI). Gibt True zurück, wenn konform."""
|
||||
if not cfg.enabled:
|
||||
return True
|
||||
if not Path(cfg.binary).exists():
|
||||
log.warning("veraPDF binary nicht gefunden: %s", cfg.binary)
|
||||
return False
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[cfg.binary, "--flavour", cfg.flavour, "--format", "text", str(pdf)],
|
||||
capture_output=True, text=True, timeout=300,
|
||||
)
|
||||
ok = result.returncode == 0 and "PASS" in result.stdout
|
||||
log.info("veraPDF %s: %s", "PASS" if ok else "FAIL", pdf.name)
|
||||
return ok
|
||||
except subprocess.TimeoutExpired:
|
||||
log.error("veraPDF Timeout: %s", pdf.name)
|
||||
return False
|
||||
|
||||
|
||||
def process_pdf(
|
||||
src: Path,
|
||||
working_dir: Path,
|
||||
outgoing_dir: Path,
|
||||
error_dir: Path,
|
||||
ocr_cfg: OcrConfig,
|
||||
vera_cfg: VeraPdfConfig,
|
||||
) -> ProcessResult:
|
||||
"""Verarbeitet eine einzelne PDF: move→OCR→validate→outgoing/error."""
|
||||
work_src = working_dir / src.name
|
||||
work_out = working_dir / f"OCR_{src.name}"
|
||||
final_out = outgoing_dir / f"OCR_{src.name}"
|
||||
|
||||
try:
|
||||
shutil.move(str(src), str(work_src))
|
||||
except OSError as e:
|
||||
return ProcessResult(src, final_out, False, f"move to working failed: {e}")
|
||||
|
||||
try:
|
||||
run_ocr(work_src, work_out, ocr_cfg)
|
||||
except Exception as e: # noqa: BLE001 - ocrmypdf wirft viele Typen
|
||||
log.exception("OCR fehlgeschlagen für %s", src.name)
|
||||
_move_to_error(work_src, error_dir)
|
||||
return ProcessResult(src, final_out, False, f"ocr failed: {e}")
|
||||
|
||||
vera_ok: bool | None = None
|
||||
if vera_cfg.enabled:
|
||||
vera_ok = run_verapdf(work_out, vera_cfg)
|
||||
if not vera_ok:
|
||||
_move_to_error(work_out, error_dir)
|
||||
work_src.unlink(missing_ok=True)
|
||||
return ProcessResult(src, final_out, False,
|
||||
"verapdf validation failed", verapdf_passed=False)
|
||||
|
||||
outgoing_dir.mkdir(parents=True, exist_ok=True)
|
||||
shutil.move(str(work_out), str(final_out))
|
||||
work_src.unlink(missing_ok=True)
|
||||
return ProcessResult(src, final_out, True, verapdf_passed=vera_ok)
|
||||
|
||||
|
||||
def _move_to_error(p: Path, error_dir: Path) -> None:
|
||||
error_dir.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
shutil.move(str(p), str(error_dir / p.name))
|
||||
except OSError:
|
||||
log.exception("Konnte %s nicht in error-Verzeichnis verschieben", p)
|
||||
Reference in New Issue
Block a user