Initial commit: PDF OCR Hotfolder v0.1.0

Komplettes Rewrite des alten Bash-Tools `pdf-tool` in Python.
- ocrmypdf als Library, watchdog für Hotfolder, ThreadPool für Parallelität
- Upload-Targets: folder, Nextcloud (WebDAV), SFTP
- E-Mail-Notify, optional veraPDF
- Interaktiver Installer mit Service-User-Support (lokal + AD via SSSD)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-08 00:22:31 +02:00
commit 76c3a991df
16 changed files with 1261 additions and 0 deletions
+3
View File
@@ -0,0 +1,3 @@
"""PDF OCR Hotfolder — Scanner-PDFs automatisch durchsuchbar machen."""
__version__ = "0.1.0"
+57
View File
@@ -0,0 +1,57 @@
"""CLI-Entrypoint."""
from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
from . import __version__
from .config import load_config
from .service import HotfolderService
def _setup_logging(level: str) -> None:
logging.basicConfig(
level=getattr(logging, level.upper(), logging.INFO),
format="%(asctime)s %(levelname)-7s %(name)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
def main() -> int:
parser = argparse.ArgumentParser(
prog="pdf-ocr-hotfolder",
description="Wandelt eingehende PDFs per OCR in durchsuchbare PDFs um.",
)
parser.add_argument("--config", "-c", default="/etc/pdf-ocr-hotfolder/config.toml",
help="Pfad zur Konfigurationsdatei (TOML)")
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
parser.add_argument("--once", action="store_true",
help="Nur bestehende Dateien verarbeiten und beenden")
args = parser.parse_args()
cfg_path = Path(args.config)
if not cfg_path.exists():
print(f"Config nicht gefunden: {cfg_path}", file=sys.stderr)
return 2
cfg = load_config(cfg_path)
_setup_logging(cfg.log_level)
service = HotfolderService(cfg)
if args.once:
service._ensure_dirs() # noqa: SLF001
service._scan_existing() # noqa: SLF001
service._executor.shutdown(wait=True) # noqa: SLF001
return 0
try:
service.run()
except KeyboardInterrupt:
pass
return 0
if __name__ == "__main__":
sys.exit(main())
+129
View File
@@ -0,0 +1,129 @@
"""Konfigurations-Loader (TOML)."""
from __future__ import annotations
import tomllib
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
@dataclass
class Paths:
incoming: Path
outgoing: Path
working: Path
error: Path
@dataclass
class OcrConfig:
languages: str = "deu+eng"
jobs: int = 4
skip_text: bool = True
oversample: int = 300
pdfa_level: str = "2"
deskew: bool = True
clean: bool = False
max_workers: int = 2
timeout: int = 1800
@dataclass
class VeraPdfConfig:
enabled: bool = False
binary: str = "/opt/verapdf/verapdf"
flavour: str = "1b"
@dataclass
class FolderUpload:
enabled: bool = True
target: str = ""
@dataclass
class NextcloudUpload:
enabled: bool = False
url: str = ""
username: str = ""
password: str = ""
remote_path: str = ""
verify_ssl: bool = True
@dataclass
class SftpUpload:
enabled: bool = False
host: str = ""
port: int = 22
username: str = ""
key_file: str = ""
password: str = ""
remote_path: str = ""
@dataclass
class EmailNotify:
enabled: bool = False
smtp_host: str = ""
smtp_port: int = 587
smtp_user: str = ""
smtp_password: str = ""
use_starttls: bool = True
from_addr: str = ""
to_addrs: list[str] = field(default_factory=list)
on: str = "errors" # always | errors | never
@dataclass
class Config:
paths: Paths
ocr: OcrConfig
verapdf: VeraPdfConfig
folder: FolderUpload
nextcloud: NextcloudUpload
sftp: SftpUpload
email: EmailNotify
log_level: str = "INFO"
def _section(data: dict[str, Any], *keys: str) -> dict[str, Any]:
cur: Any = data
for k in keys:
cur = cur.get(k, {}) if isinstance(cur, dict) else {}
return cur if isinstance(cur, dict) else {}
def load_config(path: str | Path) -> Config:
path = Path(path)
with path.open("rb") as f:
data = tomllib.load(f)
p = _section(data, "paths")
paths = Paths(
incoming=Path(p["incoming"]),
outgoing=Path(p["outgoing"]),
working=Path(p["working"]),
error=Path(p["error"]),
)
ocr = OcrConfig(**{k: v for k, v in _section(data, "ocr").items()
if k in OcrConfig.__annotations__})
verapdf = VeraPdfConfig(**{k: v for k, v in _section(data, "verapdf").items()
if k in VeraPdfConfig.__annotations__})
folder = FolderUpload(**{k: v for k, v in _section(data, "upload", "folder").items()
if k in FolderUpload.__annotations__})
nextcloud = NextcloudUpload(**{k: v for k, v in _section(data, "upload", "nextcloud").items()
if k in NextcloudUpload.__annotations__})
sftp = SftpUpload(**{k: v for k, v in _section(data, "upload", "sftp").items()
if k in SftpUpload.__annotations__})
email = EmailNotify(**{k: v for k, v in _section(data, "notify", "email").items()
if k in EmailNotify.__annotations__})
log_level = _section(data, "logging").get("level", "INFO")
return Config(
paths=paths, ocr=ocr, verapdf=verapdf,
folder=folder, nextcloud=nextcloud, sftp=sftp, email=email,
log_level=log_level,
)
+112
View File
@@ -0,0 +1,112 @@
"""OCR-Verarbeitung einer einzelnen PDF mit ocrmypdf + optional veraPDF."""
from __future__ import annotations
import logging
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
import ocrmypdf
from .config import OcrConfig, VeraPdfConfig
log = logging.getLogger(__name__)
@dataclass
class ProcessResult:
source: Path
output: Path
success: bool
error: str = ""
verapdf_passed: bool | None = None
def run_ocr(src: Path, dst: Path, cfg: OcrConfig) -> None:
"""Führt ocrmypdf als Library-Call aus (kein Subprozess-Overhead)."""
kwargs: dict = {
"language": cfg.languages,
"jobs": cfg.jobs,
"deskew": cfg.deskew,
"clean": cfg.clean,
"oversample": cfg.oversample,
"progress_bar": False,
"skip_text": cfg.skip_text,
}
if cfg.pdfa_level:
kwargs["output_type"] = f"pdfa-{cfg.pdfa_level}"
else:
kwargs["output_type"] = "pdf"
log.info("OCR start: %s", src.name)
ocrmypdf.ocr(str(src), str(dst), **kwargs)
log.info("OCR done: %s", dst.name)
def run_verapdf(pdf: Path, cfg: VeraPdfConfig) -> bool:
"""Validiert PDF/A mit veraPDF (CLI). Gibt True zurück, wenn konform."""
if not cfg.enabled:
return True
if not Path(cfg.binary).exists():
log.warning("veraPDF binary nicht gefunden: %s", cfg.binary)
return False
try:
result = subprocess.run(
[cfg.binary, "--flavour", cfg.flavour, "--format", "text", str(pdf)],
capture_output=True, text=True, timeout=300,
)
ok = result.returncode == 0 and "PASS" in result.stdout
log.info("veraPDF %s: %s", "PASS" if ok else "FAIL", pdf.name)
return ok
except subprocess.TimeoutExpired:
log.error("veraPDF Timeout: %s", pdf.name)
return False
def process_pdf(
src: Path,
working_dir: Path,
outgoing_dir: Path,
error_dir: Path,
ocr_cfg: OcrConfig,
vera_cfg: VeraPdfConfig,
) -> ProcessResult:
"""Verarbeitet eine einzelne PDF: move→OCR→validate→outgoing/error."""
work_src = working_dir / src.name
work_out = working_dir / f"OCR_{src.name}"
final_out = outgoing_dir / f"OCR_{src.name}"
try:
shutil.move(str(src), str(work_src))
except OSError as e:
return ProcessResult(src, final_out, False, f"move to working failed: {e}")
try:
run_ocr(work_src, work_out, ocr_cfg)
except Exception as e: # noqa: BLE001 - ocrmypdf wirft viele Typen
log.exception("OCR fehlgeschlagen für %s", src.name)
_move_to_error(work_src, error_dir)
return ProcessResult(src, final_out, False, f"ocr failed: {e}")
vera_ok: bool | None = None
if vera_cfg.enabled:
vera_ok = run_verapdf(work_out, vera_cfg)
if not vera_ok:
_move_to_error(work_out, error_dir)
work_src.unlink(missing_ok=True)
return ProcessResult(src, final_out, False,
"verapdf validation failed", verapdf_passed=False)
outgoing_dir.mkdir(parents=True, exist_ok=True)
shutil.move(str(work_out), str(final_out))
work_src.unlink(missing_ok=True)
return ProcessResult(src, final_out, True, verapdf_passed=vera_ok)
def _move_to_error(p: Path, error_dir: Path) -> None:
error_dir.mkdir(parents=True, exist_ok=True)
try:
shutil.move(str(p), str(error_dir / p.name))
except OSError:
log.exception("Konnte %s nicht in error-Verzeichnis verschieben", p)
+173
View File
@@ -0,0 +1,173 @@
"""Hauptservice: Hotfolder via watchdog, ThreadPool für PDF-Verarbeitung."""
from __future__ import annotations
import logging
import signal
import threading
import time
from concurrent.futures import Future, ThreadPoolExecutor
from pathlib import Path
from watchdog.events import FileSystemEvent, FileSystemEventHandler
from watchdog.observers import Observer
from .config import Config
from .processor import ProcessResult, process_pdf
from .uploaders import notify_email, upload_folder, upload_nextcloud, upload_sftp
log = logging.getLogger(__name__)
def _is_pdf(path: Path) -> bool:
return path.suffix.lower() == ".pdf" and path.is_file()
def _wait_until_stable(path: Path, checks: int = 3, interval: float = 1.0) -> bool:
"""Wartet bis Datei nicht mehr wächst (Scanner schreibt mehrmals)."""
last = -1
stable_count = 0
for _ in range(60): # max ~60s
try:
size = path.stat().st_size
except FileNotFoundError:
return False
if size == last and size > 0:
stable_count += 1
if stable_count >= checks:
return True
else:
stable_count = 0
last = size
time.sleep(interval)
return False
class _Handler(FileSystemEventHandler):
def __init__(self, service: "HotfolderService") -> None:
self.service = service
def on_created(self, event: FileSystemEvent) -> None:
if not event.is_directory:
self.service.enqueue(Path(event.src_path))
def on_moved(self, event: FileSystemEvent) -> None:
if not event.is_directory:
self.service.enqueue(Path(event.dest_path))
def on_closed(self, event: FileSystemEvent) -> None:
if not event.is_directory:
self.service.enqueue(Path(event.src_path))
class HotfolderService:
def __init__(self, cfg: Config) -> None:
self.cfg = cfg
self._executor = ThreadPoolExecutor(
max_workers=cfg.ocr.max_workers,
thread_name_prefix="ocr",
)
self._observer: Observer | None = None
self._stop = threading.Event()
self._inflight: set[str] = set()
self._lock = threading.Lock()
# ---- Setup ----
def _ensure_dirs(self) -> None:
for p in (self.cfg.paths.incoming, self.cfg.paths.outgoing,
self.cfg.paths.working, self.cfg.paths.error):
p.mkdir(parents=True, exist_ok=True)
# ---- Lifecycle ----
def run(self) -> None:
self._ensure_dirs()
self._scan_existing()
self._observer = Observer()
self._observer.schedule(_Handler(self), str(self.cfg.paths.incoming), recursive=False)
self._observer.start()
log.info("Hotfolder läuft. Watching: %s", self.cfg.paths.incoming)
signal.signal(signal.SIGTERM, lambda *_: self._stop.set())
signal.signal(signal.SIGINT, lambda *_: self._stop.set())
try:
while not self._stop.is_set():
self._stop.wait(1.0)
finally:
self.shutdown()
def shutdown(self) -> None:
log.info("Shutdown läuft...")
if self._observer:
self._observer.stop()
self._observer.join(timeout=5)
self._executor.shutdown(wait=True, cancel_futures=False)
log.info("Shutdown ok.")
# ---- Queue ----
def _scan_existing(self) -> None:
"""Beim Start: bereits liegende PDFs aufgreifen."""
for p in self.cfg.paths.incoming.iterdir():
if _is_pdf(p):
self.enqueue(p)
def enqueue(self, path: Path) -> None:
if not _is_pdf(path):
return
key = str(path.resolve())
with self._lock:
if key in self._inflight:
return
self._inflight.add(key)
fut = self._executor.submit(self._process, path)
fut.add_done_callback(lambda f, k=key: self._done(k, f))
def _done(self, key: str, fut: Future) -> None:
with self._lock:
self._inflight.discard(key)
exc = fut.exception()
if exc:
log.exception("Worker-Exception", exc_info=exc)
# ---- Processing ----
def _process(self, path: Path) -> None:
if not _wait_until_stable(path):
log.warning("Datei nicht stabilisiert, überspringe: %s", path)
return
if not path.exists():
return
result: ProcessResult = process_pdf(
src=path,
working_dir=self.cfg.paths.working,
outgoing_dir=self.cfg.paths.outgoing,
error_dir=self.cfg.paths.error,
ocr_cfg=self.cfg.ocr,
vera_cfg=self.cfg.verapdf,
)
if result.success:
self._dispatch_uploads(result.output)
self._notify(result)
def _dispatch_uploads(self, pdf: Path) -> None:
upload_folder(pdf, self.cfg.folder, self.cfg.paths.outgoing)
if self.cfg.nextcloud.enabled:
upload_nextcloud(pdf, self.cfg.nextcloud)
if self.cfg.sftp.enabled:
upload_sftp(pdf, self.cfg.sftp)
def _notify(self, result: ProcessResult) -> None:
if result.success:
subject = f"[pdf-ocr] OK: {result.source.name}"
body = f"Datei verarbeitet: {result.output}\n"
if result.verapdf_passed is not None:
body += f"veraPDF: {'PASS' if result.verapdf_passed else 'FAIL'}\n"
else:
subject = f"[pdf-ocr] FEHLER: {result.source.name}"
body = f"Fehler beim Verarbeiten von {result.source}\n\n{result.error}\n"
notify_email(self.cfg.email, subject, body, result.success)
+104
View File
@@ -0,0 +1,104 @@
"""Upload-Ziele: lokaler Ordner, Nextcloud (WebDAV), SFTP. Plus E-Mail-Notify."""
from __future__ import annotations
import logging
import smtplib
import ssl
from email.message import EmailMessage
from pathlib import Path
from urllib.parse import quote
import paramiko
import requests
from .config import EmailNotify, FolderUpload, NextcloudUpload, SftpUpload
log = logging.getLogger(__name__)
def upload_folder(pdf: Path, cfg: FolderUpload, default_target: Path) -> bool:
if not cfg.enabled:
return True
target = Path(cfg.target) if cfg.target else default_target
target.mkdir(parents=True, exist_ok=True)
dest = target / pdf.name
try:
if pdf.resolve() == dest.resolve():
return True
dest.write_bytes(pdf.read_bytes())
log.info("Folder upload OK: %s", dest)
return True
except OSError as e:
log.error("Folder upload failed: %s", e)
return False
def upload_nextcloud(pdf: Path, cfg: NextcloudUpload) -> bool:
if not cfg.enabled:
return True
base = cfg.url.rstrip("/")
remote = "/".join(quote(part) for part in cfg.remote_path.strip("/").split("/") if part)
url = f"{base}/remote.php/dav/files/{quote(cfg.username)}/{remote}/{quote(pdf.name)}"
try:
with pdf.open("rb") as f:
r = requests.put(url, data=f, auth=(cfg.username, cfg.password),
verify=cfg.verify_ssl, timeout=300)
if r.status_code in (200, 201, 204):
log.info("Nextcloud upload OK: %s", pdf.name)
return True
log.error("Nextcloud upload HTTP %s: %s", r.status_code, r.text[:200])
return False
except requests.RequestException as e:
log.error("Nextcloud upload failed: %s", e)
return False
def upload_sftp(pdf: Path, cfg: SftpUpload) -> bool:
if not cfg.enabled:
return True
try:
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
connect_kwargs: dict = {
"hostname": cfg.host, "port": cfg.port, "username": cfg.username,
"timeout": 30,
}
if cfg.key_file:
connect_kwargs["key_filename"] = cfg.key_file
if cfg.password:
connect_kwargs["password"] = cfg.password
client.connect(**connect_kwargs)
sftp = client.open_sftp()
try:
remote = f"{cfg.remote_path.rstrip('/')}/{pdf.name}"
sftp.put(str(pdf), remote)
log.info("SFTP upload OK: %s", remote)
return True
finally:
sftp.close()
client.close()
except (paramiko.SSHException, OSError) as e:
log.error("SFTP upload failed: %s", e)
return False
def notify_email(cfg: EmailNotify, subject: str, body: str, success: bool) -> None:
if not cfg.enabled or cfg.on == "never":
return
if cfg.on == "errors" and success:
return
msg = EmailMessage()
msg["Subject"] = subject
msg["From"] = cfg.from_addr
msg["To"] = ", ".join(cfg.to_addrs)
msg.set_content(body)
try:
with smtplib.SMTP(cfg.smtp_host, cfg.smtp_port, timeout=30) as s:
if cfg.use_starttls:
s.starttls(context=ssl.create_default_context())
if cfg.smtp_user:
s.login(cfg.smtp_user, cfg.smtp_password)
s.send_message(msg)
log.info("E-Mail-Notify gesendet: %s", subject)
except (smtplib.SMTPException, OSError) as e:
log.error("E-Mail-Notify fehlgeschlagen: %s", e)