feat: konfigurierbarer Dateiname + Archiv-Modus für Original (v0.3.0)

Neue [output]-Section:
- name_mode: prefix | suffix | none (suffix wird vor Extension eingefügt)
- name_tag: verbatim einfügbarer String
- original_on_success: delete | archive
- archive_dir mit Kollisions-Schutz (Timestamp-Suffix)

20 neue Tests (50 insgesamt, alle grün).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-09 22:32:41 +02:00
parent 9cdc9ae443
commit a23a3968ef
10 changed files with 337 additions and 8 deletions
+16 -1
View File
@@ -28,6 +28,18 @@ class OcrConfig:
timeout: int = 1800
@dataclass
class OutputConfig:
# "prefix" | "suffix" | "none"
name_mode: str = "prefix"
# Tag-String, verbatim eingefügt (Leerstring = kein Tag)
name_tag: str = "OCR_"
# "delete" | "archive"
original_on_success: str = "delete"
# Absoluter Pfad; Pflicht wenn original_on_success == "archive"
archive_dir: str = ""
@dataclass
class VeraPdfConfig:
enabled: bool = False
@@ -79,6 +91,7 @@ class EmailNotify:
class Config:
paths: Paths
ocr: OcrConfig
output: OutputConfig
verapdf: VeraPdfConfig
folder: FolderUpload
nextcloud: NextcloudUpload
@@ -109,6 +122,8 @@ def load_config(path: str | Path) -> Config:
ocr = OcrConfig(**{k: v for k, v in _section(data, "ocr").items()
if k in OcrConfig.__annotations__})
output = OutputConfig(**{k: v for k, v in _section(data, "output").items()
if k in OutputConfig.__annotations__})
verapdf = VeraPdfConfig(**{k: v for k, v in _section(data, "verapdf").items()
if k in VeraPdfConfig.__annotations__})
folder = FolderUpload(**{k: v for k, v in _section(data, "upload", "folder").items()
@@ -123,7 +138,7 @@ def load_config(path: str | Path) -> Config:
log_level = _section(data, "logging").get("level", "INFO")
return Config(
paths=paths, ocr=ocr, verapdf=verapdf,
paths=paths, ocr=ocr, output=output, verapdf=verapdf,
folder=folder, nextcloud=nextcloud, sftp=sftp, email=email,
log_level=log_level,
)
+60 -4
View File
@@ -7,11 +7,37 @@ import subprocess
from dataclasses import dataclass
from pathlib import Path
from .config import OcrConfig, VeraPdfConfig
from .config import OcrConfig, OutputConfig, VeraPdfConfig
log = logging.getLogger(__name__)
def build_output_name(src_name: str, mode: str, tag: str) -> str:
"""Erzeugt den Ziel-Dateinamen für ein OCR-PDF.
Args:
src_name: Original-Dateiname (z.B. "scan.pdf")
mode: "prefix" | "suffix" | "none"
tag: Einzufügender String (verbatim, leer = kein Tag)
Beispiele:
prefix "OCR_": "scan.pdf" -> "OCR_scan.pdf"
suffix "_OCR": "scan.pdf" -> "scan_OCR.pdf"
suffix "_OCR": "scan.tar.gz.pdf" -> "scan.tar.gz_OCR.pdf"
none: "scan.pdf" -> "scan.pdf"
"""
if mode == "none" or not tag:
return src_name
if mode == "prefix":
return f"{tag}{src_name}"
if mode == "suffix":
# Nur die letzte Extension abspalten, sonst "foo.bar.pdf" kaputt gemacht
p = Path(src_name)
stem, ext = p.stem, p.suffix
return f"{stem}{tag}{ext}"
raise ValueError(f"Unbekannter name_mode: {mode!r}")
@dataclass
class ProcessResult:
source: Path
@@ -71,11 +97,13 @@ def process_pdf(
error_dir: Path,
ocr_cfg: OcrConfig,
vera_cfg: VeraPdfConfig,
output_cfg: OutputConfig,
) -> ProcessResult:
"""Verarbeitet eine einzelne PDF: move→OCR→validate→outgoing/error."""
out_name = build_output_name(src.name, output_cfg.name_mode, output_cfg.name_tag)
work_src = working_dir / src.name
work_out = working_dir / f"OCR_{src.name}"
final_out = outgoing_dir / f"OCR_{src.name}"
work_out = working_dir / f"__ocr_{out_name}" # Temp-Name, damit er != src.name ist
final_out = outgoing_dir / out_name
try:
shutil.move(str(src), str(work_src))
@@ -100,10 +128,38 @@ def process_pdf(
outgoing_dir.mkdir(parents=True, exist_ok=True)
shutil.move(str(work_out), str(final_out))
work_src.unlink(missing_ok=True)
_dispose_original(work_src, src.name, output_cfg)
return ProcessResult(src, final_out, True, verapdf_passed=vera_ok)
def _dispose_original(work_src: Path, original_name: str, cfg: OutputConfig) -> None:
"""Entsorgt das Original nach erfolgreichem OCR — löschen oder archivieren."""
if not work_src.exists():
return
mode = cfg.original_on_success
if mode == "delete":
work_src.unlink(missing_ok=True)
return
if mode == "archive":
if not cfg.archive_dir:
log.error("original_on_success=archive aber archive_dir ist leer — lösche stattdessen")
work_src.unlink(missing_ok=True)
return
archive = Path(cfg.archive_dir)
archive.mkdir(parents=True, exist_ok=True)
dest = archive / original_name
# Bei Namens-Kollision mit Timestamp umbenennen
if dest.exists():
from datetime import datetime
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
dest = archive / f"{dest.stem}_{ts}{dest.suffix}"
shutil.move(str(work_src), str(dest))
log.info("Original archiviert: %s", dest)
return
log.warning("Unbekannter original_on_success=%r — lösche stattdessen", mode)
work_src.unlink(missing_ok=True)
def _move_to_error(p: Path, error_dir: Path) -> None:
error_dir.mkdir(parents=True, exist_ok=True)
try:
+19
View File
@@ -72,6 +72,20 @@ def detect_ghostscript_version() -> str | None:
return result.stdout.strip() or None
def check_output_config(mode: str, archive_dir: str) -> None:
"""Validiert die [output]-Section. Wirft PreflightError bei Problemen."""
valid_modes = {"delete", "archive"}
if mode not in valid_modes:
raise PreflightError(
f"[output].original_on_success={mode!r} ungültig. "
f"Erlaubt: {sorted(valid_modes)}"
)
if mode == "archive" and not archive_dir:
raise PreflightError(
"[output].original_on_success='archive' erfordert [output].archive_dir"
)
def check_preflight(pdfa_level: str = "") -> None:
"""Prüft externe Abhängigkeiten.
@@ -173,6 +187,8 @@ class HotfolderService:
def run(self) -> None:
check_preflight(self.cfg.ocr.pdfa_level)
check_output_config(self.cfg.output.original_on_success,
self.cfg.output.archive_dir)
self.ensure_dirs()
self._scan_existing()
@@ -197,6 +213,8 @@ class HotfolderService:
Anzahl fehlgeschlagener PDFs (0 = alles ok).
"""
check_preflight(self.cfg.ocr.pdfa_level)
check_output_config(self.cfg.output.original_on_success,
self.cfg.output.archive_dir)
self.ensure_dirs()
self._scan_existing()
self._executor.shutdown(wait=True)
@@ -254,6 +272,7 @@ class HotfolderService:
error_dir=self.cfg.paths.error,
ocr_cfg=self.cfg.ocr,
vera_cfg=self.cfg.verapdf,
output_cfg=self.cfg.output,
)
with self._lock: