Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a23a3968ef |
@@ -1,5 +1,20 @@
|
||||
# Changelog
|
||||
|
||||
## [0.3.0] - 2026-04-09
|
||||
|
||||
### Added
|
||||
- Neue Config-Sektion `[output]` mit:
|
||||
- `name_mode` — Platzierung des Tags im Dateinamen: `"prefix"`, `"suffix"` (vor Extension), `"none"`
|
||||
- `name_tag` — verbatim einzufügender String, z.B. `"OCR_"` oder `"_OCR"`
|
||||
- `original_on_success` — `"delete"` (alter Default) oder `"archive"`
|
||||
- `archive_dir` — Zielverzeichnis für `"archive"`, mit Kollisions-Schutz (Timestamp-Suffix)
|
||||
- Runtime-Validierung der Output-Config in `check_output_config()`
|
||||
- 20 neue Tests für `build_output_name()`, `check_output_config()` und `process_pdf()`
|
||||
mit allen Kombinationen aus Modus + Original-Behandlung
|
||||
|
||||
### Changed
|
||||
- `process_pdf()` nimmt jetzt `output_cfg: OutputConfig` als Pflicht-Argument
|
||||
|
||||
## [0.2.2] - 2026-04-09
|
||||
|
||||
### Fixed
|
||||
|
||||
@@ -89,6 +89,22 @@ max_workers = 2 # parallele PDFs
|
||||
timeout = 1800
|
||||
```
|
||||
|
||||
### `[output]`
|
||||
```toml
|
||||
# Dateiname im outgoing/:
|
||||
# "prefix" → OCR_scan.pdf
|
||||
# "suffix" → scan_OCR.pdf (vor der Extension)
|
||||
# "none" → scan.pdf (unverändert)
|
||||
name_mode = "prefix"
|
||||
name_tag = "OCR_"
|
||||
|
||||
# Nach erfolgreichem OCR mit dem Original:
|
||||
# "delete" → löschen
|
||||
# "archive" → in archive_dir verschieben
|
||||
original_on_success = "delete"
|
||||
archive_dir = "" # absoluter Pfad, Pflicht bei "archive"
|
||||
```
|
||||
|
||||
### `[upload.nextcloud]`
|
||||
```toml
|
||||
enabled = true
|
||||
|
||||
@@ -34,6 +34,22 @@ max_workers = 2
|
||||
# Timeout pro PDF in Sekunden
|
||||
timeout = 1800
|
||||
|
||||
[output]
|
||||
# Wie soll die Ziel-Datei im outgoing/-Ordner benannt werden?
|
||||
# "prefix" : name_tag wird vor den Dateinamen gestellt (OCR_scan.pdf)
|
||||
# "suffix" : name_tag wird vor die Extension gestellt (scan_OCR.pdf)
|
||||
# "none" : Dateiname bleibt wie das Original
|
||||
name_mode = "prefix"
|
||||
# Verbatim einzufügender String. Leerer String = kein Tag (wie mode="none").
|
||||
# Beispiele: "OCR_", "[OCR]_", "_OCR", "_searchable"
|
||||
name_tag = "OCR_"
|
||||
# Was passiert mit dem Original, wenn OCR erfolgreich war?
|
||||
# "delete" : Original wird gelöscht (alter Standard)
|
||||
# "archive" : Original wird in archive_dir verschoben
|
||||
original_on_success = "delete"
|
||||
# Absoluter Pfad; nur relevant wenn original_on_success = "archive"
|
||||
archive_dir = ""
|
||||
|
||||
[verapdf]
|
||||
# PDF/A-Validierung (optional)
|
||||
enabled = false
|
||||
|
||||
@@ -28,6 +28,18 @@ class OcrConfig:
|
||||
timeout: int = 1800
|
||||
|
||||
|
||||
@dataclass
|
||||
class OutputConfig:
|
||||
# "prefix" | "suffix" | "none"
|
||||
name_mode: str = "prefix"
|
||||
# Tag-String, verbatim eingefügt (Leerstring = kein Tag)
|
||||
name_tag: str = "OCR_"
|
||||
# "delete" | "archive"
|
||||
original_on_success: str = "delete"
|
||||
# Absoluter Pfad; Pflicht wenn original_on_success == "archive"
|
||||
archive_dir: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class VeraPdfConfig:
|
||||
enabled: bool = False
|
||||
@@ -79,6 +91,7 @@ class EmailNotify:
|
||||
class Config:
|
||||
paths: Paths
|
||||
ocr: OcrConfig
|
||||
output: OutputConfig
|
||||
verapdf: VeraPdfConfig
|
||||
folder: FolderUpload
|
||||
nextcloud: NextcloudUpload
|
||||
@@ -109,6 +122,8 @@ def load_config(path: str | Path) -> Config:
|
||||
|
||||
ocr = OcrConfig(**{k: v for k, v in _section(data, "ocr").items()
|
||||
if k in OcrConfig.__annotations__})
|
||||
output = OutputConfig(**{k: v for k, v in _section(data, "output").items()
|
||||
if k in OutputConfig.__annotations__})
|
||||
verapdf = VeraPdfConfig(**{k: v for k, v in _section(data, "verapdf").items()
|
||||
if k in VeraPdfConfig.__annotations__})
|
||||
folder = FolderUpload(**{k: v for k, v in _section(data, "upload", "folder").items()
|
||||
@@ -123,7 +138,7 @@ def load_config(path: str | Path) -> Config:
|
||||
log_level = _section(data, "logging").get("level", "INFO")
|
||||
|
||||
return Config(
|
||||
paths=paths, ocr=ocr, verapdf=verapdf,
|
||||
paths=paths, ocr=ocr, output=output, verapdf=verapdf,
|
||||
folder=folder, nextcloud=nextcloud, sftp=sftp, email=email,
|
||||
log_level=log_level,
|
||||
)
|
||||
|
||||
@@ -7,11 +7,37 @@ import subprocess
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
from .config import OcrConfig, VeraPdfConfig
|
||||
from .config import OcrConfig, OutputConfig, VeraPdfConfig
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def build_output_name(src_name: str, mode: str, tag: str) -> str:
|
||||
"""Erzeugt den Ziel-Dateinamen für ein OCR-PDF.
|
||||
|
||||
Args:
|
||||
src_name: Original-Dateiname (z.B. "scan.pdf")
|
||||
mode: "prefix" | "suffix" | "none"
|
||||
tag: Einzufügender String (verbatim, leer = kein Tag)
|
||||
|
||||
Beispiele:
|
||||
prefix "OCR_": "scan.pdf" -> "OCR_scan.pdf"
|
||||
suffix "_OCR": "scan.pdf" -> "scan_OCR.pdf"
|
||||
suffix "_OCR": "scan.tar.gz.pdf" -> "scan.tar.gz_OCR.pdf"
|
||||
none: "scan.pdf" -> "scan.pdf"
|
||||
"""
|
||||
if mode == "none" or not tag:
|
||||
return src_name
|
||||
if mode == "prefix":
|
||||
return f"{tag}{src_name}"
|
||||
if mode == "suffix":
|
||||
# Nur die letzte Extension abspalten, sonst "foo.bar.pdf" kaputt gemacht
|
||||
p = Path(src_name)
|
||||
stem, ext = p.stem, p.suffix
|
||||
return f"{stem}{tag}{ext}"
|
||||
raise ValueError(f"Unbekannter name_mode: {mode!r}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessResult:
|
||||
source: Path
|
||||
@@ -71,11 +97,13 @@ def process_pdf(
|
||||
error_dir: Path,
|
||||
ocr_cfg: OcrConfig,
|
||||
vera_cfg: VeraPdfConfig,
|
||||
output_cfg: OutputConfig,
|
||||
) -> ProcessResult:
|
||||
"""Verarbeitet eine einzelne PDF: move→OCR→validate→outgoing/error."""
|
||||
out_name = build_output_name(src.name, output_cfg.name_mode, output_cfg.name_tag)
|
||||
work_src = working_dir / src.name
|
||||
work_out = working_dir / f"OCR_{src.name}"
|
||||
final_out = outgoing_dir / f"OCR_{src.name}"
|
||||
work_out = working_dir / f"__ocr_{out_name}" # Temp-Name, damit er != src.name ist
|
||||
final_out = outgoing_dir / out_name
|
||||
|
||||
try:
|
||||
shutil.move(str(src), str(work_src))
|
||||
@@ -100,10 +128,38 @@ def process_pdf(
|
||||
|
||||
outgoing_dir.mkdir(parents=True, exist_ok=True)
|
||||
shutil.move(str(work_out), str(final_out))
|
||||
work_src.unlink(missing_ok=True)
|
||||
_dispose_original(work_src, src.name, output_cfg)
|
||||
return ProcessResult(src, final_out, True, verapdf_passed=vera_ok)
|
||||
|
||||
|
||||
def _dispose_original(work_src: Path, original_name: str, cfg: OutputConfig) -> None:
|
||||
"""Entsorgt das Original nach erfolgreichem OCR — löschen oder archivieren."""
|
||||
if not work_src.exists():
|
||||
return
|
||||
mode = cfg.original_on_success
|
||||
if mode == "delete":
|
||||
work_src.unlink(missing_ok=True)
|
||||
return
|
||||
if mode == "archive":
|
||||
if not cfg.archive_dir:
|
||||
log.error("original_on_success=archive aber archive_dir ist leer — lösche stattdessen")
|
||||
work_src.unlink(missing_ok=True)
|
||||
return
|
||||
archive = Path(cfg.archive_dir)
|
||||
archive.mkdir(parents=True, exist_ok=True)
|
||||
dest = archive / original_name
|
||||
# Bei Namens-Kollision mit Timestamp umbenennen
|
||||
if dest.exists():
|
||||
from datetime import datetime
|
||||
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
dest = archive / f"{dest.stem}_{ts}{dest.suffix}"
|
||||
shutil.move(str(work_src), str(dest))
|
||||
log.info("Original archiviert: %s", dest)
|
||||
return
|
||||
log.warning("Unbekannter original_on_success=%r — lösche stattdessen", mode)
|
||||
work_src.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def _move_to_error(p: Path, error_dir: Path) -> None:
|
||||
error_dir.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
|
||||
@@ -72,6 +72,20 @@ def detect_ghostscript_version() -> str | None:
|
||||
return result.stdout.strip() or None
|
||||
|
||||
|
||||
def check_output_config(mode: str, archive_dir: str) -> None:
|
||||
"""Validiert die [output]-Section. Wirft PreflightError bei Problemen."""
|
||||
valid_modes = {"delete", "archive"}
|
||||
if mode not in valid_modes:
|
||||
raise PreflightError(
|
||||
f"[output].original_on_success={mode!r} ungültig. "
|
||||
f"Erlaubt: {sorted(valid_modes)}"
|
||||
)
|
||||
if mode == "archive" and not archive_dir:
|
||||
raise PreflightError(
|
||||
"[output].original_on_success='archive' erfordert [output].archive_dir"
|
||||
)
|
||||
|
||||
|
||||
def check_preflight(pdfa_level: str = "") -> None:
|
||||
"""Prüft externe Abhängigkeiten.
|
||||
|
||||
@@ -173,6 +187,8 @@ class HotfolderService:
|
||||
|
||||
def run(self) -> None:
|
||||
check_preflight(self.cfg.ocr.pdfa_level)
|
||||
check_output_config(self.cfg.output.original_on_success,
|
||||
self.cfg.output.archive_dir)
|
||||
self.ensure_dirs()
|
||||
self._scan_existing()
|
||||
|
||||
@@ -197,6 +213,8 @@ class HotfolderService:
|
||||
Anzahl fehlgeschlagener PDFs (0 = alles ok).
|
||||
"""
|
||||
check_preflight(self.cfg.ocr.pdfa_level)
|
||||
check_output_config(self.cfg.output.original_on_success,
|
||||
self.cfg.output.archive_dir)
|
||||
self.ensure_dirs()
|
||||
self._scan_existing()
|
||||
self._executor.shutdown(wait=True)
|
||||
@@ -254,6 +272,7 @@ class HotfolderService:
|
||||
error_dir=self.cfg.paths.error,
|
||||
ocr_cfg=self.cfg.ocr,
|
||||
vera_cfg=self.cfg.verapdf,
|
||||
output_cfg=self.cfg.output,
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
|
||||
@@ -11,6 +11,7 @@ from pdf_ocr_hotfolder.config import (
|
||||
FolderUpload,
|
||||
NextcloudUpload,
|
||||
OcrConfig,
|
||||
OutputConfig,
|
||||
Paths,
|
||||
SftpUpload,
|
||||
VeraPdfConfig,
|
||||
@@ -32,6 +33,7 @@ def tmp_config(tmp_path: Path) -> Config:
|
||||
return Config(
|
||||
paths=paths,
|
||||
ocr=OcrConfig(max_workers=1),
|
||||
output=OutputConfig(),
|
||||
verapdf=VeraPdfConfig(enabled=False),
|
||||
folder=FolderUpload(enabled=False),
|
||||
nextcloud=NextcloudUpload(enabled=False),
|
||||
|
||||
@@ -8,7 +8,7 @@ from pdf_ocr_hotfolder.processor import ProcessResult
|
||||
from pdf_ocr_hotfolder.service import HotfolderService
|
||||
|
||||
|
||||
def _fake_success(src: Path, working_dir, outgoing_dir, error_dir, ocr_cfg, vera_cfg):
|
||||
def _fake_success(src: Path, working_dir, outgoing_dir, error_dir, **kwargs):
|
||||
out = outgoing_dir / f"OCR_{src.name}"
|
||||
out.parent.mkdir(parents=True, exist_ok=True)
|
||||
out.write_bytes(b"%PDF-1.4 ocr\n")
|
||||
@@ -16,7 +16,7 @@ def _fake_success(src: Path, working_dir, outgoing_dir, error_dir, ocr_cfg, vera
|
||||
return ProcessResult(src, out, True)
|
||||
|
||||
|
||||
def _fake_failure(src: Path, working_dir, outgoing_dir, error_dir, ocr_cfg, vera_cfg):
|
||||
def _fake_failure(src: Path, working_dir, outgoing_dir, error_dir, **kwargs):
|
||||
error_dir.mkdir(parents=True, exist_ok=True)
|
||||
dest = error_dir / src.name
|
||||
src.rename(dest)
|
||||
|
||||
@@ -0,0 +1,190 @@
|
||||
"""Tests für Feature: konfigurierbare Dateinamen und Original-Behandlung."""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from pdf_ocr_hotfolder.config import OcrConfig, OutputConfig, VeraPdfConfig
|
||||
from pdf_ocr_hotfolder.processor import build_output_name, process_pdf
|
||||
from pdf_ocr_hotfolder.service import PreflightError, check_output_config
|
||||
|
||||
|
||||
# ---------------- build_output_name ----------------
|
||||
|
||||
@pytest.mark.parametrize("src,mode,tag,expected", [
|
||||
# prefix
|
||||
("scan.pdf", "prefix", "OCR_", "OCR_scan.pdf"),
|
||||
("scan.pdf", "prefix", "[OCR] ", "[OCR] scan.pdf"),
|
||||
# suffix (Tag vor Extension)
|
||||
("scan.pdf", "suffix", "_OCR", "scan_OCR.pdf"),
|
||||
("scan.pdf", "suffix", "-ocr", "scan-ocr.pdf"),
|
||||
# none
|
||||
("scan.pdf", "none", "OCR_", "scan.pdf"),
|
||||
# leerer Tag = none
|
||||
("scan.pdf", "prefix", "", "scan.pdf"),
|
||||
("scan.pdf", "suffix", "", "scan.pdf"),
|
||||
# Mehrfach-Punkte im Namen: nur letzte Extension zählt
|
||||
("rechnung.2026.pdf", "suffix", "_OCR", "rechnung.2026_OCR.pdf"),
|
||||
("rechnung.2026.pdf", "prefix", "OCR_", "OCR_rechnung.2026.pdf"),
|
||||
# Name ohne Extension
|
||||
("NO_EXT", "suffix", "_OCR", "NO_EXT_OCR"),
|
||||
])
|
||||
def test_build_output_name(src, mode, tag, expected) -> None:
|
||||
assert build_output_name(src, mode, tag) == expected
|
||||
|
||||
|
||||
def test_build_output_name_invalid_mode() -> None:
|
||||
with pytest.raises(ValueError, match="name_mode"):
|
||||
build_output_name("x.pdf", "bogus", "OCR_")
|
||||
|
||||
|
||||
# ---------------- check_output_config ----------------
|
||||
|
||||
def test_check_output_config_delete_ok() -> None:
|
||||
check_output_config("delete", "") # ok
|
||||
|
||||
|
||||
def test_check_output_config_archive_requires_dir() -> None:
|
||||
with pytest.raises(PreflightError, match="archive_dir"):
|
||||
check_output_config("archive", "")
|
||||
|
||||
|
||||
def test_check_output_config_archive_with_dir_ok() -> None:
|
||||
check_output_config("archive", "/var/archive") # ok
|
||||
|
||||
|
||||
def test_check_output_config_invalid_mode() -> None:
|
||||
with pytest.raises(PreflightError, match="ungültig"):
|
||||
check_output_config("trash", "")
|
||||
|
||||
|
||||
# ---------------- process_pdf mit Original-Behandlung ----------------
|
||||
|
||||
def _fake_ocr(src: Path, dst: Path, cfg: OcrConfig) -> None:
|
||||
"""Simuliert ocrmypdf: kopiert Inhalt, erzeugt Zieldatei."""
|
||||
dst.write_bytes(b"%PDF-1.4 OCRed\n" + src.read_bytes())
|
||||
|
||||
|
||||
def _prepare(tmp_path: Path) -> dict:
|
||||
dirs = {
|
||||
"working": tmp_path / "working",
|
||||
"outgoing": tmp_path / "outgoing",
|
||||
"error": tmp_path / "error",
|
||||
"archive": tmp_path / "archive",
|
||||
"incoming": tmp_path / "incoming",
|
||||
}
|
||||
for d in dirs.values():
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
src = dirs["incoming"] / "scan.pdf"
|
||||
src.write_bytes(b"%PDF-1.4 original\n")
|
||||
return {"src": src, **dirs}
|
||||
|
||||
|
||||
def test_process_pdf_prefix_delete(tmp_path: Path) -> None:
|
||||
env = _prepare(tmp_path)
|
||||
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
|
||||
original_on_success="delete")
|
||||
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
||||
result = process_pdf(
|
||||
src=env["src"],
|
||||
working_dir=env["working"],
|
||||
outgoing_dir=env["outgoing"],
|
||||
error_dir=env["error"],
|
||||
ocr_cfg=OcrConfig(),
|
||||
vera_cfg=VeraPdfConfig(enabled=False),
|
||||
output_cfg=out_cfg,
|
||||
)
|
||||
assert result.success
|
||||
assert (env["outgoing"] / "OCR_scan.pdf").exists()
|
||||
# Original ist weg, weder in incoming noch in working
|
||||
assert not env["src"].exists()
|
||||
assert not (env["working"] / "scan.pdf").exists()
|
||||
|
||||
|
||||
def test_process_pdf_suffix_delete(tmp_path: Path) -> None:
|
||||
env = _prepare(tmp_path)
|
||||
out_cfg = OutputConfig(name_mode="suffix", name_tag="_OCR",
|
||||
original_on_success="delete")
|
||||
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
||||
result = process_pdf(
|
||||
src=env["src"],
|
||||
working_dir=env["working"],
|
||||
outgoing_dir=env["outgoing"],
|
||||
error_dir=env["error"],
|
||||
ocr_cfg=OcrConfig(),
|
||||
vera_cfg=VeraPdfConfig(enabled=False),
|
||||
output_cfg=out_cfg,
|
||||
)
|
||||
assert result.success
|
||||
assert (env["outgoing"] / "scan_OCR.pdf").exists()
|
||||
|
||||
|
||||
def test_process_pdf_none_mode(tmp_path: Path) -> None:
|
||||
env = _prepare(tmp_path)
|
||||
out_cfg = OutputConfig(name_mode="none", name_tag="OCR_",
|
||||
original_on_success="delete")
|
||||
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
||||
result = process_pdf(
|
||||
src=env["src"],
|
||||
working_dir=env["working"],
|
||||
outgoing_dir=env["outgoing"],
|
||||
error_dir=env["error"],
|
||||
ocr_cfg=OcrConfig(),
|
||||
vera_cfg=VeraPdfConfig(enabled=False),
|
||||
output_cfg=out_cfg,
|
||||
)
|
||||
assert result.success
|
||||
# Ausgang hat GLEICHEN Namen wie Original
|
||||
assert (env["outgoing"] / "scan.pdf").exists()
|
||||
|
||||
|
||||
def test_process_pdf_archive_original(tmp_path: Path) -> None:
|
||||
env = _prepare(tmp_path)
|
||||
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
|
||||
original_on_success="archive",
|
||||
archive_dir=str(env["archive"]))
|
||||
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
||||
result = process_pdf(
|
||||
src=env["src"],
|
||||
working_dir=env["working"],
|
||||
outgoing_dir=env["outgoing"],
|
||||
error_dir=env["error"],
|
||||
ocr_cfg=OcrConfig(),
|
||||
vera_cfg=VeraPdfConfig(enabled=False),
|
||||
output_cfg=out_cfg,
|
||||
)
|
||||
assert result.success
|
||||
assert (env["outgoing"] / "OCR_scan.pdf").exists()
|
||||
# Original liegt jetzt im Archiv
|
||||
archived = env["archive"] / "scan.pdf"
|
||||
assert archived.exists()
|
||||
assert archived.read_bytes() == b"%PDF-1.4 original\n"
|
||||
|
||||
|
||||
def test_process_pdf_archive_name_collision(tmp_path: Path) -> None:
|
||||
"""Bei Namens-Kollision im Archiv wird Timestamp angehängt."""
|
||||
env = _prepare(tmp_path)
|
||||
# Vorhandene Kollisions-Datei
|
||||
(env["archive"] / "scan.pdf").write_bytes(b"old")
|
||||
|
||||
out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_",
|
||||
original_on_success="archive",
|
||||
archive_dir=str(env["archive"]))
|
||||
with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr):
|
||||
process_pdf(
|
||||
src=env["src"],
|
||||
working_dir=env["working"],
|
||||
outgoing_dir=env["outgoing"],
|
||||
error_dir=env["error"],
|
||||
ocr_cfg=OcrConfig(),
|
||||
vera_cfg=VeraPdfConfig(enabled=False),
|
||||
output_cfg=out_cfg,
|
||||
)
|
||||
# Alte Datei unverändert
|
||||
assert (env["archive"] / "scan.pdf").read_bytes() == b"old"
|
||||
# Neue Datei mit Timestamp-Suffix
|
||||
archived = list(env["archive"].glob("scan_*.pdf"))
|
||||
assert len(archived) == 1
|
||||
assert archived[0].read_bytes() == b"%PDF-1.4 original\n"
|
||||
Reference in New Issue
Block a user