From a23a3968efca87db622cb7e86f4116b7a543174c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dominik=20H=C3=B6fling?= Date: Thu, 9 Apr 2026 22:32:41 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20konfigurierbarer=20Dateiname=20+=20Arch?= =?UTF-8?q?iv-Modus=20f=C3=BCr=20Original=20(v0.3.0)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Neue [output]-Section: - name_mode: prefix | suffix | none (suffix wird vor Extension eingefügt) - name_tag: verbatim einfügbarer String - original_on_success: delete | archive - archive_dir mit Kollisions-Schutz (Timestamp-Suffix) 20 neue Tests (50 insgesamt, alle grün). Co-Authored-By: Claude Opus 4.6 --- CHANGELOG.md | 15 +++ README.md | 16 +++ VERSION | 2 +- config.example.toml | 16 +++ pdf_ocr_hotfolder/config.py | 17 ++- pdf_ocr_hotfolder/processor.py | 64 ++++++++++- pdf_ocr_hotfolder/service.py | 19 ++++ tests/conftest.py | 2 + tests/test_once_exit_code.py | 4 +- tests/test_output_naming.py | 190 +++++++++++++++++++++++++++++++++ 10 files changed, 337 insertions(+), 8 deletions(-) create mode 100644 tests/test_output_naming.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 95421b1..afca8b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,20 @@ # Changelog +## [0.3.0] - 2026-04-09 + +### Added +- Neue Config-Sektion `[output]` mit: + - `name_mode` — Platzierung des Tags im Dateinamen: `"prefix"`, `"suffix"` (vor Extension), `"none"` + - `name_tag` — verbatim einzufügender String, z.B. `"OCR_"` oder `"_OCR"` + - `original_on_success` — `"delete"` (alter Default) oder `"archive"` + - `archive_dir` — Zielverzeichnis für `"archive"`, mit Kollisions-Schutz (Timestamp-Suffix) +- Runtime-Validierung der Output-Config in `check_output_config()` +- 20 neue Tests für `build_output_name()`, `check_output_config()` und `process_pdf()` + mit allen Kombinationen aus Modus + Original-Behandlung + +### Changed +- `process_pdf()` nimmt jetzt `output_cfg: OutputConfig` als Pflicht-Argument + ## [0.2.2] - 2026-04-09 ### Fixed diff --git a/README.md b/README.md index 580df88..4bffa1c 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,22 @@ max_workers = 2 # parallele PDFs timeout = 1800 ``` +### `[output]` +```toml +# Dateiname im outgoing/: +# "prefix" → OCR_scan.pdf +# "suffix" → scan_OCR.pdf (vor der Extension) +# "none" → scan.pdf (unverändert) +name_mode = "prefix" +name_tag = "OCR_" + +# Nach erfolgreichem OCR mit dem Original: +# "delete" → löschen +# "archive" → in archive_dir verschieben +original_on_success = "delete" +archive_dir = "" # absoluter Pfad, Pflicht bei "archive" +``` + ### `[upload.nextcloud]` ```toml enabled = true diff --git a/VERSION b/VERSION index ee1372d..0d91a54 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.2.2 +0.3.0 diff --git a/config.example.toml b/config.example.toml index 42e3354..878e2c5 100644 --- a/config.example.toml +++ b/config.example.toml @@ -34,6 +34,22 @@ max_workers = 2 # Timeout pro PDF in Sekunden timeout = 1800 +[output] +# Wie soll die Ziel-Datei im outgoing/-Ordner benannt werden? +# "prefix" : name_tag wird vor den Dateinamen gestellt (OCR_scan.pdf) +# "suffix" : name_tag wird vor die Extension gestellt (scan_OCR.pdf) +# "none" : Dateiname bleibt wie das Original +name_mode = "prefix" +# Verbatim einzufügender String. Leerer String = kein Tag (wie mode="none"). +# Beispiele: "OCR_", "[OCR]_", "_OCR", "_searchable" +name_tag = "OCR_" +# Was passiert mit dem Original, wenn OCR erfolgreich war? +# "delete" : Original wird gelöscht (alter Standard) +# "archive" : Original wird in archive_dir verschoben +original_on_success = "delete" +# Absoluter Pfad; nur relevant wenn original_on_success = "archive" +archive_dir = "" + [verapdf] # PDF/A-Validierung (optional) enabled = false diff --git a/pdf_ocr_hotfolder/config.py b/pdf_ocr_hotfolder/config.py index 6b67337..324f3f0 100644 --- a/pdf_ocr_hotfolder/config.py +++ b/pdf_ocr_hotfolder/config.py @@ -28,6 +28,18 @@ class OcrConfig: timeout: int = 1800 +@dataclass +class OutputConfig: + # "prefix" | "suffix" | "none" + name_mode: str = "prefix" + # Tag-String, verbatim eingefügt (Leerstring = kein Tag) + name_tag: str = "OCR_" + # "delete" | "archive" + original_on_success: str = "delete" + # Absoluter Pfad; Pflicht wenn original_on_success == "archive" + archive_dir: str = "" + + @dataclass class VeraPdfConfig: enabled: bool = False @@ -79,6 +91,7 @@ class EmailNotify: class Config: paths: Paths ocr: OcrConfig + output: OutputConfig verapdf: VeraPdfConfig folder: FolderUpload nextcloud: NextcloudUpload @@ -109,6 +122,8 @@ def load_config(path: str | Path) -> Config: ocr = OcrConfig(**{k: v for k, v in _section(data, "ocr").items() if k in OcrConfig.__annotations__}) + output = OutputConfig(**{k: v for k, v in _section(data, "output").items() + if k in OutputConfig.__annotations__}) verapdf = VeraPdfConfig(**{k: v for k, v in _section(data, "verapdf").items() if k in VeraPdfConfig.__annotations__}) folder = FolderUpload(**{k: v for k, v in _section(data, "upload", "folder").items() @@ -123,7 +138,7 @@ def load_config(path: str | Path) -> Config: log_level = _section(data, "logging").get("level", "INFO") return Config( - paths=paths, ocr=ocr, verapdf=verapdf, + paths=paths, ocr=ocr, output=output, verapdf=verapdf, folder=folder, nextcloud=nextcloud, sftp=sftp, email=email, log_level=log_level, ) diff --git a/pdf_ocr_hotfolder/processor.py b/pdf_ocr_hotfolder/processor.py index 06ce78b..191a6f0 100644 --- a/pdf_ocr_hotfolder/processor.py +++ b/pdf_ocr_hotfolder/processor.py @@ -7,11 +7,37 @@ import subprocess from dataclasses import dataclass from pathlib import Path -from .config import OcrConfig, VeraPdfConfig +from .config import OcrConfig, OutputConfig, VeraPdfConfig log = logging.getLogger(__name__) +def build_output_name(src_name: str, mode: str, tag: str) -> str: + """Erzeugt den Ziel-Dateinamen für ein OCR-PDF. + + Args: + src_name: Original-Dateiname (z.B. "scan.pdf") + mode: "prefix" | "suffix" | "none" + tag: Einzufügender String (verbatim, leer = kein Tag) + + Beispiele: + prefix "OCR_": "scan.pdf" -> "OCR_scan.pdf" + suffix "_OCR": "scan.pdf" -> "scan_OCR.pdf" + suffix "_OCR": "scan.tar.gz.pdf" -> "scan.tar.gz_OCR.pdf" + none: "scan.pdf" -> "scan.pdf" + """ + if mode == "none" or not tag: + return src_name + if mode == "prefix": + return f"{tag}{src_name}" + if mode == "suffix": + # Nur die letzte Extension abspalten, sonst "foo.bar.pdf" kaputt gemacht + p = Path(src_name) + stem, ext = p.stem, p.suffix + return f"{stem}{tag}{ext}" + raise ValueError(f"Unbekannter name_mode: {mode!r}") + + @dataclass class ProcessResult: source: Path @@ -71,11 +97,13 @@ def process_pdf( error_dir: Path, ocr_cfg: OcrConfig, vera_cfg: VeraPdfConfig, + output_cfg: OutputConfig, ) -> ProcessResult: """Verarbeitet eine einzelne PDF: move→OCR→validate→outgoing/error.""" + out_name = build_output_name(src.name, output_cfg.name_mode, output_cfg.name_tag) work_src = working_dir / src.name - work_out = working_dir / f"OCR_{src.name}" - final_out = outgoing_dir / f"OCR_{src.name}" + work_out = working_dir / f"__ocr_{out_name}" # Temp-Name, damit er != src.name ist + final_out = outgoing_dir / out_name try: shutil.move(str(src), str(work_src)) @@ -100,10 +128,38 @@ def process_pdf( outgoing_dir.mkdir(parents=True, exist_ok=True) shutil.move(str(work_out), str(final_out)) - work_src.unlink(missing_ok=True) + _dispose_original(work_src, src.name, output_cfg) return ProcessResult(src, final_out, True, verapdf_passed=vera_ok) +def _dispose_original(work_src: Path, original_name: str, cfg: OutputConfig) -> None: + """Entsorgt das Original nach erfolgreichem OCR — löschen oder archivieren.""" + if not work_src.exists(): + return + mode = cfg.original_on_success + if mode == "delete": + work_src.unlink(missing_ok=True) + return + if mode == "archive": + if not cfg.archive_dir: + log.error("original_on_success=archive aber archive_dir ist leer — lösche stattdessen") + work_src.unlink(missing_ok=True) + return + archive = Path(cfg.archive_dir) + archive.mkdir(parents=True, exist_ok=True) + dest = archive / original_name + # Bei Namens-Kollision mit Timestamp umbenennen + if dest.exists(): + from datetime import datetime + ts = datetime.now().strftime("%Y%m%d-%H%M%S") + dest = archive / f"{dest.stem}_{ts}{dest.suffix}" + shutil.move(str(work_src), str(dest)) + log.info("Original archiviert: %s", dest) + return + log.warning("Unbekannter original_on_success=%r — lösche stattdessen", mode) + work_src.unlink(missing_ok=True) + + def _move_to_error(p: Path, error_dir: Path) -> None: error_dir.mkdir(parents=True, exist_ok=True) try: diff --git a/pdf_ocr_hotfolder/service.py b/pdf_ocr_hotfolder/service.py index a26ec0e..cb5c35c 100644 --- a/pdf_ocr_hotfolder/service.py +++ b/pdf_ocr_hotfolder/service.py @@ -72,6 +72,20 @@ def detect_ghostscript_version() -> str | None: return result.stdout.strip() or None +def check_output_config(mode: str, archive_dir: str) -> None: + """Validiert die [output]-Section. Wirft PreflightError bei Problemen.""" + valid_modes = {"delete", "archive"} + if mode not in valid_modes: + raise PreflightError( + f"[output].original_on_success={mode!r} ungültig. " + f"Erlaubt: {sorted(valid_modes)}" + ) + if mode == "archive" and not archive_dir: + raise PreflightError( + "[output].original_on_success='archive' erfordert [output].archive_dir" + ) + + def check_preflight(pdfa_level: str = "") -> None: """Prüft externe Abhängigkeiten. @@ -173,6 +187,8 @@ class HotfolderService: def run(self) -> None: check_preflight(self.cfg.ocr.pdfa_level) + check_output_config(self.cfg.output.original_on_success, + self.cfg.output.archive_dir) self.ensure_dirs() self._scan_existing() @@ -197,6 +213,8 @@ class HotfolderService: Anzahl fehlgeschlagener PDFs (0 = alles ok). """ check_preflight(self.cfg.ocr.pdfa_level) + check_output_config(self.cfg.output.original_on_success, + self.cfg.output.archive_dir) self.ensure_dirs() self._scan_existing() self._executor.shutdown(wait=True) @@ -254,6 +272,7 @@ class HotfolderService: error_dir=self.cfg.paths.error, ocr_cfg=self.cfg.ocr, vera_cfg=self.cfg.verapdf, + output_cfg=self.cfg.output, ) with self._lock: diff --git a/tests/conftest.py b/tests/conftest.py index 002b9b6..a0b55af 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -11,6 +11,7 @@ from pdf_ocr_hotfolder.config import ( FolderUpload, NextcloudUpload, OcrConfig, + OutputConfig, Paths, SftpUpload, VeraPdfConfig, @@ -32,6 +33,7 @@ def tmp_config(tmp_path: Path) -> Config: return Config( paths=paths, ocr=OcrConfig(max_workers=1), + output=OutputConfig(), verapdf=VeraPdfConfig(enabled=False), folder=FolderUpload(enabled=False), nextcloud=NextcloudUpload(enabled=False), diff --git a/tests/test_once_exit_code.py b/tests/test_once_exit_code.py index 7471bca..a3f9d1a 100644 --- a/tests/test_once_exit_code.py +++ b/tests/test_once_exit_code.py @@ -8,7 +8,7 @@ from pdf_ocr_hotfolder.processor import ProcessResult from pdf_ocr_hotfolder.service import HotfolderService -def _fake_success(src: Path, working_dir, outgoing_dir, error_dir, ocr_cfg, vera_cfg): +def _fake_success(src: Path, working_dir, outgoing_dir, error_dir, **kwargs): out = outgoing_dir / f"OCR_{src.name}" out.parent.mkdir(parents=True, exist_ok=True) out.write_bytes(b"%PDF-1.4 ocr\n") @@ -16,7 +16,7 @@ def _fake_success(src: Path, working_dir, outgoing_dir, error_dir, ocr_cfg, vera return ProcessResult(src, out, True) -def _fake_failure(src: Path, working_dir, outgoing_dir, error_dir, ocr_cfg, vera_cfg): +def _fake_failure(src: Path, working_dir, outgoing_dir, error_dir, **kwargs): error_dir.mkdir(parents=True, exist_ok=True) dest = error_dir / src.name src.rename(dest) diff --git a/tests/test_output_naming.py b/tests/test_output_naming.py new file mode 100644 index 0000000..96bf2d3 --- /dev/null +++ b/tests/test_output_naming.py @@ -0,0 +1,190 @@ +"""Tests für Feature: konfigurierbare Dateinamen und Original-Behandlung.""" +from __future__ import annotations + +from pathlib import Path +from unittest.mock import patch + +import pytest + +from pdf_ocr_hotfolder.config import OcrConfig, OutputConfig, VeraPdfConfig +from pdf_ocr_hotfolder.processor import build_output_name, process_pdf +from pdf_ocr_hotfolder.service import PreflightError, check_output_config + + +# ---------------- build_output_name ---------------- + +@pytest.mark.parametrize("src,mode,tag,expected", [ + # prefix + ("scan.pdf", "prefix", "OCR_", "OCR_scan.pdf"), + ("scan.pdf", "prefix", "[OCR] ", "[OCR] scan.pdf"), + # suffix (Tag vor Extension) + ("scan.pdf", "suffix", "_OCR", "scan_OCR.pdf"), + ("scan.pdf", "suffix", "-ocr", "scan-ocr.pdf"), + # none + ("scan.pdf", "none", "OCR_", "scan.pdf"), + # leerer Tag = none + ("scan.pdf", "prefix", "", "scan.pdf"), + ("scan.pdf", "suffix", "", "scan.pdf"), + # Mehrfach-Punkte im Namen: nur letzte Extension zählt + ("rechnung.2026.pdf", "suffix", "_OCR", "rechnung.2026_OCR.pdf"), + ("rechnung.2026.pdf", "prefix", "OCR_", "OCR_rechnung.2026.pdf"), + # Name ohne Extension + ("NO_EXT", "suffix", "_OCR", "NO_EXT_OCR"), +]) +def test_build_output_name(src, mode, tag, expected) -> None: + assert build_output_name(src, mode, tag) == expected + + +def test_build_output_name_invalid_mode() -> None: + with pytest.raises(ValueError, match="name_mode"): + build_output_name("x.pdf", "bogus", "OCR_") + + +# ---------------- check_output_config ---------------- + +def test_check_output_config_delete_ok() -> None: + check_output_config("delete", "") # ok + + +def test_check_output_config_archive_requires_dir() -> None: + with pytest.raises(PreflightError, match="archive_dir"): + check_output_config("archive", "") + + +def test_check_output_config_archive_with_dir_ok() -> None: + check_output_config("archive", "/var/archive") # ok + + +def test_check_output_config_invalid_mode() -> None: + with pytest.raises(PreflightError, match="ungültig"): + check_output_config("trash", "") + + +# ---------------- process_pdf mit Original-Behandlung ---------------- + +def _fake_ocr(src: Path, dst: Path, cfg: OcrConfig) -> None: + """Simuliert ocrmypdf: kopiert Inhalt, erzeugt Zieldatei.""" + dst.write_bytes(b"%PDF-1.4 OCRed\n" + src.read_bytes()) + + +def _prepare(tmp_path: Path) -> dict: + dirs = { + "working": tmp_path / "working", + "outgoing": tmp_path / "outgoing", + "error": tmp_path / "error", + "archive": tmp_path / "archive", + "incoming": tmp_path / "incoming", + } + for d in dirs.values(): + d.mkdir(parents=True, exist_ok=True) + src = dirs["incoming"] / "scan.pdf" + src.write_bytes(b"%PDF-1.4 original\n") + return {"src": src, **dirs} + + +def test_process_pdf_prefix_delete(tmp_path: Path) -> None: + env = _prepare(tmp_path) + out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_", + original_on_success="delete") + with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr): + result = process_pdf( + src=env["src"], + working_dir=env["working"], + outgoing_dir=env["outgoing"], + error_dir=env["error"], + ocr_cfg=OcrConfig(), + vera_cfg=VeraPdfConfig(enabled=False), + output_cfg=out_cfg, + ) + assert result.success + assert (env["outgoing"] / "OCR_scan.pdf").exists() + # Original ist weg, weder in incoming noch in working + assert not env["src"].exists() + assert not (env["working"] / "scan.pdf").exists() + + +def test_process_pdf_suffix_delete(tmp_path: Path) -> None: + env = _prepare(tmp_path) + out_cfg = OutputConfig(name_mode="suffix", name_tag="_OCR", + original_on_success="delete") + with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr): + result = process_pdf( + src=env["src"], + working_dir=env["working"], + outgoing_dir=env["outgoing"], + error_dir=env["error"], + ocr_cfg=OcrConfig(), + vera_cfg=VeraPdfConfig(enabled=False), + output_cfg=out_cfg, + ) + assert result.success + assert (env["outgoing"] / "scan_OCR.pdf").exists() + + +def test_process_pdf_none_mode(tmp_path: Path) -> None: + env = _prepare(tmp_path) + out_cfg = OutputConfig(name_mode="none", name_tag="OCR_", + original_on_success="delete") + with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr): + result = process_pdf( + src=env["src"], + working_dir=env["working"], + outgoing_dir=env["outgoing"], + error_dir=env["error"], + ocr_cfg=OcrConfig(), + vera_cfg=VeraPdfConfig(enabled=False), + output_cfg=out_cfg, + ) + assert result.success + # Ausgang hat GLEICHEN Namen wie Original + assert (env["outgoing"] / "scan.pdf").exists() + + +def test_process_pdf_archive_original(tmp_path: Path) -> None: + env = _prepare(tmp_path) + out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_", + original_on_success="archive", + archive_dir=str(env["archive"])) + with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr): + result = process_pdf( + src=env["src"], + working_dir=env["working"], + outgoing_dir=env["outgoing"], + error_dir=env["error"], + ocr_cfg=OcrConfig(), + vera_cfg=VeraPdfConfig(enabled=False), + output_cfg=out_cfg, + ) + assert result.success + assert (env["outgoing"] / "OCR_scan.pdf").exists() + # Original liegt jetzt im Archiv + archived = env["archive"] / "scan.pdf" + assert archived.exists() + assert archived.read_bytes() == b"%PDF-1.4 original\n" + + +def test_process_pdf_archive_name_collision(tmp_path: Path) -> None: + """Bei Namens-Kollision im Archiv wird Timestamp angehängt.""" + env = _prepare(tmp_path) + # Vorhandene Kollisions-Datei + (env["archive"] / "scan.pdf").write_bytes(b"old") + + out_cfg = OutputConfig(name_mode="prefix", name_tag="OCR_", + original_on_success="archive", + archive_dir=str(env["archive"])) + with patch("pdf_ocr_hotfolder.processor.run_ocr", side_effect=_fake_ocr): + process_pdf( + src=env["src"], + working_dir=env["working"], + outgoing_dir=env["outgoing"], + error_dir=env["error"], + ocr_cfg=OcrConfig(), + vera_cfg=VeraPdfConfig(enabled=False), + output_cfg=out_cfg, + ) + # Alte Datei unverändert + assert (env["archive"] / "scan.pdf").read_bytes() == b"old" + # Neue Datei mit Timestamp-Suffix + archived = list(env["archive"].glob("scan_*.pdf")) + assert len(archived) == 1 + assert archived[0].read_bytes() == b"%PDF-1.4 original\n"