Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9cdc9ae443 |
@@ -1,5 +1,17 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [0.2.2] - 2026-04-09
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- **Issue #3**: Ghostscript 10.0.0–10.02.0 (Debian 12 default) zerschießen OCR mit PDF/A + `skip_text=true`.
|
||||||
|
- `config.example.toml`: `pdfa_level = ""` als sicherer Default
|
||||||
|
- Runtime-Preflight: Prüft `gs --version` wenn `pdfa_level` gesetzt ist, bricht mit klarer Fehlermeldung ab
|
||||||
|
- `install.sh`: warnt bei betroffenen GS-Versionen mit Upgrade-Hinweis auf bookworm-backports
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- `is_ghostscript_broken()` / `detect_ghostscript_version()` in `pdf_ocr_hotfolder.service`
|
||||||
|
- 19 weitere pytest-Tests für GS-Versions-Detection (parametrisiert) und Preflight-Kombinationen
|
||||||
|
|
||||||
## [0.2.1] - 2026-04-09
|
## [0.2.1] - 2026-04-09
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
|
|||||||
+4
-1
@@ -21,7 +21,10 @@ skip_text = true
|
|||||||
# Auflösung für gerasterte Seiten
|
# Auflösung für gerasterte Seiten
|
||||||
oversample = 300
|
oversample = 300
|
||||||
# PDF/A-Konformitätsstufe ("1", "2", "3" oder leer für keinen PDF/A-Output)
|
# PDF/A-Konformitätsstufe ("1", "2", "3" oder leer für keinen PDF/A-Output)
|
||||||
pdfa_level = "2"
|
# ACHTUNG: Ghostscript 10.0.0 bis 10.02.0 (Debian 12 default!) haben einen Bug,
|
||||||
|
# der mit pdfa_level + skip_text=true ocrmypdf komplett blockiert.
|
||||||
|
# Sicherer Default ist "" — nur auf "1"/"2"/"3" setzen, wenn gs >= 10.02.1 installiert ist.
|
||||||
|
pdfa_level = ""
|
||||||
# Schiefe Scans automatisch begradigen
|
# Schiefe Scans automatisch begradigen
|
||||||
deskew = true
|
deskew = true
|
||||||
# Hintergrund säubern
|
# Hintergrund säubern
|
||||||
|
|||||||
+20
@@ -52,6 +52,26 @@ install_base() {
|
|||||||
icc-profiles-free ca-certificates curl
|
icc-profiles-free ca-certificates curl
|
||||||
log_info "System-Pakete ok ✓"
|
log_info "System-Pakete ok ✓"
|
||||||
|
|
||||||
|
# Ghostscript-Versions-Check (Issue #3)
|
||||||
|
if command -v gs >/dev/null 2>&1; then
|
||||||
|
GS_VER="$(gs --version 2>/dev/null || echo 0.0)"
|
||||||
|
log_info "Ghostscript: $GS_VER"
|
||||||
|
case "$GS_VER" in
|
||||||
|
10.0.0|10.00.0|10.01.*|10.02.0)
|
||||||
|
echo
|
||||||
|
log_warn "═══════════════════════════════════════════════════════════════"
|
||||||
|
log_warn "Ghostscript $GS_VER ist vom PDF/A-Bug betroffen (10.0.0–10.02.0)."
|
||||||
|
log_warn "Mit pdfa_level + skip_text=true kann ocrmypdf KEINE PDFs verarbeiten."
|
||||||
|
log_warn ""
|
||||||
|
log_warn "Workarounds:"
|
||||||
|
log_warn " 1. ghostscript aus bookworm-backports installieren (>=10.02.1)"
|
||||||
|
log_warn " 2. In der Config [ocr].pdfa_level = \"\" setzen (Default ab v0.2.2)"
|
||||||
|
log_warn "═══════════════════════════════════════════════════════════════"
|
||||||
|
echo
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
fi
|
||||||
|
|
||||||
log_step "Default-User '$DEFAULT_USER' prüfen"
|
log_step "Default-User '$DEFAULT_USER' prüfen"
|
||||||
if id "$DEFAULT_USER" &>/dev/null; then
|
if id "$DEFAULT_USER" &>/dev/null; then
|
||||||
log_info "'$DEFAULT_USER' existiert bereits"
|
log_info "'$DEFAULT_USER' existiert bereits"
|
||||||
|
|||||||
@@ -2,8 +2,10 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import signal
|
import signal
|
||||||
|
import subprocess
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
from concurrent.futures import Future, ThreadPoolExecutor
|
from concurrent.futures import Future, ThreadPoolExecutor
|
||||||
@@ -26,11 +28,58 @@ class PreflightError(RuntimeError):
|
|||||||
# Pflicht-Binaries für ocrmypdf
|
# Pflicht-Binaries für ocrmypdf
|
||||||
_REQUIRED_BINARIES = ("tesseract", "gs")
|
_REQUIRED_BINARIES = ("tesseract", "gs")
|
||||||
|
|
||||||
|
# Ghostscript-Versionen mit bekanntem PDF/A+skip_text Bug (Issue #3):
|
||||||
|
# 10.0.0 .. 10.02.0 (inklusive). Ab 10.02.1 wieder nutzbar.
|
||||||
|
_GS_BROKEN_MIN = (10, 0, 0)
|
||||||
|
_GS_BROKEN_MAX = (10, 2, 0)
|
||||||
|
|
||||||
def check_preflight() -> None:
|
|
||||||
"""Prüft, ob alle externen Abhängigkeiten (Tesseract, Ghostscript) installiert sind.
|
|
||||||
|
|
||||||
Wirft PreflightError mit Liste der fehlenden Binaries.
|
def _parse_version(text: str) -> tuple[int, ...] | None:
|
||||||
|
"""Extrahiert die erste X.Y[.Z] Version aus einem String."""
|
||||||
|
m = re.search(r"(\d+)\.(\d+)(?:\.(\d+))?", text)
|
||||||
|
if not m:
|
||||||
|
return None
|
||||||
|
return tuple(int(x) if x is not None else 0 for x in m.groups())
|
||||||
|
|
||||||
|
|
||||||
|
def is_ghostscript_broken(version: str | None) -> bool:
|
||||||
|
"""Prüft, ob eine Ghostscript-Version vom PDF/A+skip_text Bug betroffen ist.
|
||||||
|
|
||||||
|
Betrifft 10.0.0 bis einschließlich 10.02.0. Ab 10.02.1 wieder sicher.
|
||||||
|
"""
|
||||||
|
if not version:
|
||||||
|
return False
|
||||||
|
parsed = _parse_version(version)
|
||||||
|
if parsed is None:
|
||||||
|
return False
|
||||||
|
# Auf 3-Tupel normalisieren
|
||||||
|
while len(parsed) < 3:
|
||||||
|
parsed = parsed + (0,)
|
||||||
|
parsed = parsed[:3]
|
||||||
|
return _GS_BROKEN_MIN <= parsed <= _GS_BROKEN_MAX
|
||||||
|
|
||||||
|
|
||||||
|
def detect_ghostscript_version() -> str | None:
|
||||||
|
"""Ruft `gs --version` auf und gibt den Versionsstring zurück (oder None)."""
|
||||||
|
gs = shutil.which("gs")
|
||||||
|
if gs is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
result = subprocess.run([gs, "--version"], capture_output=True,
|
||||||
|
text=True, timeout=5)
|
||||||
|
except (OSError, subprocess.TimeoutExpired):
|
||||||
|
return None
|
||||||
|
return result.stdout.strip() or None
|
||||||
|
|
||||||
|
|
||||||
|
def check_preflight(pdfa_level: str = "") -> None:
|
||||||
|
"""Prüft externe Abhängigkeiten.
|
||||||
|
|
||||||
|
- Tesseract und Ghostscript müssen im PATH sein
|
||||||
|
- Bei gesetztem pdfa_level wird die Ghostscript-Version gegen den
|
||||||
|
bekannten 10.0.0–10.02.0 Bug geprüft
|
||||||
|
|
||||||
|
Wirft PreflightError bei fehlenden Binaries oder unsicherem Ghostscript.
|
||||||
"""
|
"""
|
||||||
missing = [b for b in _REQUIRED_BINARIES if shutil.which(b) is None]
|
missing = [b for b in _REQUIRED_BINARIES if shutil.which(b) is None]
|
||||||
if missing:
|
if missing:
|
||||||
@@ -39,6 +88,16 @@ def check_preflight() -> None:
|
|||||||
+ ". Bitte installieren: sudo apt install tesseract-ocr ghostscript"
|
+ ". Bitte installieren: sudo apt install tesseract-ocr ghostscript"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if pdfa_level:
|
||||||
|
gs_version = detect_ghostscript_version()
|
||||||
|
if is_ghostscript_broken(gs_version):
|
||||||
|
raise PreflightError(
|
||||||
|
f"Ghostscript {gs_version} ist mit pdfa_level='{pdfa_level}' nicht "
|
||||||
|
"kompatibel (bekannter Bug in 10.0.0–10.02.0). "
|
||||||
|
"Entweder ghostscript auf >=10.02.1 upgraden (z.B. via bookworm-backports) "
|
||||||
|
"oder in der Config [ocr].pdfa_level = \"\" setzen."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _is_pdf(path: Path) -> bool:
|
def _is_pdf(path: Path) -> bool:
|
||||||
return path.suffix.lower() == ".pdf" and path.is_file()
|
return path.suffix.lower() == ".pdf" and path.is_file()
|
||||||
@@ -113,7 +172,7 @@ class HotfolderService:
|
|||||||
# ---- Lifecycle ----
|
# ---- Lifecycle ----
|
||||||
|
|
||||||
def run(self) -> None:
|
def run(self) -> None:
|
||||||
check_preflight()
|
check_preflight(self.cfg.ocr.pdfa_level)
|
||||||
self.ensure_dirs()
|
self.ensure_dirs()
|
||||||
self._scan_existing()
|
self._scan_existing()
|
||||||
|
|
||||||
@@ -137,7 +196,7 @@ class HotfolderService:
|
|||||||
Returns:
|
Returns:
|
||||||
Anzahl fehlgeschlagener PDFs (0 = alles ok).
|
Anzahl fehlgeschlagener PDFs (0 = alles ok).
|
||||||
"""
|
"""
|
||||||
check_preflight()
|
check_preflight(self.cfg.ocr.pdfa_level)
|
||||||
self.ensure_dirs()
|
self.ensure_dirs()
|
||||||
self._scan_existing()
|
self._scan_existing()
|
||||||
self._executor.shutdown(wait=True)
|
self._executor.shutdown(wait=True)
|
||||||
|
|||||||
@@ -0,0 +1,72 @@
|
|||||||
|
"""Tests für Issue #3: Ghostscript 10.0.0–10.02.0 PDF/A-Bug-Erkennung."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from pdf_ocr_hotfolder.service import (
|
||||||
|
PreflightError,
|
||||||
|
check_preflight,
|
||||||
|
is_ghostscript_broken,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("version,expected", [
|
||||||
|
# Betroffene Versionen
|
||||||
|
("10.0.0", True),
|
||||||
|
("10.00.0", True),
|
||||||
|
("10.01.0", True),
|
||||||
|
("10.01.1", True),
|
||||||
|
("10.01.2", True),
|
||||||
|
("10.02.0", True),
|
||||||
|
# Sichere Versionen
|
||||||
|
("10.02.1", False),
|
||||||
|
("10.03.0", False),
|
||||||
|
("10.04.0", False),
|
||||||
|
("11.0.0", False),
|
||||||
|
("9.56.1", False), # Debian 11 / Ubuntu 22.04
|
||||||
|
("9.55.0", False),
|
||||||
|
# Edge cases
|
||||||
|
("", False),
|
||||||
|
(None, False),
|
||||||
|
("garbage", False),
|
||||||
|
])
|
||||||
|
def test_is_ghostscript_broken(version, expected) -> None:
|
||||||
|
assert is_ghostscript_broken(version) is expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_preflight_without_pdfa_passes_with_broken_gs() -> None:
|
||||||
|
"""Ohne pdfa_level darf der betroffene GS verwendet werden."""
|
||||||
|
with patch("pdf_ocr_hotfolder.service.shutil.which", return_value="/usr/bin/fake"), \
|
||||||
|
patch("pdf_ocr_hotfolder.service.detect_ghostscript_version",
|
||||||
|
return_value="10.0.0"):
|
||||||
|
check_preflight(pdfa_level="") # darf nicht werfen
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_preflight_with_pdfa_fails_on_broken_gs() -> None:
|
||||||
|
"""Mit pdfa_level + kaputtem GS → PreflightError mit hilfreicher Meldung."""
|
||||||
|
with patch("pdf_ocr_hotfolder.service.shutil.which", return_value="/usr/bin/fake"), \
|
||||||
|
patch("pdf_ocr_hotfolder.service.detect_ghostscript_version",
|
||||||
|
return_value="10.0.0"):
|
||||||
|
with pytest.raises(PreflightError, match="Ghostscript 10.0.0"):
|
||||||
|
check_preflight(pdfa_level="2")
|
||||||
|
|
||||||
|
|
||||||
|
def test_check_preflight_with_pdfa_passes_on_fixed_gs() -> None:
|
||||||
|
"""Mit pdfa_level + gefixtem GS → ok."""
|
||||||
|
with patch("pdf_ocr_hotfolder.service.shutil.which", return_value="/usr/bin/fake"), \
|
||||||
|
patch("pdf_ocr_hotfolder.service.detect_ghostscript_version",
|
||||||
|
return_value="10.02.1"):
|
||||||
|
check_preflight(pdfa_level="2") # darf nicht werfen
|
||||||
|
|
||||||
|
|
||||||
|
def test_default_config_pdfa_level_is_empty() -> None:
|
||||||
|
"""Default-Config der Beispiel-Datei soll pdfa_level='' enthalten (Issue #3)."""
|
||||||
|
from pathlib import Path
|
||||||
|
import tomllib
|
||||||
|
cfg_path = Path(__file__).parent.parent / "config.example.toml"
|
||||||
|
with cfg_path.open("rb") as f:
|
||||||
|
data = tomllib.load(f)
|
||||||
|
assert data["ocr"]["pdfa_level"] == "", \
|
||||||
|
"config.example.toml muss pdfa_level='' als sicheren Default haben"
|
||||||
Reference in New Issue
Block a user