SAFEHR-data · p-j-smith · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -50,21 +50,21 @@ services:
   marker:
     profiles: [marker]
     build:
-      context: src/ocr/marker
+      context: packages/ocr/marker
       dockerfile: Dockerfile
       args:
         <<: *build-args-common
         MARKER_API_PORT: ${MARKER_API_PORT}
     environment:
       <<: [*proxy-common, *common-env]
-      CONTAINER_DATA_FOLDER: /data
+      DATA_FOLDER: /data
       MARKER_API_PORT: ${MARKER_API_PORT}
     env_file:
       - ./.env
     ports:
       - "${MARKER_API_PORT}:${MARKER_API_PORT}"
     volumes:
-      - ${HOST_DATA_FOLDER}:${CONTAINER_DATA_FOLDER:-/data}
+      - ${PWD}/${DATA_FOLDER}:/data
     networks:
       - pyonb_ocr_api
     healthcheck:
@@ -84,21 +84,21 @@ services:
   paddleocr:
     profiles: [paddleocr]
     build:
-      context: src/ocr/paddleocr
+      context: packages/ocr/paddleocr
       dockerfile: Dockerfile
       args:
         <<: *build-args-common
         PADDLEOCR_API_PORT: ${PADDLEOCR_API_PORT}
     environment:
       <<: [*proxy-common, *common-env]
-      CONTAINER_DATA_FOLDER: /data
+      DATA_FOLDER: /data
       PADDLEOCR_API_PORT: ${PADDLEOCR_API_PORT}
     env_file:
       - ./.env
     ports:
       - "${PADDLEOCR_API_PORT}:${PADDLEOCR_API_PORT}"
     volumes:
-      - ${HOST_DATA_FOLDER}:${CONTAINER_DATA_FOLDER:-/data}
+      - ${PWD}/${DATA_FOLDER}:/data
     networks:
       - pyonb_ocr_api
     healthcheck:

diff --git a/packages/ocr/marker/Dockerfile b/packages/ocr/marker/Dockerfile
@@ -0,0 +1,19 @@
+FROM ghcr.io/astral-sh/uv:python3.13-bookworm AS app
+
+SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
+
+WORKDIR /app
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+COPY ./pyproject.toml .
+COPY ./README.md .
+COPY ./src src/
+
+RUN uv venv
+RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked uv sync --no-editable --no-dev
+
+# make uvicorn etc available
+ENV PATH="/app/.venv/bin:$PATH"
+
+CMD uvicorn pyonb_marker.api:app --host 0.0.0.0 --port "$MARKER_API_PORT" --workers 4 --use-colors
diff --git a/packages/ocr/marker/README.md b/packages/ocr/marker/README.md
@@ -0,0 +1,30 @@
+# Instructions
+
+## Python
+
+First install `pyonb_marker`. From the top-level `pyonb` directory:
+
+```shell
+uv sync --extra marker
+```
+
+Then, to convert a PDF to markdown:
+
+```python
+import pyonb_marker
+
+result = pyonb_marker.convert_pdf_to_markdown(
+    filepath="path/to/data/input.pdf",
+)
+```
+
+## Docker compose
+
+From the `pyonb/packages/ocr/marker` directory:
+
+```shell
+docker compose run marker data/ms-note-one-page.pdf data/output.md
+```
+
+Note, you will need to set `DATA_FOLDER` in a `.env` file,
+e.g.: `DATA_FOLDER=path/to/data/input.pdf`.
diff --git a/src/ocr/marker/docker-compose.yml → packages/ocr/marker/docker-compose.yml b/src/ocr/marker/docker-compose.yml → packages/ocr/marker/docker-compose.yml
diff --git a/packages/ocr/marker/pyproject.toml b/packages/ocr/marker/pyproject.toml
@@ -0,0 +1,19 @@
+[build-system]
+build-backend = "hatchling.build"
+requires = ["hatchling"]
+
+[project]
+dependencies = [
+    "accelerate",
+    "fastapi[standard]",
+    "marker-pdf",
+    "ollama",
+    "python-dotenv",
+    "requests",
+    "uvicorn",
+]
+description = "pyonb wrapper around marker"
+name = "pyonb-marker"
+readme = "README.md"
+requires-python = ">=3.11"
+version = "0.1.0"
diff --git a/src/ocr/marker/__init__.py → ...s/ocr/marker/src/pyonb_marker/__init__.py b/src/ocr/marker/__init__.py → ...s/ocr/marker/src/pyonb_marker/__init__.py
diff --git a/src/ocr/marker/api.py → packages/ocr/marker/src/pyonb_marker/api.py b/src/ocr/marker/api.py → packages/ocr/marker/src/pyonb_marker/api.py
@@ -8,8 +8,11 @@
 from fastapi import FastAPI, File, HTTPException, UploadFile, status
 from fastapi.responses import JSONResponse, RedirectResponse
 
+from pyonb_marker.main import convert_pdf_to_markdown
+
+_today = datetime.datetime.now(datetime.UTC).strftime("%Y_%m_%d")  # type: ignore[attr-defined] # mypy complains that 'Module has no attribute "UTC"'
 logging.basicConfig(
-    filename="marker." + datetime.datetime.now(tz=datetime.UTC).strftime("%Y%m%d") + ".log",
+    filename=f"marker-{_today}.log",
     format="%(asctime)s %(message)s",
     filemode="a",
 )
@@ -18,18 +21,6 @@
 logger = logging.getLogger()
 logger.setLevel(logging.DEBUG)
 
-# TODO(tom): improve imports - below try statements horrible
-try:
-    # local
-    from .main import run_marker
-except Exception:
-    logger.exception("Detected inside Docker container.")
-    # Docker container
-    try:
-        from main import run_marker  # type: ignore  # noqa: PGH003
-    except Exception:
-        logger.exception("Marker imports not possible.")
-
 app = FastAPI(swagger_ui_parameters={"tryItOutEnabled": True})
 
 
@@ -71,7 +62,7 @@ async def inference(file: Annotated[UploadFile, File()] = None) -> JSONResponse:
                 # marker requires path to file rather than UploadFile object, so create temp copy of file
                 with Path(f"temp_api_file_{file.filename}").open("wb") as f:  # noqa: ASYNC230
                     f.write(content)
-                result, _ = run_marker(f"temp_api_file_{file.filename}")
+                result = convert_pdf_to_markdown(f"temp_api_file_{file.filename}")
             except Exception as e:
                 raise HTTPException(status_code=400, detail=f"Failed to run marker. Error: {e}") from e
         else:

diff --git a/src/ocr/marker/main.py → packages/ocr/marker/src/pyonb_marker/main.py b/src/ocr/marker/main.py → packages/ocr/marker/src/pyonb_marker/main.py
@@ -22,37 +22,29 @@ def setup_converter(config, config_parser) -> PdfConverter:  # noqa: ANN001
     )
 
 
-def convert_pdf_to_markdown(file_path: str | Path, output_format: str | Path = "markdown", use_llm: bool = True):  # noqa: ANN201
+def convert_pdf_to_markdown(  # noqa: ANN201
+    file_path: str | Path,
+    output_format: str | Path = "markdown",
+    use_llm: bool = True,
+):
     """Convert the PDF to markdown using Marker and optionally use LLM for improved accuracy."""
+    config = {
+        "output_format": output_format,
+        "use_llm": use_llm,
+        "llm_service": "marker.services.ollama.OllamaService",
+        "ollama_model": "llama3.2",
+        "ollama_base_url": "http://localhost:11434",
+        "disable_images": True,
+    }
+    config_parser = ConfigParser(config)
+    converter = setup_converter(config_parser.generate_config_dict(), config_parser)
     try:
-        # Optionally enable LLM for improved accuracy
-        config = {
-            "output_format": output_format,
-            "use_llm": use_llm,
-            "llm_service": "marker.services.ollama.OllamaService",
-            "ollama_model": "llama3.2",
-            "ollama_base_url": "http://localhost:11434",
-        }
-        config_parser = ConfigParser(config)
-        # Create the converter with the necessary settings
-        converter = setup_converter(config_parser.generate_config_dict(), config_parser)
-
-        # Process the PDF file and convert to the specified output format
         rendered = converter(str(file_path))
-
-        # Extract the text (Markdown, JSON, or HTML) from the rendered object
-        text, _, images = text_from_rendered(rendered)
+        text, _, _ = text_from_rendered(rendered)
     except Exception:
         logger.exception("Error processing PDF.")
-    else:
-        return text, images
-
-
-def run_marker(input_pdf_path: str | Path):  # noqa: ANN201
-    """Execute marker."""
-    res, images = convert_pdf_to_markdown(file_path=input_pdf_path, use_llm=True, output_format="json")
 
-    return res, images
+    return text
 
 
 if __name__ == "__main__":
@@ -64,11 +56,11 @@ def run_marker(input_pdf_path: str | Path):  # noqa: ANN201
     input_pdf_path = Path(sys.argv[1])
     output_txt_path = Path(sys.argv[2])
 
-    res, images = run_marker(input_pdf_path)
+    text = convert_pdf_to_markdown(input_pdf_path)
 
     try:
         with output_txt_path.open("w", encoding="utf-8") as f:
-            f.write(res)
+            f.write(text)
 
         logger.info("Text extracted to %s", output_txt_path)
 

diff --git a/packages/ocr/paddleocr/Dockerfile b/packages/ocr/paddleocr/Dockerfile
@@ -0,0 +1,35 @@
+FROM ghcr.io/astral-sh/uv:python3.13-bookworm AS app
+
+SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
+
+
+RUN --mount=type=cache,target=/var/cache/apt \
+    --mount=type=cache,target=/var/lib/apt \
+    apt-get update \
+    && apt-get install -y --no-install-recommends \
+       ccache \
+       cmake \
+       curl \
+       ffmpeg \
+       libpoppler-cpp-dev \
+       libsm6 \
+       libxext6 \
+       pkg-config \
+       poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+COPY ./pyproject.toml .
+COPY ./README.md .
+COPY ./src src/
+
+RUN uv venv
+RUN --mount=type=cache,target=/root/.cache/uv,sharing=locked uv sync --no-editable --no-dev
+
+# make uvicorn etc available
+ENV PATH="/app/.venv/bin:$PATH"
+
+CMD uvicorn pyonb_paddleocr.api:app --host 0.0.0.0 --port "$PADDLEOCR_API_PORT" --workers 4 --use-colors
diff --git a/packages/ocr/paddleocr/README.md b/packages/ocr/paddleocr/README.md
@@ -0,0 +1,24 @@
+# Instructions
+
+Before using the `paddleocr` API for OCR, you will need to set the `PADDLEOCR_API_PORT`
+environment variable in the top-level `.env` file.
+
+## Docker Compose
+
+You will need to define the `OCR_FORWARDING_API_PORT` in the `.env` file.
+
+Then, spin up the `ocr-forwarding-api` and `kreuzberg` services:
+
+```shell
+docker-compose --profile paddleocr up --build --detach
+```
+
+You can then use `curl` to send a PDF to the forwarding API:
+
+```shell
+curl -v -X POST http://127.0.0.1:8110/paddleocr/inference_single \
+  -F "[email protected]" \
+  -H "accept: application/json"
+```
+
+Note, this assumes you have set `OCR_FORWARDING_API_PORT` to `8110`.
diff --git a/packages/ocr/paddleocr/pyproject.toml b/packages/ocr/paddleocr/pyproject.toml
@@ -0,0 +1,22 @@
+[build-system]
+build-backend = "hatchling.build"
+requires = ["hatchling"]
+
+[project]
+dependencies = [
+    "fastapi",
+    "paddleocr==2.10.0",
+    "paddlepaddle",
+    "pdf2image",
+    "pillow",
+    "python-multipart",
+    "python-poppler",
+    "requests",
+    "setuptools",
+    "uvicorn",
+]
+description = "pyonb wrapper around paddleocr"
+name = "pyonb-paddleocr"
+readme = "README.md"
+requires-python = ">=3.11"
+version = "0.1.0"
diff --git a/src/ocr/paddleocr/app/__init__.py → ...paddleocr/src/pyonb_paddleocr/__init__.py b/src/ocr/paddleocr/app/__init__.py → ...paddleocr/src/pyonb_paddleocr/__init__.py