rw-deepseek-ocr/backend/main.py

import os
import re
import uuid
import tempfile
import shutil
import base64
from typing import List, Dict, Any, Optional
from contextlib import asynccontextmanager
from datetime import datetime, timezone

from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
from pydantic import BaseModel
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image
import uvicorn
from decouple import config as env_config

# Import PDF and document conversion utilities
from pdf_utils import (
    pdf_to_images_high_quality,
    images_to_pdf,
    extract_ref_patterns,
    crop_images_from_refs,
    clean_markdown_content
)
from format_converter import DocumentConverter
from database import init_db, get_db

OCR_IMAGES_DIR = env_config("OCR_IMAGES_DIR", default="/data/ocr_images")

# -----------------------------
# Lifespan context for model loading
# -----------------------------
model = None
tokenizer = None

@asynccontextmanager
async def lifespan(app: FastAPI):
    """Load model on startup, cleanup on shutdown"""
    global model, tokenizer

    # Image storage directory
    os.makedirs(OCR_IMAGES_DIR, exist_ok=True)

    # Database
    try:
        init_db()
    except Exception as exc:
        print(f"Warning: database initialization failed: {exc}")

    # Environment setup
    os.environ.pop("TRANSFORMERS_CACHE", None)
    MODEL_NAME = env_config("MODEL_NAME", default="deepseek-ai/DeepSeek-OCR")
    HF_HOME = env_config("HF_HOME", default="/models")
    os.makedirs(HF_HOME, exist_ok=True)

    # Load model
    print(f"🚀 Loading {MODEL_NAME}...")
    torch_dtype = torch.bfloat16

    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
    )

    model = AutoModel.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        use_safetensors=True,
        attn_implementation="eager",
        torch_dtype=torch_dtype,
    ).eval().to("cuda")

    # Pad token setup
    try:
        if getattr(tokenizer, "pad_token_id", None) is None and getattr(tokenizer, "eos_token_id", None) is not None:
            tokenizer.pad_token = tokenizer.eos_token
        if getattr(model.config, "pad_token_id", None) is None and getattr(tokenizer, "pad_token_id", None) is not None:
            model.config.pad_token_id = tokenizer.pad_token_id
    except Exception:
        pass

    print("✅ Model loaded and ready!")

    yield

    # Cleanup
    print("🛑 Shutting down...")

# -----------------------------
# FastAPI app
# -----------------------------
app = FastAPI(
    title="DeepSeek-OCR API",
    description="Blazing fast OCR with DeepSeek-OCR model 🔥",
    version="2.0.0",
    lifespan=lifespan
)

# CORS middleware for React frontend
CORS_ORIGINS = env_config("CORS_ORIGINS", default="").split(",")
CORS_ORIGINS = [o.strip() for o in CORS_ORIGINS if o.strip()]

app.add_middleware(
    CORSMiddleware,
    allow_origins=CORS_ORIGINS if CORS_ORIGINS else ["http://localhost:3000"],
    allow_credentials=True,
    allow_methods=["GET", "POST"],
    allow_headers=["*"],
)

# -----------------------------
# Prompt builder
# -----------------------------
def build_prompt(
    mode: str,
    user_prompt: str,
    grounding: bool,
    find_term: Optional[str],
    schema: Optional[str],
    include_caption: bool,
) -> str:
    """Build the prompt based on mode"""
    parts: List[str] = ["<image>"]
    mode_requires_grounding = mode in {"find_ref", "layout_map", "pii_redact"}
    if grounding or mode_requires_grounding:
        parts.append("<|grounding|>")

    instruction = ""
    if mode == "plain_ocr":
        instruction = "Free OCR."
    elif mode == "markdown":
        instruction = "Convert the document to markdown."
    elif mode == "tables_csv":
        instruction = (
            "Extract every table and output CSV only. "
            "Use commas, minimal quoting. If multiple tables, separate with a line containing '---'."
        )
    elif mode == "tables_md":
        instruction = "Extract every table as GitHub-flavored Markdown tables. Output only the tables."
    elif mode == "kv_json":
        schema_text = schema.strip() if schema else "{}"
        instruction = (
            "Extract key fields and return strict JSON only. "
            f"Use this schema (fill the values): {schema_text}"
        )
    elif mode == "figure_chart":
        instruction = (
            "Parse the figure. First extract any numeric series as a two-column table (x,y). "
            "Then summarize the chart in 2 sentences. Output the table, then a line '---', then the summary."
        )
    elif mode == "find_ref":
        key = (find_term or "").strip() or "Total"
        instruction = f"Locate <|ref|>{key}<|/ref|> in the image."
    elif mode == "layout_map":
        instruction = (
            'Return a JSON array of blocks with fields {"type":["title","paragraph","table","figure"],'
            '"box":[x1,y1,x2,y2]}. Do not include any text content.'
        )
    elif mode == "pii_redact":
        instruction = (
            'Find all occurrences of emails, phone numbers, postal addresses, and IBANs. '
            'Return a JSON array of objects {label, text, box:[x1,y1,x2,y2]}.'
        )
    elif mode == "multilingual":
        instruction = "Free OCR. Detect the language automatically and output in the same script."
    elif mode == "describe":
        instruction = "Describe this image. Focus on visible key elements."
    elif mode == "freeform":
        instruction = user_prompt.strip() if user_prompt else "OCR this image."
    else:
        instruction = "OCR this image."

    if include_caption and mode not in {"describe"}:
        instruction = instruction + "\nThen add a one-paragraph description of the image."

    parts.append(instruction)
    return "\n".join(parts)

# -----------------------------
# Grounding parser
# -----------------------------
# Match a full detection block and capture the coordinates as the entire list expression
# Examples of captured coords (including outer brackets):
#  - [[312, 339, 480, 681]]
#  - [[504, 700, 625, 910], [771, 570, 996, 996]]
#  - [[110, 310, 255, 800], [312, 343, 479, 680], ...]
# Using a greedy bracket capture ensures we include all inner lists up to the last ']' before </|det|>
DET_BLOCK = re.compile(
    r"<\|ref\|>(?P<label>.*?)<\|/ref\|>\s*<\|det\|>\s*(?P<coords>\[.*\])\s*<\|/det\|>",
    re.DOTALL,
)

def clean_grounding_text(text: str) -> str:
    """Remove grounding tags from text for display, keeping labels"""
    # Replace <|ref|>label<|/ref|><|det|>[...any nested lists...]<|/det|> with just the label
    cleaned = re.sub(
        r"<\|ref\|>(.*?)<\|/ref\|>\s*<\|det\|>\s*\[.*\]\s*<\|/det\|>",
        r"\1",
        text,
        flags=re.DOTALL,
    )
    # Also remove any standalone grounding tags
    cleaned = re.sub(r"<\|grounding\|>", "", cleaned)
    return cleaned.strip()

def parse_detections(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
    """Parse grounding boxes from text and scale from 0-999 normalized coords to actual image dimensions

    Handles both single and multiple bounding boxes:
    - Single: <|ref|>label<|/ref|><|det|>[[x1,y1,x2,y2]]<|/det|>
    - Multiple: <|ref|>label<|/ref|><|det|>[[x1,y1,x2,y2], [x1,y1,x2,y2], ...]<|/det|>
    """
    boxes: List[Dict[str, Any]] = []
    for m in DET_BLOCK.finditer(text or ""):
        label = m.group("label").strip()
        coords_str = m.group("coords").strip()

        print(f"🔍 DEBUG: Found detection for '{label}'")
        print(f"📦 Raw coords string (with brackets): {coords_str}")

        try:
            import ast

            # Parse the full bracket expression directly (handles single and multiple)
            parsed = ast.literal_eval(coords_str)

            # Normalize to a list of lists
            if (
                isinstance(parsed, list)
                and len(parsed) == 4
                and all(isinstance(n, (int, float)) for n in parsed)
            ):
                # Single box provided as [x1,y1,x2,y2]
                box_coords = [parsed]
                print("📦 Single box (flat list) detected")
            elif isinstance(parsed, list):
                box_coords = parsed
                print(f"📦 Boxes detected: {len(box_coords)}")
            else:
                raise ValueError("Unsupported coords structure")

            # Process each box
            for idx, box in enumerate(box_coords):
                if isinstance(box, (list, tuple)) and len(box) >= 4:
                    x1 = int(float(box[0]) / 999 * image_width)
                    y1 = int(float(box[1]) / 999 * image_height)
                    x2 = int(float(box[2]) / 999 * image_width)
                    y2 = int(float(box[3]) / 999 * image_height)
                    print(f"  Box {idx+1}: {box} → [{x1}, {y1}, {x2}, {y2}]")
                    boxes.append({"label": label, "box": [x1, y1, x2, y2]})
                else:
                    print(f"  ⚠️ Skipping invalid box: {box}")
        except Exception as e:
            print(f"❌ Parsing failed: {e}")
            continue

    print(f"🎯 Total boxes parsed: {len(boxes)}")
    return boxes

# -----------------------------
# Routes
# -----------------------------
@app.get("/")
async def root():
    return {"message": "DeepSeek-OCR API is running! 🚀", "docs": "/docs"}

@app.get("/health")
async def health():
    return {"status": "healthy", "model_loaded": model is not None}

@app.post("/api/ocr")
async def ocr_inference(
    image: UploadFile = File(...),
    mode: str = Form("plain_ocr"),
    prompt: str = Form(""),
    grounding: bool = Form(False),
    include_caption: bool = Form(False),
    find_term: Optional[str] = Form(None),
    schema: Optional[str] = Form(None),
    base_size: int = Form(1024),
    image_size: int = Form(640),
    crop_mode: bool = Form(True),
    test_compress: bool = Form(False),
):
    """
    Perform OCR inference on uploaded image

    - **image**: Image file to process
    - **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
    - **prompt**: Custom prompt for freeform mode
    - **grounding**: Enable grounding boxes
    - **include_caption**: Add image description
    - **find_term**: Term to find (for find_ref mode)
    - **schema**: JSON schema (for kv_json mode)
    - **base_size**: Base processing size
    - **image_size**: Image size parameter
    - **crop_mode**: Enable crop mode
    - **test_compress**: Test compression
    """
    if model is None or tokenizer is None:
        raise HTTPException(status_code=503, detail="Model not loaded yet")

    # Build prompt
    prompt_text = build_prompt(
        mode=mode,
        user_prompt=prompt,
        grounding=grounding,
        find_term=find_term,
        schema=schema,
        include_caption=include_caption,
    )

    tmp_img = None
    out_dir = None
    try:
        # Save uploaded file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
            content = await image.read()
            tmp.write(content)
            tmp_img = tmp.name

        # Get original dimensions
        try:
            with Image.open(tmp_img) as im:
                orig_w, orig_h = im.size
        except Exception:
            orig_w = orig_h = None

        out_dir = tempfile.mkdtemp(prefix="dsocr_")

        # Run inference
        res = model.infer(
            tokenizer,
            prompt=prompt_text,
            image_file=tmp_img,
            output_path=out_dir,
            base_size=base_size,
            image_size=image_size,
            crop_mode=crop_mode,
            save_results=False,
            test_compress=test_compress,
            eval_mode=True,
        )

        # Normalize response
        if isinstance(res, str):
            text = res.strip()
        elif isinstance(res, dict) and "text" in res:
            text = str(res["text"]).strip()
        elif isinstance(res, (list, tuple)):
            text = "\n".join(map(str, res)).strip()
        else:
            text = ""

        # Fallback: check output file
        if not text:
            mmd = os.path.join(out_dir, "result.mmd")
            if os.path.exists(mmd):
                with open(mmd, "r", encoding="utf-8") as fh:
                    text = fh.read().strip()
        if not text:
            text = "No text returned by model."

        # Parse grounding boxes with proper coordinate scaling
        boxes = parse_detections(text, orig_w or 1, orig_h or 1) if ("<|det|>" in text or "<|ref|>" in text) else []

        # Clean grounding tags from display text, but keep the labels
        display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text

        # If display text is empty after cleaning but we have boxes, show the labels
        if not display_text and boxes:
            display_text = ", ".join([b["label"] for b in boxes])

        return JSONResponse({
            "success": True,
            "text": display_text,
            "raw_text": text,  # Include raw model output for debugging
            "boxes": boxes,
            "image_dims": {"w": orig_w, "h": orig_h},
            "metadata": {
                "mode": mode,
                "grounding": grounding or (mode in {"find_ref","layout_map","pii_redact"}),
                "base_size": base_size,
                "image_size": image_size,
                "crop_mode": crop_mode
            }
        })

    except Exception as e:
        print(f"OCR inference error: {type(e).__name__}: {str(e)}")
        raise HTTPException(status_code=500, detail="An internal error occurred during OCR processing.")

    finally:
        if tmp_img:
            try:
                os.remove(tmp_img)
            except Exception:
                pass
        if out_dir:
            shutil.rmtree(out_dir, ignore_errors=True)

@app.post("/api/process-pdf")
async def process_pdf(
    pdf_file: UploadFile = File(...),
    mode: str = Form("plain_ocr"),
    prompt: str = Form(""),
    output_format: str = Form("markdown"),  # markdown, html, docx, json
    grounding: bool = Form(False),
    include_caption: bool = Form(False),
    extract_images: bool = Form(True),
    dpi: int = Form(144),
    base_size: int = Form(1024),
    image_size: int = Form(640),
    crop_mode: bool = Form(True),
):
    """
    Process PDF document with OCR and convert to various formats

    - **pdf_file**: PDF file to process
    - **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
    - **prompt**: Custom prompt for freeform mode
    - **output_format**: Output format (markdown, html, docx, json)
    - **grounding**: Enable grounding boxes
    - **include_caption**: Add image descriptions
    - **extract_images**: Extract images from PDF
    - **dpi**: PDF rendering resolution (default: 144)
    - **base_size**: Base processing size
    - **image_size**: Image size parameter
    - **crop_mode**: Enable crop mode
    """
    if model is None or tokenizer is None:
        raise HTTPException(status_code=503, detail="Model not loaded yet")

    # Validate output format
    if output_format not in ["markdown", "html", "docx", "json"]:
        raise HTTPException(status_code=400, detail="Invalid output format. Must be: markdown, html, docx, or json")

    try:
        # Read PDF file
        pdf_bytes = await pdf_file.read()

        # Convert PDF to images
        print(f"📄 Converting PDF to images (DPI: {dpi})...")
        images = pdf_to_images_high_quality(pdf_bytes, dpi=dpi)
        total_pages = len(images)
        print(f"✅ Converted {total_pages} pages")

        # Process each page
        pages_content = []
        converter = DocumentConverter()

        for page_idx, img in enumerate(images):
            print(f"🔍 Processing page {page_idx + 1}/{total_pages}...")

            # Build prompt for this page
            prompt_text = build_prompt(
                mode=mode,
                user_prompt=prompt,
                grounding=grounding,
                find_term=None,
                schema=None,
                include_caption=include_caption,
            )

            # Save image temporarily
            tmp_img = None
            out_dir = None
            try:
                with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
                    img.save(tmp, format="PNG")
                    tmp_img = tmp.name

                orig_w, orig_h = img.size
                out_dir = tempfile.mkdtemp(prefix="dsocr_pdf_")

                # Run inference
                res = model.infer(
                    tokenizer,
                    prompt=prompt_text,
                    image_file=tmp_img,
                    output_path=out_dir,
                    base_size=base_size,
                    image_size=image_size,
                    crop_mode=crop_mode,
                    save_results=False,
                    test_compress=False,
                    eval_mode=True,
                )

                # Normalize response
                if isinstance(res, str):
                    text = res.strip()
                elif isinstance(res, dict) and "text" in res:
                    text = str(res["text"]).strip()
                elif isinstance(res, (list, tuple)):
                    text = "\n".join(map(str, res)).strip()
                else:
                    text = ""

                if not text:
                    mmd = os.path.join(out_dir, "result.mmd")
                    if os.path.exists(mmd):
                        with open(mmd, "r", encoding="utf-8") as fh:
                            text = fh.read().strip()
                if not text:
                    text = f"No text returned for page {page_idx + 1}."

                # Extract images if requested
                page_images = []
                if extract_images:
                    matches, matches_image, matches_other = extract_ref_patterns(text)
                    if matches_image:
                        cropped = crop_images_from_refs(img, matches)
                        for cropped_img in cropped:
                            # Convert to base64
                            img_buffer = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
                            cropped_img.save(img_buffer.name, format="JPEG", quality=95)
                            with open(img_buffer.name, "rb") as f:
                                img_b64 = base64.b64encode(f.read()).decode('utf-8')
                                page_images.append(img_b64)
                            os.remove(img_buffer.name)

                        # Clean the text and add image placeholders
                        text = clean_markdown_content(text, matches_image, matches_other)
                        for img_idx in range(len(page_images)):
                            text = f"[IMAGE_{img_idx}]\n" + text

                # Parse grounding boxes
                boxes = parse_detections(text, orig_w, orig_h) if ("<|det|>" in text or "<|ref|>" in text) else []

                # Clean grounding tags from display text
                display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text

                pages_content.append({
                    'page_number': page_idx + 1,
                    'text': display_text,
                    'raw_text': text,
                    'boxes': boxes,
                    'images': page_images,
                    'image_dims': {'w': orig_w, 'h': orig_h}
                })

            finally:
                if tmp_img:
                    try:
                        os.remove(tmp_img)
                    except Exception:
                        pass
                if out_dir:
                    shutil.rmtree(out_dir, ignore_errors=True)

        print(f"✅ Processed all {total_pages} pages")

        # Convert to requested format
        if output_format == "json":
            return JSONResponse({
                "success": True,
                "total_pages": total_pages,
                "pages": pages_content,
                "metadata": {
                    "mode": mode,
                    "grounding": grounding,
                    "extract_images": extract_images,
                    "dpi": dpi
                }
            })
        elif output_format == "markdown":
            md_content = converter.to_markdown(pages_content, include_images=extract_images)
            return StreamingResponse(
                iter([md_content.encode('utf-8')]),
                media_type="text/markdown",
                headers={"Content-Disposition": f"attachment; filename=ocr_result.md"}
            )
        elif output_format == "html":
            html_content = converter.to_html(pages_content, include_images=extract_images)
            return StreamingResponse(
                iter([html_content.encode('utf-8')]),
                media_type="text/html",
                headers={"Content-Disposition": f"attachment; filename=ocr_result.html"}
            )
        elif output_format == "docx":
            docx_buffer = converter.to_docx(pages_content, include_images=extract_images)
            return StreamingResponse(
                docx_buffer,
                media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                headers={"Content-Disposition": f"attachment; filename=ocr_result.docx"}
            )

    except Exception as e:
        import traceback
        print(f"Error processing PDF: {e}")
        print(traceback.format_exc())
        raise HTTPException(status_code=500, detail="An internal error occurred during PDF processing.")

# -----------------------------
# Job management routes
# -----------------------------

class ReviewRequest(BaseModel):
    reviewed_text: str
    reviewer_name: str
    author: Optional[str] = None
    book: Optional[str] = None
    chapter: Optional[str] = None
    page: Optional[str] = None


def _job_row_to_dict(row) -> Dict[str, Any]:
    """Convert a DB row (RealDictRow) to a plain dict with serialisable values."""
    d = dict(row)
    for key, val in d.items():
        if isinstance(val, datetime):
            d[key] = val.isoformat()
        elif val is not None and hasattr(val, '__str__') and type(val).__name__ == 'UUID':
            d[key] = str(val)
    return d


@app.post("/api/jobs")
async def commit_job(
    image: UploadFile = File(...),
    author: str = Form(""),
    book: str = Form(""),
    chapter: str = Form(""),
    page: str = Form(""),
    ocr_text: str = Form(""),
    mode: str = Form("plain_ocr"),
):
    """Commit an OCR job: save the image and insert a DB record."""
    job_id = str(uuid.uuid4())

    # Determine file extension from original filename or content type
    original_filename = image.filename or "image"
    ext = os.path.splitext(original_filename)[1].lower()
    if not ext:
        ct = (image.content_type or "").lower()
        ext_map = {
            "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg",
            "image/webp": ".webp", "image/gif": ".gif", "image/bmp": ".bmp",
        }
        ext = ext_map.get(ct, ".png")

    image_path = os.path.join(OCR_IMAGES_DIR, f"{job_id}{ext}")

    try:
        content = await image.read()
        with open(image_path, "wb") as f:
            f.write(content)
    except Exception as exc:
        raise HTTPException(status_code=500, detail="Failed to save image file.")

    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute(
                    """
                    INSERT INTO ocr_jobs
                        (id, author, book, chapter, page, image_path, original_filename,
                         ocr_text, mode, status)
                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, 'unreviewed')
                    RETURNING *
                    """,
                    (job_id, author or None, book or None, chapter or None,
                     page or None, image_path, original_filename,
                     ocr_text or None, mode),
                )
                row = cur.fetchone()
    except Exception as exc:
        # Clean up saved image if DB insert fails
        try:
            os.remove(image_path)
        except Exception:
            pass
        # Unique constraint violation (author + chapter + page already exists)
        if getattr(exc, 'pgcode', None) == '23505':
            raise HTTPException(
                status_code=409,
                detail="A job with this Author, Chapter, and Page already exists."
            )
        print(f"Job commit DB error: {exc}")
        raise HTTPException(status_code=500, detail="Failed to save job to database.")

    return JSONResponse(_job_row_to_dict(row), status_code=201)


@app.get("/api/jobs")
async def list_jobs(
    search: Optional[str] = Query(None, description="General text search across all fields"),
    author: Optional[str] = Query(None),
    book: Optional[str] = Query(None),
    chapter: Optional[str] = Query(None),
    status: Optional[str] = Query(None, description="unreviewed | reviewed"),
    limit: int = Query(20, ge=1, le=200),
    offset: int = Query(0, ge=0),
):
    """Search and list jobs. All filters are optional and combinable."""
    conditions = []
    params: List[Any] = []

    if search:
        conditions.append(
            "(author ILIKE %s OR book ILIKE %s OR chapter ILIKE %s "
            "OR page ILIKE %s OR ocr_text ILIKE %s OR reviewer_name ILIKE %s)"
        )
        like = f"%{search}%"
        params.extend([like, like, like, like, like, like])

    if author:
        conditions.append("author ILIKE %s")
        params.append(f"%{author}%")

    if book:
        conditions.append("book ILIKE %s")
        params.append(f"%{book}%")

    if chapter:
        conditions.append("chapter ILIKE %s")
        params.append(f"%{chapter}%")

    if status:
        conditions.append("status = %s")
        params.append(status)

    where = ("WHERE " + " AND ".join(conditions)) if conditions else ""

    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute(
                    f"SELECT COUNT(*) AS total FROM ocr_jobs {where}",
                    params,
                )
                total = cur.fetchone()["total"]

                cur.execute(
                    f"""
                    SELECT id, author, book, chapter, page, submitted_at, status,
                           reviewer_name, reviewed_at, mode, original_filename
                    FROM ocr_jobs {where}
                    ORDER BY submitted_at DESC
                    LIMIT %s OFFSET %s
                    """,
                    params + [limit, offset],
                )
                rows = [_job_row_to_dict(r) for r in cur.fetchall()]
    except Exception as exc:
        print(f"list_jobs DB error: {exc}")
        raise HTTPException(status_code=500, detail="Database error.")

    return JSONResponse({"total": total, "limit": limit, "offset": offset, "jobs": rows})


@app.get("/api/jobs/suggestions")
async def job_suggestions():
    """Return distinct values for author, book, chapter, and reviewer_name to power autocomplete."""
    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute("""
                    SELECT
                        array_remove(array_agg(DISTINCT author ORDER BY author), NULL) AS authors,
                        array_remove(array_agg(DISTINCT book ORDER BY book), NULL) AS books,
                        array_remove(array_agg(DISTINCT chapter ORDER BY chapter), NULL) AS chapters,
                        array_remove(array_agg(DISTINCT reviewer_name ORDER BY reviewer_name), NULL) AS reviewers
                    FROM ocr_jobs
                """)
                row = cur.fetchone()
    except Exception as exc:
        print(f"suggestions DB error: {exc}")
        raise HTTPException(status_code=500, detail="Database error.")

    return JSONResponse({
        "authors": row["authors"] or [],
        "books": row["books"] or [],
        "chapters": row["chapters"] or [],
        "reviewers": row["reviewers"] or [],
    })


@app.get("/api/jobs/{job_id}")
async def get_job(job_id: str):
    """Retrieve full job record including OCR text."""
    try:
        uuid.UUID(job_id)
    except ValueError:
        raise HTTPException(status_code=400, detail="Invalid job ID.")

    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute("SELECT * FROM ocr_jobs WHERE id = %s", (job_id,))
                row = cur.fetchone()
    except Exception as exc:
        print(f"get_job DB error: {exc}")
        raise HTTPException(status_code=500, detail="Database error.")

    if not row:
        raise HTTPException(status_code=404, detail="Job not found.")

    return JSONResponse(_job_row_to_dict(row))


@app.get("/api/jobs/{job_id}/image")
async def get_job_image(job_id: str):
    """Serve the stored image for a job."""
    try:
        uuid.UUID(job_id)
    except ValueError:
        raise HTTPException(status_code=400, detail="Invalid job ID.")

    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute("SELECT image_path FROM ocr_jobs WHERE id = %s", (job_id,))
                row = cur.fetchone()
    except Exception as exc:
        print(f"get_job_image DB error: {exc}")
        raise HTTPException(status_code=500, detail="Database error.")

    if not row:
        raise HTTPException(status_code=404, detail="Job not found.")

    path = row["image_path"]
    if not os.path.isfile(path):
        raise HTTPException(status_code=404, detail="Image file not found on disk.")

    return FileResponse(path)


@app.put("/api/jobs/{job_id}/review")
async def review_job(job_id: str, body: ReviewRequest):
    """Mark a job as reviewed with the corrected text and reviewer name."""
    try:
        uuid.UUID(job_id)
    except ValueError:
        raise HTTPException(status_code=400, detail="Invalid job ID.")

    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute(
                    """
                    UPDATE ocr_jobs
                    SET status = 'reviewed',
                        reviewed_text = %s,
                        reviewer_name = %s,
                        reviewed_at = NOW(),
                        author = %s,
                        book = %s,
                        chapter = %s,
                        page = %s
                    WHERE id = %s
                    RETURNING *
                    """,
                    (
                        body.reviewed_text,
                        body.reviewer_name,
                        body.author or None,
                        body.book or None,
                        body.chapter or None,
                        body.page or None,
                        job_id,
                    ),
                )
                row = cur.fetchone()
    except Exception as exc:
        print(f"review_job DB error: {exc}")
        raise HTTPException(status_code=500, detail="Database error.")

    if not row:
        raise HTTPException(status_code=404, detail="Job not found.")

    return JSONResponse(_job_row_to_dict(row))


@app.delete("/api/jobs/{job_id}")
async def delete_job(job_id: str):
    """Delete a job record and its stored image."""
    try:
        uuid.UUID(job_id)
    except ValueError:
        raise HTTPException(status_code=400, detail="Invalid job ID.")

    try:
        with get_db() as conn:
            with conn.cursor() as cur:
                cur.execute(
                    "DELETE FROM ocr_jobs WHERE id = %s RETURNING image_path",
                    (job_id,),
                )
                row = cur.fetchone()
    except Exception as exc:
        print(f"delete_job DB error: {exc}")
        raise HTTPException(status_code=500, detail="Database error.")

    if not row:
        raise HTTPException(status_code=404, detail="Job not found.")

    # Best-effort removal of the stored image file
    try:
        if row["image_path"] and os.path.isfile(row["image_path"]):
            os.remove(row["image_path"])
    except Exception:
        pass

    return JSONResponse({"deleted": job_id})


if __name__ == "__main__":
    host = env_config("API_HOST", default="0.0.0.0")
    port = env_config("API_PORT", default=8000, cast=int)
    uvicorn.run(app, host=host, port=port)