Adds DELETE /api/jobs/{id} endpoint (removes DB record and image file),
and a two-step Delete / Confirm button on the review page that returns
to the job list on success.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
917 lines
32 KiB
Python
917 lines
32 KiB
Python
import os
|
|
import re
|
|
import uuid
|
|
import tempfile
|
|
import shutil
|
|
import base64
|
|
from typing import List, Dict, Any, Optional
|
|
from contextlib import asynccontextmanager
|
|
from datetime import datetime, timezone
|
|
|
|
from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
|
|
from pydantic import BaseModel
|
|
import torch
|
|
from transformers import AutoModel, AutoTokenizer
|
|
from PIL import Image
|
|
import uvicorn
|
|
from decouple import config as env_config
|
|
|
|
# Import PDF and document conversion utilities
|
|
from pdf_utils import (
|
|
pdf_to_images_high_quality,
|
|
images_to_pdf,
|
|
extract_ref_patterns,
|
|
crop_images_from_refs,
|
|
clean_markdown_content
|
|
)
|
|
from format_converter import DocumentConverter
|
|
from database import init_db, get_db
|
|
|
|
OCR_IMAGES_DIR = env_config("OCR_IMAGES_DIR", default="/data/ocr_images")
|
|
|
|
# -----------------------------
|
|
# Lifespan context for model loading
|
|
# -----------------------------
|
|
model = None
|
|
tokenizer = None
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Load model on startup, cleanup on shutdown"""
|
|
global model, tokenizer
|
|
|
|
# Image storage directory
|
|
os.makedirs(OCR_IMAGES_DIR, exist_ok=True)
|
|
|
|
# Database
|
|
try:
|
|
init_db()
|
|
except Exception as exc:
|
|
print(f"Warning: database initialization failed: {exc}")
|
|
|
|
# Environment setup
|
|
os.environ.pop("TRANSFORMERS_CACHE", None)
|
|
MODEL_NAME = env_config("MODEL_NAME", default="deepseek-ai/DeepSeek-OCR")
|
|
HF_HOME = env_config("HF_HOME", default="/models")
|
|
os.makedirs(HF_HOME, exist_ok=True)
|
|
|
|
# Load model
|
|
print(f"🚀 Loading {MODEL_NAME}...")
|
|
torch_dtype = torch.bfloat16
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
MODEL_NAME,
|
|
trust_remote_code=True,
|
|
)
|
|
|
|
model = AutoModel.from_pretrained(
|
|
MODEL_NAME,
|
|
trust_remote_code=True,
|
|
use_safetensors=True,
|
|
attn_implementation="eager",
|
|
torch_dtype=torch_dtype,
|
|
).eval().to("cuda")
|
|
|
|
# Pad token setup
|
|
try:
|
|
if getattr(tokenizer, "pad_token_id", None) is None and getattr(tokenizer, "eos_token_id", None) is not None:
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
if getattr(model.config, "pad_token_id", None) is None and getattr(tokenizer, "pad_token_id", None) is not None:
|
|
model.config.pad_token_id = tokenizer.pad_token_id
|
|
except Exception:
|
|
pass
|
|
|
|
print("✅ Model loaded and ready!")
|
|
|
|
yield
|
|
|
|
# Cleanup
|
|
print("🛑 Shutting down...")
|
|
|
|
# -----------------------------
|
|
# FastAPI app
|
|
# -----------------------------
|
|
app = FastAPI(
|
|
title="DeepSeek-OCR API",
|
|
description="Blazing fast OCR with DeepSeek-OCR model 🔥",
|
|
version="2.0.0",
|
|
lifespan=lifespan
|
|
)
|
|
|
|
# CORS middleware for React frontend
|
|
CORS_ORIGINS = env_config("CORS_ORIGINS", default="").split(",")
|
|
CORS_ORIGINS = [o.strip() for o in CORS_ORIGINS if o.strip()]
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=CORS_ORIGINS if CORS_ORIGINS else ["http://localhost:3000"],
|
|
allow_credentials=True,
|
|
allow_methods=["GET", "POST"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
# -----------------------------
|
|
# Prompt builder
|
|
# -----------------------------
|
|
def build_prompt(
|
|
mode: str,
|
|
user_prompt: str,
|
|
grounding: bool,
|
|
find_term: Optional[str],
|
|
schema: Optional[str],
|
|
include_caption: bool,
|
|
) -> str:
|
|
"""Build the prompt based on mode"""
|
|
parts: List[str] = ["<image>"]
|
|
mode_requires_grounding = mode in {"find_ref", "layout_map", "pii_redact"}
|
|
if grounding or mode_requires_grounding:
|
|
parts.append("<|grounding|>")
|
|
|
|
instruction = ""
|
|
if mode == "plain_ocr":
|
|
instruction = "Free OCR."
|
|
elif mode == "markdown":
|
|
instruction = "Convert the document to markdown."
|
|
elif mode == "tables_csv":
|
|
instruction = (
|
|
"Extract every table and output CSV only. "
|
|
"Use commas, minimal quoting. If multiple tables, separate with a line containing '---'."
|
|
)
|
|
elif mode == "tables_md":
|
|
instruction = "Extract every table as GitHub-flavored Markdown tables. Output only the tables."
|
|
elif mode == "kv_json":
|
|
schema_text = schema.strip() if schema else "{}"
|
|
instruction = (
|
|
"Extract key fields and return strict JSON only. "
|
|
f"Use this schema (fill the values): {schema_text}"
|
|
)
|
|
elif mode == "figure_chart":
|
|
instruction = (
|
|
"Parse the figure. First extract any numeric series as a two-column table (x,y). "
|
|
"Then summarize the chart in 2 sentences. Output the table, then a line '---', then the summary."
|
|
)
|
|
elif mode == "find_ref":
|
|
key = (find_term or "").strip() or "Total"
|
|
instruction = f"Locate <|ref|>{key}<|/ref|> in the image."
|
|
elif mode == "layout_map":
|
|
instruction = (
|
|
'Return a JSON array of blocks with fields {"type":["title","paragraph","table","figure"],'
|
|
'"box":[x1,y1,x2,y2]}. Do not include any text content.'
|
|
)
|
|
elif mode == "pii_redact":
|
|
instruction = (
|
|
'Find all occurrences of emails, phone numbers, postal addresses, and IBANs. '
|
|
'Return a JSON array of objects {label, text, box:[x1,y1,x2,y2]}.'
|
|
)
|
|
elif mode == "multilingual":
|
|
instruction = "Free OCR. Detect the language automatically and output in the same script."
|
|
elif mode == "describe":
|
|
instruction = "Describe this image. Focus on visible key elements."
|
|
elif mode == "freeform":
|
|
instruction = user_prompt.strip() if user_prompt else "OCR this image."
|
|
else:
|
|
instruction = "OCR this image."
|
|
|
|
if include_caption and mode not in {"describe"}:
|
|
instruction = instruction + "\nThen add a one-paragraph description of the image."
|
|
|
|
parts.append(instruction)
|
|
return "\n".join(parts)
|
|
|
|
# -----------------------------
|
|
# Grounding parser
|
|
# -----------------------------
|
|
# Match a full detection block and capture the coordinates as the entire list expression
|
|
# Examples of captured coords (including outer brackets):
|
|
# - [[312, 339, 480, 681]]
|
|
# - [[504, 700, 625, 910], [771, 570, 996, 996]]
|
|
# - [[110, 310, 255, 800], [312, 343, 479, 680], ...]
|
|
# Using a greedy bracket capture ensures we include all inner lists up to the last ']' before </|det|>
|
|
DET_BLOCK = re.compile(
|
|
r"<\|ref\|>(?P<label>.*?)<\|/ref\|>\s*<\|det\|>\s*(?P<coords>\[.*\])\s*<\|/det\|>",
|
|
re.DOTALL,
|
|
)
|
|
|
|
def clean_grounding_text(text: str) -> str:
|
|
"""Remove grounding tags from text for display, keeping labels"""
|
|
# Replace <|ref|>label<|/ref|><|det|>[...any nested lists...]<|/det|> with just the label
|
|
cleaned = re.sub(
|
|
r"<\|ref\|>(.*?)<\|/ref\|>\s*<\|det\|>\s*\[.*\]\s*<\|/det\|>",
|
|
r"\1",
|
|
text,
|
|
flags=re.DOTALL,
|
|
)
|
|
# Also remove any standalone grounding tags
|
|
cleaned = re.sub(r"<\|grounding\|>", "", cleaned)
|
|
return cleaned.strip()
|
|
|
|
def parse_detections(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
|
|
"""Parse grounding boxes from text and scale from 0-999 normalized coords to actual image dimensions
|
|
|
|
Handles both single and multiple bounding boxes:
|
|
- Single: <|ref|>label<|/ref|><|det|>[[x1,y1,x2,y2]]<|/det|>
|
|
- Multiple: <|ref|>label<|/ref|><|det|>[[x1,y1,x2,y2], [x1,y1,x2,y2], ...]<|/det|>
|
|
"""
|
|
boxes: List[Dict[str, Any]] = []
|
|
for m in DET_BLOCK.finditer(text or ""):
|
|
label = m.group("label").strip()
|
|
coords_str = m.group("coords").strip()
|
|
|
|
print(f"🔍 DEBUG: Found detection for '{label}'")
|
|
print(f"📦 Raw coords string (with brackets): {coords_str}")
|
|
|
|
try:
|
|
import ast
|
|
|
|
# Parse the full bracket expression directly (handles single and multiple)
|
|
parsed = ast.literal_eval(coords_str)
|
|
|
|
# Normalize to a list of lists
|
|
if (
|
|
isinstance(parsed, list)
|
|
and len(parsed) == 4
|
|
and all(isinstance(n, (int, float)) for n in parsed)
|
|
):
|
|
# Single box provided as [x1,y1,x2,y2]
|
|
box_coords = [parsed]
|
|
print("📦 Single box (flat list) detected")
|
|
elif isinstance(parsed, list):
|
|
box_coords = parsed
|
|
print(f"📦 Boxes detected: {len(box_coords)}")
|
|
else:
|
|
raise ValueError("Unsupported coords structure")
|
|
|
|
# Process each box
|
|
for idx, box in enumerate(box_coords):
|
|
if isinstance(box, (list, tuple)) and len(box) >= 4:
|
|
x1 = int(float(box[0]) / 999 * image_width)
|
|
y1 = int(float(box[1]) / 999 * image_height)
|
|
x2 = int(float(box[2]) / 999 * image_width)
|
|
y2 = int(float(box[3]) / 999 * image_height)
|
|
print(f" Box {idx+1}: {box} → [{x1}, {y1}, {x2}, {y2}]")
|
|
boxes.append({"label": label, "box": [x1, y1, x2, y2]})
|
|
else:
|
|
print(f" ⚠️ Skipping invalid box: {box}")
|
|
except Exception as e:
|
|
print(f"❌ Parsing failed: {e}")
|
|
continue
|
|
|
|
print(f"🎯 Total boxes parsed: {len(boxes)}")
|
|
return boxes
|
|
|
|
# -----------------------------
|
|
# Routes
|
|
# -----------------------------
|
|
@app.get("/")
|
|
async def root():
|
|
return {"message": "DeepSeek-OCR API is running! 🚀", "docs": "/docs"}
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
return {"status": "healthy", "model_loaded": model is not None}
|
|
|
|
@app.post("/api/ocr")
|
|
async def ocr_inference(
|
|
image: UploadFile = File(...),
|
|
mode: str = Form("plain_ocr"),
|
|
prompt: str = Form(""),
|
|
grounding: bool = Form(False),
|
|
include_caption: bool = Form(False),
|
|
find_term: Optional[str] = Form(None),
|
|
schema: Optional[str] = Form(None),
|
|
base_size: int = Form(1024),
|
|
image_size: int = Form(640),
|
|
crop_mode: bool = Form(True),
|
|
test_compress: bool = Form(False),
|
|
):
|
|
"""
|
|
Perform OCR inference on uploaded image
|
|
|
|
- **image**: Image file to process
|
|
- **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
|
|
- **prompt**: Custom prompt for freeform mode
|
|
- **grounding**: Enable grounding boxes
|
|
- **include_caption**: Add image description
|
|
- **find_term**: Term to find (for find_ref mode)
|
|
- **schema**: JSON schema (for kv_json mode)
|
|
- **base_size**: Base processing size
|
|
- **image_size**: Image size parameter
|
|
- **crop_mode**: Enable crop mode
|
|
- **test_compress**: Test compression
|
|
"""
|
|
if model is None or tokenizer is None:
|
|
raise HTTPException(status_code=503, detail="Model not loaded yet")
|
|
|
|
# Build prompt
|
|
prompt_text = build_prompt(
|
|
mode=mode,
|
|
user_prompt=prompt,
|
|
grounding=grounding,
|
|
find_term=find_term,
|
|
schema=schema,
|
|
include_caption=include_caption,
|
|
)
|
|
|
|
tmp_img = None
|
|
out_dir = None
|
|
try:
|
|
# Save uploaded file
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
|
|
content = await image.read()
|
|
tmp.write(content)
|
|
tmp_img = tmp.name
|
|
|
|
# Get original dimensions
|
|
try:
|
|
with Image.open(tmp_img) as im:
|
|
orig_w, orig_h = im.size
|
|
except Exception:
|
|
orig_w = orig_h = None
|
|
|
|
out_dir = tempfile.mkdtemp(prefix="dsocr_")
|
|
|
|
# Run inference
|
|
res = model.infer(
|
|
tokenizer,
|
|
prompt=prompt_text,
|
|
image_file=tmp_img,
|
|
output_path=out_dir,
|
|
base_size=base_size,
|
|
image_size=image_size,
|
|
crop_mode=crop_mode,
|
|
save_results=False,
|
|
test_compress=test_compress,
|
|
eval_mode=True,
|
|
)
|
|
|
|
# Normalize response
|
|
if isinstance(res, str):
|
|
text = res.strip()
|
|
elif isinstance(res, dict) and "text" in res:
|
|
text = str(res["text"]).strip()
|
|
elif isinstance(res, (list, tuple)):
|
|
text = "\n".join(map(str, res)).strip()
|
|
else:
|
|
text = ""
|
|
|
|
# Fallback: check output file
|
|
if not text:
|
|
mmd = os.path.join(out_dir, "result.mmd")
|
|
if os.path.exists(mmd):
|
|
with open(mmd, "r", encoding="utf-8") as fh:
|
|
text = fh.read().strip()
|
|
if not text:
|
|
text = "No text returned by model."
|
|
|
|
# Parse grounding boxes with proper coordinate scaling
|
|
boxes = parse_detections(text, orig_w or 1, orig_h or 1) if ("<|det|>" in text or "<|ref|>" in text) else []
|
|
|
|
# Clean grounding tags from display text, but keep the labels
|
|
display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text
|
|
|
|
# If display text is empty after cleaning but we have boxes, show the labels
|
|
if not display_text and boxes:
|
|
display_text = ", ".join([b["label"] for b in boxes])
|
|
|
|
return JSONResponse({
|
|
"success": True,
|
|
"text": display_text,
|
|
"raw_text": text, # Include raw model output for debugging
|
|
"boxes": boxes,
|
|
"image_dims": {"w": orig_w, "h": orig_h},
|
|
"metadata": {
|
|
"mode": mode,
|
|
"grounding": grounding or (mode in {"find_ref","layout_map","pii_redact"}),
|
|
"base_size": base_size,
|
|
"image_size": image_size,
|
|
"crop_mode": crop_mode
|
|
}
|
|
})
|
|
|
|
except Exception as e:
|
|
print(f"OCR inference error: {type(e).__name__}: {str(e)}")
|
|
raise HTTPException(status_code=500, detail="An internal error occurred during OCR processing.")
|
|
|
|
finally:
|
|
if tmp_img:
|
|
try:
|
|
os.remove(tmp_img)
|
|
except Exception:
|
|
pass
|
|
if out_dir:
|
|
shutil.rmtree(out_dir, ignore_errors=True)
|
|
|
|
@app.post("/api/process-pdf")
|
|
async def process_pdf(
|
|
pdf_file: UploadFile = File(...),
|
|
mode: str = Form("plain_ocr"),
|
|
prompt: str = Form(""),
|
|
output_format: str = Form("markdown"), # markdown, html, docx, json
|
|
grounding: bool = Form(False),
|
|
include_caption: bool = Form(False),
|
|
extract_images: bool = Form(True),
|
|
dpi: int = Form(144),
|
|
base_size: int = Form(1024),
|
|
image_size: int = Form(640),
|
|
crop_mode: bool = Form(True),
|
|
):
|
|
"""
|
|
Process PDF document with OCR and convert to various formats
|
|
|
|
- **pdf_file**: PDF file to process
|
|
- **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
|
|
- **prompt**: Custom prompt for freeform mode
|
|
- **output_format**: Output format (markdown, html, docx, json)
|
|
- **grounding**: Enable grounding boxes
|
|
- **include_caption**: Add image descriptions
|
|
- **extract_images**: Extract images from PDF
|
|
- **dpi**: PDF rendering resolution (default: 144)
|
|
- **base_size**: Base processing size
|
|
- **image_size**: Image size parameter
|
|
- **crop_mode**: Enable crop mode
|
|
"""
|
|
if model is None or tokenizer is None:
|
|
raise HTTPException(status_code=503, detail="Model not loaded yet")
|
|
|
|
# Validate output format
|
|
if output_format not in ["markdown", "html", "docx", "json"]:
|
|
raise HTTPException(status_code=400, detail="Invalid output format. Must be: markdown, html, docx, or json")
|
|
|
|
try:
|
|
# Read PDF file
|
|
pdf_bytes = await pdf_file.read()
|
|
|
|
# Convert PDF to images
|
|
print(f"📄 Converting PDF to images (DPI: {dpi})...")
|
|
images = pdf_to_images_high_quality(pdf_bytes, dpi=dpi)
|
|
total_pages = len(images)
|
|
print(f"✅ Converted {total_pages} pages")
|
|
|
|
# Process each page
|
|
pages_content = []
|
|
converter = DocumentConverter()
|
|
|
|
for page_idx, img in enumerate(images):
|
|
print(f"🔍 Processing page {page_idx + 1}/{total_pages}...")
|
|
|
|
# Build prompt for this page
|
|
prompt_text = build_prompt(
|
|
mode=mode,
|
|
user_prompt=prompt,
|
|
grounding=grounding,
|
|
find_term=None,
|
|
schema=None,
|
|
include_caption=include_caption,
|
|
)
|
|
|
|
# Save image temporarily
|
|
tmp_img = None
|
|
out_dir = None
|
|
try:
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
|
|
img.save(tmp, format="PNG")
|
|
tmp_img = tmp.name
|
|
|
|
orig_w, orig_h = img.size
|
|
out_dir = tempfile.mkdtemp(prefix="dsocr_pdf_")
|
|
|
|
# Run inference
|
|
res = model.infer(
|
|
tokenizer,
|
|
prompt=prompt_text,
|
|
image_file=tmp_img,
|
|
output_path=out_dir,
|
|
base_size=base_size,
|
|
image_size=image_size,
|
|
crop_mode=crop_mode,
|
|
save_results=False,
|
|
test_compress=False,
|
|
eval_mode=True,
|
|
)
|
|
|
|
# Normalize response
|
|
if isinstance(res, str):
|
|
text = res.strip()
|
|
elif isinstance(res, dict) and "text" in res:
|
|
text = str(res["text"]).strip()
|
|
elif isinstance(res, (list, tuple)):
|
|
text = "\n".join(map(str, res)).strip()
|
|
else:
|
|
text = ""
|
|
|
|
if not text:
|
|
mmd = os.path.join(out_dir, "result.mmd")
|
|
if os.path.exists(mmd):
|
|
with open(mmd, "r", encoding="utf-8") as fh:
|
|
text = fh.read().strip()
|
|
if not text:
|
|
text = f"No text returned for page {page_idx + 1}."
|
|
|
|
# Extract images if requested
|
|
page_images = []
|
|
if extract_images:
|
|
matches, matches_image, matches_other = extract_ref_patterns(text)
|
|
if matches_image:
|
|
cropped = crop_images_from_refs(img, matches)
|
|
for cropped_img in cropped:
|
|
# Convert to base64
|
|
img_buffer = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
|
|
cropped_img.save(img_buffer.name, format="JPEG", quality=95)
|
|
with open(img_buffer.name, "rb") as f:
|
|
img_b64 = base64.b64encode(f.read()).decode('utf-8')
|
|
page_images.append(img_b64)
|
|
os.remove(img_buffer.name)
|
|
|
|
# Clean the text and add image placeholders
|
|
text = clean_markdown_content(text, matches_image, matches_other)
|
|
for img_idx in range(len(page_images)):
|
|
text = f"[IMAGE_{img_idx}]\n" + text
|
|
|
|
# Parse grounding boxes
|
|
boxes = parse_detections(text, orig_w, orig_h) if ("<|det|>" in text or "<|ref|>" in text) else []
|
|
|
|
# Clean grounding tags from display text
|
|
display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text
|
|
|
|
pages_content.append({
|
|
'page_number': page_idx + 1,
|
|
'text': display_text,
|
|
'raw_text': text,
|
|
'boxes': boxes,
|
|
'images': page_images,
|
|
'image_dims': {'w': orig_w, 'h': orig_h}
|
|
})
|
|
|
|
finally:
|
|
if tmp_img:
|
|
try:
|
|
os.remove(tmp_img)
|
|
except Exception:
|
|
pass
|
|
if out_dir:
|
|
shutil.rmtree(out_dir, ignore_errors=True)
|
|
|
|
print(f"✅ Processed all {total_pages} pages")
|
|
|
|
# Convert to requested format
|
|
if output_format == "json":
|
|
return JSONResponse({
|
|
"success": True,
|
|
"total_pages": total_pages,
|
|
"pages": pages_content,
|
|
"metadata": {
|
|
"mode": mode,
|
|
"grounding": grounding,
|
|
"extract_images": extract_images,
|
|
"dpi": dpi
|
|
}
|
|
})
|
|
elif output_format == "markdown":
|
|
md_content = converter.to_markdown(pages_content, include_images=extract_images)
|
|
return StreamingResponse(
|
|
iter([md_content.encode('utf-8')]),
|
|
media_type="text/markdown",
|
|
headers={"Content-Disposition": f"attachment; filename=ocr_result.md"}
|
|
)
|
|
elif output_format == "html":
|
|
html_content = converter.to_html(pages_content, include_images=extract_images)
|
|
return StreamingResponse(
|
|
iter([html_content.encode('utf-8')]),
|
|
media_type="text/html",
|
|
headers={"Content-Disposition": f"attachment; filename=ocr_result.html"}
|
|
)
|
|
elif output_format == "docx":
|
|
docx_buffer = converter.to_docx(pages_content, include_images=extract_images)
|
|
return StreamingResponse(
|
|
docx_buffer,
|
|
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
headers={"Content-Disposition": f"attachment; filename=ocr_result.docx"}
|
|
)
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
print(f"Error processing PDF: {e}")
|
|
print(traceback.format_exc())
|
|
raise HTTPException(status_code=500, detail="An internal error occurred during PDF processing.")
|
|
|
|
# -----------------------------
|
|
# Job management routes
|
|
# -----------------------------
|
|
|
|
class ReviewRequest(BaseModel):
|
|
reviewed_text: str
|
|
reviewer_name: str
|
|
author: Optional[str] = None
|
|
book: Optional[str] = None
|
|
chapter: Optional[str] = None
|
|
page: Optional[str] = None
|
|
|
|
|
|
def _job_row_to_dict(row) -> Dict[str, Any]:
|
|
"""Convert a DB row (RealDictRow) to a plain dict with serialisable values."""
|
|
d = dict(row)
|
|
for key, val in d.items():
|
|
if isinstance(val, datetime):
|
|
d[key] = val.isoformat()
|
|
elif val is not None and hasattr(val, '__str__') and type(val).__name__ == 'UUID':
|
|
d[key] = str(val)
|
|
return d
|
|
|
|
|
|
@app.post("/api/jobs")
|
|
async def commit_job(
|
|
image: UploadFile = File(...),
|
|
author: str = Form(""),
|
|
book: str = Form(""),
|
|
chapter: str = Form(""),
|
|
page: str = Form(""),
|
|
ocr_text: str = Form(""),
|
|
mode: str = Form("plain_ocr"),
|
|
):
|
|
"""Commit an OCR job: save the image and insert a DB record."""
|
|
job_id = str(uuid.uuid4())
|
|
|
|
# Determine file extension from original filename or content type
|
|
original_filename = image.filename or "image"
|
|
ext = os.path.splitext(original_filename)[1].lower()
|
|
if not ext:
|
|
ct = (image.content_type or "").lower()
|
|
ext_map = {
|
|
"image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg",
|
|
"image/webp": ".webp", "image/gif": ".gif", "image/bmp": ".bmp",
|
|
}
|
|
ext = ext_map.get(ct, ".png")
|
|
|
|
image_path = os.path.join(OCR_IMAGES_DIR, f"{job_id}{ext}")
|
|
|
|
try:
|
|
content = await image.read()
|
|
with open(image_path, "wb") as f:
|
|
f.write(content)
|
|
except Exception as exc:
|
|
raise HTTPException(status_code=500, detail="Failed to save image file.")
|
|
|
|
try:
|
|
with get_db() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute(
|
|
"""
|
|
INSERT INTO ocr_jobs
|
|
(id, author, book, chapter, page, image_path, original_filename,
|
|
ocr_text, mode, status)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, 'unreviewed')
|
|
RETURNING *
|
|
""",
|
|
(job_id, author or None, book or None, chapter or None,
|
|
page or None, image_path, original_filename,
|
|
ocr_text or None, mode),
|
|
)
|
|
row = cur.fetchone()
|
|
except Exception as exc:
|
|
# Clean up saved image if DB insert fails
|
|
try:
|
|
os.remove(image_path)
|
|
except Exception:
|
|
pass
|
|
# Unique constraint violation (author + chapter + page already exists)
|
|
if getattr(exc, 'pgcode', None) == '23505':
|
|
raise HTTPException(
|
|
status_code=409,
|
|
detail="A job with this Author, Chapter, and Page already exists."
|
|
)
|
|
print(f"Job commit DB error: {exc}")
|
|
raise HTTPException(status_code=500, detail="Failed to save job to database.")
|
|
|
|
return JSONResponse(_job_row_to_dict(row), status_code=201)
|
|
|
|
|
|
@app.get("/api/jobs")
|
|
async def list_jobs(
|
|
search: Optional[str] = Query(None, description="General text search across all fields"),
|
|
author: Optional[str] = Query(None),
|
|
book: Optional[str] = Query(None),
|
|
chapter: Optional[str] = Query(None),
|
|
status: Optional[str] = Query(None, description="unreviewed | reviewed"),
|
|
limit: int = Query(20, ge=1, le=200),
|
|
offset: int = Query(0, ge=0),
|
|
):
|
|
"""Search and list jobs. All filters are optional and combinable."""
|
|
conditions = []
|
|
params: List[Any] = []
|
|
|
|
if search:
|
|
conditions.append(
|
|
"(author ILIKE %s OR book ILIKE %s OR chapter ILIKE %s "
|
|
"OR page ILIKE %s OR ocr_text ILIKE %s OR reviewer_name ILIKE %s)"
|
|
)
|
|
like = f"%{search}%"
|
|
params.extend([like, like, like, like, like, like])
|
|
|
|
if author:
|
|
conditions.append("author ILIKE %s")
|
|
params.append(f"%{author}%")
|
|
|
|
if book:
|
|
conditions.append("book ILIKE %s")
|
|
params.append(f"%{book}%")
|
|
|
|
if chapter:
|
|
conditions.append("chapter ILIKE %s")
|
|
params.append(f"%{chapter}%")
|
|
|
|
if status:
|
|
conditions.append("status = %s")
|
|
params.append(status)
|
|
|
|
where = ("WHERE " + " AND ".join(conditions)) if conditions else ""
|
|
|
|
try:
|
|
with get_db() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute(
|
|
f"SELECT COUNT(*) AS total FROM ocr_jobs {where}",
|
|
params,
|
|
)
|
|
total = cur.fetchone()["total"]
|
|
|
|
cur.execute(
|
|
f"""
|
|
SELECT id, author, book, chapter, page, submitted_at, status,
|
|
reviewer_name, reviewed_at, mode, original_filename
|
|
FROM ocr_jobs {where}
|
|
ORDER BY submitted_at DESC
|
|
LIMIT %s OFFSET %s
|
|
""",
|
|
params + [limit, offset],
|
|
)
|
|
rows = [_job_row_to_dict(r) for r in cur.fetchall()]
|
|
except Exception as exc:
|
|
print(f"list_jobs DB error: {exc}")
|
|
raise HTTPException(status_code=500, detail="Database error.")
|
|
|
|
return JSONResponse({"total": total, "limit": limit, "offset": offset, "jobs": rows})
|
|
|
|
|
|
@app.get("/api/jobs/suggestions")
|
|
async def job_suggestions():
|
|
"""Return distinct values for author, book, chapter, and reviewer_name to power autocomplete."""
|
|
try:
|
|
with get_db() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
SELECT
|
|
array_remove(array_agg(DISTINCT author ORDER BY author), NULL) AS authors,
|
|
array_remove(array_agg(DISTINCT book ORDER BY book), NULL) AS books,
|
|
array_remove(array_agg(DISTINCT chapter ORDER BY chapter), NULL) AS chapters,
|
|
array_remove(array_agg(DISTINCT reviewer_name ORDER BY reviewer_name), NULL) AS reviewers
|
|
FROM ocr_jobs
|
|
""")
|
|
row = cur.fetchone()
|
|
except Exception as exc:
|
|
print(f"suggestions DB error: {exc}")
|
|
raise HTTPException(status_code=500, detail="Database error.")
|
|
|
|
return JSONResponse({
|
|
"authors": row["authors"] or [],
|
|
"books": row["books"] or [],
|
|
"chapters": row["chapters"] or [],
|
|
"reviewers": row["reviewers"] or [],
|
|
})
|
|
|
|
|
|
@app.get("/api/jobs/{job_id}")
|
|
async def get_job(job_id: str):
|
|
"""Retrieve full job record including OCR text."""
|
|
try:
|
|
uuid.UUID(job_id)
|
|
except ValueError:
|
|
raise HTTPException(status_code=400, detail="Invalid job ID.")
|
|
|
|
try:
|
|
with get_db() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("SELECT * FROM ocr_jobs WHERE id = %s", (job_id,))
|
|
row = cur.fetchone()
|
|
except Exception as exc:
|
|
print(f"get_job DB error: {exc}")
|
|
raise HTTPException(status_code=500, detail="Database error.")
|
|
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Job not found.")
|
|
|
|
return JSONResponse(_job_row_to_dict(row))
|
|
|
|
|
|
@app.get("/api/jobs/{job_id}/image")
|
|
async def get_job_image(job_id: str):
|
|
"""Serve the stored image for a job."""
|
|
try:
|
|
uuid.UUID(job_id)
|
|
except ValueError:
|
|
raise HTTPException(status_code=400, detail="Invalid job ID.")
|
|
|
|
try:
|
|
with get_db() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute("SELECT image_path FROM ocr_jobs WHERE id = %s", (job_id,))
|
|
row = cur.fetchone()
|
|
except Exception as exc:
|
|
print(f"get_job_image DB error: {exc}")
|
|
raise HTTPException(status_code=500, detail="Database error.")
|
|
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Job not found.")
|
|
|
|
path = row["image_path"]
|
|
if not os.path.isfile(path):
|
|
raise HTTPException(status_code=404, detail="Image file not found on disk.")
|
|
|
|
return FileResponse(path)
|
|
|
|
|
|
@app.put("/api/jobs/{job_id}/review")
|
|
async def review_job(job_id: str, body: ReviewRequest):
|
|
"""Mark a job as reviewed with the corrected text and reviewer name."""
|
|
try:
|
|
uuid.UUID(job_id)
|
|
except ValueError:
|
|
raise HTTPException(status_code=400, detail="Invalid job ID.")
|
|
|
|
try:
|
|
with get_db() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute(
|
|
"""
|
|
UPDATE ocr_jobs
|
|
SET status = 'reviewed',
|
|
reviewed_text = %s,
|
|
reviewer_name = %s,
|
|
reviewed_at = NOW(),
|
|
author = %s,
|
|
book = %s,
|
|
chapter = %s,
|
|
page = %s
|
|
WHERE id = %s
|
|
RETURNING *
|
|
""",
|
|
(
|
|
body.reviewed_text,
|
|
body.reviewer_name,
|
|
body.author or None,
|
|
body.book or None,
|
|
body.chapter or None,
|
|
body.page or None,
|
|
job_id,
|
|
),
|
|
)
|
|
row = cur.fetchone()
|
|
except Exception as exc:
|
|
print(f"review_job DB error: {exc}")
|
|
raise HTTPException(status_code=500, detail="Database error.")
|
|
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Job not found.")
|
|
|
|
return JSONResponse(_job_row_to_dict(row))
|
|
|
|
|
|
@app.delete("/api/jobs/{job_id}")
|
|
async def delete_job(job_id: str):
|
|
"""Delete a job record and its stored image."""
|
|
try:
|
|
uuid.UUID(job_id)
|
|
except ValueError:
|
|
raise HTTPException(status_code=400, detail="Invalid job ID.")
|
|
|
|
try:
|
|
with get_db() as conn:
|
|
with conn.cursor() as cur:
|
|
cur.execute(
|
|
"DELETE FROM ocr_jobs WHERE id = %s RETURNING image_path",
|
|
(job_id,),
|
|
)
|
|
row = cur.fetchone()
|
|
except Exception as exc:
|
|
print(f"delete_job DB error: {exc}")
|
|
raise HTTPException(status_code=500, detail="Database error.")
|
|
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Job not found.")
|
|
|
|
# Best-effort removal of the stored image file
|
|
try:
|
|
if row["image_path"] and os.path.isfile(row["image_path"]):
|
|
os.remove(row["image_path"])
|
|
except Exception:
|
|
pass
|
|
|
|
return JSONResponse({"deleted": job_id})
|
|
|
|
|
|
if __name__ == "__main__":
|
|
host = env_config("API_HOST", default="0.0.0.0")
|
|
port = env_config("API_PORT", default=8000, cast=int)
|
|
uvicorn.run(app, host=host, port=port)
|