Remove Freeform and Find from UI. Allow Description to be added to Reviewed job
This commit is contained in:
13
.env.example
13
.env.example
@@ -11,6 +11,19 @@ FRONTEND_PORT=3000
|
|||||||
MODEL_NAME=deepseek-ai/DeepSeek-OCR
|
MODEL_NAME=deepseek-ai/DeepSeek-OCR
|
||||||
HF_HOME=/models
|
HF_HOME=/models
|
||||||
|
|
||||||
|
# OCR model selection
|
||||||
|
# Register the local DeepSeek-OCR model (set to false for an Ollama-only deployment)
|
||||||
|
ENABLE_DEEPSEEK_LOCAL=true
|
||||||
|
# External Ollama host the backend should call (no trailing slash)
|
||||||
|
OLLAMA_BASE_URL=http://host.docker.internal:11434
|
||||||
|
# Comma-separated Ollama vision model tags to surface in the UI.
|
||||||
|
# Pull these on the Ollama host first, e.g. `ollama pull glm-ocr`.
|
||||||
|
OLLAMA_MODELS=glm-ocr,llama3.2-vision,minicpm-v,qwen2.5vl
|
||||||
|
# Default model id selected in the UI (deepseek-local or ollama:<tag>)
|
||||||
|
DEFAULT_OCR_MODEL=deepseek-local
|
||||||
|
# Per-request timeout (seconds) for Ollama calls
|
||||||
|
OLLAMA_TIMEOUT=300
|
||||||
|
|
||||||
# CORS Configuration (comma-separated origins, defaults to http://localhost:3000)
|
# CORS Configuration (comma-separated origins, defaults to http://localhost:3000)
|
||||||
CORS_ORIGINS=http://localhost:3000
|
CORS_ORIGINS=http://localhost:3000
|
||||||
|
|
||||||
|
|||||||
45
README.md
45
README.md
@@ -172,6 +172,13 @@ FRONTEND_PORT=3000
|
|||||||
MODEL_NAME=deepseek-ai/DeepSeek-OCR
|
MODEL_NAME=deepseek-ai/DeepSeek-OCR
|
||||||
HF_HOME=/models
|
HF_HOME=/models
|
||||||
|
|
||||||
|
# OCR model selection (DeepSeek + Ollama)
|
||||||
|
ENABLE_DEEPSEEK_LOCAL=true # register the local GPU model
|
||||||
|
OLLAMA_BASE_URL=http://host.docker.internal:11434 # external Ollama host
|
||||||
|
OLLAMA_MODELS=glm-ocr,llama3.2-vision,minicpm-v,qwen2.5vl
|
||||||
|
DEFAULT_OCR_MODEL=deepseek-local # deepseek-local or ollama:<tag>
|
||||||
|
OLLAMA_TIMEOUT=300 # per-request timeout (seconds)
|
||||||
|
|
||||||
# Upload Configuration
|
# Upload Configuration
|
||||||
MAX_UPLOAD_SIZE_MB=100 # Maximum file upload size
|
MAX_UPLOAD_SIZE_MB=100 # Maximum file upload size
|
||||||
|
|
||||||
@@ -186,13 +193,47 @@ CROP_MODE=true # Enable dynamic cropping for large images
|
|||||||
- `API_HOST`: Backend API host (default: 0.0.0.0)
|
- `API_HOST`: Backend API host (default: 0.0.0.0)
|
||||||
- `API_PORT`: Backend API port (default: 8000)
|
- `API_PORT`: Backend API port (default: 8000)
|
||||||
- `FRONTEND_PORT`: Frontend port (default: 3000)
|
- `FRONTEND_PORT`: Frontend port (default: 3000)
|
||||||
- `MODEL_NAME`: HuggingFace model identifier
|
- `MODEL_NAME`: HuggingFace model identifier for the local DeepSeek-OCR model
|
||||||
- `HF_HOME`: Model cache directory
|
- `HF_HOME`: Model cache directory
|
||||||
|
- `ENABLE_DEEPSEEK_LOCAL`: Register the local DeepSeek-OCR model (set `false` for an Ollama-only deployment with no GPU model loaded)
|
||||||
|
- `OLLAMA_BASE_URL`: URL of an external Ollama server the backend calls for non-DeepSeek models
|
||||||
|
- `OLLAMA_MODELS`: Comma-separated Ollama vision model tags to expose in the UI (pull them on the Ollama host first, e.g. `ollama pull glm-ocr`)
|
||||||
|
- `DEFAULT_OCR_MODEL`: Model id selected by default (`deepseek-local` or `ollama:<tag>`)
|
||||||
|
- `OLLAMA_TIMEOUT`: Per-request timeout in seconds for Ollama calls
|
||||||
- `MAX_UPLOAD_SIZE_MB`: Maximum file upload size in megabytes
|
- `MAX_UPLOAD_SIZE_MB`: Maximum file upload size in megabytes
|
||||||
- `BASE_SIZE`: Base image processing size (affects memory usage)
|
- `BASE_SIZE`: Base image processing size (affects memory usage)
|
||||||
- `IMAGE_SIZE`: Tile size for dynamic cropping
|
- `IMAGE_SIZE`: Tile size for dynamic cropping
|
||||||
- `CROP_MODE`: Enable/disable dynamic image cropping
|
- `CROP_MODE`: Enable/disable dynamic image cropping
|
||||||
|
|
||||||
|
### Choosing an OCR Model
|
||||||
|
|
||||||
|
The **Model** selector (next to the Mode selector) chooses which backend runs the OCR:
|
||||||
|
|
||||||
|
- **DeepSeek-OCR (local GPU)** — the default. Loaded lazily on first use. Supports
|
||||||
|
every mode including grounding/bounding-box modes (Find), plus the Advanced
|
||||||
|
Settings (base size, crop mode, etc.).
|
||||||
|
- **Ollama models** — any vision model pulled on your Ollama host and listed in
|
||||||
|
`OLLAMA_MODELS` (e.g. `glm-ocr`, `llama3.2-vision`). These run remotely on the
|
||||||
|
Ollama server. They return **plain text only**: bounding boxes are not produced,
|
||||||
|
so grounding modes (Find) and the DeepSeek-specific Advanced Settings are ignored
|
||||||
|
/ disabled when an Ollama model is selected.
|
||||||
|
|
||||||
|
Setup for Ollama models:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# On the machine running Ollama
|
||||||
|
ollama pull glm-ocr
|
||||||
|
ollama pull llama3.2-vision
|
||||||
|
|
||||||
|
# Point the backend at it (in .env), then restart
|
||||||
|
OLLAMA_BASE_URL=http://host.docker.internal:11434
|
||||||
|
OLLAMA_MODELS=glm-ocr,llama3.2-vision
|
||||||
|
```
|
||||||
|
|
||||||
|
`GET /api/models` returns the registered models and their capabilities; the UI
|
||||||
|
populates the selector from it. The model used for each job is stored on the job
|
||||||
|
record (`ocr_model`) and shown in the Browse Jobs view.
|
||||||
|
|
||||||
## Tech Stack
|
## Tech Stack
|
||||||
|
|
||||||
### Frontend
|
### Frontend
|
||||||
@@ -377,6 +418,7 @@ For large images, the model uses dynamic cropping:
|
|||||||
|
|
||||||
**Parameters:**
|
**Parameters:**
|
||||||
- `image` (file, required) - Image file to process (up to 100MB)
|
- `image` (file, required) - Image file to process (up to 100MB)
|
||||||
|
- `model` (string) - OCR model id from `GET /api/models` (default: registry default). Grounding/Advanced settings apply to DeepSeek only.
|
||||||
- `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform`
|
- `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform`
|
||||||
- `prompt` (string) - Custom prompt for freeform mode
|
- `prompt` (string) - Custom prompt for freeform mode
|
||||||
- `grounding` (bool) - Enable bounding boxes (auto-enabled for find_ref)
|
- `grounding` (bool) - Enable bounding boxes (auto-enabled for find_ref)
|
||||||
@@ -416,6 +458,7 @@ Process PDF documents with OCR and export to various formats.
|
|||||||
|
|
||||||
**Parameters:**
|
**Parameters:**
|
||||||
- `pdf_file` (file, required) - PDF file to process (up to 100MB)
|
- `pdf_file` (file, required) - PDF file to process (up to 100MB)
|
||||||
|
- `model` (string) - OCR model id from `GET /api/models` (default: registry default)
|
||||||
- `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform`
|
- `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform`
|
||||||
- `prompt` (string) - Custom prompt for freeform mode
|
- `prompt` (string) - Custom prompt for freeform mode
|
||||||
- `output_format` (string) - Output format: `markdown` | `html` | `docx` | `json`
|
- `output_format` (string) - Output format: `markdown` | `html` | `docx` | `json`
|
||||||
|
|||||||
@@ -62,6 +62,11 @@ def init_db():
|
|||||||
ALTER TABLE ocr_jobs
|
ALTER TABLE ocr_jobs
|
||||||
ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ
|
ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ
|
||||||
""")
|
""")
|
||||||
|
# Which OCR model produced this job (e.g. "deepseek-local", "ollama:glm-ocr")
|
||||||
|
cur.execute("""
|
||||||
|
ALTER TABLE ocr_jobs
|
||||||
|
ADD COLUMN IF NOT EXISTS ocr_model TEXT
|
||||||
|
""")
|
||||||
# Trigger function: stamp updated_at on every row update
|
# Trigger function: stamp updated_at on every row update
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
CREATE OR REPLACE FUNCTION set_updated_at()
|
CREATE OR REPLACE FUNCTION set_updated_at()
|
||||||
|
|||||||
460
backend/main.py
460
backend/main.py
@@ -1,8 +1,6 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import uuid
|
import uuid
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
|
||||||
import base64
|
import base64
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
@@ -12,8 +10,6 @@ from fastapi import FastAPI, File, UploadFile, Form, HTTPException, Query
|
|||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
|
from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
import torch
|
|
||||||
from transformers import AutoModel, AutoTokenizer
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from decouple import config as env_config
|
from decouple import config as env_config
|
||||||
@@ -28,19 +24,28 @@ from pdf_utils import (
|
|||||||
)
|
)
|
||||||
from format_converter import DocumentConverter
|
from format_converter import DocumentConverter
|
||||||
from database import init_db, get_db
|
from database import init_db, get_db
|
||||||
|
from providers import (
|
||||||
|
build_registry,
|
||||||
|
parse_detections,
|
||||||
|
clean_grounding_text,
|
||||||
|
ProviderError,
|
||||||
|
GROUNDING_MODES,
|
||||||
|
)
|
||||||
|
|
||||||
OCR_IMAGES_DIR = env_config("OCR_IMAGES_DIR", default="/data/ocr_images")
|
OCR_IMAGES_DIR = env_config("OCR_IMAGES_DIR", default="/data/ocr_images")
|
||||||
|
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
# Lifespan context for model loading
|
# Lifespan context
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
model = None
|
# The model registry holds all available OCR providers. Local models (e.g.
|
||||||
tokenizer = None
|
# DeepSeek-OCR) are loaded lazily on first use so an Ollama-only deployment
|
||||||
|
# starts instantly and never touches the GPU.
|
||||||
|
registry = None
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
"""Load model on startup, cleanup on shutdown"""
|
"""Build the model registry on startup."""
|
||||||
global model, tokenizer
|
global registry
|
||||||
|
|
||||||
# Image storage directory
|
# Image storage directory
|
||||||
os.makedirs(OCR_IMAGES_DIR, exist_ok=True)
|
os.makedirs(OCR_IMAGES_DIR, exist_ok=True)
|
||||||
@@ -51,39 +56,8 @@ async def lifespan(app: FastAPI):
|
|||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(f"Warning: database initialization failed: {exc}")
|
print(f"Warning: database initialization failed: {exc}")
|
||||||
|
|
||||||
# Environment setup
|
# OCR model registry (providers load their models lazily)
|
||||||
os.environ.pop("TRANSFORMERS_CACHE", None)
|
registry = build_registry()
|
||||||
MODEL_NAME = env_config("MODEL_NAME", default="deepseek-ai/DeepSeek-OCR")
|
|
||||||
HF_HOME = env_config("HF_HOME", default="/models")
|
|
||||||
os.makedirs(HF_HOME, exist_ok=True)
|
|
||||||
|
|
||||||
# Load model
|
|
||||||
print(f"🚀 Loading {MODEL_NAME}...")
|
|
||||||
torch_dtype = torch.bfloat16
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
MODEL_NAME,
|
|
||||||
trust_remote_code=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
model = AutoModel.from_pretrained(
|
|
||||||
MODEL_NAME,
|
|
||||||
trust_remote_code=True,
|
|
||||||
use_safetensors=True,
|
|
||||||
attn_implementation="eager",
|
|
||||||
torch_dtype=torch_dtype,
|
|
||||||
).eval().to("cuda")
|
|
||||||
|
|
||||||
# Pad token setup
|
|
||||||
try:
|
|
||||||
if getattr(tokenizer, "pad_token_id", None) is None and getattr(tokenizer, "eos_token_id", None) is not None:
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
|
||||||
if getattr(model.config, "pad_token_id", None) is None and getattr(tokenizer, "pad_token_id", None) is not None:
|
|
||||||
model.config.pad_token_id = tokenizer.pad_token_id
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
print("✅ Model loaded and ready!")
|
|
||||||
|
|
||||||
yield
|
yield
|
||||||
|
|
||||||
@@ -112,155 +86,6 @@ app.add_middleware(
|
|||||||
allow_headers=["*"],
|
allow_headers=["*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Prompt builder
|
|
||||||
# -----------------------------
|
|
||||||
def build_prompt(
|
|
||||||
mode: str,
|
|
||||||
user_prompt: str,
|
|
||||||
grounding: bool,
|
|
||||||
find_term: Optional[str],
|
|
||||||
schema: Optional[str],
|
|
||||||
include_caption: bool,
|
|
||||||
) -> str:
|
|
||||||
"""Build the prompt based on mode"""
|
|
||||||
parts: List[str] = ["<image>"]
|
|
||||||
mode_requires_grounding = mode in {"find_ref", "layout_map", "pii_redact"}
|
|
||||||
if grounding or mode_requires_grounding:
|
|
||||||
parts.append("<|grounding|>")
|
|
||||||
|
|
||||||
instruction = ""
|
|
||||||
if mode == "plain_ocr":
|
|
||||||
instruction = "Free OCR."
|
|
||||||
elif mode == "markdown":
|
|
||||||
instruction = "Convert the document to markdown."
|
|
||||||
elif mode == "tables_csv":
|
|
||||||
instruction = (
|
|
||||||
"Extract every table and output CSV only. "
|
|
||||||
"Use commas, minimal quoting. If multiple tables, separate with a line containing '---'."
|
|
||||||
)
|
|
||||||
elif mode == "tables_md":
|
|
||||||
instruction = "Extract every table as GitHub-flavored Markdown tables. Output only the tables."
|
|
||||||
elif mode == "kv_json":
|
|
||||||
schema_text = schema.strip() if schema else "{}"
|
|
||||||
instruction = (
|
|
||||||
"Extract key fields and return strict JSON only. "
|
|
||||||
f"Use this schema (fill the values): {schema_text}"
|
|
||||||
)
|
|
||||||
elif mode == "figure_chart":
|
|
||||||
instruction = (
|
|
||||||
"Parse the figure. First extract any numeric series as a two-column table (x,y). "
|
|
||||||
"Then summarize the chart in 2 sentences. Output the table, then a line '---', then the summary."
|
|
||||||
)
|
|
||||||
elif mode == "find_ref":
|
|
||||||
key = (find_term or "").strip() or "Total"
|
|
||||||
instruction = f"Locate <|ref|>{key}<|/ref|> in the image."
|
|
||||||
elif mode == "layout_map":
|
|
||||||
instruction = (
|
|
||||||
'Return a JSON array of blocks with fields {"type":["title","paragraph","table","figure"],'
|
|
||||||
'"box":[x1,y1,x2,y2]}. Do not include any text content.'
|
|
||||||
)
|
|
||||||
elif mode == "pii_redact":
|
|
||||||
instruction = (
|
|
||||||
'Find all occurrences of emails, phone numbers, postal addresses, and IBANs. '
|
|
||||||
'Return a JSON array of objects {label, text, box:[x1,y1,x2,y2]}.'
|
|
||||||
)
|
|
||||||
elif mode == "multilingual":
|
|
||||||
instruction = "Free OCR. Detect the language automatically and output in the same script."
|
|
||||||
elif mode == "describe":
|
|
||||||
instruction = "Describe this image. Focus on visible key elements."
|
|
||||||
elif mode == "freeform":
|
|
||||||
instruction = user_prompt.strip() if user_prompt else "OCR this image."
|
|
||||||
else:
|
|
||||||
instruction = "OCR this image."
|
|
||||||
|
|
||||||
if include_caption and mode not in {"describe"}:
|
|
||||||
instruction = instruction + "\nThen add a one-paragraph description of the image."
|
|
||||||
|
|
||||||
parts.append(instruction)
|
|
||||||
return "\n".join(parts)
|
|
||||||
|
|
||||||
# -----------------------------
|
|
||||||
# Grounding parser
|
|
||||||
# -----------------------------
|
|
||||||
# Match a full detection block and capture the coordinates as the entire list expression
|
|
||||||
# Examples of captured coords (including outer brackets):
|
|
||||||
# - [[312, 339, 480, 681]]
|
|
||||||
# - [[504, 700, 625, 910], [771, 570, 996, 996]]
|
|
||||||
# - [[110, 310, 255, 800], [312, 343, 479, 680], ...]
|
|
||||||
# Using a greedy bracket capture ensures we include all inner lists up to the last ']' before </|det|>
|
|
||||||
DET_BLOCK = re.compile(
|
|
||||||
r"<\|ref\|>(?P<label>.*?)<\|/ref\|>\s*<\|det\|>\s*(?P<coords>\[.*\])\s*<\|/det\|>",
|
|
||||||
re.DOTALL,
|
|
||||||
)
|
|
||||||
|
|
||||||
def clean_grounding_text(text: str) -> str:
|
|
||||||
"""Remove grounding tags from text for display, keeping labels"""
|
|
||||||
# Replace <|ref|>label<|/ref|><|det|>[...any nested lists...]<|/det|> with just the label
|
|
||||||
cleaned = re.sub(
|
|
||||||
r"<\|ref\|>(.*?)<\|/ref\|>\s*<\|det\|>\s*\[.*\]\s*<\|/det\|>",
|
|
||||||
r"\1",
|
|
||||||
text,
|
|
||||||
flags=re.DOTALL,
|
|
||||||
)
|
|
||||||
# Also remove any standalone grounding tags
|
|
||||||
cleaned = re.sub(r"<\|grounding\|>", "", cleaned)
|
|
||||||
return cleaned.strip()
|
|
||||||
|
|
||||||
def parse_detections(text: str, image_width: int, image_height: int) -> List[Dict[str, Any]]:
|
|
||||||
"""Parse grounding boxes from text and scale from 0-999 normalized coords to actual image dimensions
|
|
||||||
|
|
||||||
Handles both single and multiple bounding boxes:
|
|
||||||
- Single: <|ref|>label<|/ref|><|det|>[[x1,y1,x2,y2]]<|/det|>
|
|
||||||
- Multiple: <|ref|>label<|/ref|><|det|>[[x1,y1,x2,y2], [x1,y1,x2,y2], ...]<|/det|>
|
|
||||||
"""
|
|
||||||
boxes: List[Dict[str, Any]] = []
|
|
||||||
for m in DET_BLOCK.finditer(text or ""):
|
|
||||||
label = m.group("label").strip()
|
|
||||||
coords_str = m.group("coords").strip()
|
|
||||||
|
|
||||||
print(f"🔍 DEBUG: Found detection for '{label}'")
|
|
||||||
print(f"📦 Raw coords string (with brackets): {coords_str}")
|
|
||||||
|
|
||||||
try:
|
|
||||||
import ast
|
|
||||||
|
|
||||||
# Parse the full bracket expression directly (handles single and multiple)
|
|
||||||
parsed = ast.literal_eval(coords_str)
|
|
||||||
|
|
||||||
# Normalize to a list of lists
|
|
||||||
if (
|
|
||||||
isinstance(parsed, list)
|
|
||||||
and len(parsed) == 4
|
|
||||||
and all(isinstance(n, (int, float)) for n in parsed)
|
|
||||||
):
|
|
||||||
# Single box provided as [x1,y1,x2,y2]
|
|
||||||
box_coords = [parsed]
|
|
||||||
print("📦 Single box (flat list) detected")
|
|
||||||
elif isinstance(parsed, list):
|
|
||||||
box_coords = parsed
|
|
||||||
print(f"📦 Boxes detected: {len(box_coords)}")
|
|
||||||
else:
|
|
||||||
raise ValueError("Unsupported coords structure")
|
|
||||||
|
|
||||||
# Process each box
|
|
||||||
for idx, box in enumerate(box_coords):
|
|
||||||
if isinstance(box, (list, tuple)) and len(box) >= 4:
|
|
||||||
x1 = int(float(box[0]) / 999 * image_width)
|
|
||||||
y1 = int(float(box[1]) / 999 * image_height)
|
|
||||||
x2 = int(float(box[2]) / 999 * image_width)
|
|
||||||
y2 = int(float(box[3]) / 999 * image_height)
|
|
||||||
print(f" Box {idx+1}: {box} → [{x1}, {y1}, {x2}, {y2}]")
|
|
||||||
boxes.append({"label": label, "box": [x1, y1, x2, y2]})
|
|
||||||
else:
|
|
||||||
print(f" ⚠️ Skipping invalid box: {box}")
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Parsing failed: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"🎯 Total boxes parsed: {len(boxes)}")
|
|
||||||
return boxes
|
|
||||||
|
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
# Routes
|
# Routes
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
@@ -270,11 +95,38 @@ async def root():
|
|||||||
|
|
||||||
@app.get("/health")
|
@app.get("/health")
|
||||||
async def health():
|
async def health():
|
||||||
return {"status": "healthy", "model_loaded": model is not None}
|
return {"status": "healthy", "models": registry.list_models() if registry else []}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/models")
|
||||||
|
async def list_models():
|
||||||
|
"""List the OCR models available for selection in the UI."""
|
||||||
|
if registry is None:
|
||||||
|
raise HTTPException(status_code=503, detail="Model registry not ready.")
|
||||||
|
return JSONResponse({"models": registry.list_models()})
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_provider(model_id: Optional[str], mode: str):
|
||||||
|
"""Look up the provider and reject capability mismatches (e.g. grounding)."""
|
||||||
|
if registry is None:
|
||||||
|
raise HTTPException(status_code=503, detail="Model registry not ready.")
|
||||||
|
try:
|
||||||
|
provider = registry.get(model_id)
|
||||||
|
except ProviderError as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc))
|
||||||
|
|
||||||
|
if mode in GROUNDING_MODES and not provider.capabilities.get("grounding"):
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Model '{provider.label}' does not support grounding modes (e.g. {mode}).",
|
||||||
|
)
|
||||||
|
return provider
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/ocr")
|
@app.post("/api/ocr")
|
||||||
async def ocr_inference(
|
async def ocr_inference(
|
||||||
image: UploadFile = File(...),
|
image: UploadFile = File(...),
|
||||||
|
model: Optional[str] = Form(None),
|
||||||
mode: str = Form("plain_ocr"),
|
mode: str = Form("plain_ocr"),
|
||||||
prompt: str = Form(""),
|
prompt: str = Form(""),
|
||||||
grounding: bool = Form(False),
|
grounding: bool = Form(False),
|
||||||
@@ -290,32 +142,18 @@ async def ocr_inference(
|
|||||||
Perform OCR inference on uploaded image
|
Perform OCR inference on uploaded image
|
||||||
|
|
||||||
- **image**: Image file to process
|
- **image**: Image file to process
|
||||||
|
- **model**: OCR model id (see GET /api/models); defaults to the registry default
|
||||||
- **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
|
- **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
|
||||||
- **prompt**: Custom prompt for freeform mode
|
- **prompt**: Custom prompt for freeform mode
|
||||||
- **grounding**: Enable grounding boxes
|
- **grounding**: Enable grounding boxes (DeepSeek only)
|
||||||
- **include_caption**: Add image description
|
- **include_caption**: Add image description
|
||||||
- **find_term**: Term to find (for find_ref mode)
|
- **find_term**: Term to find (for find_ref mode)
|
||||||
- **schema**: JSON schema (for kv_json mode)
|
- **schema**: JSON schema (for kv_json mode)
|
||||||
- **base_size**: Base processing size
|
- **base_size/image_size/crop_mode/test_compress**: DeepSeek processing options
|
||||||
- **image_size**: Image size parameter
|
|
||||||
- **crop_mode**: Enable crop mode
|
|
||||||
- **test_compress**: Test compression
|
|
||||||
"""
|
"""
|
||||||
if model is None or tokenizer is None:
|
provider = _resolve_provider(model, mode)
|
||||||
raise HTTPException(status_code=503, detail="Model not loaded yet")
|
|
||||||
|
|
||||||
# Build prompt
|
|
||||||
prompt_text = build_prompt(
|
|
||||||
mode=mode,
|
|
||||||
user_prompt=prompt,
|
|
||||||
grounding=grounding,
|
|
||||||
find_term=find_term,
|
|
||||||
schema=schema,
|
|
||||||
include_caption=include_caption,
|
|
||||||
)
|
|
||||||
|
|
||||||
tmp_img = None
|
tmp_img = None
|
||||||
out_dir = None
|
|
||||||
try:
|
try:
|
||||||
# Save uploaded file
|
# Save uploaded file
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
|
||||||
@@ -330,42 +168,27 @@ async def ocr_inference(
|
|||||||
except Exception:
|
except Exception:
|
||||||
orig_w = orig_h = None
|
orig_w = orig_h = None
|
||||||
|
|
||||||
out_dir = tempfile.mkdtemp(prefix="dsocr_")
|
# Run inference through the selected provider
|
||||||
|
text = provider.run(
|
||||||
# Run inference
|
tmp_img,
|
||||||
res = model.infer(
|
mode=mode,
|
||||||
tokenizer,
|
prompt=prompt,
|
||||||
prompt=prompt_text,
|
grounding=grounding,
|
||||||
image_file=tmp_img,
|
find_term=find_term,
|
||||||
output_path=out_dir,
|
schema=schema,
|
||||||
base_size=base_size,
|
include_caption=include_caption,
|
||||||
image_size=image_size,
|
options={
|
||||||
crop_mode=crop_mode,
|
"base_size": base_size,
|
||||||
save_results=False,
|
"image_size": image_size,
|
||||||
test_compress=test_compress,
|
"crop_mode": crop_mode,
|
||||||
eval_mode=True,
|
"test_compress": test_compress,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Normalize response
|
|
||||||
if isinstance(res, str):
|
|
||||||
text = res.strip()
|
|
||||||
elif isinstance(res, dict) and "text" in res:
|
|
||||||
text = str(res["text"]).strip()
|
|
||||||
elif isinstance(res, (list, tuple)):
|
|
||||||
text = "\n".join(map(str, res)).strip()
|
|
||||||
else:
|
|
||||||
text = ""
|
|
||||||
|
|
||||||
# Fallback: check output file
|
|
||||||
if not text:
|
|
||||||
mmd = os.path.join(out_dir, "result.mmd")
|
|
||||||
if os.path.exists(mmd):
|
|
||||||
with open(mmd, "r", encoding="utf-8") as fh:
|
|
||||||
text = fh.read().strip()
|
|
||||||
if not text:
|
if not text:
|
||||||
text = "No text returned by model."
|
text = "No text returned by model."
|
||||||
|
|
||||||
# Parse grounding boxes with proper coordinate scaling
|
# Parse grounding boxes (no-op for providers/text without grounding tokens)
|
||||||
boxes = parse_detections(text, orig_w or 1, orig_h or 1) if ("<|det|>" in text or "<|ref|>" in text) else []
|
boxes = parse_detections(text, orig_w or 1, orig_h or 1) if ("<|det|>" in text or "<|ref|>" in text) else []
|
||||||
|
|
||||||
# Clean grounding tags from display text, but keep the labels
|
# Clean grounding tags from display text, but keep the labels
|
||||||
@@ -382,14 +205,21 @@ async def ocr_inference(
|
|||||||
"boxes": boxes,
|
"boxes": boxes,
|
||||||
"image_dims": {"w": orig_w, "h": orig_h},
|
"image_dims": {"w": orig_w, "h": orig_h},
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"model": provider.id,
|
||||||
|
"model_label": provider.label,
|
||||||
"mode": mode,
|
"mode": mode,
|
||||||
"grounding": grounding or (mode in {"find_ref","layout_map","pii_redact"}),
|
"grounding": grounding or (mode in GROUNDING_MODES),
|
||||||
"base_size": base_size,
|
"base_size": base_size,
|
||||||
"image_size": image_size,
|
"image_size": image_size,
|
||||||
"crop_mode": crop_mode
|
"crop_mode": crop_mode
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
except ProviderError as e:
|
||||||
|
print(f"OCR provider error: {e}")
|
||||||
|
raise HTTPException(status_code=502, detail=str(e))
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"OCR inference error: {type(e).__name__}: {str(e)}")
|
print(f"OCR inference error: {type(e).__name__}: {str(e)}")
|
||||||
raise HTTPException(status_code=500, detail="An internal error occurred during OCR processing.")
|
raise HTTPException(status_code=500, detail="An internal error occurred during OCR processing.")
|
||||||
@@ -400,12 +230,11 @@ async def ocr_inference(
|
|||||||
os.remove(tmp_img)
|
os.remove(tmp_img)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if out_dir:
|
|
||||||
shutil.rmtree(out_dir, ignore_errors=True)
|
|
||||||
|
|
||||||
@app.post("/api/process-pdf")
|
@app.post("/api/process-pdf")
|
||||||
async def process_pdf(
|
async def process_pdf(
|
||||||
pdf_file: UploadFile = File(...),
|
pdf_file: UploadFile = File(...),
|
||||||
|
model: Optional[str] = Form(None),
|
||||||
mode: str = Form("plain_ocr"),
|
mode: str = Form("plain_ocr"),
|
||||||
prompt: str = Form(""),
|
prompt: str = Form(""),
|
||||||
output_format: str = Form("markdown"), # markdown, html, docx, json
|
output_format: str = Form("markdown"), # markdown, html, docx, json
|
||||||
@@ -432,8 +261,7 @@ async def process_pdf(
|
|||||||
- **image_size**: Image size parameter
|
- **image_size**: Image size parameter
|
||||||
- **crop_mode**: Enable crop mode
|
- **crop_mode**: Enable crop mode
|
||||||
"""
|
"""
|
||||||
if model is None or tokenizer is None:
|
provider = _resolve_provider(model, mode)
|
||||||
raise HTTPException(status_code=503, detail="Model not loaded yet")
|
|
||||||
|
|
||||||
# Validate output format
|
# Validate output format
|
||||||
if output_format not in ["markdown", "html", "docx", "json"]:
|
if output_format not in ["markdown", "html", "docx", "json"]:
|
||||||
@@ -456,56 +284,32 @@ async def process_pdf(
|
|||||||
for page_idx, img in enumerate(images):
|
for page_idx, img in enumerate(images):
|
||||||
print(f"🔍 Processing page {page_idx + 1}/{total_pages}...")
|
print(f"🔍 Processing page {page_idx + 1}/{total_pages}...")
|
||||||
|
|
||||||
# Build prompt for this page
|
|
||||||
prompt_text = build_prompt(
|
|
||||||
mode=mode,
|
|
||||||
user_prompt=prompt,
|
|
||||||
grounding=grounding,
|
|
||||||
find_term=None,
|
|
||||||
schema=None,
|
|
||||||
include_caption=include_caption,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Save image temporarily
|
# Save image temporarily
|
||||||
tmp_img = None
|
tmp_img = None
|
||||||
out_dir = None
|
|
||||||
try:
|
try:
|
||||||
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
|
||||||
img.save(tmp, format="PNG")
|
img.save(tmp, format="PNG")
|
||||||
tmp_img = tmp.name
|
tmp_img = tmp.name
|
||||||
|
|
||||||
orig_w, orig_h = img.size
|
orig_w, orig_h = img.size
|
||||||
out_dir = tempfile.mkdtemp(prefix="dsocr_pdf_")
|
|
||||||
|
|
||||||
# Run inference
|
# Run inference through the selected provider
|
||||||
res = model.infer(
|
text = provider.run(
|
||||||
tokenizer,
|
tmp_img,
|
||||||
prompt=prompt_text,
|
mode=mode,
|
||||||
image_file=tmp_img,
|
prompt=prompt,
|
||||||
output_path=out_dir,
|
grounding=grounding,
|
||||||
base_size=base_size,
|
find_term=None,
|
||||||
image_size=image_size,
|
schema=None,
|
||||||
crop_mode=crop_mode,
|
include_caption=include_caption,
|
||||||
save_results=False,
|
options={
|
||||||
test_compress=False,
|
"base_size": base_size,
|
||||||
eval_mode=True,
|
"image_size": image_size,
|
||||||
|
"crop_mode": crop_mode,
|
||||||
|
"test_compress": False,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Normalize response
|
|
||||||
if isinstance(res, str):
|
|
||||||
text = res.strip()
|
|
||||||
elif isinstance(res, dict) and "text" in res:
|
|
||||||
text = str(res["text"]).strip()
|
|
||||||
elif isinstance(res, (list, tuple)):
|
|
||||||
text = "\n".join(map(str, res)).strip()
|
|
||||||
else:
|
|
||||||
text = ""
|
|
||||||
|
|
||||||
if not text:
|
|
||||||
mmd = os.path.join(out_dir, "result.mmd")
|
|
||||||
if os.path.exists(mmd):
|
|
||||||
with open(mmd, "r", encoding="utf-8") as fh:
|
|
||||||
text = fh.read().strip()
|
|
||||||
if not text:
|
if not text:
|
||||||
text = f"No text returned for page {page_idx + 1}."
|
text = f"No text returned for page {page_idx + 1}."
|
||||||
|
|
||||||
@@ -550,8 +354,6 @@ async def process_pdf(
|
|||||||
os.remove(tmp_img)
|
os.remove(tmp_img)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if out_dir:
|
|
||||||
shutil.rmtree(out_dir, ignore_errors=True)
|
|
||||||
|
|
||||||
print(f"✅ Processed all {total_pages} pages")
|
print(f"✅ Processed all {total_pages} pages")
|
||||||
|
|
||||||
@@ -562,6 +364,8 @@ async def process_pdf(
|
|||||||
"total_pages": total_pages,
|
"total_pages": total_pages,
|
||||||
"pages": pages_content,
|
"pages": pages_content,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
"model": provider.id,
|
||||||
|
"model_label": provider.label,
|
||||||
"mode": mode,
|
"mode": mode,
|
||||||
"grounding": grounding,
|
"grounding": grounding,
|
||||||
"extract_images": extract_images,
|
"extract_images": extract_images,
|
||||||
@@ -590,6 +394,9 @@ async def process_pdf(
|
|||||||
headers={"Content-Disposition": f"attachment; filename=ocr_result.docx"}
|
headers={"Content-Disposition": f"attachment; filename=ocr_result.docx"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
except ProviderError as e:
|
||||||
|
print(f"PDF provider error: {e}")
|
||||||
|
raise HTTPException(status_code=502, detail=str(e))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import traceback
|
import traceback
|
||||||
print(f"Error processing PDF: {e}")
|
print(f"Error processing PDF: {e}")
|
||||||
@@ -633,6 +440,7 @@ async def commit_job(
|
|||||||
describe_text: str = Form(""),
|
describe_text: str = Form(""),
|
||||||
freeform_text: str = Form(""),
|
freeform_text: str = Form(""),
|
||||||
mode: str = Form("plain_ocr"),
|
mode: str = Form("plain_ocr"),
|
||||||
|
ocr_model: str = Form(""),
|
||||||
):
|
):
|
||||||
"""Commit an OCR job: save the image and insert a DB record."""
|
"""Commit an OCR job: save the image and insert a DB record."""
|
||||||
job_id = str(uuid.uuid4())
|
job_id = str(uuid.uuid4())
|
||||||
@@ -664,13 +472,14 @@ async def commit_job(
|
|||||||
"""
|
"""
|
||||||
INSERT INTO ocr_jobs
|
INSERT INTO ocr_jobs
|
||||||
(id, author, book, chapter, page, image_path, original_filename,
|
(id, author, book, chapter, page, image_path, original_filename,
|
||||||
ocr_text, describe_text, freeform_text, mode, status)
|
ocr_text, describe_text, freeform_text, mode, ocr_model, status)
|
||||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'unreviewed')
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'unreviewed')
|
||||||
RETURNING *
|
RETURNING *
|
||||||
""",
|
""",
|
||||||
(job_id, author or None, book or None, chapter or None,
|
(job_id, author or None, book or None, chapter or None,
|
||||||
page or None, image_path, original_filename,
|
page or None, image_path, original_filename,
|
||||||
ocr_text or None, describe_text or None, freeform_text or None, mode),
|
ocr_text or None, describe_text or None, freeform_text or None,
|
||||||
|
mode, ocr_model or None),
|
||||||
)
|
)
|
||||||
row = cur.fetchone()
|
row = cur.fetchone()
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
@@ -743,7 +552,7 @@ async def list_jobs(
|
|||||||
cur.execute(
|
cur.execute(
|
||||||
f"""
|
f"""
|
||||||
SELECT id, author, book, chapter, page, submitted_at, status,
|
SELECT id, author, book, chapter, page, submitted_at, status,
|
||||||
reviewer_name, reviewed_at, mode, original_filename
|
reviewer_name, reviewed_at, mode, ocr_model, original_filename
|
||||||
FROM ocr_jobs {where}
|
FROM ocr_jobs {where}
|
||||||
ORDER BY submitted_at DESC
|
ORDER BY submitted_at DESC
|
||||||
LIMIT %s OFFSET %s
|
LIMIT %s OFFSET %s
|
||||||
@@ -945,6 +754,75 @@ async def set_job_status(job_id: str, body: StatusRequest):
|
|||||||
return JSONResponse(_job_row_to_dict(row))
|
return JSONResponse(_job_row_to_dict(row))
|
||||||
|
|
||||||
|
|
||||||
|
class JobDescribeRequest(BaseModel):
|
||||||
|
model: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/jobs/{job_id}/describe")
|
||||||
|
async def describe_job(job_id: str, body: JobDescribeRequest):
|
||||||
|
"""Run Describe mode on a job's stored image and save the result to describe_text."""
|
||||||
|
try:
|
||||||
|
uuid.UUID(job_id)
|
||||||
|
except ValueError:
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid job ID.")
|
||||||
|
|
||||||
|
# Look up the stored image for this job
|
||||||
|
try:
|
||||||
|
with get_db() as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT image_path FROM ocr_jobs WHERE id = %s", (job_id,))
|
||||||
|
row = cur.fetchone()
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"describe_job lookup DB error: {exc}")
|
||||||
|
raise HTTPException(status_code=500, detail="Database error.")
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail="Job not found.")
|
||||||
|
image_path = row["image_path"]
|
||||||
|
if not image_path or not os.path.isfile(image_path):
|
||||||
|
raise HTTPException(status_code=404, detail="Image file not found on disk.")
|
||||||
|
|
||||||
|
provider = _resolve_provider(body.model, "describe")
|
||||||
|
|
||||||
|
try:
|
||||||
|
text = provider.run(
|
||||||
|
image_path,
|
||||||
|
mode="describe",
|
||||||
|
prompt="",
|
||||||
|
grounding=False,
|
||||||
|
find_term=None,
|
||||||
|
schema=None,
|
||||||
|
include_caption=False,
|
||||||
|
options={"base_size": 1024, "image_size": 640, "crop_mode": True, "test_compress": False},
|
||||||
|
)
|
||||||
|
except ProviderError as e:
|
||||||
|
print(f"describe_job provider error: {e}")
|
||||||
|
raise HTTPException(status_code=502, detail=str(e))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"describe_job inference error: {type(e).__name__}: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="An internal error occurred during description.")
|
||||||
|
|
||||||
|
display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text
|
||||||
|
|
||||||
|
# Persist the generated description on the job
|
||||||
|
try:
|
||||||
|
with get_db() as conn:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute(
|
||||||
|
"UPDATE ocr_jobs SET describe_text = %s WHERE id = %s RETURNING *",
|
||||||
|
(display_text, job_id),
|
||||||
|
)
|
||||||
|
updated = cur.fetchone()
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"describe_job save DB error: {exc}")
|
||||||
|
raise HTTPException(status_code=500, detail="Database error.")
|
||||||
|
|
||||||
|
if not updated:
|
||||||
|
raise HTTPException(status_code=404, detail="Job not found.")
|
||||||
|
|
||||||
|
return JSONResponse(_job_row_to_dict(updated))
|
||||||
|
|
||||||
|
|
||||||
@app.delete("/api/jobs/{job_id}")
|
@app.delete("/api/jobs/{job_id}")
|
||||||
async def delete_job(job_id: str):
|
async def delete_job(job_id: str):
|
||||||
"""Delete a job record and its stored image."""
|
"""Delete a job record and its stored image."""
|
||||||
|
|||||||
@@ -16,3 +16,4 @@ img2pdf>=0.5.0
|
|||||||
python-docx>=1.1.0
|
python-docx>=1.1.0
|
||||||
markdown>=3.5.0
|
markdown>=3.5.0
|
||||||
psycopg2-binary>=2.9.0
|
psycopg2-binary>=2.9.0
|
||||||
|
httpx>=0.27.0
|
||||||
|
|||||||
@@ -27,6 +27,15 @@ services:
|
|||||||
MAX_UPLOAD_SIZE_MB: ${MAX_UPLOAD_SIZE_MB:-100}
|
MAX_UPLOAD_SIZE_MB: ${MAX_UPLOAD_SIZE_MB:-100}
|
||||||
DATABASE_URL: ${DATABASE_URL:-postgresql://ocr_user:ocr_password@postgres:5432/ocr_db}
|
DATABASE_URL: ${DATABASE_URL:-postgresql://ocr_user:ocr_password@postgres:5432/ocr_db}
|
||||||
OCR_IMAGES_DIR: ${OCR_IMAGES_DIR:-/data/ocr_images}
|
OCR_IMAGES_DIR: ${OCR_IMAGES_DIR:-/data/ocr_images}
|
||||||
|
ENABLE_DEEPSEEK_LOCAL: ${ENABLE_DEEPSEEK_LOCAL:-true}
|
||||||
|
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL:-http://host.docker.internal:11434}
|
||||||
|
OLLAMA_MODELS: ${OLLAMA_MODELS:-}
|
||||||
|
DEFAULT_OCR_MODEL: ${DEFAULT_OCR_MODEL:-deepseek-local}
|
||||||
|
OLLAMA_TIMEOUT: ${OLLAMA_TIMEOUT:-300}
|
||||||
|
# Lets the container reach an Ollama server running on the Docker host
|
||||||
|
# (works out of the box on Docker Desktop; required for Linux engines).
|
||||||
|
extra_hosts:
|
||||||
|
- "host.docker.internal:host-gateway"
|
||||||
volumes:
|
volumes:
|
||||||
- ./models:/models
|
- ./models:/models
|
||||||
- ./ocr_images:/data/ocr_images
|
- ./ocr_images:/data/ocr_images
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
import { useState, useCallback } from 'react'
|
import { useState, useCallback, useEffect } from 'react'
|
||||||
import { useSuggestions } from './hooks/useSuggestions'
|
import { useSuggestions } from './hooks/useSuggestions'
|
||||||
|
import { useModels } from './hooks/useModels'
|
||||||
import { motion, AnimatePresence } from 'framer-motion'
|
import { motion, AnimatePresence } from 'framer-motion'
|
||||||
import {
|
import {
|
||||||
Sparkles, Zap, Loader2, Settings, Image as ImageIcon, FileText,
|
Sparkles, Zap, Loader2, Settings, Image as ImageIcon, FileText,
|
||||||
@@ -7,6 +8,7 @@ import {
|
|||||||
} from 'lucide-react'
|
} from 'lucide-react'
|
||||||
import ImageUpload from './components/ImageUpload'
|
import ImageUpload from './components/ImageUpload'
|
||||||
import ModeSelector from './components/ModeSelector'
|
import ModeSelector from './components/ModeSelector'
|
||||||
|
import ModelSelector from './components/ModelSelector'
|
||||||
import ResultPanel from './components/ResultPanel'
|
import ResultPanel from './components/ResultPanel'
|
||||||
import AdvancedSettings from './components/AdvancedSettings'
|
import AdvancedSettings from './components/AdvancedSettings'
|
||||||
import PDFProcessor from './components/PDFProcessor'
|
import PDFProcessor from './components/PDFProcessor'
|
||||||
@@ -24,6 +26,8 @@ function App() {
|
|||||||
const [view, setView] = useState('new_job')
|
const [view, setView] = useState('new_job')
|
||||||
|
|
||||||
// OCR state
|
// OCR state
|
||||||
|
const { models, loading: modelsLoading } = useModels()
|
||||||
|
const [model, setModel] = useState(null)
|
||||||
const [mode, setMode] = useState('plain_ocr')
|
const [mode, setMode] = useState('plain_ocr')
|
||||||
const [fileType, setFileType] = useState('image')
|
const [fileType, setFileType] = useState('image')
|
||||||
const [image, setImage] = useState(null)
|
const [image, setImage] = useState(null)
|
||||||
@@ -51,8 +55,15 @@ function App() {
|
|||||||
const [commitResult, setCommitResult] = useState(null)
|
const [commitResult, setCommitResult] = useState(null)
|
||||||
|
|
||||||
// Modes that produce editable text output and can be committed to the DB
|
// Modes that produce editable text output and can be committed to the DB
|
||||||
const COMMITTABLE_MODES = new Set(['plain_ocr', 'describe', 'freeform'])
|
const COMMITTABLE_MODES = new Set(['plain_ocr', 'describe'])
|
||||||
const MODE_LABELS = { plain_ocr: 'OCR Text', describe: 'Description', freeform: 'Freeform' }
|
const MODE_LABELS = { plain_ocr: 'OCR Text', describe: 'Description' }
|
||||||
|
|
||||||
|
// Pick the default model once the list loads
|
||||||
|
useEffect(() => {
|
||||||
|
if (!model && models.length > 0) {
|
||||||
|
setModel((models.find(m => m.default) || models[0]).id)
|
||||||
|
}
|
||||||
|
}, [models, model])
|
||||||
|
|
||||||
// Show the full-screen result view once at least one committable mode has a result
|
// Show the full-screen result view once at least one committable mode has a result
|
||||||
const showResultView = view === 'new_job' && Object.keys(modeResults).length > 0
|
const showResultView = view === 'new_job' && Object.keys(modeResults).length > 0
|
||||||
@@ -97,6 +108,7 @@ function App() {
|
|||||||
try {
|
try {
|
||||||
const formData = new FormData()
|
const formData = new FormData()
|
||||||
formData.append('image', image)
|
formData.append('image', image)
|
||||||
|
if (model) formData.append('model', model)
|
||||||
formData.append('mode', mode)
|
formData.append('mode', mode)
|
||||||
formData.append('prompt', prompt)
|
formData.append('prompt', prompt)
|
||||||
formData.append('grounding', mode === 'find_ref')
|
formData.append('grounding', mode === 'find_ref')
|
||||||
@@ -149,6 +161,7 @@ function App() {
|
|||||||
formData.append('describe_text', editedResults.describe || '')
|
formData.append('describe_text', editedResults.describe || '')
|
||||||
formData.append('freeform_text', editedResults.freeform || '')
|
formData.append('freeform_text', editedResults.freeform || '')
|
||||||
formData.append('mode', mode)
|
formData.append('mode', mode)
|
||||||
|
if (model) formData.append('ocr_model', model)
|
||||||
|
|
||||||
const response = await axios.post(`${API_BASE}/jobs`, formData, {
|
const response = await axios.post(`${API_BASE}/jobs`, formData, {
|
||||||
headers: { 'Content-Type': 'multipart/form-data' },
|
headers: { 'Content-Type': 'multipart/form-data' },
|
||||||
@@ -159,7 +172,7 @@ function App() {
|
|||||||
} finally {
|
} finally {
|
||||||
setCommitLoading(false)
|
setCommitLoading(false)
|
||||||
}
|
}
|
||||||
}, [image, editedResults, metadata, mode])
|
}, [image, editedResults, metadata, mode, model])
|
||||||
|
|
||||||
const handleCopy = useCallback(() => {
|
const handleCopy = useCallback(() => {
|
||||||
const text = (activeResultMode && editedResults[activeResultMode]) || result?.text
|
const text = (activeResultMode && editedResults[activeResultMode]) || result?.text
|
||||||
@@ -263,11 +276,12 @@ function App() {
|
|||||||
>
|
>
|
||||||
{/* Run additional modes */}
|
{/* Run additional modes */}
|
||||||
<div className="glass p-4 rounded-2xl flex-shrink-0">
|
<div className="glass p-4 rounded-2xl flex-shrink-0">
|
||||||
<ModeSelector
|
<div className="mb-3">
|
||||||
mode={mode} onModeChange={setMode}
|
<ModelSelector
|
||||||
prompt={prompt} onPromptChange={setPrompt}
|
models={models} value={model} onChange={setModel} loading={modelsLoading}
|
||||||
findTerm={findTerm} onFindTermChange={setFindTerm}
|
/>
|
||||||
/>
|
</div>
|
||||||
|
<ModeSelector mode={mode} onModeChange={setMode} />
|
||||||
<div className="flex items-center gap-3 mt-3">
|
<div className="flex items-center gap-3 mt-3">
|
||||||
<motion.button
|
<motion.button
|
||||||
onClick={handleSubmit}
|
onClick={handleSubmit}
|
||||||
@@ -462,12 +476,12 @@ function App() {
|
|||||||
|
|
||||||
<MetadataForm metadata={metadata} onChange={setMetadata} suggestions={suggestions} />
|
<MetadataForm metadata={metadata} onChange={setMetadata} suggestions={suggestions} />
|
||||||
|
|
||||||
<ModeSelector
|
<ModelSelector
|
||||||
mode={mode} onModeChange={setMode}
|
models={models} value={model} onChange={setModel} loading={modelsLoading}
|
||||||
prompt={prompt} onPromptChange={setPrompt}
|
|
||||||
findTerm={findTerm} onFindTermChange={setFindTerm}
|
|
||||||
/>
|
/>
|
||||||
|
|
||||||
|
<ModeSelector mode={mode} onModeChange={setMode} />
|
||||||
|
|
||||||
<ImageUpload onImageSelect={handleImageSelect} preview={imagePreview} fileType={fileType} />
|
<ImageUpload onImageSelect={handleImageSelect} preview={imagePreview} fileType={fileType} />
|
||||||
|
|
||||||
<motion.button
|
<motion.button
|
||||||
@@ -497,7 +511,7 @@ function App() {
|
|||||||
|
|
||||||
{fileType === 'pdf' ? (
|
{fileType === 'pdf' ? (
|
||||||
<PDFProcessor
|
<PDFProcessor
|
||||||
pdfFile={image} mode={mode} prompt={prompt}
|
pdfFile={image} mode={mode} prompt={prompt} model={model}
|
||||||
advancedSettings={advancedSettings} includeCaption={includeCaption}
|
advancedSettings={advancedSettings} includeCaption={includeCaption}
|
||||||
/>
|
/>
|
||||||
) : (
|
) : (
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
import { useState, useEffect, useCallback } from 'react'
|
import { useState, useEffect, useCallback } from 'react'
|
||||||
import { useSuggestions } from '../hooks/useSuggestions'
|
import { useSuggestions } from '../hooks/useSuggestions'
|
||||||
|
import { useModels } from '../hooks/useModels'
|
||||||
import { motion, AnimatePresence } from 'framer-motion'
|
import { motion, AnimatePresence } from 'framer-motion'
|
||||||
import {
|
import {
|
||||||
Search, ChevronLeft, ChevronRight, CheckCircle2, Clock,
|
Search, ChevronLeft, ChevronRight, CheckCircle2, Clock,
|
||||||
FileText, Loader2, Save, RefreshCw, Trash2,
|
FileText, Loader2, Save, RefreshCw, Trash2, Sparkles,
|
||||||
} from 'lucide-react'
|
} from 'lucide-react'
|
||||||
import axios from 'axios'
|
import axios from 'axios'
|
||||||
|
|
||||||
@@ -32,10 +33,14 @@ function StatusBadge({ status }) {
|
|||||||
// Full-screen Job Detail
|
// Full-screen Job Detail
|
||||||
// ─────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────
|
||||||
function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} }) {
|
function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} }) {
|
||||||
|
const { models } = useModels()
|
||||||
const [job, setJob] = useState(null)
|
const [job, setJob] = useState(null)
|
||||||
const [loading, setLoading] = useState(true)
|
const [loading, setLoading] = useState(true)
|
||||||
const [error, setError] = useState(null)
|
const [error, setError] = useState(null)
|
||||||
|
|
||||||
|
const [describeModel, setDescribeModel] = useState('')
|
||||||
|
const [generatingDescribe, setGeneratingDescribe] = useState(false)
|
||||||
|
|
||||||
const [editedText, setEditedText] = useState('')
|
const [editedText, setEditedText] = useState('')
|
||||||
const [editDescribeText, setEditDescribeText] = useState('')
|
const [editDescribeText, setEditDescribeText] = useState('')
|
||||||
const [editFreeformText, setEditFreeformText] = useState('')
|
const [editFreeformText, setEditFreeformText] = useState('')
|
||||||
@@ -71,10 +76,9 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
|
|||||||
setEditChapter(d.chapter || '')
|
setEditChapter(d.chapter || '')
|
||||||
setEditPage(d.page || '')
|
setEditPage(d.page || '')
|
||||||
setReviewerName(d.reviewer_name || '')
|
setReviewerName(d.reviewer_name || '')
|
||||||
// Default to first tab that has content
|
// Default to the OCR tab when there's OCR text, otherwise Description
|
||||||
if (d.reviewed_text || d.ocr_text) setActiveTab('ocr')
|
if (d.reviewed_text || d.ocr_text) setActiveTab('ocr')
|
||||||
else if (d.describe_text) setActiveTab('describe')
|
else setActiveTab('describe')
|
||||||
else if (d.freeform_text) setActiveTab('freeform')
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.catch(err => {
|
.catch(err => {
|
||||||
@@ -85,6 +89,32 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
|
|||||||
return () => { cancelled = true }
|
return () => { cancelled = true }
|
||||||
}, [jobId])
|
}, [jobId])
|
||||||
|
|
||||||
|
// Default the Describe model to the job's original model (if available) or the registry default
|
||||||
|
useEffect(() => {
|
||||||
|
if (!describeModel && models.length > 0) {
|
||||||
|
const def = models.find(m => m.default) || models[0]
|
||||||
|
const fromJob = job?.ocr_model && models.some(m => m.id === job.ocr_model) ? job.ocr_model : null
|
||||||
|
setDescribeModel(fromJob || def.id)
|
||||||
|
}
|
||||||
|
}, [models, job, describeModel])
|
||||||
|
|
||||||
|
const handleGenerateDescribe = async () => {
|
||||||
|
setGeneratingDescribe(true)
|
||||||
|
setSaveResult(null)
|
||||||
|
try {
|
||||||
|
const res = await axios.post(`${API_BASE}/jobs/${jobId}/describe`, {
|
||||||
|
model: describeModel || null,
|
||||||
|
})
|
||||||
|
setJob(res.data)
|
||||||
|
setEditDescribeText(res.data.describe_text || '')
|
||||||
|
onReviewed(res.data)
|
||||||
|
} catch (err) {
|
||||||
|
setSaveResult({ success: false, error: err.response?.data?.detail || err.message })
|
||||||
|
} finally {
|
||||||
|
setGeneratingDescribe(false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const handleSave = async () => {
|
const handleSave = async () => {
|
||||||
if (!reviewerName.trim()) {
|
if (!reviewerName.trim()) {
|
||||||
setSaveResult({ success: false, error: 'Reviewer name is required.' })
|
setSaveResult({ success: false, error: 'Reviewer name is required.' })
|
||||||
@@ -114,16 +144,24 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
|
|||||||
}
|
}
|
||||||
|
|
||||||
const handleToggleStatus = async () => {
|
const handleToggleStatus = async () => {
|
||||||
const next = isReviewed ? 'unreviewed' : 'reviewed'
|
// Marking reviewed accepts BOTH the reviewed document text and the description,
|
||||||
if (next === 'reviewed' && !reviewerName.trim()) {
|
// so it goes through the full review save (not a status-only flip).
|
||||||
setSaveResult({ success: false, error: 'Reviewer name is required to mark reviewed.' })
|
if (!isReviewed) {
|
||||||
|
setTogglingStatus(true)
|
||||||
|
try {
|
||||||
|
await handleSave()
|
||||||
|
} finally {
|
||||||
|
setTogglingStatus(false)
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reverting to unreviewed preserves the saved reviewed text and description.
|
||||||
setTogglingStatus(true)
|
setTogglingStatus(true)
|
||||||
setSaveResult(null)
|
setSaveResult(null)
|
||||||
try {
|
try {
|
||||||
const res = await axios.put(`${API_BASE}/jobs/${jobId}/status`, {
|
const res = await axios.put(`${API_BASE}/jobs/${jobId}/status`, {
|
||||||
status: next,
|
status: 'unreviewed',
|
||||||
reviewer_name: reviewerName.trim() || null,
|
reviewer_name: reviewerName.trim() || null,
|
||||||
})
|
})
|
||||||
setJob(res.data)
|
setJob(res.data)
|
||||||
@@ -259,8 +297,7 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
|
|||||||
{(() => {
|
{(() => {
|
||||||
const tabs = [
|
const tabs = [
|
||||||
job.ocr_text || job.reviewed_text ? { id: 'ocr', label: 'OCR Text' } : null,
|
job.ocr_text || job.reviewed_text ? { id: 'ocr', label: 'OCR Text' } : null,
|
||||||
job.describe_text != null ? { id: 'describe', label: 'Description' } : null,
|
{ id: 'describe', label: 'Description' },
|
||||||
job.freeform_text != null ? { id: 'freeform', label: 'Freeform' } : null,
|
|
||||||
].filter(Boolean)
|
].filter(Boolean)
|
||||||
return tabs.length > 1 ? (
|
return tabs.length > 1 ? (
|
||||||
<div className="flex gap-1 mb-3 flex-shrink-0">
|
<div className="flex gap-1 mb-3 flex-shrink-0">
|
||||||
@@ -282,7 +319,7 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
|
|||||||
})()}
|
})()}
|
||||||
|
|
||||||
<p className="text-xs text-gray-400 mb-2 flex-shrink-0">
|
<p className="text-xs text-gray-400 mb-2 flex-shrink-0">
|
||||||
{{ ocr: isReviewed ? 'Reviewed Text' : 'OCR Text', describe: 'Description', freeform: 'Freeform' }[activeTab]}
|
{{ ocr: isReviewed ? 'Reviewed Text' : 'OCR Text', describe: 'Description' }[activeTab]}
|
||||||
<span className="text-purple-400 ml-1">(editable)</span>
|
<span className="text-purple-400 ml-1">(editable)</span>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@@ -307,20 +344,43 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
|
|||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
{activeTab === 'describe' && (
|
{activeTab === 'describe' && (
|
||||||
<textarea
|
<>
|
||||||
value={editDescribeText}
|
<div className="flex items-center gap-2 mb-2 flex-shrink-0">
|
||||||
onChange={e => setEditDescribeText(e.target.value)}
|
<select
|
||||||
className="flex-1 w-full bg-transparent text-sm text-gray-200 font-mono resize-none focus:outline-none min-h-0"
|
value={describeModel}
|
||||||
placeholder="Description text..."
|
onChange={e => setDescribeModel(e.target.value)}
|
||||||
/>
|
disabled={generatingDescribe || models.length === 0}
|
||||||
)}
|
className="bg-white/5 border border-white/10 rounded-lg px-2 py-1.5 text-xs text-gray-200 focus:outline-none focus:border-purple-500/50"
|
||||||
{activeTab === 'freeform' && (
|
>
|
||||||
<textarea
|
{models.length === 0 && <option value="">No models</option>}
|
||||||
value={editFreeformText}
|
{models.map(m => (
|
||||||
onChange={e => setEditFreeformText(e.target.value)}
|
<option key={m.id} value={m.id}>{m.label}{m.default ? ' (default)' : ''}</option>
|
||||||
className="flex-1 w-full bg-transparent text-sm text-gray-200 font-mono resize-none focus:outline-none min-h-0"
|
))}
|
||||||
placeholder="Freeform result..."
|
</select>
|
||||||
/>
|
<motion.button
|
||||||
|
onClick={handleGenerateDescribe}
|
||||||
|
disabled={generatingDescribe || !describeModel}
|
||||||
|
className={`flex items-center gap-1.5 px-3 py-1.5 rounded-lg text-xs font-medium transition-all ${
|
||||||
|
generatingDescribe || !describeModel
|
||||||
|
? 'opacity-50 cursor-not-allowed bg-white/5'
|
||||||
|
: 'bg-gradient-to-r from-violet-600 to-purple-600 hover:from-violet-500 hover:to-purple-500'
|
||||||
|
}`}
|
||||||
|
whileHover={!generatingDescribe && describeModel ? { scale: 1.02 } : {}}
|
||||||
|
whileTap={!generatingDescribe && describeModel ? { scale: 0.98 } : {}}
|
||||||
|
title="Run Describe on this job's image and save it"
|
||||||
|
>
|
||||||
|
{generatingDescribe
|
||||||
|
? <><Loader2 className="w-3.5 h-3.5 animate-spin" /> Generating…</>
|
||||||
|
: <><Sparkles className="w-3.5 h-3.5" /> Generate Description</>}
|
||||||
|
</motion.button>
|
||||||
|
</div>
|
||||||
|
<textarea
|
||||||
|
value={editDescribeText}
|
||||||
|
onChange={e => setEditDescribeText(e.target.value)}
|
||||||
|
className="flex-1 w-full bg-transparent text-sm text-gray-200 font-mono resize-none focus:outline-none min-h-0"
|
||||||
|
placeholder="No description yet — pick a model and click Generate Description, or type one here."
|
||||||
|
/>
|
||||||
|
</>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -385,6 +445,12 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{!isReviewed && (
|
||||||
|
<p className="text-xs text-gray-500 mt-2">
|
||||||
|
Marking reviewed accepts both the reviewed document text and the description.
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
|
|
||||||
{saveResult && (
|
{saveResult && (
|
||||||
<motion.div
|
<motion.div
|
||||||
initial={{ opacity: 0, y: -4 }} animate={{ opacity: 1, y: 0 }}
|
initial={{ opacity: 0, y: -4 }} animate={{ opacity: 1, y: 0 }}
|
||||||
@@ -405,6 +471,7 @@ function JobDetail({ jobId, onClose, onReviewed, onDeleted, suggestions = {} })
|
|||||||
<span className="text-xs text-gray-500">Last reviewed: {new Date(job.reviewed_at).toLocaleString()}</span>
|
<span className="text-xs text-gray-500">Last reviewed: {new Date(job.reviewed_at).toLocaleString()}</span>
|
||||||
)}
|
)}
|
||||||
{job.mode && <span className="text-xs text-gray-500">Mode: {job.mode}</span>}
|
{job.mode && <span className="text-xs text-gray-500">Mode: {job.mode}</span>}
|
||||||
|
{job.ocr_model && <span className="text-xs text-gray-500">Model: {job.ocr_model}</span>}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</>
|
</>
|
||||||
@@ -573,7 +640,10 @@ export default function JobsPanel() {
|
|||||||
{job.page && <span className="text-xs text-gray-500">p. {job.page}</span>}
|
{job.page && <span className="text-xs text-gray-500">p. {job.page}</span>}
|
||||||
</div>
|
</div>
|
||||||
{job.author && <p className="text-xs text-gray-400 mt-1">{job.author}</p>}
|
{job.author && <p className="text-xs text-gray-400 mt-1">{job.author}</p>}
|
||||||
<p className="text-xs text-gray-600 mt-2 font-mono">{new Date(job.submitted_at).toLocaleDateString()}</p>
|
<div className="flex items-center justify-between mt-2">
|
||||||
|
<p className="text-xs text-gray-600 font-mono">{new Date(job.submitted_at).toLocaleDateString()}</p>
|
||||||
|
{job.ocr_model && <span className="text-[10px] text-gray-500 truncate ml-2">{job.ocr_model}</span>}
|
||||||
|
</div>
|
||||||
</motion.button>
|
</motion.button>
|
||||||
))}
|
))}
|
||||||
</AnimatePresence>
|
</AnimatePresence>
|
||||||
|
|||||||
@@ -1,29 +1,17 @@
|
|||||||
import { motion } from 'framer-motion'
|
import { motion } from 'framer-motion'
|
||||||
import { FileText, Eye, Search, Wand2 } from 'lucide-react'
|
import { FileText, Eye } from 'lucide-react'
|
||||||
|
|
||||||
const modes = [
|
const modes = [
|
||||||
{ id: 'plain_ocr', name: 'Plain OCR', icon: FileText, color: 'from-blue-500 to-cyan-500', desc: 'Extract raw text', needsInput: false },
|
{ id: 'plain_ocr', name: 'Plain OCR', icon: FileText, color: 'from-blue-500 to-cyan-500', desc: 'Extract raw text' },
|
||||||
{ id: 'describe', name: 'Describe', icon: Eye, color: 'from-violet-500 to-purple-500', desc: 'Image description', needsInput: false },
|
{ id: 'describe', name: 'Describe', icon: Eye, color: 'from-violet-500 to-purple-500', desc: 'Image description' },
|
||||||
{ id: 'find_ref', name: 'Find', icon: Search, color: 'from-yellow-500 to-orange-500', desc: 'Locate specific terms', needsInput: 'findTerm' },
|
|
||||||
{ id: 'freeform', name: 'Freeform', icon: Wand2, color: 'from-fuchsia-500 to-pink-500', desc: 'Custom prompt', needsInput: 'prompt' },
|
|
||||||
]
|
]
|
||||||
|
|
||||||
export default function ModeSelector({
|
export default function ModeSelector({ mode, onModeChange }) {
|
||||||
mode,
|
|
||||||
onModeChange,
|
|
||||||
prompt,
|
|
||||||
onPromptChange,
|
|
||||||
findTerm,
|
|
||||||
onFindTermChange
|
|
||||||
}) {
|
|
||||||
const selectedMode = modes.find(m => m.id === mode)
|
|
||||||
const needsInput = selectedMode?.needsInput
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className="glass p-4 rounded-2xl space-y-3">
|
<div className="glass p-4 rounded-2xl space-y-3">
|
||||||
<h3 className="text-sm font-semibold text-gray-200">Mode</h3>
|
<h3 className="text-sm font-semibold text-gray-200">Mode</h3>
|
||||||
|
|
||||||
<div className="grid grid-cols-4 gap-2">
|
<div className="grid grid-cols-2 gap-2">
|
||||||
{modes.map((m) => {
|
{modes.map((m) => {
|
||||||
const Icon = m.icon
|
const Icon = m.icon
|
||||||
const isSelected = mode === m.id
|
const isSelected = mode === m.id
|
||||||
@@ -32,6 +20,7 @@ export default function ModeSelector({
|
|||||||
<motion.button
|
<motion.button
|
||||||
key={m.id}
|
key={m.id}
|
||||||
onClick={() => onModeChange(m.id)}
|
onClick={() => onModeChange(m.id)}
|
||||||
|
title={m.desc}
|
||||||
className={`
|
className={`
|
||||||
relative p-2 rounded-xl text-center transition-all
|
relative p-2 rounded-xl text-center transition-all
|
||||||
${isSelected
|
${isSelected
|
||||||
@@ -68,38 +57,6 @@ export default function ModeSelector({
|
|||||||
)
|
)
|
||||||
})}
|
})}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{needsInput === 'findTerm' && (
|
|
||||||
<motion.div
|
|
||||||
initial={{ opacity: 0, height: 0 }}
|
|
||||||
animate={{ opacity: 1, height: 'auto' }}
|
|
||||||
exit={{ opacity: 0, height: 0 }}
|
|
||||||
>
|
|
||||||
<input
|
|
||||||
type="text"
|
|
||||||
value={findTerm}
|
|
||||||
onChange={(e) => onFindTermChange(e.target.value)}
|
|
||||||
placeholder="Enter term to find (e.g., Total, Invoice #)"
|
|
||||||
className="w-full bg-white/5 border border-white/10 rounded-xl px-3 py-2 text-sm focus:outline-none focus:border-purple-500 transition-colors"
|
|
||||||
/>
|
|
||||||
</motion.div>
|
|
||||||
)}
|
|
||||||
|
|
||||||
{needsInput === 'prompt' && (
|
|
||||||
<motion.div
|
|
||||||
initial={{ opacity: 0, height: 0 }}
|
|
||||||
animate={{ opacity: 1, height: 'auto' }}
|
|
||||||
exit={{ opacity: 0, height: 0 }}
|
|
||||||
>
|
|
||||||
<textarea
|
|
||||||
value={prompt}
|
|
||||||
onChange={(e) => onPromptChange(e.target.value)}
|
|
||||||
placeholder="Enter your custom prompt..."
|
|
||||||
className="w-full bg-white/5 border border-white/10 rounded-xl px-3 py-2 text-sm focus:outline-none focus:border-purple-500 transition-colors resize-none"
|
|
||||||
rows={2}
|
|
||||||
/>
|
|
||||||
</motion.div>
|
|
||||||
)}
|
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ import axios from 'axios'
|
|||||||
|
|
||||||
const API_BASE = import.meta.env.VITE_API_URL || '/api'
|
const API_BASE = import.meta.env.VITE_API_URL || '/api'
|
||||||
|
|
||||||
function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption }) {
|
function PDFProcessor({ pdfFile, mode, prompt, model, advancedSettings, includeCaption }) {
|
||||||
const [processing, setProcessing] = useState(false)
|
const [processing, setProcessing] = useState(false)
|
||||||
const [progress, setProgress] = useState(0)
|
const [progress, setProgress] = useState(0)
|
||||||
const [result, setResult] = useState(null)
|
const [result, setResult] = useState(null)
|
||||||
@@ -29,6 +29,7 @@ function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption
|
|||||||
try {
|
try {
|
||||||
const formData = new FormData()
|
const formData = new FormData()
|
||||||
formData.append('pdf_file', pdfFile)
|
formData.append('pdf_file', pdfFile)
|
||||||
|
if (model) formData.append('model', model)
|
||||||
formData.append('mode', mode)
|
formData.append('mode', mode)
|
||||||
formData.append('prompt', prompt)
|
formData.append('prompt', prompt)
|
||||||
formData.append('output_format', outputFormat)
|
formData.append('output_format', outputFormat)
|
||||||
@@ -80,7 +81,7 @@ function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption
|
|||||||
} finally {
|
} finally {
|
||||||
setProcessing(false)
|
setProcessing(false)
|
||||||
}
|
}
|
||||||
}, [pdfFile, mode, prompt, outputFormat, includeCaption, advancedSettings])
|
}, [pdfFile, mode, prompt, model, outputFormat, includeCaption, advancedSettings])
|
||||||
|
|
||||||
const handleDownloadJSON = useCallback(() => {
|
const handleDownloadJSON = useCallback(() => {
|
||||||
if (!result || outputFormat !== 'json') return
|
if (!result || outputFormat !== 'json') return
|
||||||
|
|||||||
Reference in New Issue
Block a user