From 02185bef46ccc0e594f62cac226de73005e09748 Mon Sep 17 00:00:00 2001 From: Aaron Roberts Date: Tue, 30 Jun 2026 12:16:16 +0100 Subject: [PATCH] Adding missed files --- backend/providers.py | 489 ++++++++++++++++++++++ frontend/src/components/ModelSelector.jsx | 33 ++ frontend/src/hooks/useModels.js | 24 ++ 3 files changed, 546 insertions(+) create mode 100644 backend/providers.py create mode 100644 frontend/src/components/ModelSelector.jsx create mode 100644 frontend/src/hooks/useModels.js diff --git a/backend/providers.py b/backend/providers.py new file mode 100644 index 0000000..34be178 --- /dev/null +++ b/backend/providers.py @@ -0,0 +1,489 @@ +""" +OCR provider abstraction. + +Each provider knows how to turn an image + a semantic OCR request (mode, prompt, +options) into raw model text. DeepSeek-specific prompt tokens and grounding-box +parsing live here too so the FastAPI routes stay model-agnostic. + +Two providers ship today: + - DeepSeekLocalProvider -> the local HF transformers DeepSeek-OCR model (GPU) + - OllamaProvider -> any vision model served by an external Ollama host + +The registry is built from environment variables at startup (see build_registry()). +""" + +import os +import re +import base64 +import tempfile +import shutil +from abc import ABC, abstractmethod +from typing import List, Dict, Any, Optional + +from decouple import config as env_config + +# httpx is only needed when an Ollama model is actually used; import lazily so the +# backend can run DeepSeek-only without the dependency installed. +try: + import httpx +except Exception: # pragma: no cover - exercised only when httpx is missing + httpx = None + + +# ============================================================================= +# Prompt builders +# ============================================================================= +def build_prompt( + mode: str, + user_prompt: str, + grounding: bool, + find_term: Optional[str], + schema: Optional[str], + include_caption: bool, +) -> str: + """Build the DeepSeek-OCR prompt (with its special tokens) based on mode.""" + parts: List[str] = [""] + mode_requires_grounding = mode in {"find_ref", "layout_map", "pii_redact"} + if grounding or mode_requires_grounding: + parts.append("<|grounding|>") + + parts.append(_instruction_for_mode(mode, user_prompt, find_term, schema, include_caption)) + return "\n".join(parts) + + +def build_ollama_prompt( + mode: str, + user_prompt: str, + find_term: Optional[str], + schema: Optional[str], + include_caption: bool, +) -> str: + """Build a plain natural-language prompt for a generic vision model. + + No DeepSeek grounding tokens — Ollama vision models receive the image + separately and respond in plain text. + """ + if mode == "plain_ocr": + instruction = ( + "Transcribe all of the text in this image exactly as it appears, " + "preserving line breaks and reading order. Output only the transcribed " + "text with no commentary." + ) + elif mode == "markdown": + instruction = ( + "Convert this document image to clean GitHub-flavored Markdown, " + "preserving headings, lists, and tables. Output only the Markdown." + ) + elif mode == "tables_csv": + instruction = ( + "Extract every table in this image and output CSV only. Use commas with " + "minimal quoting. If there are multiple tables, separate them with a line " + "containing '---'. Output only the CSV." + ) + elif mode == "tables_md": + instruction = ( + "Extract every table in this image as GitHub-flavored Markdown tables. " + "Output only the tables." + ) + elif mode == "kv_json": + schema_text = schema.strip() if schema else "{}" + instruction = ( + "Extract the key fields from this image and return strict JSON only " + f"(no prose). Use this schema, filling in the values: {schema_text}" + ) + elif mode == "figure_chart": + instruction = ( + "Parse the figure in this image. First extract any numeric series as a " + "two-column table (x,y). Then add a line containing '---' followed by a " + "two-sentence summary of the chart." + ) + elif mode == "find_ref": + key = (find_term or "").strip() or "Total" + instruction = ( + f"Find every occurrence of '{key}' in this image and quote the surrounding " + "text for each match. If it does not appear, say so." + ) + elif mode == "layout_map": + instruction = ( + 'Identify the layout blocks in this image and return a JSON array of ' + 'objects {"type": one of ["title","paragraph","table","figure"]}. ' + "Do not include the text content." + ) + elif mode == "pii_redact": + instruction = ( + "Find all emails, phone numbers, postal addresses, and IBANs in this image. " + 'Return a JSON array of objects {"label", "text"}.' + ) + elif mode == "multilingual": + instruction = ( + "Transcribe all of the text in this image exactly, detecting the language " + "automatically and preserving the original script. Output only the text." + ) + elif mode == "describe": + instruction = "Describe this image, focusing on the key visible elements." + elif mode == "freeform": + instruction = user_prompt.strip() if user_prompt else "Transcribe the text in this image." + else: + instruction = "Transcribe the text in this image." + + if include_caption and mode != "describe": + instruction += "\nThen add a one-paragraph description of the image." + + return instruction + + +def _instruction_for_mode( + mode: str, + user_prompt: str, + find_term: Optional[str], + schema: Optional[str], + include_caption: bool, +) -> str: + """The DeepSeek instruction text (without the /<|grounding|> prefix tokens).""" + if mode == "plain_ocr": + instruction = "Free OCR." + elif mode == "markdown": + instruction = "Convert the document to markdown." + elif mode == "tables_csv": + instruction = ( + "Extract every table and output CSV only. " + "Use commas, minimal quoting. If multiple tables, separate with a line containing '---'." + ) + elif mode == "tables_md": + instruction = "Extract every table as GitHub-flavored Markdown tables. Output only the tables." + elif mode == "kv_json": + schema_text = schema.strip() if schema else "{}" + instruction = ( + "Extract key fields and return strict JSON only. " + f"Use this schema (fill the values): {schema_text}" + ) + elif mode == "figure_chart": + instruction = ( + "Parse the figure. First extract any numeric series as a two-column table (x,y). " + "Then summarize the chart in 2 sentences. Output the table, then a line '---', then the summary." + ) + elif mode == "find_ref": + key = (find_term or "").strip() or "Total" + instruction = f"Locate <|ref|>{key}<|/ref|> in the image." + elif mode == "layout_map": + instruction = ( + 'Return a JSON array of blocks with fields {"type":["title","paragraph","table","figure"],' + '"box":[x1,y1,x2,y2]}. Do not include any text content.' + ) + elif mode == "pii_redact": + instruction = ( + 'Find all occurrences of emails, phone numbers, postal addresses, and IBANs. ' + 'Return a JSON array of objects {label, text, box:[x1,y1,x2,y2]}.' + ) + elif mode == "multilingual": + instruction = "Free OCR. Detect the language automatically and output in the same script." + elif mode == "describe": + instruction = "Describe this image. Focus on visible key elements." + elif mode == "freeform": + instruction = user_prompt.strip() if user_prompt else "OCR this image." + else: + instruction = "OCR this image." + + if include_caption and mode != "describe": + instruction = instruction + "\nThen add a one-paragraph description of the image." + + return instruction + + +# ============================================================================= +# Grounding parser (DeepSeek-specific; no-op on plain text) +# ============================================================================= +DET_BLOCK = re.compile( + r"<\|ref\|>(?P