""" OCR provider abstraction. Each provider knows how to turn an image + a semantic OCR request (mode, prompt, options) into raw model text. DeepSeek-specific prompt tokens and grounding-box parsing live here too so the FastAPI routes stay model-agnostic. Two providers ship today: - DeepSeekLocalProvider -> the local HF transformers DeepSeek-OCR model (GPU) - OllamaProvider -> any vision model served by an external Ollama host The registry is built from environment variables at startup (see build_registry()). """ import os import re import base64 import tempfile import shutil from abc import ABC, abstractmethod from typing import List, Dict, Any, Optional from decouple import config as env_config # httpx is only needed when an Ollama model is actually used; import lazily so the # backend can run DeepSeek-only without the dependency installed. try: import httpx except Exception: # pragma: no cover - exercised only when httpx is missing httpx = None # ============================================================================= # Prompt builders # ============================================================================= def build_prompt( mode: str, user_prompt: str, grounding: bool, find_term: Optional[str], schema: Optional[str], include_caption: bool, ) -> str: """Build the DeepSeek-OCR prompt (with its special tokens) based on mode.""" parts: List[str] = [""] mode_requires_grounding = mode in {"find_ref", "layout_map", "pii_redact"} if grounding or mode_requires_grounding: parts.append("<|grounding|>") parts.append(_instruction_for_mode(mode, user_prompt, find_term, schema, include_caption)) return "\n".join(parts) def build_ollama_prompt( mode: str, user_prompt: str, find_term: Optional[str], schema: Optional[str], include_caption: bool, ) -> str: """Build a plain natural-language prompt for a generic vision model. No DeepSeek grounding tokens — Ollama vision models receive the image separately and respond in plain text. """ if mode == "plain_ocr": instruction = ( "Transcribe all of the text in this image exactly as it appears, " "preserving line breaks and reading order. Output only the transcribed " "text with no commentary." ) elif mode == "markdown": instruction = ( "Convert this document image to clean GitHub-flavored Markdown, " "preserving headings, lists, and tables. Output only the Markdown." ) elif mode == "tables_csv": instruction = ( "Extract every table in this image and output CSV only. Use commas with " "minimal quoting. If there are multiple tables, separate them with a line " "containing '---'. Output only the CSV." ) elif mode == "tables_md": instruction = ( "Extract every table in this image as GitHub-flavored Markdown tables. " "Output only the tables." ) elif mode == "kv_json": schema_text = schema.strip() if schema else "{}" instruction = ( "Extract the key fields from this image and return strict JSON only " f"(no prose). Use this schema, filling in the values: {schema_text}" ) elif mode == "figure_chart": instruction = ( "Parse the figure in this image. First extract any numeric series as a " "two-column table (x,y). Then add a line containing '---' followed by a " "two-sentence summary of the chart." ) elif mode == "find_ref": key = (find_term or "").strip() or "Total" instruction = ( f"Find every occurrence of '{key}' in this image and quote the surrounding " "text for each match. If it does not appear, say so." ) elif mode == "layout_map": instruction = ( 'Identify the layout blocks in this image and return a JSON array of ' 'objects {"type": one of ["title","paragraph","table","figure"]}. ' "Do not include the text content." ) elif mode == "pii_redact": instruction = ( "Find all emails, phone numbers, postal addresses, and IBANs in this image. " 'Return a JSON array of objects {"label", "text"}.' ) elif mode == "multilingual": instruction = ( "Transcribe all of the text in this image exactly, detecting the language " "automatically and preserving the original script. Output only the text." ) elif mode == "describe": instruction = "Describe this image, focusing on the key visible elements." elif mode == "freeform": instruction = user_prompt.strip() if user_prompt else "Transcribe the text in this image." else: instruction = "Transcribe the text in this image." if include_caption and mode != "describe": instruction += "\nThen add a one-paragraph description of the image." return instruction def _instruction_for_mode( mode: str, user_prompt: str, find_term: Optional[str], schema: Optional[str], include_caption: bool, ) -> str: """The DeepSeek instruction text (without the /<|grounding|> prefix tokens).""" if mode == "plain_ocr": instruction = "Free OCR." elif mode == "markdown": instruction = "Convert the document to markdown." elif mode == "tables_csv": instruction = ( "Extract every table and output CSV only. " "Use commas, minimal quoting. If multiple tables, separate with a line containing '---'." ) elif mode == "tables_md": instruction = "Extract every table as GitHub-flavored Markdown tables. Output only the tables." elif mode == "kv_json": schema_text = schema.strip() if schema else "{}" instruction = ( "Extract key fields and return strict JSON only. " f"Use this schema (fill the values): {schema_text}" ) elif mode == "figure_chart": instruction = ( "Parse the figure. First extract any numeric series as a two-column table (x,y). " "Then summarize the chart in 2 sentences. Output the table, then a line '---', then the summary." ) elif mode == "find_ref": key = (find_term or "").strip() or "Total" instruction = f"Locate <|ref|>{key}<|/ref|> in the image." elif mode == "layout_map": instruction = ( 'Return a JSON array of blocks with fields {"type":["title","paragraph","table","figure"],' '"box":[x1,y1,x2,y2]}. Do not include any text content.' ) elif mode == "pii_redact": instruction = ( 'Find all occurrences of emails, phone numbers, postal addresses, and IBANs. ' 'Return a JSON array of objects {label, text, box:[x1,y1,x2,y2]}.' ) elif mode == "multilingual": instruction = "Free OCR. Detect the language automatically and output in the same script." elif mode == "describe": instruction = "Describe this image. Focus on visible key elements." elif mode == "freeform": instruction = user_prompt.strip() if user_prompt else "OCR this image." else: instruction = "OCR this image." if include_caption and mode != "describe": instruction = instruction + "\nThen add a one-paragraph description of the image." return instruction # ============================================================================= # Grounding parser (DeepSeek-specific; no-op on plain text) # ============================================================================= DET_BLOCK = re.compile( r"<\|ref\|>(?P