From aec04f6eb4e3d07fa524f9d56986f64d8c5d4753 Mon Sep 17 00:00:00 2001 From: Ray Dumasia Date: Tue, 21 Oct 2025 01:32:09 +0100 Subject: [PATCH] Initial commit --- .gitignore | 102 ++++++ README.md | 138 ++++++++ backend/Dockerfile | 20 ++ backend/main.py | 329 +++++++++++++++++++ backend/requirements.txt | 12 + docker-compose.yml | 35 ++ frontend/.gitignore | 24 ++ frontend/Dockerfile | 29 ++ frontend/README.md | 73 ++++ frontend/eslint.config.js | 23 ++ frontend/index.html | 16 + frontend/nginx.conf | 40 +++ frontend/package.json | 29 ++ frontend/postcss.config.js | 6 + frontend/public/vite.svg | 1 + frontend/src/App.css | 42 +++ frontend/src/App.jsx | 251 ++++++++++++++ frontend/src/App.tsx | 35 ++ frontend/src/assets/react.svg | 1 + frontend/src/components/AdvancedSettings.jsx | 83 +++++ frontend/src/components/ImageUpload.jsx | 99 ++++++ frontend/src/components/ModeSelector.jsx | 105 ++++++ frontend/src/components/ResultPanel.jsx | 302 +++++++++++++++++ frontend/src/index.css | 50 +++ frontend/src/main.jsx | 10 + frontend/src/main.tsx | 10 + frontend/tailwind.config.js | 48 +++ frontend/tsconfig.app.json | 28 ++ frontend/tsconfig.json | 7 + frontend/tsconfig.node.json | 26 ++ frontend/vite.config.js | 20 ++ frontend/vite.config.ts | 7 + 32 files changed, 2001 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 backend/Dockerfile create mode 100644 backend/main.py create mode 100644 backend/requirements.txt create mode 100644 docker-compose.yml create mode 100644 frontend/.gitignore create mode 100644 frontend/Dockerfile create mode 100644 frontend/README.md create mode 100644 frontend/eslint.config.js create mode 100644 frontend/index.html create mode 100644 frontend/nginx.conf create mode 100644 frontend/package.json create mode 100644 frontend/postcss.config.js create mode 100644 frontend/public/vite.svg create mode 100644 frontend/src/App.css create mode 100644 frontend/src/App.jsx create mode 100644 frontend/src/App.tsx create mode 100644 frontend/src/assets/react.svg create mode 100644 frontend/src/components/AdvancedSettings.jsx create mode 100644 frontend/src/components/ImageUpload.jsx create mode 100644 frontend/src/components/ModeSelector.jsx create mode 100644 frontend/src/components/ResultPanel.jsx create mode 100644 frontend/src/index.css create mode 100644 frontend/src/main.jsx create mode 100644 frontend/src/main.tsx create mode 100644 frontend/tailwind.config.js create mode 100644 frontend/tsconfig.app.json create mode 100644 frontend/tsconfig.json create mode 100644 frontend/tsconfig.node.json create mode 100644 frontend/vite.config.js create mode 100644 frontend/vite.config.ts diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..705ff1b --- /dev/null +++ b/.gitignore @@ -0,0 +1,102 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +*.pyc +.pytest_cache/ +.coverage +htmlcov/ +.tox/ +.hypothesis/ +venv/ +env/ +ENV/ +.venv + +# Node +node_modules/ +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +.pnpm-store/ +dist/ +dist-ssr/ +*.local +package-lock.json +yarn.lock +pnpm-lock.yaml + +# Environment +.env +.env.local +.env.development.local +.env.test.local +.env.production.local + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store +*.sublime-project +*.sublime-workspace +.project +.classpath +.settings/ + +# Logs +logs/ +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* +lerna-debug.log* + +# Docker +.dockerignore + +# Models (keep structure, ignore downloads) +models/hub/models--*/blobs/* +models/hub/models--*/snapshots/*/pytorch_model*.bin +models/hub/models--*/snapshots/*/model*.safetensors +models/hub/models--*/snapshots/*/*.msgpack +*.bin +*.safetensors +*.msgpack +*.h5 +*.onnx + +# Temporary files +tmp/ +temp/ +*.tmp +*.bak +*.swp +*~ + +# OS +.DS_Store +Thumbs.db +Desktop.ini diff --git a/README.md b/README.md new file mode 100644 index 0000000..093560c --- /dev/null +++ b/README.md @@ -0,0 +1,138 @@ +# 🚀 DeepSeek OCR - React + FastAPI + +Modern OCR web application powered by DeepSeek-OCR with a stunning React frontend and FastAPI backend. + +> **Note**: This was a quickly vibe-coded project to test out DeepSeek-OCR! It basically works quite nice on an RTX 5090. The "Find" mode grounding boxes aren't quite working yet - probably my fault in not interpreting the dimensions correctly, but the core OCR functionality is pretty nice so far. + +## Quick Start + +```bash +docker compose up --build +``` + +Then open: +- **Frontend**: http://localhost:3000 +- **Backend API**: http://localhost:8000 +- **API Docs**: http://localhost:8000/docs + +## Features + +### 4 OCR Modes +- **Plain OCR** - Raw text extraction +- **Describe** - Generate image descriptions +- **Find** - Locate specific terms (grounding boxes WIP) +- **Freeform** - Custom prompts for anything + +### UI Features +- 🎨 Glass morphism design with animated gradients +- 🎯 Drag & drop file upload +- 📦 Grounding box visualization (WIP - dimensions need fixing) +- ✨ Smooth animations (Framer Motion) +- 📋 Copy/Download results +- 🎛️ Advanced settings dropdown +- 📝 Markdown rendering for formatted output + +## Tech Stack + +- **Frontend**: React 18 + Vite 5 + TailwindCSS 3 + Framer Motion 11 +- **Backend**: FastAPI + PyTorch + Transformers 4.46 + DeepSeek-OCR +- **Server**: Nginx (reverse proxy) +- **Container**: Docker + Docker Compose with multi-stage builds +- **GPU**: NVIDIA CUDA support (tested on RTX 3090) + +## Project Structure + +``` +deepseek-ocr/ +├── backend/ # FastAPI backend +│ ├── main.py +│ ├── requirements.txt +│ └── Dockerfile +├── frontend/ # React frontend +│ ├── src/ +│ │ ├── components/ +│ │ ├── App.jsx +│ │ └── main.jsx +│ ├── package.json +│ ├── nginx.conf +│ └── Dockerfile +├── models/ # Model cache +└── docker-compose.yml +``` + +## Development + +### Backend +```bash +cd backend +pip install -r requirements.txt +uvicorn main:app --reload --host 0.0.0.0 --port 8000 +``` + +### Frontend +```bash +cd frontend +npm install +npm run dev +``` + +## Requirements + +- Docker & Docker Compose +- NVIDIA GPU with CUDA support (tested on RTX 3090) +- nvidia-docker runtime +- ~8-12GB VRAM for model + +## Known Issues + +- 📦 **Find mode grounding boxes**: Not rendering correctly - likely dimension scaling issue in the canvas overlay logic. Boxes are detected and returned by the backend, but the frontend visualization needs work. + +## API Usage + +### POST /api/ocr + +**Parameters:** +- `image` (file, required) +- `mode` (string): plain_ocr | describe | find_ref | freeform +- `prompt` (string): Custom prompt for freeform mode +- `grounding` (bool): Enable bounding boxes (auto-enabled for find_ref) +- `find_term` (string): Term to locate in find_ref mode +- `base_size` (int): Base processing size (default: 1024) +- `image_size` (int): Image size (default: 640) +- `crop_mode` (bool): Enable crop mode (default: true) + +**Response:** +```json +{ + "success": true, + "text": "Extracted text...", + "boxes": [{"label": "field", "box": [x1, y1, x2, y2]}], + "image_dims": {"w": 1920, "h": 1080}, + "metadata": {...} +} +``` + +## Troubleshooting + +### GPU not detected +```bash +nvidia-smi +docker run --rm --gpus all nvidia/cuda:11.8.0-base-ubuntu22.04 nvidia-smi +``` + +### Port conflicts +```bash +sudo lsof -i :3000 +sudo lsof -i :8000 +``` + +### Frontend build issues +```bash +cd frontend +rm -rf node_modules package-lock.json +docker-compose build frontend +``` + +## License + +This project uses the DeepSeek-OCR model. Refer to the model's license terms. diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..7b95536 --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,20 @@ +# Backend Dockerfile - FastAPI + DeepSeek-OCR +FROM nvcr.io/nvidia/pytorch:25.09-py3 + +ENV PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 \ + HF_HOME=/models + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --upgrade pip && pip install -r requirements.txt + +# Copy backend code +COPY main.py . + +EXPOSE 8000 + +# Use uvicorn with reasonable workers for GPU workload +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "1"] diff --git a/backend/main.py b/backend/main.py new file mode 100644 index 0000000..2b69bdb --- /dev/null +++ b/backend/main.py @@ -0,0 +1,329 @@ +import os +import re +import tempfile +import shutil +from typing import List, Dict, Any, Optional +from contextlib import asynccontextmanager + +from fastapi import FastAPI, File, UploadFile, Form, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse +import torch +from transformers import AutoModel, AutoTokenizer +from PIL import Image +import uvicorn + +# ----------------------------- +# Lifespan context for model loading +# ----------------------------- +model = None +tokenizer = None + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Load model on startup, cleanup on shutdown""" + global model, tokenizer + + # Environment setup + os.environ.pop("TRANSFORMERS_CACHE", None) + MODEL_NAME = os.environ.get("MODEL_NAME", "deepseek-ai/DeepSeek-OCR") + HF_HOME = os.environ.get("HF_HOME", "/models") + os.makedirs(HF_HOME, exist_ok=True) + + # Load model + print(f"🚀 Loading {MODEL_NAME}...") + torch_dtype = torch.bfloat16 + + tokenizer = AutoTokenizer.from_pretrained( + MODEL_NAME, + trust_remote_code=True, + ) + + model = AutoModel.from_pretrained( + MODEL_NAME, + trust_remote_code=True, + use_safetensors=True, + attn_implementation="eager", + torch_dtype=torch_dtype, + ).eval().to("cuda") + + # Pad token setup + try: + if getattr(tokenizer, "pad_token_id", None) is None and getattr(tokenizer, "eos_token_id", None) is not None: + tokenizer.pad_token = tokenizer.eos_token + if getattr(model.config, "pad_token_id", None) is None and getattr(tokenizer, "pad_token_id", None) is not None: + model.config.pad_token_id = tokenizer.pad_token_id + except Exception: + pass + + print("✅ Model loaded and ready!") + + yield + + # Cleanup + print("🛑 Shutting down...") + +# ----------------------------- +# FastAPI app +# ----------------------------- +app = FastAPI( + title="DeepSeek-OCR API", + description="Blazing fast OCR with DeepSeek-OCR model 🔥", + version="2.0.0", + lifespan=lifespan +) + +# CORS middleware for React frontend +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# ----------------------------- +# Prompt builder +# ----------------------------- +def build_prompt( + mode: str, + user_prompt: str, + grounding: bool, + find_term: Optional[str], + schema: Optional[str], + include_caption: bool, +) -> str: + """Build the prompt based on mode""" + parts: List[str] = [""] + mode_requires_grounding = mode in {"find_ref", "layout_map", "pii_redact"} + if grounding or mode_requires_grounding: + parts.append("<|grounding|>") + + instruction = "" + if mode == "plain_ocr": + instruction = "Free OCR. Only output the raw text." + elif mode == "markdown": + instruction = "Convert the document to markdown." + elif mode == "tables_csv": + instruction = ( + "Extract every table and output CSV only. " + "Use commas, minimal quoting. If multiple tables, separate with a line containing '---'." + ) + elif mode == "tables_md": + instruction = "Extract every table as GitHub-flavored Markdown tables. Output only the tables." + elif mode == "kv_json": + schema_text = schema.strip() if schema else "{}" + instruction = ( + "Extract key fields and return strict JSON only. " + f"Use this schema (fill the values): {schema_text}" + ) + elif mode == "figure_chart": + instruction = ( + "Parse the figure. First extract any numeric series as a two-column table (x,y). " + "Then summarize the chart in 2 sentences. Output the table, then a line '---', then the summary." + ) + elif mode == "find_ref": + key = (find_term or "").strip() or "Total" + instruction = f"Locate <|ref|>{key}<|/ref|> in the image." + elif mode == "layout_map": + instruction = ( + 'Return a JSON array of blocks with fields {"type":["title","paragraph","table","figure"],' + '"box":[x1,y1,x2,y2]}. Do not include any text content.' + ) + elif mode == "pii_redact": + instruction = ( + 'Find all occurrences of emails, phone numbers, postal addresses, and IBANs. ' + 'Return a JSON array of objects {label, text, box:[x1,y1,x2,y2]}.' + ) + elif mode == "multilingual": + instruction = "Free OCR. Detect the language automatically and output in the same script." + elif mode == "describe": + instruction = "Describe this image concisely in 2-3 sentences. Focus on visible key elements." + elif mode == "freeform": + instruction = user_prompt.strip() if user_prompt else "OCR this image." + else: + instruction = "OCR this image." + + if include_caption and mode not in {"describe"}: + instruction = instruction + "\nThen add a one-paragraph description of the image." + + parts.append(instruction) + return "\n".join(parts) + +# ----------------------------- +# Grounding parser +# ----------------------------- +DET_BLOCK = re.compile( + r"<\|ref\|>(?P