Compare commits
12 Commits
f28320a23d
...
3dac0741b1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3dac0741b1 | ||
|
|
e24f064042 | ||
|
|
e82cd2abf0 | ||
|
|
7b7d368c94 | ||
|
|
efa2bd265b | ||
|
|
e33e9be75a | ||
|
|
e578276d3e | ||
|
|
5ba45f7db2 | ||
|
|
fd063c0e71 | ||
|
|
0fb5760b11 | ||
|
|
23bbd1fc8d | ||
|
|
225655d02c |
@@ -11,6 +11,9 @@ FRONTEND_PORT=3000
|
|||||||
MODEL_NAME=deepseek-ai/DeepSeek-OCR
|
MODEL_NAME=deepseek-ai/DeepSeek-OCR
|
||||||
HF_HOME=/models
|
HF_HOME=/models
|
||||||
|
|
||||||
|
# CORS Configuration (comma-separated origins, defaults to http://localhost:3000)
|
||||||
|
CORS_ORIGINS=http://localhost:3000
|
||||||
|
|
||||||
# Upload Configuration
|
# Upload Configuration
|
||||||
MAX_UPLOAD_SIZE_MB=100
|
MAX_UPLOAD_SIZE_MB=100
|
||||||
|
|
||||||
|
|||||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2025 rdumasia303
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
||||||
212
README.md
212
README.md
@@ -1,10 +1,54 @@
|
|||||||
# 🚀 DeepSeek OCR - React + FastAPI
|
# 🚀 DeepSeek OCR - React + FastAPI
|
||||||
|
|
||||||
Modern OCR web application powered by DeepSeek-OCR with a stunning React frontend and FastAPI backend.
|
Modern OCR web application powered by DeepSeek-OCR with a stunning React frontend and FastAPI backend. **Now with PDF processing and multi-format document conversion!**
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
> **Recent Updates (v2.1.1)**
|
## ✨ What's New in v2.2.0 - PDF Processing & Document Conversion
|
||||||
|
|
||||||
|
We've added powerful PDF processing capabilities based on community feedback! Here's what you can do now:
|
||||||
|
|
||||||
|
### 📄 Process Entire PDF Documents
|
||||||
|
- Upload PDF files up to 100MB
|
||||||
|
- Automatic multi-page OCR processing
|
||||||
|
- Real-time progress tracking for large documents
|
||||||
|
- Extract text from scanned PDFs or image-based documents
|
||||||
|
|
||||||
|
### 🔄 Convert to Multiple Formats
|
||||||
|
Export your OCR results in the format you need:
|
||||||
|
- **Markdown (.md)** - Clean, structured text perfect for documentation
|
||||||
|
- **HTML (.html)** - Styled documents with embedded images and tables
|
||||||
|
- **Word (.docx)** - Professional documents with formatting, tables, and images
|
||||||
|
- **JSON** - Structured data for programmatic access
|
||||||
|
|
||||||
|
### 🖼️ Automatic Image Extraction
|
||||||
|
- Detects and extracts images from PDF pages
|
||||||
|
- Embeds images in exported documents
|
||||||
|
- Preserves image placement and context
|
||||||
|
|
||||||
|
### 📐 Formula & Formatting Preservation
|
||||||
|
- Maintains mathematical formulas (LaTeX syntax)
|
||||||
|
- Preserves tables, headings, and document structure
|
||||||
|
- Cleans up special characters while keeping formatting intact
|
||||||
|
|
||||||
|
### 🎯 Use Cases
|
||||||
|
- **Document Digitization** - Convert scanned PDFs to editable formats
|
||||||
|
- **Data Extraction** - Pull structured data from forms and invoices
|
||||||
|
- **Content Migration** - Convert PDFs to Markdown for wikis/documentation
|
||||||
|
- **Academic Papers** - Extract text and formulas from research papers
|
||||||
|
- **Business Documents** - Convert reports to Word for editing
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
> **Latest Updates (v2.2.0)** - November 2025
|
||||||
|
> - 🎉 **NEW: PDF Processing** - Upload PDFs and extract text from all pages
|
||||||
|
> - 🎉 **NEW: Multi-Format Export** - Convert to Markdown, HTML, DOCX, or JSON
|
||||||
|
> - 🎉 **NEW: Automatic Image Extraction** - Extract and preserve images from PDFs
|
||||||
|
> - 🎉 **NEW: Progress Tracking** - Real-time progress for multi-page documents
|
||||||
|
> - ✅ Dual mode: Image OCR + PDF Processing with format conversion
|
||||||
|
> - ✅ Enhanced document processing with formula and formatting preservation
|
||||||
|
>
|
||||||
|
> **Previous Updates (v2.1.1)**
|
||||||
> - ✅ Fixed image removal button - now properly clears and allows re-upload
|
> - ✅ Fixed image removal button - now properly clears and allows re-upload
|
||||||
> - ✅ Fixed multiple bounding boxes parsing - handles `[[x1,y1,x2,y2], [x1,y1,x2,y2]]` format
|
> - ✅ Fixed multiple bounding boxes parsing - handles `[[x1,y1,x2,y2], [x1,y1,x2,y2]]` format
|
||||||
> - ✅ Simplified to 4 core working modes for better stability
|
> - ✅ Simplified to 4 core working modes for better stability
|
||||||
@@ -37,24 +81,80 @@ Modern OCR web application powered by DeepSeek-OCR with a stunning React fronten
|
|||||||
- **Backend API**: http://localhost:8000 (or your configured API_PORT)
|
- **Backend API**: http://localhost:8000 (or your configured API_PORT)
|
||||||
- **API Docs**: http://localhost:8000/docs
|
- **API Docs**: http://localhost:8000/docs
|
||||||
|
|
||||||
|
## 🎓 How to Use
|
||||||
|
|
||||||
|
### Processing Images (Single Image OCR)
|
||||||
|
|
||||||
|
1. Select **"Image OCR"** mode in the toggle
|
||||||
|
2. Upload an image (PNG, JPG, WEBP, etc.)
|
||||||
|
3. Choose your OCR mode:
|
||||||
|
- **Plain OCR** - Extract all text
|
||||||
|
- **Describe** - Get image description
|
||||||
|
- **Find** - Locate specific terms
|
||||||
|
- **Freeform** - Use custom prompts
|
||||||
|
4. Click **"Analyze Image"**
|
||||||
|
5. View results with bounding boxes (if enabled)
|
||||||
|
6. Copy or download the extracted text
|
||||||
|
|
||||||
|
### Processing PDFs (Multi-Page Documents) - NEW!
|
||||||
|
|
||||||
|
1. Select **"PDF Processing"** mode in the toggle
|
||||||
|
2. Upload a PDF file (up to 100MB)
|
||||||
|
3. Choose your OCR mode (same as above)
|
||||||
|
4. Select **output format**:
|
||||||
|
- 📝 **Markdown** - For documentation, wikis, GitHub
|
||||||
|
- 🌐 **HTML** - For web publishing, styled viewing
|
||||||
|
- 📄 **DOCX** - For Word editing, professional documents
|
||||||
|
- 📊 **JSON** - For programmatic access, data extraction
|
||||||
|
5. Click **"Process PDF"**
|
||||||
|
6. Watch the progress bar as pages are processed
|
||||||
|
7. Your file downloads automatically when complete!
|
||||||
|
|
||||||
|
### Tips for Best Results
|
||||||
|
|
||||||
|
- **For scanned documents**: Use higher DPI (144-300) in advanced settings
|
||||||
|
- **For tables**: The model excels at extracting structured data
|
||||||
|
- **For formulas**: Mathematical notation is preserved in output
|
||||||
|
- **For images in PDFs**: Enable "Extract Images" to include them in output
|
||||||
|
- **For large PDFs**: JSON format is fastest, DOCX takes longer due to formatting
|
||||||
|
|
||||||
|
### Output Format Comparison
|
||||||
|
|
||||||
|
| Format | Best For | Features | File Size |
|
||||||
|
|--------|----------|----------|-----------|
|
||||||
|
| **Markdown** | Documentation, GitHub, wikis | Clean text, tables, code blocks | Smallest |
|
||||||
|
| **HTML** | Web viewing, sharing | Styled output, embedded images, tables | Medium |
|
||||||
|
| **DOCX** | Editing, professional docs | Full formatting, images, tables | Largest |
|
||||||
|
| **JSON** | Data processing, APIs | Structured data, metadata, page info | Small |
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
### 4 Core OCR Modes
|
### Dual Processing Modes
|
||||||
|
#### 📸 **Image OCR** (4 Core Modes)
|
||||||
- **Plain OCR** - Raw text extraction from any image
|
- **Plain OCR** - Raw text extraction from any image
|
||||||
- **Describe** - Generate intelligent image descriptions
|
- **Describe** - Generate intelligent image descriptions
|
||||||
- **Find** - Locate specific terms with visual bounding boxes
|
- **Find** - Locate specific terms with visual bounding boxes
|
||||||
- **Freeform** - Custom prompts for specialized tasks
|
- **Freeform** - Custom prompts for specialized tasks
|
||||||
|
|
||||||
|
#### 📄 **PDF Processing** (NEW!)
|
||||||
|
- **Multi-Page Processing** - Process entire PDF documents page by page
|
||||||
|
- **Format Conversion** - Export to Markdown, HTML, DOCX, or JSON
|
||||||
|
- **Image Extraction** - Automatically extract and preserve embedded images
|
||||||
|
- **Formula Preservation** - Maintain mathematical formulas and special formatting
|
||||||
|
- **Progress Tracking** - Real-time progress updates for large documents
|
||||||
|
|
||||||
### UI Features
|
### UI Features
|
||||||
- 🎨 Glass morphism design with animated gradients
|
- 🎨 Glass morphism design with animated gradients
|
||||||
- 🎯 Drag & drop file upload (up to 100MB by default)
|
- 🎯 Drag & drop file upload (Images up to 10MB, PDFs up to 100MB)
|
||||||
- 🗑️ Easy image removal and re-upload
|
- 🔄 Easy file removal and re-upload
|
||||||
- 📦 Grounding box visualization with proper coordinate scaling
|
- 📦 Grounding box visualization with proper coordinate scaling
|
||||||
- ✨ Smooth animations (Framer Motion)
|
- ✨ Smooth animations (Framer Motion)
|
||||||
- 📋 Copy/Download results
|
- 📋 Copy/Download results in multiple formats
|
||||||
- 🎛️ Advanced settings dropdown
|
- 🎛️ Advanced settings dropdown
|
||||||
- 📝 HTML and Markdown rendering for formatted output
|
- 📝 HTML and Markdown rendering for formatted output
|
||||||
- 🔍 Multiple bounding box support (handles multiple instances of found terms)
|
- 🔍 Multiple bounding box support (handles multiple instances of found terms)
|
||||||
|
- 📊 Progress bars for multi-page PDF processing
|
||||||
|
- 💾 Direct download for converted documents (MD, HTML, DOCX)
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
@@ -95,10 +195,25 @@ CROP_MODE=true # Enable dynamic cropping for large images
|
|||||||
|
|
||||||
## Tech Stack
|
## Tech Stack
|
||||||
|
|
||||||
- **Frontend**: React 18 + Vite 5 + TailwindCSS 3 + Framer Motion 11
|
### Frontend
|
||||||
- **Backend**: FastAPI + PyTorch + Transformers 4.46 + DeepSeek-OCR
|
- **Framework**: React 18 + Vite 5
|
||||||
|
- **Styling**: TailwindCSS 3 + Custom Glass Morphism
|
||||||
|
- **Animations**: Framer Motion 11
|
||||||
|
- **HTTP Client**: Axios
|
||||||
|
- **File Upload**: React Dropzone
|
||||||
|
|
||||||
|
### Backend
|
||||||
|
- **API Framework**: FastAPI (async Python web framework)
|
||||||
|
- **ML/AI**: PyTorch + Transformers 4.46 + DeepSeek-OCR
|
||||||
|
- **PDF Processing**: PyMuPDF (fitz) + img2pdf
|
||||||
|
- **Document Conversion**:
|
||||||
|
- python-docx (Word documents)
|
||||||
|
- markdown (Markdown processing)
|
||||||
|
- Custom HTML generator
|
||||||
- **Configuration**: python-decouple for environment management
|
- **Configuration**: python-decouple for environment management
|
||||||
- **Server**: Nginx (reverse proxy)
|
|
||||||
|
### Infrastructure
|
||||||
|
- **Server**: Nginx (reverse proxy & static file serving)
|
||||||
- **Container**: Docker + Docker Compose with multi-stage builds
|
- **Container**: Docker + Docker Compose with multi-stage builds
|
||||||
- **GPU**: NVIDIA CUDA support (tested on RTX 3090, RTX 5090)
|
- **GPU**: NVIDIA CUDA support (tested on RTX 3090, RTX 5090)
|
||||||
|
|
||||||
@@ -106,19 +221,26 @@ CROP_MODE=true # Enable dynamic cropping for large images
|
|||||||
|
|
||||||
```
|
```
|
||||||
deepseek-ocr/
|
deepseek-ocr/
|
||||||
├── backend/ # FastAPI backend
|
├── backend/ # FastAPI backend
|
||||||
│ ├── main.py
|
│ ├── main.py # Main API with OCR and PDF endpoints
|
||||||
|
│ ├── pdf_utils.py # PDF processing utilities (NEW)
|
||||||
|
│ ├── format_converter.py # Document format conversion (NEW)
|
||||||
│ ├── requirements.txt
|
│ ├── requirements.txt
|
||||||
│ └── Dockerfile
|
│ └── Dockerfile
|
||||||
├── frontend/ # React frontend
|
├── frontend/ # React frontend
|
||||||
│ ├── src/
|
│ ├── src/
|
||||||
│ │ ├── components/
|
│ │ ├── components/
|
||||||
│ │ ├── App.jsx
|
│ │ │ ├── ImageUpload.jsx # File upload (images & PDFs)
|
||||||
|
│ │ │ ├── PDFProcessor.jsx # PDF processing UI (NEW)
|
||||||
|
│ │ │ ├── ModeSelector.jsx
|
||||||
|
│ │ │ ├── ResultPanel.jsx
|
||||||
|
│ │ │ └── AdvancedSettings.jsx
|
||||||
|
│ │ ├── App.jsx # Main app with dual mode support
|
||||||
│ │ └── main.jsx
|
│ │ └── main.jsx
|
||||||
│ ├── package.json
|
│ ├── package.json
|
||||||
│ ├── nginx.conf
|
│ ├── nginx.conf
|
||||||
│ └── Dockerfile
|
│ └── Dockerfile
|
||||||
├── models/ # Model cache
|
├── models/ # Model cache
|
||||||
└── docker-compose.yml
|
└── docker-compose.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -288,6 +410,63 @@ For large images, the model uses dynamic cropping:
|
|||||||
- **Supports multiple boxes**: When finding multiple instances, format is `[[x1,y1,x2,y2], [x1,y1,x2,y2], ...]`
|
- **Supports multiple boxes**: When finding multiple instances, format is `[[x1,y1,x2,y2], [x1,y1,x2,y2], ...]`
|
||||||
- Frontend automatically displays all boxes overlaid on the image with unique colors
|
- Frontend automatically displays all boxes overlaid on the image with unique colors
|
||||||
|
|
||||||
|
### POST /api/process-pdf (NEW!)
|
||||||
|
|
||||||
|
Process PDF documents with OCR and export to various formats.
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
- `pdf_file` (file, required) - PDF file to process (up to 100MB)
|
||||||
|
- `mode` (string) - OCR mode: `plain_ocr` | `describe` | `find_ref` | `freeform`
|
||||||
|
- `prompt` (string) - Custom prompt for freeform mode
|
||||||
|
- `output_format` (string) - Output format: `markdown` | `html` | `docx` | `json`
|
||||||
|
- `grounding` (bool) - Enable bounding boxes (default: false)
|
||||||
|
- `include_caption` (bool) - Add image descriptions (default: false)
|
||||||
|
- `extract_images` (bool) - Extract embedded images from PDF (default: true)
|
||||||
|
- `dpi` (int) - PDF rendering resolution (default: 144)
|
||||||
|
- `base_size` (int) - Base processing size (default: 1024)
|
||||||
|
- `image_size` (int) - Tile size for cropping (default: 640)
|
||||||
|
- `crop_mode` (bool) - Enable dynamic cropping (default: true)
|
||||||
|
|
||||||
|
**Response Formats:**
|
||||||
|
|
||||||
|
**JSON Format** (`output_format=json`):
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"success": true,
|
||||||
|
"total_pages": 5,
|
||||||
|
"pages": [
|
||||||
|
{
|
||||||
|
"page_number": 1,
|
||||||
|
"text": "Extracted and cleaned text...",
|
||||||
|
"raw_text": "Raw model output with tags...",
|
||||||
|
"boxes": [{"label": "field", "box": [x1, y1, x2, y2]}],
|
||||||
|
"images": ["base64_encoded_image_data..."],
|
||||||
|
"image_dims": {"w": 1920, "h": 1080}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"mode": "plain_ocr",
|
||||||
|
"grounding": false,
|
||||||
|
"extract_images": true,
|
||||||
|
"dpi": 144
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**File Downloads** (`output_format=markdown|html|docx`):
|
||||||
|
- Returns the document as a downloadable file
|
||||||
|
- Markdown: `.md` file with preserved formatting
|
||||||
|
- HTML: `.html` file with embedded styling and images
|
||||||
|
- DOCX: `.docx` Word document with tables and formatting
|
||||||
|
|
||||||
|
**Features:**
|
||||||
|
- 📄 Multi-page processing with progress tracking
|
||||||
|
- 🖼️ Automatic image extraction and embedding
|
||||||
|
- 📐 Formula and formatting preservation
|
||||||
|
- 🎨 Styled HTML output with tables and code blocks
|
||||||
|
- 📝 Clean Markdown with proper structure
|
||||||
|
- 📋 Professional DOCX with headings and tables
|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
Here are some example images showcasing different OCR capabilities:
|
Here are some example images showcasing different OCR capabilities:
|
||||||
@@ -325,3 +504,8 @@ docker-compose build frontend
|
|||||||
## License
|
## License
|
||||||
|
|
||||||
This project uses the DeepSeek-OCR model. Refer to the model's license terms.
|
This project uses the DeepSeek-OCR model. Refer to the model's license terms.
|
||||||
|
|
||||||
|
|
||||||
|
<!-- Small note and direct link to license at the bottom -->
|
||||||
|
<!-- MIT License: this repository is licensed under the MIT License. See the full text in the LICENSE file. -->
|
||||||
|
Note: Licensed under the MIT License. View the full license: [LICENSE](./LICENSE)
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ COPY requirements.txt .
|
|||||||
RUN pip install --upgrade pip && pip install -r requirements.txt
|
RUN pip install --upgrade pip && pip install -r requirements.txt
|
||||||
|
|
||||||
# Copy backend code
|
# Copy backend code
|
||||||
COPY main.py .
|
COPY *.py .
|
||||||
|
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
|
|||||||
326
backend/format_converter.py
Normal file
326
backend/format_converter.py
Normal file
@@ -0,0 +1,326 @@
|
|||||||
|
"""
|
||||||
|
Document Format Conversion Utilities
|
||||||
|
Handles conversion to Markdown, HTML, DOCX while preserving formatting
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
from io import BytesIO
|
||||||
|
from docx import Document
|
||||||
|
from docx.shared import Pt, Inches, RGBColor
|
||||||
|
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
||||||
|
import markdown
|
||||||
|
import base64
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentConverter:
|
||||||
|
"""Handles conversion of OCR results to various document formats"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.page_separator = '<--- Page Split --->'
|
||||||
|
|
||||||
|
def to_markdown(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> str:
|
||||||
|
"""
|
||||||
|
Convert OCR results to Markdown format
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pages_content: List of page dictionaries with text and metadata
|
||||||
|
include_images: Whether to include image references
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Markdown formatted string
|
||||||
|
"""
|
||||||
|
md_content = []
|
||||||
|
|
||||||
|
for idx, page in enumerate(pages_content):
|
||||||
|
# Add page header
|
||||||
|
md_content.append(f"# Page {idx + 1}\n")
|
||||||
|
|
||||||
|
text = page.get('text', '')
|
||||||
|
|
||||||
|
# Process and clean the text
|
||||||
|
if include_images and 'images' in page:
|
||||||
|
# Replace image placeholders with actual markdown image syntax
|
||||||
|
for img_idx, img_data in enumerate(page.get('images', [])):
|
||||||
|
placeholder = f"[IMAGE_{img_idx}]"
|
||||||
|
img_ref = f""
|
||||||
|
text = text.replace(placeholder, img_ref)
|
||||||
|
|
||||||
|
md_content.append(text)
|
||||||
|
md_content.append("\n\n---\n\n") # Page separator
|
||||||
|
|
||||||
|
return "\n".join(md_content)
|
||||||
|
|
||||||
|
def to_html(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> str:
|
||||||
|
"""
|
||||||
|
Convert OCR results to HTML format
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pages_content: List of page dictionaries with text and metadata
|
||||||
|
include_images: Whether to include images
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
HTML formatted string
|
||||||
|
"""
|
||||||
|
html_parts = []
|
||||||
|
|
||||||
|
# HTML header
|
||||||
|
html_parts.append("""
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>OCR Results</title>
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
||||||
|
max-width: 900px;
|
||||||
|
margin: 40px auto;
|
||||||
|
padding: 20px;
|
||||||
|
line-height: 1.6;
|
||||||
|
background-color: #f5f5f5;
|
||||||
|
}
|
||||||
|
.page {
|
||||||
|
background: white;
|
||||||
|
padding: 40px;
|
||||||
|
margin-bottom: 30px;
|
||||||
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
||||||
|
border-radius: 8px;
|
||||||
|
}
|
||||||
|
.page-header {
|
||||||
|
color: #333;
|
||||||
|
border-bottom: 2px solid #4CAF50;
|
||||||
|
padding-bottom: 10px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
table {
|
||||||
|
border-collapse: collapse;
|
||||||
|
width: 100%;
|
||||||
|
margin: 20px 0;
|
||||||
|
}
|
||||||
|
th, td {
|
||||||
|
border: 1px solid #ddd;
|
||||||
|
padding: 12px;
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
th {
|
||||||
|
background-color: #4CAF50;
|
||||||
|
color: white;
|
||||||
|
}
|
||||||
|
tr:nth-child(even) {
|
||||||
|
background-color: #f9f9f9;
|
||||||
|
}
|
||||||
|
img {
|
||||||
|
max-width: 100%;
|
||||||
|
height: auto;
|
||||||
|
margin: 15px 0;
|
||||||
|
border-radius: 4px;
|
||||||
|
}
|
||||||
|
code {
|
||||||
|
background-color: #f4f4f4;
|
||||||
|
padding: 2px 6px;
|
||||||
|
border-radius: 3px;
|
||||||
|
font-family: 'Courier New', monospace;
|
||||||
|
}
|
||||||
|
pre {
|
||||||
|
background-color: #f4f4f4;
|
||||||
|
padding: 15px;
|
||||||
|
border-radius: 5px;
|
||||||
|
overflow-x: auto;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>DeepSeek OCR Results</h1>
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Process each page
|
||||||
|
for idx, page in enumerate(pages_content):
|
||||||
|
html_parts.append(f' <div class="page">')
|
||||||
|
html_parts.append(f' <h2 class="page-header">Page {idx + 1}</h2>')
|
||||||
|
|
||||||
|
text = page.get('text', '')
|
||||||
|
|
||||||
|
# Handle images if present
|
||||||
|
if include_images and 'images' in page:
|
||||||
|
for img_idx, img_data in enumerate(page.get('images', [])):
|
||||||
|
placeholder = f"[IMAGE_{img_idx}]"
|
||||||
|
img_tag = f'<img src="data:image/jpeg;base64,{img_data}" alt="Image {img_idx + 1}" />'
|
||||||
|
text = text.replace(placeholder, img_tag)
|
||||||
|
|
||||||
|
# Convert markdown to HTML if the text appears to be markdown
|
||||||
|
if self._is_markdown(text):
|
||||||
|
html_content = markdown.markdown(text, extensions=['tables', 'fenced_code'])
|
||||||
|
else:
|
||||||
|
# Otherwise, preserve the HTML or wrap in paragraph
|
||||||
|
html_content = text if '<' in text else f'<p>{text.replace(chr(10), "<br>")}</p>'
|
||||||
|
|
||||||
|
html_parts.append(f' {html_content}')
|
||||||
|
html_parts.append(' </div>')
|
||||||
|
|
||||||
|
# HTML footer
|
||||||
|
html_parts.append("""
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""")
|
||||||
|
|
||||||
|
return "\n".join(html_parts)
|
||||||
|
|
||||||
|
def to_docx(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> BytesIO:
|
||||||
|
"""
|
||||||
|
Convert OCR results to DOCX format
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pages_content: List of page dictionaries with text and metadata
|
||||||
|
include_images: Whether to include images
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BytesIO object containing the DOCX file
|
||||||
|
"""
|
||||||
|
doc = Document()
|
||||||
|
|
||||||
|
# Set default font
|
||||||
|
style = doc.styles['Normal']
|
||||||
|
font = style.font
|
||||||
|
font.name = 'Calibri'
|
||||||
|
font.size = Pt(11)
|
||||||
|
|
||||||
|
# Add title
|
||||||
|
title = doc.add_heading('DeepSeek OCR Results', 0)
|
||||||
|
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
||||||
|
|
||||||
|
# Process each page
|
||||||
|
for idx, page in enumerate(pages_content):
|
||||||
|
# Add page heading
|
||||||
|
page_heading = doc.add_heading(f'Page {idx + 1}', level=1)
|
||||||
|
page_heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
||||||
|
|
||||||
|
text = page.get('text', '')
|
||||||
|
|
||||||
|
# Handle images
|
||||||
|
if include_images and 'images' in page:
|
||||||
|
for img_idx, img_data in enumerate(page.get('images', [])):
|
||||||
|
placeholder = f"[IMAGE_{img_idx}]"
|
||||||
|
|
||||||
|
# Add image to document
|
||||||
|
try:
|
||||||
|
img_bytes = base64.b64decode(img_data)
|
||||||
|
img_stream = BytesIO(img_bytes)
|
||||||
|
doc.add_picture(img_stream, width=Inches(5))
|
||||||
|
text = text.replace(placeholder, '')
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error adding image to DOCX: {e}")
|
||||||
|
|
||||||
|
# Process text content
|
||||||
|
self._add_formatted_text_to_doc(doc, text)
|
||||||
|
|
||||||
|
# Add page break (except for last page)
|
||||||
|
if idx < len(pages_content) - 1:
|
||||||
|
doc.add_page_break()
|
||||||
|
|
||||||
|
# Save to BytesIO
|
||||||
|
docx_buffer = BytesIO()
|
||||||
|
doc.save(docx_buffer)
|
||||||
|
docx_buffer.seek(0)
|
||||||
|
|
||||||
|
return docx_buffer
|
||||||
|
|
||||||
|
def _is_markdown(self, text: str) -> bool:
|
||||||
|
"""Check if text appears to be markdown formatted"""
|
||||||
|
markdown_patterns = [
|
||||||
|
r'^#+\s', # Headers
|
||||||
|
r'\*\*.*\*\*', # Bold
|
||||||
|
r'\*.*\*', # Italic
|
||||||
|
r'^\*\s', # Lists
|
||||||
|
r'^\d+\.\s', # Numbered lists
|
||||||
|
r'\[.*\]\(.*\)', # Links
|
||||||
|
r'```', # Code blocks
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in markdown_patterns:
|
||||||
|
if re.search(pattern, text, re.MULTILINE):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _add_formatted_text_to_doc(self, doc: Document, text: str):
|
||||||
|
"""
|
||||||
|
Add formatted text to document, preserving structure
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: Document object
|
||||||
|
text: Text to add
|
||||||
|
"""
|
||||||
|
# Split into paragraphs
|
||||||
|
paragraphs = text.split('\n\n')
|
||||||
|
|
||||||
|
for para in paragraphs:
|
||||||
|
if not para.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for headers
|
||||||
|
if para.startswith('# '):
|
||||||
|
doc.add_heading(para.replace('# ', ''), level=1)
|
||||||
|
elif para.startswith('## '):
|
||||||
|
doc.add_heading(para.replace('## ', ''), level=2)
|
||||||
|
elif para.startswith('### '):
|
||||||
|
doc.add_heading(para.replace('### ', ''), level=3)
|
||||||
|
# Check for tables (simple detection)
|
||||||
|
elif '|' in para and para.count('|') > 2:
|
||||||
|
self._add_table_to_doc(doc, para)
|
||||||
|
# Check for code blocks
|
||||||
|
elif para.startswith('```'):
|
||||||
|
code_text = para.strip('```').strip()
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
run = p.add_run(code_text)
|
||||||
|
run.font.name = 'Courier New'
|
||||||
|
run.font.size = Pt(10)
|
||||||
|
else:
|
||||||
|
# Regular paragraph
|
||||||
|
doc.add_paragraph(para.strip())
|
||||||
|
|
||||||
|
def _add_table_to_doc(self, doc: Document, table_text: str):
|
||||||
|
"""
|
||||||
|
Add a table to the document from markdown-style table text
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc: Document object
|
||||||
|
table_text: Table in markdown format
|
||||||
|
"""
|
||||||
|
rows = [row.strip() for row in table_text.split('\n') if row.strip()]
|
||||||
|
|
||||||
|
# Filter out separator rows
|
||||||
|
data_rows = [row for row in rows if not re.match(r'^[\|\s\-:]+$', row)]
|
||||||
|
|
||||||
|
if not data_rows:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Parse table data
|
||||||
|
table_data = []
|
||||||
|
for row in data_rows:
|
||||||
|
cells = [cell.strip() for cell in row.split('|')]
|
||||||
|
cells = [c for c in cells if c] # Remove empty cells
|
||||||
|
if cells:
|
||||||
|
table_data.append(cells)
|
||||||
|
|
||||||
|
if not table_data:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create table
|
||||||
|
max_cols = max(len(row) for row in table_data)
|
||||||
|
table = doc.add_table(rows=len(table_data), cols=max_cols)
|
||||||
|
table.style = 'Light Grid Accent 1'
|
||||||
|
|
||||||
|
# Populate table
|
||||||
|
for i, row_data in enumerate(table_data):
|
||||||
|
row = table.rows[i]
|
||||||
|
for j, cell_text in enumerate(row_data):
|
||||||
|
if j < len(row.cells):
|
||||||
|
row.cells[j].text = cell_text
|
||||||
|
|
||||||
|
# Make header row bold
|
||||||
|
if i == 0:
|
||||||
|
for paragraph in row.cells[j].paragraphs:
|
||||||
|
for run in paragraph.runs:
|
||||||
|
run.font.bold = True
|
||||||
218
backend/main.py
218
backend/main.py
@@ -2,18 +2,29 @@ import os
|
|||||||
import re
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
import shutil
|
import shutil
|
||||||
|
import base64
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
from fastapi import FastAPI, File, UploadFile, Form, HTTPException
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
import torch
|
import torch
|
||||||
from transformers import AutoModel, AutoTokenizer
|
from transformers import AutoModel, AutoTokenizer
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from decouple import config as env_config
|
from decouple import config as env_config
|
||||||
|
|
||||||
|
# Import PDF and document conversion utilities
|
||||||
|
from pdf_utils import (
|
||||||
|
pdf_to_images_high_quality,
|
||||||
|
images_to_pdf,
|
||||||
|
extract_ref_patterns,
|
||||||
|
crop_images_from_refs,
|
||||||
|
clean_markdown_content
|
||||||
|
)
|
||||||
|
from format_converter import DocumentConverter
|
||||||
|
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
# Lifespan context for model loading
|
# Lifespan context for model loading
|
||||||
# -----------------------------
|
# -----------------------------
|
||||||
@@ -75,11 +86,14 @@ app = FastAPI(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# CORS middleware for React frontend
|
# CORS middleware for React frontend
|
||||||
|
CORS_ORIGINS = env_config("CORS_ORIGINS", default="").split(",")
|
||||||
|
CORS_ORIGINS = [o.strip() for o in CORS_ORIGINS if o.strip()]
|
||||||
|
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
allow_origins=["*"],
|
allow_origins=CORS_ORIGINS if CORS_ORIGINS else ["http://localhost:3000"],
|
||||||
allow_credentials=True,
|
allow_credentials=True,
|
||||||
allow_methods=["*"],
|
allow_methods=["GET", "POST"],
|
||||||
allow_headers=["*"],
|
allow_headers=["*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -102,7 +116,7 @@ def build_prompt(
|
|||||||
|
|
||||||
instruction = ""
|
instruction = ""
|
||||||
if mode == "plain_ocr":
|
if mode == "plain_ocr":
|
||||||
instruction = "Free OCR. Only output the raw text."
|
instruction = "Free OCR."
|
||||||
elif mode == "markdown":
|
elif mode == "markdown":
|
||||||
instruction = "Convert the document to markdown."
|
instruction = "Convert the document to markdown."
|
||||||
elif mode == "tables_csv":
|
elif mode == "tables_csv":
|
||||||
@@ -362,7 +376,8 @@ async def ocr_inference(
|
|||||||
})
|
})
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise HTTPException(status_code=500, detail=f"{type(e).__name__}: {str(e)}")
|
print(f"OCR inference error: {type(e).__name__}: {str(e)}")
|
||||||
|
raise HTTPException(status_code=500, detail="An internal error occurred during OCR processing.")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
if tmp_img:
|
if tmp_img:
|
||||||
@@ -373,6 +388,199 @@ async def ocr_inference(
|
|||||||
if out_dir:
|
if out_dir:
|
||||||
shutil.rmtree(out_dir, ignore_errors=True)
|
shutil.rmtree(out_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
@app.post("/api/process-pdf")
|
||||||
|
async def process_pdf(
|
||||||
|
pdf_file: UploadFile = File(...),
|
||||||
|
mode: str = Form("plain_ocr"),
|
||||||
|
prompt: str = Form(""),
|
||||||
|
output_format: str = Form("markdown"), # markdown, html, docx, json
|
||||||
|
grounding: bool = Form(False),
|
||||||
|
include_caption: bool = Form(False),
|
||||||
|
extract_images: bool = Form(True),
|
||||||
|
dpi: int = Form(144),
|
||||||
|
base_size: int = Form(1024),
|
||||||
|
image_size: int = Form(640),
|
||||||
|
crop_mode: bool = Form(True),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Process PDF document with OCR and convert to various formats
|
||||||
|
|
||||||
|
- **pdf_file**: PDF file to process
|
||||||
|
- **mode**: OCR mode (plain_ocr, markdown, tables_csv, etc.)
|
||||||
|
- **prompt**: Custom prompt for freeform mode
|
||||||
|
- **output_format**: Output format (markdown, html, docx, json)
|
||||||
|
- **grounding**: Enable grounding boxes
|
||||||
|
- **include_caption**: Add image descriptions
|
||||||
|
- **extract_images**: Extract images from PDF
|
||||||
|
- **dpi**: PDF rendering resolution (default: 144)
|
||||||
|
- **base_size**: Base processing size
|
||||||
|
- **image_size**: Image size parameter
|
||||||
|
- **crop_mode**: Enable crop mode
|
||||||
|
"""
|
||||||
|
if model is None or tokenizer is None:
|
||||||
|
raise HTTPException(status_code=503, detail="Model not loaded yet")
|
||||||
|
|
||||||
|
# Validate output format
|
||||||
|
if output_format not in ["markdown", "html", "docx", "json"]:
|
||||||
|
raise HTTPException(status_code=400, detail="Invalid output format. Must be: markdown, html, docx, or json")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Read PDF file
|
||||||
|
pdf_bytes = await pdf_file.read()
|
||||||
|
|
||||||
|
# Convert PDF to images
|
||||||
|
print(f"📄 Converting PDF to images (DPI: {dpi})...")
|
||||||
|
images = pdf_to_images_high_quality(pdf_bytes, dpi=dpi)
|
||||||
|
total_pages = len(images)
|
||||||
|
print(f"✅ Converted {total_pages} pages")
|
||||||
|
|
||||||
|
# Process each page
|
||||||
|
pages_content = []
|
||||||
|
converter = DocumentConverter()
|
||||||
|
|
||||||
|
for page_idx, img in enumerate(images):
|
||||||
|
print(f"🔍 Processing page {page_idx + 1}/{total_pages}...")
|
||||||
|
|
||||||
|
# Build prompt for this page
|
||||||
|
prompt_text = build_prompt(
|
||||||
|
mode=mode,
|
||||||
|
user_prompt=prompt,
|
||||||
|
grounding=grounding,
|
||||||
|
find_term=None,
|
||||||
|
schema=None,
|
||||||
|
include_caption=include_caption,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save image temporarily
|
||||||
|
tmp_img = None
|
||||||
|
out_dir = None
|
||||||
|
try:
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp:
|
||||||
|
img.save(tmp, format="PNG")
|
||||||
|
tmp_img = tmp.name
|
||||||
|
|
||||||
|
orig_w, orig_h = img.size
|
||||||
|
out_dir = tempfile.mkdtemp(prefix="dsocr_pdf_")
|
||||||
|
|
||||||
|
# Run inference
|
||||||
|
res = model.infer(
|
||||||
|
tokenizer,
|
||||||
|
prompt=prompt_text,
|
||||||
|
image_file=tmp_img,
|
||||||
|
output_path=out_dir,
|
||||||
|
base_size=base_size,
|
||||||
|
image_size=image_size,
|
||||||
|
crop_mode=crop_mode,
|
||||||
|
save_results=False,
|
||||||
|
test_compress=False,
|
||||||
|
eval_mode=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Normalize response
|
||||||
|
if isinstance(res, str):
|
||||||
|
text = res.strip()
|
||||||
|
elif isinstance(res, dict) and "text" in res:
|
||||||
|
text = str(res["text"]).strip()
|
||||||
|
elif isinstance(res, (list, tuple)):
|
||||||
|
text = "\n".join(map(str, res)).strip()
|
||||||
|
else:
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
if not text:
|
||||||
|
mmd = os.path.join(out_dir, "result.mmd")
|
||||||
|
if os.path.exists(mmd):
|
||||||
|
with open(mmd, "r", encoding="utf-8") as fh:
|
||||||
|
text = fh.read().strip()
|
||||||
|
if not text:
|
||||||
|
text = f"No text returned for page {page_idx + 1}."
|
||||||
|
|
||||||
|
# Extract images if requested
|
||||||
|
page_images = []
|
||||||
|
if extract_images:
|
||||||
|
matches, matches_image, matches_other = extract_ref_patterns(text)
|
||||||
|
if matches_image:
|
||||||
|
cropped = crop_images_from_refs(img, matches)
|
||||||
|
for cropped_img in cropped:
|
||||||
|
# Convert to base64
|
||||||
|
img_buffer = tempfile.NamedTemporaryFile(delete=False, suffix=".jpg")
|
||||||
|
cropped_img.save(img_buffer.name, format="JPEG", quality=95)
|
||||||
|
with open(img_buffer.name, "rb") as f:
|
||||||
|
img_b64 = base64.b64encode(f.read()).decode('utf-8')
|
||||||
|
page_images.append(img_b64)
|
||||||
|
os.remove(img_buffer.name)
|
||||||
|
|
||||||
|
# Clean the text and add image placeholders
|
||||||
|
text = clean_markdown_content(text, matches_image, matches_other)
|
||||||
|
for img_idx in range(len(page_images)):
|
||||||
|
text = f"[IMAGE_{img_idx}]\n" + text
|
||||||
|
|
||||||
|
# Parse grounding boxes
|
||||||
|
boxes = parse_detections(text, orig_w, orig_h) if ("<|det|>" in text or "<|ref|>" in text) else []
|
||||||
|
|
||||||
|
# Clean grounding tags from display text
|
||||||
|
display_text = clean_grounding_text(text) if ("<|ref|>" in text or "<|grounding|>" in text) else text
|
||||||
|
|
||||||
|
pages_content.append({
|
||||||
|
'page_number': page_idx + 1,
|
||||||
|
'text': display_text,
|
||||||
|
'raw_text': text,
|
||||||
|
'boxes': boxes,
|
||||||
|
'images': page_images,
|
||||||
|
'image_dims': {'w': orig_w, 'h': orig_h}
|
||||||
|
})
|
||||||
|
|
||||||
|
finally:
|
||||||
|
if tmp_img:
|
||||||
|
try:
|
||||||
|
os.remove(tmp_img)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if out_dir:
|
||||||
|
shutil.rmtree(out_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
print(f"✅ Processed all {total_pages} pages")
|
||||||
|
|
||||||
|
# Convert to requested format
|
||||||
|
if output_format == "json":
|
||||||
|
return JSONResponse({
|
||||||
|
"success": True,
|
||||||
|
"total_pages": total_pages,
|
||||||
|
"pages": pages_content,
|
||||||
|
"metadata": {
|
||||||
|
"mode": mode,
|
||||||
|
"grounding": grounding,
|
||||||
|
"extract_images": extract_images,
|
||||||
|
"dpi": dpi
|
||||||
|
}
|
||||||
|
})
|
||||||
|
elif output_format == "markdown":
|
||||||
|
md_content = converter.to_markdown(pages_content, include_images=extract_images)
|
||||||
|
return StreamingResponse(
|
||||||
|
iter([md_content.encode('utf-8')]),
|
||||||
|
media_type="text/markdown",
|
||||||
|
headers={"Content-Disposition": f"attachment; filename=ocr_result.md"}
|
||||||
|
)
|
||||||
|
elif output_format == "html":
|
||||||
|
html_content = converter.to_html(pages_content, include_images=extract_images)
|
||||||
|
return StreamingResponse(
|
||||||
|
iter([html_content.encode('utf-8')]),
|
||||||
|
media_type="text/html",
|
||||||
|
headers={"Content-Disposition": f"attachment; filename=ocr_result.html"}
|
||||||
|
)
|
||||||
|
elif output_format == "docx":
|
||||||
|
docx_buffer = converter.to_docx(pages_content, include_images=extract_images)
|
||||||
|
return StreamingResponse(
|
||||||
|
docx_buffer,
|
||||||
|
media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
headers={"Content-Disposition": f"attachment; filename=ocr_result.docx"}
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
print(f"Error processing PDF: {e}")
|
||||||
|
print(traceback.format_exc())
|
||||||
|
raise HTTPException(status_code=500, detail="An internal error occurred during PDF processing.")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
host = env_config("API_HOST", default="0.0.0.0")
|
host = env_config("API_HOST", default="0.0.0.0")
|
||||||
port = env_config("API_PORT", default=8000, cast=int)
|
port = env_config("API_PORT", default=8000, cast=int)
|
||||||
|
|||||||
215
backend/pdf_utils.py
Normal file
215
backend/pdf_utils.py
Normal file
@@ -0,0 +1,215 @@
|
|||||||
|
"""
|
||||||
|
PDF Processing Utilities for DeepSeek OCR
|
||||||
|
Handles PDF to image conversion and batch processing
|
||||||
|
"""
|
||||||
|
|
||||||
|
import ast
|
||||||
|
import io
|
||||||
|
import re
|
||||||
|
from typing import List, Tuple, Dict, Any
|
||||||
|
import fitz # PyMuPDF
|
||||||
|
import img2pdf
|
||||||
|
from PIL import Image
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def pdf_to_images_high_quality(pdf_bytes: bytes, dpi: int = 144) -> List[Image.Image]:
|
||||||
|
"""
|
||||||
|
Convert PDF pages to high-quality PIL images
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_bytes: PDF file as bytes
|
||||||
|
dpi: Resolution for rendering (default: 144)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of PIL Image objects, one per page
|
||||||
|
"""
|
||||||
|
images = []
|
||||||
|
|
||||||
|
# Open PDF from bytes
|
||||||
|
pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||||
|
|
||||||
|
# Calculate zoom factor from DPI
|
||||||
|
zoom = dpi / 72.0
|
||||||
|
matrix = fitz.Matrix(zoom, zoom)
|
||||||
|
|
||||||
|
# Process each page
|
||||||
|
for page_num in range(pdf_document.page_count):
|
||||||
|
page = pdf_document[page_num]
|
||||||
|
|
||||||
|
# Render page to pixmap
|
||||||
|
pixmap = page.get_pixmap(matrix=matrix, alpha=False)
|
||||||
|
|
||||||
|
# Allow reasonably large images (200 megapixels) but not decompression bombs
|
||||||
|
Image.MAX_IMAGE_PIXELS = 200_000_000
|
||||||
|
|
||||||
|
# Convert to PIL Image
|
||||||
|
img_data = pixmap.tobytes("png")
|
||||||
|
img = Image.open(io.BytesIO(img_data))
|
||||||
|
|
||||||
|
# Ensure RGB mode
|
||||||
|
if img.mode in ('RGBA', 'LA'):
|
||||||
|
background = Image.new('RGB', img.size, (255, 255, 255))
|
||||||
|
background.paste(img, mask=img.split()[-1] if img.mode == 'RGBA' else None)
|
||||||
|
img = background
|
||||||
|
elif img.mode != 'RGB':
|
||||||
|
img = img.convert('RGB')
|
||||||
|
|
||||||
|
images.append(img)
|
||||||
|
|
||||||
|
pdf_document.close()
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def images_to_pdf(pil_images: List[Image.Image]) -> bytes:
|
||||||
|
"""
|
||||||
|
Convert list of PIL images to PDF bytes
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pil_images: List of PIL Image objects
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
PDF file as bytes
|
||||||
|
"""
|
||||||
|
if not pil_images:
|
||||||
|
return b''
|
||||||
|
|
||||||
|
image_bytes_list = []
|
||||||
|
|
||||||
|
for img in pil_images:
|
||||||
|
# Ensure RGB mode
|
||||||
|
if img.mode != 'RGB':
|
||||||
|
img = img.convert('RGB')
|
||||||
|
|
||||||
|
# Convert to JPEG bytes
|
||||||
|
img_buffer = io.BytesIO()
|
||||||
|
img.save(img_buffer, format='JPEG', quality=95)
|
||||||
|
img_bytes = img_buffer.getvalue()
|
||||||
|
image_bytes_list.append(img_bytes)
|
||||||
|
|
||||||
|
# Convert to PDF
|
||||||
|
pdf_bytes = img2pdf.convert(image_bytes_list)
|
||||||
|
return pdf_bytes
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ref_patterns(text: str) -> Tuple[List[Tuple], List[str], List[str]]:
|
||||||
|
"""
|
||||||
|
Extract reference patterns from OCR output
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: OCR output text with reference tags
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (all_matches, image_matches, other_matches)
|
||||||
|
"""
|
||||||
|
pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
|
||||||
|
matches = re.findall(pattern, text, re.DOTALL)
|
||||||
|
|
||||||
|
matches_image = []
|
||||||
|
matches_other = []
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
if '<|ref|>image<|/ref|>' in match[0]:
|
||||||
|
matches_image.append(match[0])
|
||||||
|
else:
|
||||||
|
matches_other.append(match[0])
|
||||||
|
|
||||||
|
return matches, matches_image, matches_other
|
||||||
|
|
||||||
|
|
||||||
|
def parse_coordinates(ref_text: Tuple, image_width: int, image_height: int) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Parse coordinates from reference text
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ref_text: Tuple of (full_match, label, coordinates)
|
||||||
|
image_width: Image width in pixels
|
||||||
|
image_height: Image height in pixels
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with label and scaled coordinates
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
label_type = ref_text[1]
|
||||||
|
cor_list = ast.literal_eval(ref_text[2])
|
||||||
|
|
||||||
|
# Scale coordinates from 0-999 to actual pixels
|
||||||
|
scaled_boxes = []
|
||||||
|
for points in cor_list:
|
||||||
|
x1, y1, x2, y2 = points
|
||||||
|
scaled_box = [
|
||||||
|
int(x1 / 999 * image_width),
|
||||||
|
int(y1 / 999 * image_height),
|
||||||
|
int(x2 / 999 * image_width),
|
||||||
|
int(y2 / 999 * image_height)
|
||||||
|
]
|
||||||
|
scaled_boxes.append(scaled_box)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'label': label_type,
|
||||||
|
'boxes': scaled_boxes
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error parsing coordinates: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def crop_images_from_refs(image: Image.Image, refs: List[Tuple]) -> List[Image.Image]:
|
||||||
|
"""
|
||||||
|
Crop images based on reference bounding boxes
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image: Source PIL Image
|
||||||
|
refs: List of reference tuples
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of cropped PIL Images
|
||||||
|
"""
|
||||||
|
cropped_images = []
|
||||||
|
image_width, image_height = image.size
|
||||||
|
|
||||||
|
for ref in refs:
|
||||||
|
coord_data = parse_coordinates(ref, image_width, image_height)
|
||||||
|
if coord_data and coord_data['label'] == 'image':
|
||||||
|
for box in coord_data['boxes']:
|
||||||
|
x1, y1, x2, y2 = box
|
||||||
|
try:
|
||||||
|
cropped = image.crop((x1, y1, x2, y2))
|
||||||
|
cropped_images.append(cropped)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error cropping image: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return cropped_images
|
||||||
|
|
||||||
|
|
||||||
|
def clean_markdown_content(content: str, image_refs: List[str], other_refs: List[str]) -> str:
|
||||||
|
"""
|
||||||
|
Clean markdown content by removing reference tags
|
||||||
|
|
||||||
|
Args:
|
||||||
|
content: Raw OCR output with tags
|
||||||
|
image_refs: List of image reference tags
|
||||||
|
other_refs: List of other reference tags
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Cleaned markdown content
|
||||||
|
"""
|
||||||
|
cleaned = content
|
||||||
|
|
||||||
|
# Remove image reference tags (will be replaced with markdown images)
|
||||||
|
for ref in image_refs:
|
||||||
|
cleaned = cleaned.replace(ref, '')
|
||||||
|
|
||||||
|
# Remove other reference tags and clean up formatting
|
||||||
|
for ref in other_refs:
|
||||||
|
cleaned = cleaned.replace(ref, '')
|
||||||
|
|
||||||
|
# Clean up LaTeX and formatting
|
||||||
|
cleaned = (cleaned
|
||||||
|
.replace('\\coloneqq', ':=')
|
||||||
|
.replace('\\eqqcolon', '=:')
|
||||||
|
.replace('\n\n\n\n', '\n\n')
|
||||||
|
.replace('\n\n\n', '\n\n'))
|
||||||
|
|
||||||
|
return cleaned
|
||||||
@@ -11,3 +11,7 @@ pillow
|
|||||||
safetensors
|
safetensors
|
||||||
torch
|
torch
|
||||||
python-decouple>=3.8
|
python-decouple>=3.8
|
||||||
|
PyMuPDF>=1.23.0
|
||||||
|
img2pdf>=0.5.0
|
||||||
|
python-docx>=1.1.0
|
||||||
|
markdown>=3.5.0
|
||||||
|
|||||||
150
backend/test_security.py
Normal file
150
backend/test_security.py
Normal file
@@ -0,0 +1,150 @@
|
|||||||
|
"""
|
||||||
|
Security regression tests for the eval() RCE vulnerability (OX Security disclosure).
|
||||||
|
|
||||||
|
The vulnerability allowed arbitrary code execution via crafted OCR output
|
||||||
|
that was passed to eval() in parse_coordinates(). The fix uses ast.literal_eval()
|
||||||
|
which only allows literal data structures.
|
||||||
|
|
||||||
|
This test is self-contained and does not require backend dependencies.
|
||||||
|
|
||||||
|
Run: python test_security.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import ast
|
||||||
|
|
||||||
|
|
||||||
|
def parse_coordinates(ref_text, image_width, image_height):
|
||||||
|
"""
|
||||||
|
Minimal reproduction of pdf_utils.parse_coordinates using the patched code.
|
||||||
|
This mirrors the fixed version that uses ast.literal_eval() instead of eval().
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
label_type = ref_text[1]
|
||||||
|
cor_list = ast.literal_eval(ref_text[2])
|
||||||
|
|
||||||
|
scaled_boxes = []
|
||||||
|
for points in cor_list:
|
||||||
|
x1, y1, x2, y2 = points
|
||||||
|
scaled_box = [
|
||||||
|
int(x1 / 999 * image_width),
|
||||||
|
int(y1 / 999 * image_height),
|
||||||
|
int(x2 / 999 * image_width),
|
||||||
|
int(y2 / 999 * image_height)
|
||||||
|
]
|
||||||
|
scaled_boxes.append(scaled_box)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'label': label_type,
|
||||||
|
'boxes': scaled_boxes
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
print(f" [Blocked] {type(e).__name__}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def test_legitimate_coordinates():
|
||||||
|
"""Verify that normal coordinate parsing still works."""
|
||||||
|
ref_text = ("full_match", "text", "[[312, 339, 480, 681]]")
|
||||||
|
result = parse_coordinates(ref_text, 1000, 1000)
|
||||||
|
|
||||||
|
assert result is not None, "Legitimate coordinates should parse successfully"
|
||||||
|
assert result['label'] == 'text'
|
||||||
|
assert len(result['boxes']) == 1
|
||||||
|
print("PASS: Legitimate coordinates parse correctly")
|
||||||
|
|
||||||
|
|
||||||
|
def test_multiple_boxes():
|
||||||
|
"""Verify multiple bounding boxes still work."""
|
||||||
|
ref_text = ("full_match", "image", "[[100, 200, 300, 400], [500, 600, 700, 800]]")
|
||||||
|
result = parse_coordinates(ref_text, 1000, 1000)
|
||||||
|
|
||||||
|
assert result is not None, "Multiple boxes should parse successfully"
|
||||||
|
assert len(result['boxes']) == 2
|
||||||
|
print("PASS: Multiple bounding boxes parse correctly")
|
||||||
|
|
||||||
|
|
||||||
|
def test_rce_blocked_import_os():
|
||||||
|
"""The original exploit: __import__('os').system('...') must be blocked."""
|
||||||
|
malicious = "__import__('os').system('echo HACKED')"
|
||||||
|
ref_text = ("full_match", "exploit", malicious)
|
||||||
|
result = parse_coordinates(ref_text, 1000, 1000)
|
||||||
|
|
||||||
|
assert result is None, "Code execution payload should be rejected"
|
||||||
|
print("PASS: __import__('os').system() payload is blocked")
|
||||||
|
|
||||||
|
|
||||||
|
def test_rce_blocked_exec():
|
||||||
|
"""exec() based payloads must be blocked."""
|
||||||
|
malicious = "exec('import os; os.system(\"echo HACKED\")')"
|
||||||
|
ref_text = ("full_match", "exploit", malicious)
|
||||||
|
result = parse_coordinates(ref_text, 1000, 1000)
|
||||||
|
|
||||||
|
assert result is None, "exec() payload should be rejected"
|
||||||
|
print("PASS: exec() payload is blocked")
|
||||||
|
|
||||||
|
|
||||||
|
def test_rce_blocked_eval():
|
||||||
|
"""Nested eval() payloads must be blocked."""
|
||||||
|
malicious = "eval('__import__(\"os\").popen(\"id\").read()')"
|
||||||
|
ref_text = ("full_match", "exploit", malicious)
|
||||||
|
result = parse_coordinates(ref_text, 1000, 1000)
|
||||||
|
|
||||||
|
assert result is None, "Nested eval() payload should be rejected"
|
||||||
|
print("PASS: Nested eval() payload is blocked")
|
||||||
|
|
||||||
|
|
||||||
|
def test_rce_blocked_lambda():
|
||||||
|
"""Lambda-based payloads must be blocked."""
|
||||||
|
malicious = "(lambda: __import__('os').system('echo HACKED'))()"
|
||||||
|
ref_text = ("full_match", "exploit", malicious)
|
||||||
|
result = parse_coordinates(ref_text, 1000, 1000)
|
||||||
|
|
||||||
|
assert result is None, "Lambda payload should be rejected"
|
||||||
|
print("PASS: Lambda payload is blocked")
|
||||||
|
|
||||||
|
|
||||||
|
def test_rce_blocked_comprehension():
|
||||||
|
"""List comprehension code execution must be blocked."""
|
||||||
|
malicious = "[__import__('os').system('echo HACKED') for x in [1]]"
|
||||||
|
ref_text = ("full_match", "exploit", malicious)
|
||||||
|
result = parse_coordinates(ref_text, 1000, 1000)
|
||||||
|
|
||||||
|
assert result is None, "List comprehension payload should be rejected"
|
||||||
|
print("PASS: List comprehension payload is blocked")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("=" * 60)
|
||||||
|
print("Security Regression Tests (OX Security RCE disclosure)")
|
||||||
|
print("=" * 60)
|
||||||
|
print()
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
test_legitimate_coordinates,
|
||||||
|
test_multiple_boxes,
|
||||||
|
test_rce_blocked_import_os,
|
||||||
|
test_rce_blocked_exec,
|
||||||
|
test_rce_blocked_eval,
|
||||||
|
test_rce_blocked_lambda,
|
||||||
|
test_rce_blocked_comprehension,
|
||||||
|
]
|
||||||
|
|
||||||
|
passed = 0
|
||||||
|
failed = 0
|
||||||
|
for test in tests:
|
||||||
|
try:
|
||||||
|
test()
|
||||||
|
passed += 1
|
||||||
|
except AssertionError as e:
|
||||||
|
print(f"FAIL: {test.__name__}: {e}")
|
||||||
|
failed += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR: {test.__name__}: {e}")
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
print()
|
||||||
|
print(f"Results: {passed} passed, {failed} failed out of {len(tests)} tests")
|
||||||
|
if failed == 0:
|
||||||
|
print("All security tests passed - RCE vulnerability is patched.")
|
||||||
|
else:
|
||||||
|
print("WARNING: Some tests failed!")
|
||||||
@@ -10,6 +10,7 @@
|
|||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"axios": "^1.6.5",
|
"axios": "^1.6.5",
|
||||||
|
"dompurify": "^3.3.3",
|
||||||
"framer-motion": "^11.0.0",
|
"framer-motion": "^11.0.0",
|
||||||
"lucide-react": "^0.344.0",
|
"lucide-react": "^0.344.0",
|
||||||
"react": "^18.3.1",
|
"react": "^18.3.1",
|
||||||
|
|||||||
@@ -1,20 +1,25 @@
|
|||||||
import { useState, useCallback } from 'react'
|
import { useState, useCallback } from 'react'
|
||||||
import { motion, AnimatePresence } from 'framer-motion'
|
import { motion, AnimatePresence } from 'framer-motion'
|
||||||
import { Sparkles, Zap, Loader2 } from 'lucide-react'
|
import { Sparkles, Zap, Loader2, Settings, Image as ImageIcon, FileText } from 'lucide-react'
|
||||||
import ImageUpload from './components/ImageUpload'
|
import ImageUpload from './components/ImageUpload'
|
||||||
import ModeSelector from './components/ModeSelector'
|
import ModeSelector from './components/ModeSelector'
|
||||||
import ResultPanel from './components/ResultPanel'
|
import ResultPanel from './components/ResultPanel'
|
||||||
|
import AdvancedSettings from './components/AdvancedSettings'
|
||||||
|
import PDFProcessor from './components/PDFProcessor'
|
||||||
import axios from 'axios'
|
import axios from 'axios'
|
||||||
|
|
||||||
const API_BASE = import.meta.env.VITE_API_URL || '/api'
|
const API_BASE = import.meta.env.VITE_API_URL || '/api'
|
||||||
|
|
||||||
function App() {
|
function App() {
|
||||||
const [mode, setMode] = useState('plain_ocr')
|
const [mode, setMode] = useState('plain_ocr')
|
||||||
|
const [fileType, setFileType] = useState('image') // 'image' or 'pdf'
|
||||||
const [image, setImage] = useState(null)
|
const [image, setImage] = useState(null)
|
||||||
const [imagePreview, setImagePreview] = useState(null)
|
const [imagePreview, setImagePreview] = useState(null)
|
||||||
const [result, setResult] = useState(null)
|
const [result, setResult] = useState(null)
|
||||||
const [loading, setLoading] = useState(false)
|
const [loading, setLoading] = useState(false)
|
||||||
const [error, setError] = useState(null)
|
const [error, setError] = useState(null)
|
||||||
|
const [showAdvanced, setShowAdvanced] = useState(false)
|
||||||
|
const [includeCaption, setIncludeCaption] = useState(false)
|
||||||
|
|
||||||
// Form state
|
// Form state
|
||||||
const [prompt, setPrompt] = useState('')
|
const [prompt, setPrompt] = useState('')
|
||||||
@@ -26,11 +31,23 @@ function App() {
|
|||||||
test_compress: false
|
test_compress: false
|
||||||
})
|
})
|
||||||
|
|
||||||
|
const handleFileTypeChange = useCallback((newType) => {
|
||||||
|
// Clear current file when switching types
|
||||||
|
setImage(null)
|
||||||
|
if (imagePreview) {
|
||||||
|
URL.revokeObjectURL(imagePreview)
|
||||||
|
}
|
||||||
|
setImagePreview(null)
|
||||||
|
setError(null)
|
||||||
|
setResult(null)
|
||||||
|
setFileType(newType)
|
||||||
|
}, [imagePreview])
|
||||||
|
|
||||||
const handleImageSelect = useCallback((file) => {
|
const handleImageSelect = useCallback((file) => {
|
||||||
if (file === null) {
|
if (file === null) {
|
||||||
// Clear everything when removing image
|
// Clear everything when removing image
|
||||||
setImage(null)
|
setImage(null)
|
||||||
if (imagePreview) {
|
if (imagePreview && fileType === 'image') {
|
||||||
URL.revokeObjectURL(imagePreview)
|
URL.revokeObjectURL(imagePreview)
|
||||||
}
|
}
|
||||||
setImagePreview(null)
|
setImagePreview(null)
|
||||||
@@ -38,11 +55,16 @@ function App() {
|
|||||||
setResult(null)
|
setResult(null)
|
||||||
} else {
|
} else {
|
||||||
setImage(file)
|
setImage(file)
|
||||||
setImagePreview(URL.createObjectURL(file))
|
// Only create preview URL for images, not PDFs
|
||||||
|
if (fileType === 'image') {
|
||||||
|
setImagePreview(URL.createObjectURL(file))
|
||||||
|
} else {
|
||||||
|
setImagePreview(file) // Just store the file for PDFs
|
||||||
|
}
|
||||||
setError(null)
|
setError(null)
|
||||||
setResult(null)
|
setResult(null)
|
||||||
}
|
}
|
||||||
}, [imagePreview])
|
}, [imagePreview, fileType])
|
||||||
|
|
||||||
const handleSubmit = async () => {
|
const handleSubmit = async () => {
|
||||||
if (!image) {
|
if (!image) {
|
||||||
@@ -60,7 +82,7 @@ function App() {
|
|||||||
formData.append('prompt', prompt)
|
formData.append('prompt', prompt)
|
||||||
// Enable grounding only for find mode
|
// Enable grounding only for find mode
|
||||||
formData.append('grounding', mode === 'find_ref')
|
formData.append('grounding', mode === 'find_ref')
|
||||||
formData.append('include_caption', false)
|
formData.append('include_caption', includeCaption)
|
||||||
formData.append('find_term', findTerm)
|
formData.append('find_term', findTerm)
|
||||||
formData.append('schema', '')
|
formData.append('schema', '')
|
||||||
formData.append('base_size', advancedSettings.base_size)
|
formData.append('base_size', advancedSettings.base_size)
|
||||||
@@ -174,9 +196,41 @@ function App() {
|
|||||||
transition={{ delay: 0.1 }}
|
transition={{ delay: 0.1 }}
|
||||||
className="space-y-6"
|
className="space-y-6"
|
||||||
>
|
>
|
||||||
|
{/* File Type Toggle */}
|
||||||
|
<div className="glass p-4 rounded-2xl">
|
||||||
|
<div className="grid grid-cols-2 gap-2">
|
||||||
|
<motion.button
|
||||||
|
onClick={() => handleFileTypeChange('image')}
|
||||||
|
className={`p-3 rounded-xl text-sm font-medium transition-all flex items-center justify-center gap-2 ${
|
||||||
|
fileType === 'image'
|
||||||
|
? 'bg-gradient-to-r from-purple-600 to-cyan-600 text-white'
|
||||||
|
: 'glass text-gray-400 hover:bg-white/5'
|
||||||
|
}`}
|
||||||
|
whileHover={{ scale: 1.02 }}
|
||||||
|
whileTap={{ scale: 0.98 }}
|
||||||
|
>
|
||||||
|
<ImageIcon className="w-4 h-4" />
|
||||||
|
Image OCR
|
||||||
|
</motion.button>
|
||||||
|
<motion.button
|
||||||
|
onClick={() => handleFileTypeChange('pdf')}
|
||||||
|
className={`p-3 rounded-xl text-sm font-medium transition-all flex items-center justify-center gap-2 ${
|
||||||
|
fileType === 'pdf'
|
||||||
|
? 'bg-gradient-to-r from-purple-600 to-cyan-600 text-white'
|
||||||
|
: 'glass text-gray-400 hover:bg-white/5'
|
||||||
|
}`}
|
||||||
|
whileHover={{ scale: 1.02 }}
|
||||||
|
whileTap={{ scale: 0.98 }}
|
||||||
|
>
|
||||||
|
<FileText className="w-4 h-4" />
|
||||||
|
PDF Processing
|
||||||
|
</motion.button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
{/* Mode Selector with integrated inputs */}
|
{/* Mode Selector with integrated inputs */}
|
||||||
<ModeSelector
|
<ModeSelector
|
||||||
mode={mode}
|
mode={mode}
|
||||||
onModeChange={setMode}
|
onModeChange={setMode}
|
||||||
prompt={prompt}
|
prompt={prompt}
|
||||||
onPromptChange={setPrompt}
|
onPromptChange={setPrompt}
|
||||||
@@ -184,46 +238,92 @@ function App() {
|
|||||||
onFindTermChange={setFindTerm}
|
onFindTermChange={setFindTerm}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
{/* Image Upload */}
|
{/* Image/PDF Upload */}
|
||||||
<ImageUpload
|
<ImageUpload
|
||||||
onImageSelect={handleImageSelect}
|
onImageSelect={handleImageSelect}
|
||||||
preview={imagePreview}
|
preview={imagePreview}
|
||||||
|
fileType={fileType}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
{/* Action Button */}
|
{/* Advanced Settings Toggle */}
|
||||||
<motion.button
|
<motion.button
|
||||||
onClick={handleSubmit}
|
onClick={() => setShowAdvanced(!showAdvanced)}
|
||||||
disabled={!image || loading}
|
className="w-full glass px-4 py-3 rounded-2xl flex items-center justify-between hover:bg-white/5 transition-colors"
|
||||||
className={`w-full relative overflow-hidden rounded-2xl p-[2px] ${
|
whileHover={{ scale: 1.01 }}
|
||||||
!image || loading ? 'opacity-50 cursor-not-allowed' : ''
|
whileTap={{ scale: 0.99 }}
|
||||||
}`}
|
|
||||||
whileHover={!loading && image ? { scale: 1.02 } : {}}
|
|
||||||
whileTap={!loading && image ? { scale: 0.98 } : {}}
|
|
||||||
>
|
>
|
||||||
<div className="absolute inset-0 bg-gradient-to-r from-purple-600 via-pink-600 to-cyan-600 animate-gradient" />
|
<div className="flex items-center gap-2">
|
||||||
<div className="relative bg-dark-100 px-8 py-4 rounded-2xl flex items-center justify-center gap-3">
|
<Settings className="w-4 h-4 text-purple-400" />
|
||||||
{loading ? (
|
<span className="text-sm font-medium text-gray-300">Advanced Settings</span>
|
||||||
<>
|
|
||||||
<Loader2 className="w-5 h-5 animate-spin" />
|
|
||||||
<span className="font-semibold">Processing Magic...</span>
|
|
||||||
</>
|
|
||||||
) : (
|
|
||||||
<>
|
|
||||||
<Zap className="w-5 h-5" />
|
|
||||||
<span className="font-semibold">Analyze Image</span>
|
|
||||||
</>
|
|
||||||
)}
|
|
||||||
</div>
|
</div>
|
||||||
|
<motion.div
|
||||||
|
animate={{ rotate: showAdvanced ? 180 : 0 }}
|
||||||
|
transition={{ duration: 0.3 }}
|
||||||
|
>
|
||||||
|
<svg className="w-4 h-4 text-gray-400" fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||||
|
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M19 9l-7 7-7-7" />
|
||||||
|
</svg>
|
||||||
|
</motion.div>
|
||||||
</motion.button>
|
</motion.button>
|
||||||
|
|
||||||
{error && (
|
{/* Advanced Settings Panel */}
|
||||||
<motion.div
|
<AnimatePresence>
|
||||||
initial={{ opacity: 0, y: -10 }}
|
{showAdvanced && (
|
||||||
animate={{ opacity: 1, y: 0 }}
|
<AdvancedSettings
|
||||||
className="glass p-4 rounded-2xl border-red-500/50 bg-red-500/10"
|
settings={advancedSettings}
|
||||||
>
|
onSettingsChange={setAdvancedSettings}
|
||||||
<p className="text-sm text-red-400">{error}</p>
|
includeCaption={includeCaption}
|
||||||
</motion.div>
|
onIncludeCaptionChange={setIncludeCaption}
|
||||||
|
/>
|
||||||
|
)}
|
||||||
|
</AnimatePresence>
|
||||||
|
|
||||||
|
{/* Action Button / PDF Processor */}
|
||||||
|
{fileType === 'pdf' ? (
|
||||||
|
<PDFProcessor
|
||||||
|
pdfFile={image}
|
||||||
|
mode={mode}
|
||||||
|
prompt={prompt}
|
||||||
|
advancedSettings={advancedSettings}
|
||||||
|
includeCaption={includeCaption}
|
||||||
|
/>
|
||||||
|
) : (
|
||||||
|
<>
|
||||||
|
<motion.button
|
||||||
|
onClick={handleSubmit}
|
||||||
|
disabled={!image || loading}
|
||||||
|
className={`w-full relative overflow-hidden rounded-2xl p-[2px] ${
|
||||||
|
!image || loading ? 'opacity-50 cursor-not-allowed' : ''
|
||||||
|
}`}
|
||||||
|
whileHover={!loading && image ? { scale: 1.02 } : {}}
|
||||||
|
whileTap={!loading && image ? { scale: 0.98 } : {}}
|
||||||
|
>
|
||||||
|
<div className="absolute inset-0 bg-gradient-to-r from-purple-600 via-pink-600 to-cyan-600 animate-gradient" />
|
||||||
|
<div className="relative bg-dark-100 px-8 py-4 rounded-2xl flex items-center justify-center gap-3">
|
||||||
|
{loading ? (
|
||||||
|
<>
|
||||||
|
<Loader2 className="w-5 h-5 animate-spin" />
|
||||||
|
<span className="font-semibold">Processing Magic...</span>
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
<>
|
||||||
|
<Zap className="w-5 h-5" />
|
||||||
|
<span className="font-semibold">Analyze Image</span>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</motion.button>
|
||||||
|
|
||||||
|
{error && (
|
||||||
|
<motion.div
|
||||||
|
initial={{ opacity: 0, y: -10 }}
|
||||||
|
animate={{ opacity: 1, y: 0 }}
|
||||||
|
className="glass p-4 rounded-2xl border-red-500/50 bg-red-500/10"
|
||||||
|
>
|
||||||
|
<p className="text-sm text-red-400">{error}</p>
|
||||||
|
</motion.div>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
)}
|
)}
|
||||||
</motion.div>
|
</motion.div>
|
||||||
|
|
||||||
@@ -246,11 +346,14 @@ function App() {
|
|||||||
|
|
||||||
{/* Footer */}
|
{/* Footer */}
|
||||||
<footer className="mt-20 border-t border-white/10 glass">
|
<footer className="mt-20 border-t border-white/10 glass">
|
||||||
<div className="max-w-7xl mx-auto px-6 py-8 text-center">
|
<div className="max-w-7xl mx-auto px-6 py-8 text-center space-y-2">
|
||||||
<p className="text-sm text-gray-400">
|
<p className="text-sm text-gray-400">
|
||||||
Powered by <span className="gradient-text font-semibold">DeepSeek-OCR</span> •
|
Powered by <span className="gradient-text font-semibold">DeepSeek-OCR</span> •
|
||||||
Built with <span className="text-pink-400">♥</span> using React + FastAPI
|
Built with <span className="text-pink-400">♥</span> using React + FastAPI
|
||||||
</p>
|
</p>
|
||||||
|
<p className="text-xs text-gray-500">
|
||||||
|
Thanks to <a href="https://github.com/p-xiexin" target="_blank" rel="noopener noreferrer" className="text-purple-400 hover:text-purple-300 transition-colors">@p-xiexin</a> for the clipboard paste idea!
|
||||||
|
</p>
|
||||||
</div>
|
</div>
|
||||||
</footer>
|
</footer>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,18 +1,54 @@
|
|||||||
import { useCallback } from 'react'
|
import { useCallback, useEffect } from 'react'
|
||||||
import { motion } from 'framer-motion'
|
import { motion } from 'framer-motion'
|
||||||
import { useDropzone } from 'react-dropzone'
|
import { useDropzone } from 'react-dropzone'
|
||||||
import { Upload, Image as ImageIcon, X } from 'lucide-react'
|
import { Upload, Image as ImageIcon, X, FileText, Clipboard } from 'lucide-react'
|
||||||
|
|
||||||
export default function ImageUpload({ onImageSelect, preview }) {
|
export default function ImageUpload({ onImageSelect, preview, fileType = 'image' }) {
|
||||||
const onDrop = useCallback((acceptedFiles) => {
|
const onDrop = useCallback((acceptedFiles) => {
|
||||||
if (acceptedFiles?.[0]) {
|
if (acceptedFiles?.[0]) {
|
||||||
onImageSelect(acceptedFiles[0])
|
onImageSelect(acceptedFiles[0])
|
||||||
}
|
}
|
||||||
}, [onImageSelect])
|
}, [onImageSelect])
|
||||||
|
|
||||||
|
const isPDF = fileType === 'pdf'
|
||||||
|
|
||||||
|
// Handle clipboard paste
|
||||||
|
useEffect(() => {
|
||||||
|
// Only enable paste for images, not PDFs
|
||||||
|
if (isPDF) return
|
||||||
|
|
||||||
|
const handlePaste = async (e) => {
|
||||||
|
const items = e.clipboardData?.items
|
||||||
|
if (!items) return
|
||||||
|
|
||||||
|
for (let i = 0; i < items.length; i++) {
|
||||||
|
const item = items[i]
|
||||||
|
|
||||||
|
if (item.type.indexOf('image') !== -1) {
|
||||||
|
e.preventDefault()
|
||||||
|
const blob = item.getAsFile()
|
||||||
|
|
||||||
|
if (blob) {
|
||||||
|
// Create a File object with a proper name
|
||||||
|
const file = new File([blob], `pasted-image-${Date.now()}.png`, {
|
||||||
|
type: blob.type,
|
||||||
|
})
|
||||||
|
onImageSelect(file)
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
document.addEventListener('paste', handlePaste)
|
||||||
|
return () => document.removeEventListener('paste', handlePaste)
|
||||||
|
}, [onImageSelect, isPDF])
|
||||||
|
|
||||||
const { getRootProps, getInputProps, isDragActive } = useDropzone({
|
const { getRootProps, getInputProps, isDragActive } = useDropzone({
|
||||||
onDrop,
|
onDrop,
|
||||||
accept: {
|
accept: isPDF ? {
|
||||||
|
'application/pdf': ['.pdf']
|
||||||
|
} : {
|
||||||
'image/*': ['.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp']
|
'image/*': ['.png', '.jpg', '.jpeg', '.webp', '.gif', '.bmp']
|
||||||
},
|
},
|
||||||
multiple: false
|
multiple: false
|
||||||
@@ -21,8 +57,14 @@ export default function ImageUpload({ onImageSelect, preview }) {
|
|||||||
return (
|
return (
|
||||||
<div className="glass p-6 rounded-2xl space-y-4">
|
<div className="glass p-6 rounded-2xl space-y-4">
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
<h3 className="font-semibold text-gray-200">Upload Image</h3>
|
<h3 className="font-semibold text-gray-200">
|
||||||
<ImageIcon className="w-5 h-5 text-purple-400" />
|
{isPDF ? 'Upload PDF' : 'Upload Image'}
|
||||||
|
</h3>
|
||||||
|
{isPDF ? (
|
||||||
|
<FileText className="w-5 h-5 text-purple-400" />
|
||||||
|
) : (
|
||||||
|
<ImageIcon className="w-5 h-5 text-purple-400" />
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{!preview ? (
|
{!preview ? (
|
||||||
@@ -59,11 +101,25 @@ export default function ImageUpload({ onImageSelect, preview }) {
|
|||||||
|
|
||||||
<div>
|
<div>
|
||||||
<p className="text-lg font-medium text-gray-200">
|
<p className="text-lg font-medium text-gray-200">
|
||||||
{isDragActive ? 'Drop it like it\'s hot! 🔥' : 'Drag & drop your image'}
|
{isDragActive
|
||||||
|
? 'Drop it like it\'s hot! 🔥'
|
||||||
|
: isPDF
|
||||||
|
? 'Drag & drop your PDF'
|
||||||
|
: 'Drag & drop your image'
|
||||||
|
}
|
||||||
</p>
|
</p>
|
||||||
<p className="text-sm text-gray-400 mt-1">
|
<p className="text-sm text-gray-400 mt-1">
|
||||||
or click to browse • PNG, JPG, WEBP up to 10MB
|
{isPDF
|
||||||
|
? 'or click to browse • PDF files up to 100MB'
|
||||||
|
: 'or click to browse • PNG, JPG, WEBP up to 10MB'
|
||||||
|
}
|
||||||
</p>
|
</p>
|
||||||
|
{!isPDF && (
|
||||||
|
<p className="text-xs text-purple-400 mt-2 flex items-center justify-center gap-1.5">
|
||||||
|
<Clipboard className="w-3.5 h-3.5" />
|
||||||
|
<span>Press Ctrl+V to paste from clipboard</span>
|
||||||
|
</p>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</motion.div>
|
</motion.div>
|
||||||
@@ -73,11 +129,21 @@ export default function ImageUpload({ onImageSelect, preview }) {
|
|||||||
animate={{ opacity: 1, scale: 1 }}
|
animate={{ opacity: 1, scale: 1 }}
|
||||||
className="relative group rounded-2xl overflow-hidden"
|
className="relative group rounded-2xl overflow-hidden"
|
||||||
>
|
>
|
||||||
<img
|
{isPDF ? (
|
||||||
src={preview}
|
<div className="flex items-center justify-center p-12 bg-white/5 border border-white/10 rounded-2xl">
|
||||||
alt="Preview"
|
<div className="text-center">
|
||||||
className="w-full rounded-2xl border border-white/10"
|
<FileText className="w-16 h-16 mx-auto mb-3 text-purple-400" />
|
||||||
/>
|
<p className="text-sm text-gray-300 font-medium">PDF Ready</p>
|
||||||
|
<p className="text-xs text-gray-500 mt-1">{preview?.name || 'Document loaded'}</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<img
|
||||||
|
src={preview}
|
||||||
|
alt="Preview"
|
||||||
|
className="w-full rounded-2xl border border-white/10"
|
||||||
|
/>
|
||||||
|
)}
|
||||||
<div className="absolute top-3 right-3 flex gap-2">
|
<div className="absolute top-3 right-3 flex gap-2">
|
||||||
<motion.button
|
<motion.button
|
||||||
onClick={(e) => {
|
onClick={(e) => {
|
||||||
@@ -87,7 +153,7 @@ export default function ImageUpload({ onImageSelect, preview }) {
|
|||||||
className="bg-red-500/90 backdrop-blur-sm px-3 py-2 rounded-full opacity-100 hover:bg-red-600 transition-colors flex items-center gap-2 shadow-lg"
|
className="bg-red-500/90 backdrop-blur-sm px-3 py-2 rounded-full opacity-100 hover:bg-red-600 transition-colors flex items-center gap-2 shadow-lg"
|
||||||
whileHover={{ scale: 1.05 }}
|
whileHover={{ scale: 1.05 }}
|
||||||
whileTap={{ scale: 0.95 }}
|
whileTap={{ scale: 0.95 }}
|
||||||
title="Remove image"
|
title={isPDF ? "Remove PDF" : "Remove image"}
|
||||||
>
|
>
|
||||||
<X className="w-4 h-4" />
|
<X className="w-4 h-4" />
|
||||||
<span className="text-sm font-medium">Remove</span>
|
<span className="text-sm font-medium">Remove</span>
|
||||||
|
|||||||
233
frontend/src/components/PDFProcessor.jsx
Normal file
233
frontend/src/components/PDFProcessor.jsx
Normal file
@@ -0,0 +1,233 @@
|
|||||||
|
import { useState, useCallback } from 'react'
|
||||||
|
import { motion, AnimatePresence } from 'framer-motion'
|
||||||
|
import { FileText, Download, Loader2, CheckCircle2, AlertCircle } from 'lucide-react'
|
||||||
|
import axios from 'axios'
|
||||||
|
|
||||||
|
const API_BASE = import.meta.env.VITE_API_URL || '/api'
|
||||||
|
|
||||||
|
function PDFProcessor({ pdfFile, mode, prompt, advancedSettings, includeCaption }) {
|
||||||
|
const [processing, setProcessing] = useState(false)
|
||||||
|
const [progress, setProgress] = useState(0)
|
||||||
|
const [result, setResult] = useState(null)
|
||||||
|
const [error, setError] = useState(null)
|
||||||
|
const [outputFormat, setOutputFormat] = useState('markdown')
|
||||||
|
|
||||||
|
const formats = [
|
||||||
|
{ value: 'markdown', label: 'Markdown', ext: 'md', icon: '📝' },
|
||||||
|
{ value: 'html', label: 'HTML', ext: 'html', icon: '🌐' },
|
||||||
|
{ value: 'docx', label: 'Word', ext: 'docx', icon: '📄' },
|
||||||
|
{ value: 'json', label: 'JSON', ext: 'json', icon: '📊' }
|
||||||
|
]
|
||||||
|
|
||||||
|
const handleProcess = useCallback(async () => {
|
||||||
|
if (!pdfFile) return
|
||||||
|
|
||||||
|
setProcessing(true)
|
||||||
|
setError(null)
|
||||||
|
setProgress(0)
|
||||||
|
|
||||||
|
try {
|
||||||
|
const formData = new FormData()
|
||||||
|
formData.append('pdf_file', pdfFile)
|
||||||
|
formData.append('mode', mode)
|
||||||
|
formData.append('prompt', prompt)
|
||||||
|
formData.append('output_format', outputFormat)
|
||||||
|
formData.append('grounding', mode === 'find_ref')
|
||||||
|
formData.append('include_caption', includeCaption)
|
||||||
|
formData.append('extract_images', true)
|
||||||
|
formData.append('dpi', 144)
|
||||||
|
formData.append('base_size', advancedSettings.base_size)
|
||||||
|
formData.append('image_size', advancedSettings.image_size)
|
||||||
|
formData.append('crop_mode', advancedSettings.crop_mode)
|
||||||
|
|
||||||
|
const response = await axios.post(`${API_BASE}/process-pdf`, formData, {
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'multipart/form-data',
|
||||||
|
},
|
||||||
|
responseType: outputFormat === 'json' ? 'json' : 'blob',
|
||||||
|
onUploadProgress: (progressEvent) => {
|
||||||
|
const percentCompleted = Math.round((progressEvent.loaded * 100) / progressEvent.total)
|
||||||
|
setProgress(percentCompleted)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
if (outputFormat === 'json') {
|
||||||
|
setResult(response.data)
|
||||||
|
} else {
|
||||||
|
// For file downloads (markdown, html, docx)
|
||||||
|
const format = formats.find(f => f.value === outputFormat)
|
||||||
|
const blob = new Blob([response.data], {
|
||||||
|
type: response.headers['content-type']
|
||||||
|
})
|
||||||
|
const url = URL.createObjectURL(blob)
|
||||||
|
const a = document.createElement('a')
|
||||||
|
a.href = url
|
||||||
|
a.download = `ocr_result.${format.ext}`
|
||||||
|
a.click()
|
||||||
|
URL.revokeObjectURL(url)
|
||||||
|
|
||||||
|
setResult({
|
||||||
|
success: true,
|
||||||
|
message: `Document downloaded as ${format.label}`,
|
||||||
|
format: outputFormat
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
setProgress(100)
|
||||||
|
} catch (err) {
|
||||||
|
console.error('PDF processing error:', err)
|
||||||
|
setError(err.response?.data?.detail || err.message || 'Failed to process PDF')
|
||||||
|
} finally {
|
||||||
|
setProcessing(false)
|
||||||
|
}
|
||||||
|
}, [pdfFile, mode, prompt, outputFormat, includeCaption, advancedSettings])
|
||||||
|
|
||||||
|
const handleDownloadJSON = useCallback(() => {
|
||||||
|
if (!result || outputFormat !== 'json') return
|
||||||
|
|
||||||
|
const blob = new Blob([JSON.stringify(result, null, 2)], { type: 'application/json' })
|
||||||
|
const url = URL.createObjectURL(blob)
|
||||||
|
const a = document.createElement('a')
|
||||||
|
a.href = url
|
||||||
|
a.download = 'ocr_result.json'
|
||||||
|
a.click()
|
||||||
|
URL.revokeObjectURL(url)
|
||||||
|
}, [result, outputFormat])
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="space-y-4">
|
||||||
|
{/* Format Selector */}
|
||||||
|
<div className="glass p-6 rounded-2xl space-y-3">
|
||||||
|
<label className="block text-sm font-medium text-gray-300 mb-3">
|
||||||
|
Output Format
|
||||||
|
</label>
|
||||||
|
<div className="grid grid-cols-2 gap-2">
|
||||||
|
{formats.map((format) => (
|
||||||
|
<motion.button
|
||||||
|
key={format.value}
|
||||||
|
onClick={() => setOutputFormat(format.value)}
|
||||||
|
className={`p-3 rounded-xl text-sm font-medium transition-all ${
|
||||||
|
outputFormat === format.value
|
||||||
|
? 'bg-gradient-to-r from-purple-600 to-cyan-600 text-white'
|
||||||
|
: 'glass text-gray-400 hover:bg-white/5'
|
||||||
|
}`}
|
||||||
|
whileHover={{ scale: 1.02 }}
|
||||||
|
whileTap={{ scale: 0.98 }}
|
||||||
|
>
|
||||||
|
<span className="mr-2">{format.icon}</span>
|
||||||
|
{format.label}
|
||||||
|
</motion.button>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Process Button */}
|
||||||
|
<motion.button
|
||||||
|
onClick={handleProcess}
|
||||||
|
disabled={!pdfFile || processing}
|
||||||
|
className={`w-full relative overflow-hidden rounded-2xl p-[2px] ${
|
||||||
|
!pdfFile || processing ? 'opacity-50 cursor-not-allowed' : ''
|
||||||
|
}`}
|
||||||
|
whileHover={!processing && pdfFile ? { scale: 1.02 } : {}}
|
||||||
|
whileTap={!processing && pdfFile ? { scale: 0.98 } : {}}
|
||||||
|
>
|
||||||
|
<div className="absolute inset-0 bg-gradient-to-r from-purple-600 via-pink-600 to-cyan-600 animate-gradient" />
|
||||||
|
<div className="relative bg-dark-100 px-8 py-4 rounded-2xl flex items-center justify-center gap-3">
|
||||||
|
{processing ? (
|
||||||
|
<>
|
||||||
|
<Loader2 className="w-5 h-5 animate-spin" />
|
||||||
|
<span className="font-semibold">Processing PDF...</span>
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
<>
|
||||||
|
<FileText className="w-5 h-5" />
|
||||||
|
<span className="font-semibold">Process PDF</span>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</motion.button>
|
||||||
|
|
||||||
|
{/* Progress Bar */}
|
||||||
|
<AnimatePresence>
|
||||||
|
{processing && progress > 0 && (
|
||||||
|
<motion.div
|
||||||
|
initial={{ opacity: 0, height: 0 }}
|
||||||
|
animate={{ opacity: 1, height: 'auto' }}
|
||||||
|
exit={{ opacity: 0, height: 0 }}
|
||||||
|
className="glass p-4 rounded-2xl"
|
||||||
|
>
|
||||||
|
<div className="flex items-center justify-between mb-2">
|
||||||
|
<span className="text-sm text-gray-400">Processing...</span>
|
||||||
|
<span className="text-sm font-medium text-purple-400">{progress}%</span>
|
||||||
|
</div>
|
||||||
|
<div className="h-2 bg-dark-200 rounded-full overflow-hidden">
|
||||||
|
<motion.div
|
||||||
|
className="h-full bg-gradient-to-r from-purple-600 to-cyan-600"
|
||||||
|
initial={{ width: 0 }}
|
||||||
|
animate={{ width: `${progress}%` }}
|
||||||
|
transition={{ duration: 0.3 }}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</motion.div>
|
||||||
|
)}
|
||||||
|
</AnimatePresence>
|
||||||
|
|
||||||
|
{/* Error Display */}
|
||||||
|
<AnimatePresence>
|
||||||
|
{error && (
|
||||||
|
<motion.div
|
||||||
|
initial={{ opacity: 0, y: -10 }}
|
||||||
|
animate={{ opacity: 1, y: 0 }}
|
||||||
|
exit={{ opacity: 0, y: -10 }}
|
||||||
|
className="glass p-4 rounded-2xl border-red-500/50 bg-red-500/10 flex items-start gap-3"
|
||||||
|
>
|
||||||
|
<AlertCircle className="w-5 h-5 text-red-400 flex-shrink-0 mt-0.5" />
|
||||||
|
<div>
|
||||||
|
<p className="text-sm font-medium text-red-400">Processing Failed</p>
|
||||||
|
<p className="text-xs text-red-300 mt-1">{error}</p>
|
||||||
|
</div>
|
||||||
|
</motion.div>
|
||||||
|
)}
|
||||||
|
</AnimatePresence>
|
||||||
|
|
||||||
|
{/* Success Display */}
|
||||||
|
<AnimatePresence>
|
||||||
|
{result && !error && (
|
||||||
|
<motion.div
|
||||||
|
initial={{ opacity: 0, y: -10 }}
|
||||||
|
animate={{ opacity: 1, y: 0 }}
|
||||||
|
exit={{ opacity: 0, y: -10 }}
|
||||||
|
className="glass p-6 rounded-2xl border-green-500/50 bg-green-500/10"
|
||||||
|
>
|
||||||
|
<div className="flex items-start gap-3">
|
||||||
|
<CheckCircle2 className="w-5 h-5 text-green-400 flex-shrink-0 mt-0.5" />
|
||||||
|
<div className="flex-1">
|
||||||
|
<p className="text-sm font-medium text-green-400">
|
||||||
|
{result.message || 'PDF processed successfully!'}
|
||||||
|
</p>
|
||||||
|
{outputFormat === 'json' && result.pages && (
|
||||||
|
<div className="mt-3 space-y-2">
|
||||||
|
<p className="text-xs text-gray-400">
|
||||||
|
Processed {result.total_pages} page{result.total_pages > 1 ? 's' : ''}
|
||||||
|
</p>
|
||||||
|
<motion.button
|
||||||
|
onClick={handleDownloadJSON}
|
||||||
|
className="glass px-4 py-2 rounded-xl text-sm font-medium hover:bg-white/5 transition-colors flex items-center gap-2"
|
||||||
|
whileHover={{ scale: 1.02 }}
|
||||||
|
whileTap={{ scale: 0.98 }}
|
||||||
|
>
|
||||||
|
<Download className="w-4 h-4" />
|
||||||
|
Download JSON
|
||||||
|
</motion.button>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</motion.div>
|
||||||
|
)}
|
||||||
|
</AnimatePresence>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
export default PDFProcessor
|
||||||
@@ -2,6 +2,7 @@ import { useEffect, useRef, useState, useCallback } from 'react'
|
|||||||
import { motion, AnimatePresence } from 'framer-motion'
|
import { motion, AnimatePresence } from 'framer-motion'
|
||||||
import { Copy, Download, Sparkles, Loader2, CheckCircle2, ChevronDown } from 'lucide-react'
|
import { Copy, Download, Sparkles, Loader2, CheckCircle2, ChevronDown } from 'lucide-react'
|
||||||
import ReactMarkdown from 'react-markdown'
|
import ReactMarkdown from 'react-markdown'
|
||||||
|
import DOMPurify from 'dompurify'
|
||||||
|
|
||||||
export default function ResultPanel({ result, loading, imagePreview, onCopy, onDownload }) {
|
export default function ResultPanel({ result, loading, imagePreview, onCopy, onDownload }) {
|
||||||
const canvasRef = useRef(null)
|
const canvasRef = useRef(null)
|
||||||
@@ -230,7 +231,7 @@ export default function ResultPanel({ result, loading, imagePreview, onCopy, onD
|
|||||||
{isHTML ? (
|
{isHTML ? (
|
||||||
<div
|
<div
|
||||||
className="prose prose-invert prose-sm max-w-none"
|
className="prose prose-invert prose-sm max-w-none"
|
||||||
dangerouslySetInnerHTML={{ __html: result.text }}
|
dangerouslySetInnerHTML={{ __html: DOMPurify.sanitize(result.text) }}
|
||||||
style={{
|
style={{
|
||||||
color: '#e5e7eb',
|
color: '#e5e7eb',
|
||||||
}}
|
}}
|
||||||
|
|||||||
Reference in New Issue
Block a user