Features added: - PDF to image conversion with configurable DPI - Multi-page PDF processing with OCR - Export to Markdown, HTML, DOCX, and JSON formats - Automatic image extraction from PDFs - Formula and formatting preservation - Real-time progress tracking for multi-page documents Backend changes: - New /api/process-pdf endpoint for PDF processing - pdf_utils.py: PDF conversion and image extraction utilities - format_converter.py: Document format conversion (MD, HTML, DOCX) - Updated dependencies: PyMuPDF, img2pdf, python-docx, markdown Frontend changes: - File type toggle (Image OCR / PDF Processing) - PDFProcessor component with format selection - Updated ImageUpload to support both images and PDFs - Progress bars for multi-page processing - Download options for converted documents Documentation: - Updated README with PDF processing features - Added API documentation for /api/process-pdf endpoint - Added format conversion examples
327 lines
10 KiB
Python
327 lines
10 KiB
Python
"""
|
|
Document Format Conversion Utilities
|
|
Handles conversion to Markdown, HTML, DOCX while preserving formatting
|
|
"""
|
|
|
|
import re
|
|
from typing import List, Dict, Any
|
|
from io import BytesIO
|
|
from docx import Document
|
|
from docx.shared import Pt, Inches, RGBColor
|
|
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
|
import markdown
|
|
import base64
|
|
from PIL import Image
|
|
|
|
|
|
class DocumentConverter:
|
|
"""Handles conversion of OCR results to various document formats"""
|
|
|
|
def __init__(self):
|
|
self.page_separator = '<--- Page Split --->'
|
|
|
|
def to_markdown(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> str:
|
|
"""
|
|
Convert OCR results to Markdown format
|
|
|
|
Args:
|
|
pages_content: List of page dictionaries with text and metadata
|
|
include_images: Whether to include image references
|
|
|
|
Returns:
|
|
Markdown formatted string
|
|
"""
|
|
md_content = []
|
|
|
|
for idx, page in enumerate(pages_content):
|
|
# Add page header
|
|
md_content.append(f"# Page {idx + 1}\n")
|
|
|
|
text = page.get('text', '')
|
|
|
|
# Process and clean the text
|
|
if include_images and 'images' in page:
|
|
# Replace image placeholders with actual markdown image syntax
|
|
for img_idx, img_data in enumerate(page.get('images', [])):
|
|
placeholder = f"[IMAGE_{img_idx}]"
|
|
img_ref = f""
|
|
text = text.replace(placeholder, img_ref)
|
|
|
|
md_content.append(text)
|
|
md_content.append("\n\n---\n\n") # Page separator
|
|
|
|
return "\n".join(md_content)
|
|
|
|
def to_html(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> str:
|
|
"""
|
|
Convert OCR results to HTML format
|
|
|
|
Args:
|
|
pages_content: List of page dictionaries with text and metadata
|
|
include_images: Whether to include images
|
|
|
|
Returns:
|
|
HTML formatted string
|
|
"""
|
|
html_parts = []
|
|
|
|
# HTML header
|
|
html_parts.append("""
|
|
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>OCR Results</title>
|
|
<style>
|
|
body {
|
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
|
max-width: 900px;
|
|
margin: 40px auto;
|
|
padding: 20px;
|
|
line-height: 1.6;
|
|
background-color: #f5f5f5;
|
|
}
|
|
.page {
|
|
background: white;
|
|
padding: 40px;
|
|
margin-bottom: 30px;
|
|
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
|
border-radius: 8px;
|
|
}
|
|
.page-header {
|
|
color: #333;
|
|
border-bottom: 2px solid #4CAF50;
|
|
padding-bottom: 10px;
|
|
margin-bottom: 20px;
|
|
}
|
|
table {
|
|
border-collapse: collapse;
|
|
width: 100%;
|
|
margin: 20px 0;
|
|
}
|
|
th, td {
|
|
border: 1px solid #ddd;
|
|
padding: 12px;
|
|
text-align: left;
|
|
}
|
|
th {
|
|
background-color: #4CAF50;
|
|
color: white;
|
|
}
|
|
tr:nth-child(even) {
|
|
background-color: #f9f9f9;
|
|
}
|
|
img {
|
|
max-width: 100%;
|
|
height: auto;
|
|
margin: 15px 0;
|
|
border-radius: 4px;
|
|
}
|
|
code {
|
|
background-color: #f4f4f4;
|
|
padding: 2px 6px;
|
|
border-radius: 3px;
|
|
font-family: 'Courier New', monospace;
|
|
}
|
|
pre {
|
|
background-color: #f4f4f4;
|
|
padding: 15px;
|
|
border-radius: 5px;
|
|
overflow-x: auto;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>DeepSeek OCR Results</h1>
|
|
""")
|
|
|
|
# Process each page
|
|
for idx, page in enumerate(pages_content):
|
|
html_parts.append(f' <div class="page">')
|
|
html_parts.append(f' <h2 class="page-header">Page {idx + 1}</h2>')
|
|
|
|
text = page.get('text', '')
|
|
|
|
# Handle images if present
|
|
if include_images and 'images' in page:
|
|
for img_idx, img_data in enumerate(page.get('images', [])):
|
|
placeholder = f"[IMAGE_{img_idx}]"
|
|
img_tag = f'<img src="data:image/jpeg;base64,{img_data}" alt="Image {img_idx + 1}" />'
|
|
text = text.replace(placeholder, img_tag)
|
|
|
|
# Convert markdown to HTML if the text appears to be markdown
|
|
if self._is_markdown(text):
|
|
html_content = markdown.markdown(text, extensions=['tables', 'fenced_code'])
|
|
else:
|
|
# Otherwise, preserve the HTML or wrap in paragraph
|
|
html_content = text if '<' in text else f'<p>{text.replace(chr(10), "<br>")}</p>'
|
|
|
|
html_parts.append(f' {html_content}')
|
|
html_parts.append(' </div>')
|
|
|
|
# HTML footer
|
|
html_parts.append("""
|
|
</body>
|
|
</html>
|
|
""")
|
|
|
|
return "\n".join(html_parts)
|
|
|
|
def to_docx(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> BytesIO:
|
|
"""
|
|
Convert OCR results to DOCX format
|
|
|
|
Args:
|
|
pages_content: List of page dictionaries with text and metadata
|
|
include_images: Whether to include images
|
|
|
|
Returns:
|
|
BytesIO object containing the DOCX file
|
|
"""
|
|
doc = Document()
|
|
|
|
# Set default font
|
|
style = doc.styles['Normal']
|
|
font = style.font
|
|
font.name = 'Calibri'
|
|
font.size = Pt(11)
|
|
|
|
# Add title
|
|
title = doc.add_heading('DeepSeek OCR Results', 0)
|
|
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
|
|
# Process each page
|
|
for idx, page in enumerate(pages_content):
|
|
# Add page heading
|
|
page_heading = doc.add_heading(f'Page {idx + 1}', level=1)
|
|
page_heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
|
|
text = page.get('text', '')
|
|
|
|
# Handle images
|
|
if include_images and 'images' in page:
|
|
for img_idx, img_data in enumerate(page.get('images', [])):
|
|
placeholder = f"[IMAGE_{img_idx}]"
|
|
|
|
# Add image to document
|
|
try:
|
|
img_bytes = base64.b64decode(img_data)
|
|
img_stream = BytesIO(img_bytes)
|
|
doc.add_picture(img_stream, width=Inches(5))
|
|
text = text.replace(placeholder, '')
|
|
except Exception as e:
|
|
print(f"Error adding image to DOCX: {e}")
|
|
|
|
# Process text content
|
|
self._add_formatted_text_to_doc(doc, text)
|
|
|
|
# Add page break (except for last page)
|
|
if idx < len(pages_content) - 1:
|
|
doc.add_page_break()
|
|
|
|
# Save to BytesIO
|
|
docx_buffer = BytesIO()
|
|
doc.save(docx_buffer)
|
|
docx_buffer.seek(0)
|
|
|
|
return docx_buffer
|
|
|
|
def _is_markdown(self, text: str) -> bool:
|
|
"""Check if text appears to be markdown formatted"""
|
|
markdown_patterns = [
|
|
r'^#+\s', # Headers
|
|
r'\*\*.*\*\*', # Bold
|
|
r'\*.*\*', # Italic
|
|
r'^\*\s', # Lists
|
|
r'^\d+\.\s', # Numbered lists
|
|
r'\[.*\]\(.*\)', # Links
|
|
r'```', # Code blocks
|
|
]
|
|
|
|
for pattern in markdown_patterns:
|
|
if re.search(pattern, text, re.MULTILINE):
|
|
return True
|
|
return False
|
|
|
|
def _add_formatted_text_to_doc(self, doc: Document, text: str):
|
|
"""
|
|
Add formatted text to document, preserving structure
|
|
|
|
Args:
|
|
doc: Document object
|
|
text: Text to add
|
|
"""
|
|
# Split into paragraphs
|
|
paragraphs = text.split('\n\n')
|
|
|
|
for para in paragraphs:
|
|
if not para.strip():
|
|
continue
|
|
|
|
# Check for headers
|
|
if para.startswith('# '):
|
|
doc.add_heading(para.replace('# ', ''), level=1)
|
|
elif para.startswith('## '):
|
|
doc.add_heading(para.replace('## ', ''), level=2)
|
|
elif para.startswith('### '):
|
|
doc.add_heading(para.replace('### ', ''), level=3)
|
|
# Check for tables (simple detection)
|
|
elif '|' in para and para.count('|') > 2:
|
|
self._add_table_to_doc(doc, para)
|
|
# Check for code blocks
|
|
elif para.startswith('```'):
|
|
code_text = para.strip('```').strip()
|
|
p = doc.add_paragraph()
|
|
run = p.add_run(code_text)
|
|
run.font.name = 'Courier New'
|
|
run.font.size = Pt(10)
|
|
else:
|
|
# Regular paragraph
|
|
doc.add_paragraph(para.strip())
|
|
|
|
def _add_table_to_doc(self, doc: Document, table_text: str):
|
|
"""
|
|
Add a table to the document from markdown-style table text
|
|
|
|
Args:
|
|
doc: Document object
|
|
table_text: Table in markdown format
|
|
"""
|
|
rows = [row.strip() for row in table_text.split('\n') if row.strip()]
|
|
|
|
# Filter out separator rows
|
|
data_rows = [row for row in rows if not re.match(r'^[\|\s\-:]+$', row)]
|
|
|
|
if not data_rows:
|
|
return
|
|
|
|
# Parse table data
|
|
table_data = []
|
|
for row in data_rows:
|
|
cells = [cell.strip() for cell in row.split('|')]
|
|
cells = [c for c in cells if c] # Remove empty cells
|
|
if cells:
|
|
table_data.append(cells)
|
|
|
|
if not table_data:
|
|
return
|
|
|
|
# Create table
|
|
max_cols = max(len(row) for row in table_data)
|
|
table = doc.add_table(rows=len(table_data), cols=max_cols)
|
|
table.style = 'Light Grid Accent 1'
|
|
|
|
# Populate table
|
|
for i, row_data in enumerate(table_data):
|
|
row = table.rows[i]
|
|
for j, cell_text in enumerate(row_data):
|
|
if j < len(row.cells):
|
|
row.cells[j].text = cell_text
|
|
|
|
# Make header row bold
|
|
if i == 0:
|
|
for paragraph in row.cells[j].paragraphs:
|
|
for run in paragraph.runs:
|
|
run.font.bold = True
|