"""
Document Format Conversion Utilities
Handles conversion to Markdown, HTML, DOCX while preserving formatting
"""
import re
from typing import List, Dict, Any
from io import BytesIO
from docx import Document
from docx.shared import Pt, Inches, RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import markdown
import base64
from PIL import Image
class DocumentConverter:
"""Handles conversion of OCR results to various document formats"""
def __init__(self):
self.page_separator = '<--- Page Split --->'
def to_markdown(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> str:
"""
Convert OCR results to Markdown format
Args:
pages_content: List of page dictionaries with text and metadata
include_images: Whether to include image references
Returns:
Markdown formatted string
"""
md_content = []
for idx, page in enumerate(pages_content):
# Add page header
md_content.append(f"# Page {idx + 1}\n")
text = page.get('text', '')
# Process and clean the text
if include_images and 'images' in page:
# Replace image placeholders with actual markdown image syntax
for img_idx, img_data in enumerate(page.get('images', [])):
placeholder = f"[IMAGE_{img_idx}]"
img_ref = f""
text = text.replace(placeholder, img_ref)
md_content.append(text)
md_content.append("\n\n---\n\n") # Page separator
return "\n".join(md_content)
def to_html(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> str:
"""
Convert OCR results to HTML format
Args:
pages_content: List of page dictionaries with text and metadata
include_images: Whether to include images
Returns:
HTML formatted string
"""
html_parts = []
# HTML header
html_parts.append("""
')
html_parts.append(f' ')
text = page.get('text', '')
# Handle images if present
if include_images and 'images' in page:
for img_idx, img_data in enumerate(page.get('images', [])):
placeholder = f"[IMAGE_{img_idx}]"
img_tag = f'

'
text = text.replace(placeholder, img_tag)
# Convert markdown to HTML if the text appears to be markdown
if self._is_markdown(text):
html_content = markdown.markdown(text, extensions=['tables', 'fenced_code'])
else:
# Otherwise, preserve the HTML or wrap in paragraph
html_content = text if '<' in text else f'
{text.replace(chr(10), "
")}
'
html_parts.append(f' {html_content}')
html_parts.append('
')
# HTML footer
html_parts.append("""
""")
return "\n".join(html_parts)
def to_docx(self, pages_content: List[Dict[str, Any]], include_images: bool = True) -> BytesIO:
"""
Convert OCR results to DOCX format
Args:
pages_content: List of page dictionaries with text and metadata
include_images: Whether to include images
Returns:
BytesIO object containing the DOCX file
"""
doc = Document()
# Set default font
style = doc.styles['Normal']
font = style.font
font.name = 'Calibri'
font.size = Pt(11)
# Add title
title = doc.add_heading('DeepSeek OCR Results', 0)
title.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
# Process each page
for idx, page in enumerate(pages_content):
# Add page heading
page_heading = doc.add_heading(f'Page {idx + 1}', level=1)
page_heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
text = page.get('text', '')
# Handle images
if include_images and 'images' in page:
for img_idx, img_data in enumerate(page.get('images', [])):
placeholder = f"[IMAGE_{img_idx}]"
# Add image to document
try:
img_bytes = base64.b64decode(img_data)
img_stream = BytesIO(img_bytes)
doc.add_picture(img_stream, width=Inches(5))
text = text.replace(placeholder, '')
except Exception as e:
print(f"Error adding image to DOCX: {e}")
# Process text content
self._add_formatted_text_to_doc(doc, text)
# Add page break (except for last page)
if idx < len(pages_content) - 1:
doc.add_page_break()
# Save to BytesIO
docx_buffer = BytesIO()
doc.save(docx_buffer)
docx_buffer.seek(0)
return docx_buffer
def _is_markdown(self, text: str) -> bool:
"""Check if text appears to be markdown formatted"""
markdown_patterns = [
r'^#+\s', # Headers
r'\*\*.*\*\*', # Bold
r'\*.*\*', # Italic
r'^\*\s', # Lists
r'^\d+\.\s', # Numbered lists
r'\[.*\]\(.*\)', # Links
r'```', # Code blocks
]
for pattern in markdown_patterns:
if re.search(pattern, text, re.MULTILINE):
return True
return False
def _add_formatted_text_to_doc(self, doc: Document, text: str):
"""
Add formatted text to document, preserving structure
Args:
doc: Document object
text: Text to add
"""
# Split into paragraphs
paragraphs = text.split('\n\n')
for para in paragraphs:
if not para.strip():
continue
# Check for headers
if para.startswith('# '):
doc.add_heading(para.replace('# ', ''), level=1)
elif para.startswith('## '):
doc.add_heading(para.replace('## ', ''), level=2)
elif para.startswith('### '):
doc.add_heading(para.replace('### ', ''), level=3)
# Check for tables (simple detection)
elif '|' in para and para.count('|') > 2:
self._add_table_to_doc(doc, para)
# Check for code blocks
elif para.startswith('```'):
code_text = para.strip('```').strip()
p = doc.add_paragraph()
run = p.add_run(code_text)
run.font.name = 'Courier New'
run.font.size = Pt(10)
else:
# Regular paragraph
doc.add_paragraph(para.strip())
def _add_table_to_doc(self, doc: Document, table_text: str):
"""
Add a table to the document from markdown-style table text
Args:
doc: Document object
table_text: Table in markdown format
"""
rows = [row.strip() for row in table_text.split('\n') if row.strip()]
# Filter out separator rows
data_rows = [row for row in rows if not re.match(r'^[\|\s\-:]+$', row)]
if not data_rows:
return
# Parse table data
table_data = []
for row in data_rows:
cells = [cell.strip() for cell in row.split('|')]
cells = [c for c in cells if c] # Remove empty cells
if cells:
table_data.append(cells)
if not table_data:
return
# Create table
max_cols = max(len(row) for row in table_data)
table = doc.add_table(rows=len(table_data), cols=max_cols)
table.style = 'Light Grid Accent 1'
# Populate table
for i, row_data in enumerate(table_data):
row = table.rows[i]
for j, cell_text in enumerate(row_data):
if j < len(row.cells):
row.cells[j].text = cell_text
# Make header row bold
if i == 0:
for paragraph in row.cells[j].paragraphs:
for run in paragraph.runs:
run.font.bold = True