116 lines
3.9 KiB
Python
116 lines
3.9 KiB
Python
import os
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
from contextlib import contextmanager
|
|
from decouple import config as env_config
|
|
|
|
DATABASE_URL = env_config(
|
|
"DATABASE_URL",
|
|
default="postgresql://ocr_user:ocr_password@postgres:5432/ocr_db"
|
|
)
|
|
|
|
|
|
def _get_conn():
|
|
return psycopg2.connect(DATABASE_URL, cursor_factory=psycopg2.extras.RealDictCursor)
|
|
|
|
|
|
def init_db():
|
|
"""Create tables if they don't exist. Called once at startup."""
|
|
conn = None
|
|
try:
|
|
conn = _get_conn()
|
|
with conn.cursor() as cur:
|
|
cur.execute("""
|
|
CREATE TABLE IF NOT EXISTS ocr_jobs (
|
|
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
|
author TEXT,
|
|
book TEXT,
|
|
chapter TEXT,
|
|
page TEXT,
|
|
submitted_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
image_path TEXT NOT NULL,
|
|
original_filename TEXT,
|
|
ocr_text TEXT,
|
|
status TEXT NOT NULL DEFAULT 'unreviewed',
|
|
reviewed_text TEXT,
|
|
reviewer_name TEXT,
|
|
reviewed_at TIMESTAMPTZ,
|
|
mode TEXT
|
|
)
|
|
""")
|
|
# Index for fast full-text-style searches on common fields
|
|
cur.execute("""
|
|
CREATE INDEX IF NOT EXISTS ocr_jobs_status_idx ON ocr_jobs(status)
|
|
""")
|
|
cur.execute("""
|
|
CREATE INDEX IF NOT EXISTS ocr_jobs_submitted_at_idx ON ocr_jobs(submitted_at DESC)
|
|
""")
|
|
# Add columns introduced after initial schema (safe to run repeatedly)
|
|
cur.execute("""
|
|
ALTER TABLE ocr_jobs
|
|
ADD COLUMN IF NOT EXISTS describe_text TEXT
|
|
""")
|
|
cur.execute("""
|
|
ALTER TABLE ocr_jobs
|
|
ADD COLUMN IF NOT EXISTS freeform_text TEXT
|
|
""")
|
|
cur.execute("""
|
|
ALTER TABLE ocr_jobs
|
|
ADD COLUMN IF NOT EXISTS qdrant_synced_at TIMESTAMPTZ
|
|
""")
|
|
cur.execute("""
|
|
ALTER TABLE ocr_jobs
|
|
ADD COLUMN IF NOT EXISTS updated_at TIMESTAMPTZ
|
|
""")
|
|
# Which OCR model produced this job (e.g. "deepseek-local", "ollama:glm-ocr")
|
|
cur.execute("""
|
|
ALTER TABLE ocr_jobs
|
|
ADD COLUMN IF NOT EXISTS ocr_model TEXT
|
|
""")
|
|
# Trigger function: stamp updated_at on every row update
|
|
cur.execute("""
|
|
CREATE OR REPLACE FUNCTION set_updated_at()
|
|
RETURNS TRIGGER AS $$
|
|
BEGIN
|
|
NEW.updated_at = NOW();
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql
|
|
""")
|
|
cur.execute("""
|
|
CREATE OR REPLACE TRIGGER ocr_jobs_set_updated_at
|
|
BEFORE UPDATE ON ocr_jobs
|
|
FOR EACH ROW EXECUTE FUNCTION set_updated_at()
|
|
""")
|
|
# Unique constraint: prevent duplicate (author, chapter, page) submissions.
|
|
# Applies only when all three fields are non-null.
|
|
cur.execute("""
|
|
CREATE UNIQUE INDEX IF NOT EXISTS ocr_jobs_author_chapter_page_unique
|
|
ON ocr_jobs (author, chapter, page)
|
|
WHERE author IS NOT NULL AND chapter IS NOT NULL AND page IS NOT NULL
|
|
""")
|
|
conn.commit()
|
|
print("Database initialized.")
|
|
except Exception as exc:
|
|
print(f"Database init failed: {exc}")
|
|
if conn:
|
|
conn.rollback()
|
|
raise
|
|
finally:
|
|
if conn:
|
|
conn.close()
|
|
|
|
|
|
@contextmanager
|
|
def get_db():
|
|
"""Yield a connection and auto-commit/rollback."""
|
|
conn = _get_conn()
|
|
try:
|
|
yield conn
|
|
conn.commit()
|
|
except Exception:
|
|
conn.rollback()
|
|
raise
|
|
finally:
|
|
conn.close()
|