Data Model¶
Complete PostgreSQL schema and entity relationships for OpenComplai.
Entity Relationship Diagram¶
┌──────────────┐
│ users │
├──────────────┤
│ id (PK) │◄─────────┐
│ email │ │
│ password_hash│ │
│ created_at │ │
└──────────────┘ │
│
┌─────────────┐
│ documents │
├─────────────┤
│ id (PK) │
│ user_id (FK)├─────────┤
│ title │ │
│ content │ │
│ status │ │
│ created_at │ │
└─────────────┘ │
│ │
┌────────────────┼───────────────┘
│ │
▼ ▼
┌──────────────┐ ┌─────────────────┐
│processing_ │ │audit_logs │
│results │ ├─────────────────┤
├──────────────┤ │id (PK) │
│id (PK) │ │user_id (FK) │
│document_id │ │action │
│extracted │ │resource_type │
│analysis │ │resource_id │
│created_at │ │changes (JSONB) │
└──────────────┘ │created_at │
└─────────────────┘
Core Tables¶
users¶
Stores user account information and authentication.
CREATE TABLE users (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
email TEXT UNIQUE NOT NULL,
password_hash TEXT NOT NULL,
full_name TEXT,
organization TEXT,
avatar_url TEXT,
-- Account status
is_active BOOLEAN DEFAULT true,
email_verified BOOLEAN DEFAULT false,
email_verified_at TIMESTAMP,
-- Subscription
tier VARCHAR(50) DEFAULT 'free', -- free, pro, enterprise
subscription_expires_at TIMESTAMP,
-- Timestamps
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
deleted_at TIMESTAMP, -- Soft delete
-- Indexes
CONSTRAINT email_not_empty CHECK (email != '')
);
CREATE INDEX idx_users_email ON users(email);
CREATE INDEX idx_users_is_active ON users(is_active);
CREATE INDEX idx_users_created_at ON users(created_at DESC);
documents¶
Stores uploaded documents and their metadata.
CREATE TABLE documents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
-- Document content
title TEXT NOT NULL,
description TEXT,
content TEXT, -- Full text stored here
file_type VARCHAR(50), -- pdf, docx, txt, etc.
file_size_bytes INTEGER,
-- Processing status
processing_status VARCHAR(50) DEFAULT 'pending', -- pending, processing, completed, failed
processing_error TEXT,
processing_started_at TIMESTAMP,
processing_completed_at TIMESTAMP,
-- Metadata
source VARCHAR(100), -- web, api, upload, etc.
external_id TEXT, -- For linking to external systems
metadata JSONB DEFAULT '{}', -- Custom fields
-- Full-text search
search_vector tsvector GENERATED ALWAYS AS (to_tsvector('english', COALESCE(title, '') || ' ' || COALESCE(content, ''))) STORED,
-- Timestamps
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
deleted_at TIMESTAMP, -- Soft delete
CONSTRAINT title_not_empty CHECK (title != '')
);
CREATE INDEX idx_documents_user_id ON documents(user_id);
CREATE INDEX idx_documents_status ON documents(processing_status);
CREATE INDEX idx_documents_created_at ON documents(created_at DESC);
CREATE INDEX idx_documents_search ON documents USING GIN(search_vector);
processing_results¶
Stores extraction and analysis results from document processing.
CREATE TABLE processing_results (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
-- Extraction results
extracted_text TEXT,
extracted_metadata JSONB, -- title, author, created_date, etc.
extracted_entities JSONB, -- Named entities found
extracted_tables JSONB, -- Structured table data
extracted_images JSONB, -- Image metadata and URLs
-- AI Analysis
analysis_results JSONB, -- Custom analysis results
sentiment_score NUMERIC(3,2), -- 0.0 to 1.0
language_detected VARCHAR(10), -- en, es, fr, etc.
-- Processing metrics
processing_time_ms INTEGER,
token_count INTEGER,
model_version VARCHAR(50),
-- Quality metrics
confidence_score NUMERIC(3,2), -- 0.0 to 1.0
quality_notes TEXT,
-- Timestamps
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX idx_processing_results_document_id ON processing_results(document_id);
CREATE INDEX idx_processing_results_created_at ON processing_results(created_at DESC);
audit_logs¶
Tracks all user actions for compliance and debugging.
CREATE TABLE audit_logs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID REFERENCES users(id),
-- Action details
action VARCHAR(100) NOT NULL, -- create, read, update, delete
resource_type VARCHAR(100) NOT NULL, -- document, user, etc.
resource_id UUID,
-- What changed
old_values JSONB, -- Previous values
new_values JSONB, -- New values
-- Context
ip_address INET,
user_agent TEXT,
request_id UUID, -- Correlation ID
-- Timestamp
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT action_not_empty CHECK (action != '')
);
CREATE INDEX idx_audit_logs_user_id ON audit_logs(user_id);
CREATE INDEX idx_audit_logs_created_at ON audit_logs(created_at DESC);
CREATE INDEX idx_audit_logs_resource ON audit_logs(resource_type, resource_id);
api_keys¶
Stores API keys for programmatic access.
CREATE TABLE api_keys (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,
-- Key info
name TEXT NOT NULL,
key_hash TEXT UNIQUE NOT NULL, -- Never store plaintext
prefix VARCHAR(10), -- sk_live_ for example
-- Permissions
permissions TEXT[] DEFAULT '{}', -- ['document.read', 'document.create']
-- Usage tracking
last_used_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
expires_at TIMESTAMP,
revoked_at TIMESTAMP
);
CREATE INDEX idx_api_keys_user_id ON api_keys(user_id);
CREATE INDEX idx_api_keys_key_hash ON api_keys(key_hash);
Indexing Strategy¶
Full-Text Search¶
-- Documents have auto-generated search vector
SELECT * FROM documents
WHERE search_vector @@ plainto_tsquery('english', 'machine learning')
ORDER BY ts_rank_cd(search_vector, query) DESC;
Time-Series Queries¶
-- Get documents created in last 7 days
SELECT * FROM documents
WHERE created_at >= CURRENT_DATE - INTERVAL '7 days'
ORDER BY created_at DESC;
User Data¶
-- Get all documents for a user
SELECT * FROM documents
WHERE user_id = $1 AND deleted_at IS NULL
ORDER BY created_at DESC;
Data Constraints¶
Foreign Keys¶
documents.user_id→users.id(CASCADE on delete)processing_results.document_id→documents.id(CASCADE on delete)api_keys.user_id→users.id(CASCADE on delete)
Unique Constraints¶
users.email- Email must be uniqueapi_keys.key_hash- API key hash must be unique
Check Constraints¶
- Non-empty required text fields
- Valid email format (application-level)
- Valid processing status values
Data Types¶
| Field Type | PostgreSQL Type | Purpose |
|---|---|---|
| ID | UUID | Globally unique identifiers |
| Timestamps | TIMESTAMP | When records created/updated |
| Soft Delete | TIMESTAMP | Mark deleted without removing |
| JSON | JSONB | Semi-structured data (queryable) |
| Full-Text | tsvector | Search optimization |
| Large Text | TEXT | Unlimited length content |
| Numeric | NUMERIC(p,s) | Precise decimal values |
Migrations¶
Example migration to add new field:
-- Version: 001_initial_schema.sql
-- Applied: 2026-05-15
ALTER TABLE documents ADD COLUMN language_code VARCHAR(10);
ALTER TABLE documents ADD CONSTRAINT language_code_valid
CHECK (language_code ~ '^[a-z]{2}(-[A-Z]{2})?$');
CREATE INDEX idx_documents_language ON documents(language_code);
Performance Considerations¶
Query Optimization¶
-- ❌ Slow - full table scan
SELECT * FROM documents WHERE processing_status = 'completed';
-- ✅ Fast - uses index
SELECT * FROM documents
WHERE processing_status = 'completed'
AND user_id = $1; -- Filter by indexed column first
Partitioning (for scale)¶
When documents table > 10M rows, consider:
-- Partition by date
CREATE TABLE documents_2026_05 PARTITION OF documents
FOR VALUES FROM ('2026-05-01') TO ('2026-06-01');
Connection Pooling¶
Use pgBouncer or application-level pooling:
Backup & Recovery¶
Daily Backups¶
# Full dump (production-safe)
pg_dump -Fc opencomplai > backup_$(date +%Y%m%d).dump
# Point-in-time recovery
pg_restore -d opencomplai backup_20260515.dump
WAL Archiving¶
Related Documentation¶
- System Design - How components interact
- Design Decisions - Why PostgreSQL was chosen
- API Reference - How to query this data