Skip to content

Data Model

Complete PostgreSQL schema and entity relationships for OpenComplai.

Entity Relationship Diagram

┌──────────────┐
│ users        │
├──────────────┤
│ id (PK)      │◄─────────┐
│ email        │          │
│ password_hash│          │
│ created_at   │          │
└──────────────┘          │
                   ┌─────────────┐
                   │ documents   │
                   ├─────────────┤
                   │ id (PK)     │
                   │ user_id (FK)├─────────┤
                   │ title       │         │
                   │ content     │         │
                   │ status      │         │
                   │ created_at  │         │
                   └─────────────┘         │
                          │               │
         ┌────────────────┼───────────────┘
         │                │
         ▼                ▼
   ┌──────────────┐   ┌─────────────────┐
   │processing_   │   │audit_logs       │
   │results       │   ├─────────────────┤
   ├──────────────┤   │id (PK)          │
   │id (PK)       │   │user_id (FK)     │
   │document_id   │   │action           │
   │extracted     │   │resource_type    │
   │analysis      │   │resource_id      │
   │created_at    │   │changes (JSONB)  │
   └──────────────┘   │created_at       │
                      └─────────────────┘

Core Tables

users

Stores user account information and authentication.

CREATE TABLE users (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  email TEXT UNIQUE NOT NULL,
  password_hash TEXT NOT NULL,
  full_name TEXT,
  organization TEXT,
  avatar_url TEXT,

  -- Account status
  is_active BOOLEAN DEFAULT true,
  email_verified BOOLEAN DEFAULT false,
  email_verified_at TIMESTAMP,

  -- Subscription
  tier VARCHAR(50) DEFAULT 'free',  -- free, pro, enterprise
  subscription_expires_at TIMESTAMP,

  -- Timestamps
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  deleted_at TIMESTAMP,  -- Soft delete

  -- Indexes
  CONSTRAINT email_not_empty CHECK (email != '')
);

CREATE INDEX idx_users_email ON users(email);
CREATE INDEX idx_users_is_active ON users(is_active);
CREATE INDEX idx_users_created_at ON users(created_at DESC);

documents

Stores uploaded documents and their metadata.

CREATE TABLE documents (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,

  -- Document content
  title TEXT NOT NULL,
  description TEXT,
  content TEXT,  -- Full text stored here
  file_type VARCHAR(50),  -- pdf, docx, txt, etc.
  file_size_bytes INTEGER,

  -- Processing status
  processing_status VARCHAR(50) DEFAULT 'pending',  -- pending, processing, completed, failed
  processing_error TEXT,
  processing_started_at TIMESTAMP,
  processing_completed_at TIMESTAMP,

  -- Metadata
  source VARCHAR(100),  -- web, api, upload, etc.
  external_id TEXT,  -- For linking to external systems
  metadata JSONB DEFAULT '{}',  -- Custom fields

  -- Full-text search
  search_vector tsvector GENERATED ALWAYS AS (to_tsvector('english', COALESCE(title, '') || ' ' || COALESCE(content, ''))) STORED,

  -- Timestamps
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  deleted_at TIMESTAMP,  -- Soft delete

  CONSTRAINT title_not_empty CHECK (title != '')
);

CREATE INDEX idx_documents_user_id ON documents(user_id);
CREATE INDEX idx_documents_status ON documents(processing_status);
CREATE INDEX idx_documents_created_at ON documents(created_at DESC);
CREATE INDEX idx_documents_search ON documents USING GIN(search_vector);

processing_results

Stores extraction and analysis results from document processing.

CREATE TABLE processing_results (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  document_id UUID NOT NULL REFERENCES documents(id) ON DELETE CASCADE,

  -- Extraction results
  extracted_text TEXT,
  extracted_metadata JSONB,  -- title, author, created_date, etc.
  extracted_entities JSONB,  -- Named entities found
  extracted_tables JSONB,    -- Structured table data
  extracted_images JSONB,    -- Image metadata and URLs

  -- AI Analysis
  analysis_results JSONB,    -- Custom analysis results
  sentiment_score NUMERIC(3,2),  -- 0.0 to 1.0
  language_detected VARCHAR(10),  -- en, es, fr, etc.

  -- Processing metrics
  processing_time_ms INTEGER,
  token_count INTEGER,
  model_version VARCHAR(50),

  -- Quality metrics
  confidence_score NUMERIC(3,2),  -- 0.0 to 1.0
  quality_notes TEXT,

  -- Timestamps
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE INDEX idx_processing_results_document_id ON processing_results(document_id);
CREATE INDEX idx_processing_results_created_at ON processing_results(created_at DESC);

audit_logs

Tracks all user actions for compliance and debugging.

CREATE TABLE audit_logs (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  user_id UUID REFERENCES users(id),

  -- Action details
  action VARCHAR(100) NOT NULL,  -- create, read, update, delete
  resource_type VARCHAR(100) NOT NULL,  -- document, user, etc.
  resource_id UUID,

  -- What changed
  old_values JSONB,  -- Previous values
  new_values JSONB,  -- New values

  -- Context
  ip_address INET,
  user_agent TEXT,
  request_id UUID,  -- Correlation ID

  -- Timestamp
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,

  CONSTRAINT action_not_empty CHECK (action != '')
);

CREATE INDEX idx_audit_logs_user_id ON audit_logs(user_id);
CREATE INDEX idx_audit_logs_created_at ON audit_logs(created_at DESC);
CREATE INDEX idx_audit_logs_resource ON audit_logs(resource_type, resource_id);

api_keys

Stores API keys for programmatic access.

CREATE TABLE api_keys (
  id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
  user_id UUID NOT NULL REFERENCES users(id) ON DELETE CASCADE,

  -- Key info
  name TEXT NOT NULL,
  key_hash TEXT UNIQUE NOT NULL,  -- Never store plaintext
  prefix VARCHAR(10),  -- sk_live_ for example

  -- Permissions
  permissions TEXT[] DEFAULT '{}',  -- ['document.read', 'document.create']

  -- Usage tracking
  last_used_at TIMESTAMP,
  created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  expires_at TIMESTAMP,
  revoked_at TIMESTAMP
);

CREATE INDEX idx_api_keys_user_id ON api_keys(user_id);
CREATE INDEX idx_api_keys_key_hash ON api_keys(key_hash);

Indexing Strategy

-- Documents have auto-generated search vector
SELECT * FROM documents 
WHERE search_vector @@ plainto_tsquery('english', 'machine learning')
ORDER BY ts_rank_cd(search_vector, query) DESC;

Time-Series Queries

-- Get documents created in last 7 days
SELECT * FROM documents 
WHERE created_at >= CURRENT_DATE - INTERVAL '7 days'
ORDER BY created_at DESC;

User Data

-- Get all documents for a user
SELECT * FROM documents 
WHERE user_id = $1 AND deleted_at IS NULL
ORDER BY created_at DESC;

Data Constraints

Foreign Keys

  • documents.user_idusers.id (CASCADE on delete)
  • processing_results.document_iddocuments.id (CASCADE on delete)
  • api_keys.user_idusers.id (CASCADE on delete)

Unique Constraints

  • users.email - Email must be unique
  • api_keys.key_hash - API key hash must be unique

Check Constraints

  • Non-empty required text fields
  • Valid email format (application-level)
  • Valid processing status values

Data Types

Field Type PostgreSQL Type Purpose
ID UUID Globally unique identifiers
Timestamps TIMESTAMP When records created/updated
Soft Delete TIMESTAMP Mark deleted without removing
JSON JSONB Semi-structured data (queryable)
Full-Text tsvector Search optimization
Large Text TEXT Unlimited length content
Numeric NUMERIC(p,s) Precise decimal values

Migrations

Example migration to add new field:

-- Version: 001_initial_schema.sql
-- Applied: 2026-05-15

ALTER TABLE documents ADD COLUMN language_code VARCHAR(10);
ALTER TABLE documents ADD CONSTRAINT language_code_valid 
  CHECK (language_code ~ '^[a-z]{2}(-[A-Z]{2})?$');

CREATE INDEX idx_documents_language ON documents(language_code);

Performance Considerations

Query Optimization

-- ❌ Slow - full table scan
SELECT * FROM documents WHERE processing_status = 'completed';

-- ✅ Fast - uses index
SELECT * FROM documents 
WHERE processing_status = 'completed'
AND user_id = $1;  -- Filter by indexed column first

Partitioning (for scale)

When documents table > 10M rows, consider:

-- Partition by date
CREATE TABLE documents_2026_05 PARTITION OF documents
  FOR VALUES FROM ('2026-05-01') TO ('2026-06-01');

Connection Pooling

Use pgBouncer or application-level pooling:

Min connections: 5
Max connections: 20
Idle timeout: 10 minutes

Backup & Recovery

Daily Backups

# Full dump (production-safe)
pg_dump -Fc opencomplai > backup_$(date +%Y%m%d).dump

# Point-in-time recovery
pg_restore -d opencomplai backup_20260515.dump

WAL Archiving

wal_level = replica
archive_mode = on
archive_command = 'aws s3 cp %p s3://backups/wal/%f'