From 321d4c1ddda742327717ad64fdd2fc6ff150e3ba Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 9 Dec 2025 11:29:09 +0000 Subject: [PATCH 01/49] feat: Add bulk embedding and ingest mode Co-authored-by: jdavid10001 --- README.md | 27 +- dev/helpers/embed.ts | 36 +++ dev/payload.config.ts | 3 + dev/specs/bulkEmbed.spec.ts | 191 +++++++++++++ dev/specs/config.spec.ts | 24 +- src/admin/components/EmbedAllButton.tsx | 54 ++++ src/collections/bulkEmbeddingsRuns.ts | 111 ++++++++ src/collections/embeddings.ts | 8 + src/endpoints/bulkEmbed.ts | 61 ++++ src/exports/client.ts | 1 + src/exports/rsc.ts | 2 + src/index.ts | 35 ++- src/tasks/bulkEmbedAll.ts | 357 ++++++++++++++++++++++++ src/types.ts | 85 ++++++ 14 files changed, 989 insertions(+), 6 deletions(-) create mode 100644 dev/specs/bulkEmbed.spec.ts create mode 100644 src/admin/components/EmbedAllButton.tsx create mode 100644 src/collections/bulkEmbeddingsRuns.ts create mode 100644 src/endpoints/bulkEmbed.ts create mode 100644 src/exports/client.ts create mode 100644 src/exports/rsc.ts create mode 100644 src/tasks/bulkEmbedAll.ts diff --git a/README.md b/README.md index fd617c8..ec36941 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ A Payload CMS plugin that adds vector search capabilities to your collections us - 🔍 **Semantic Search**: Vectorize any collection for intelligent content discovery - 🚀 **Automatic**: Documents are automatically vectorized when created or updated, and vectors are deleted as soon as the document is deleted. +- 🧵 **Bulk embedding**: Run “Embed all” batches that backfill only documents missing the current `embeddingVersion`. - 📊 **PostgreSQL Integration**: Built on pgvector for high-performance vector operations - ⚡ **Background Processing**: Uses Payload's job system for non-blocking vectorization - 🎯 **Flexible Chunking**: Drive chunk creation yourself with `toKnowledgePool` functions so you can combine any fields or content types @@ -189,6 +190,8 @@ The embeddings collection name will be the same as the knowledge pool name. - `embedQuery`: `EmbedQueryFn` - Function to embed search queries - `embeddingVersion`: `string` - Version string for tracking model changes - `extensionFields?`: `Field[]` - Optional fields to extend the embeddings collection schema +- `ingestMode?`: `'realtime' | 'bulk'` - Default `realtime` queues embeddings immediately. `bulk` skips realtime embedding, deletes stale vectors on updates, and relies on the bulk job to backfill. +- `bulkEmbeddings?`: Provider-specific callbacks for batch embedding (`prepareBulkEmbeddings`, `pollBulkEmbeddings`, `completeBulkEmbeddings`). If omitted, the plugin falls back to using `embedDocs` in-process. #### CollectionVectorizeOption @@ -299,6 +302,27 @@ Search for similar content using vector similarity. } ``` +### Bulk embedding (Embed all) + +- Each knowledge pool’s embeddings list shows an **Embed all** admin button that queues a `payloadcms-vectorize:bulk-embed-all` job. +- Bulk runs only include documents that are missing embeddings for the pool’s current `embeddingVersion`. +- Progress is recorded in the `vector-bulk-embeddings-runs` collection (fields: `pool`, `embeddingVersion`, `providerBatchId`, `status`, counts, timestamps, `error`). +- Endpoint: **POST** `/api/vector-bulk-embed` + +```jsonc +{ + "knowledgePool": "main" +} +``` + +Bulk callbacks are provider-agnostic: + +- `prepareBulkEmbeddings({ payload, knowledgePool, embeddingVersion, inputs })` +- `pollBulkEmbeddings({ payload, knowledgePool, providerBatchId })` +- `completeBulkEmbeddings({ payload, knowledgePool, providerBatchId })` + +If `bulkEmbeddings` is not provided, the plugin falls back to running `embedDocs` locally. + ## Changelog See [CHANGELOG.md](./CHANGELOG.md) for release history, migration notes, and upgrade guides. @@ -339,13 +363,12 @@ Thank you for the stars! The following updates have been completed: - **Multiple Knowledge Pools**: You can create separate knowledge pools with independent configurations (dims, ivfflatLists, embedding functions) and needs. Each pool operates independently, allowing you to organize your vectorized content by domain, use case, or any other criteria that makes sense for your application. - **More expressive queries**: Added ability to change query limit, search on certain collections or certain fields +- **Bulk embed all**: Batch backfills with admin button, provider callbacks, and run tracking. The following features are planned for future releases based on community interest and stars: - **Migrations for vector dimensions**: Easy migration tools for changing vector dimensions and/or ivfflatLists after initial setup - **MongoDB support**: Extend vector search capabilities to MongoDB databases - **Vercel support**: Optimized deployment and configuration for Vercel hosting -- **Batch embedding**: More efficient bulk embedding operations for large datasets -- **'Embed all' button**: Admin UI button to re-embed all content after embeddingVersion changes **Want to see these features sooner?** Star this repository and open issues for the features you need most! diff --git a/dev/helpers/embed.ts b/dev/helpers/embed.ts index 18ac59d..d70b87e 100644 --- a/dev/helpers/embed.ts +++ b/dev/helpers/embed.ts @@ -1,5 +1,6 @@ import { voyage } from 'voyage-ai-provider' import { embed, embedMany } from 'ai' +import type { BulkEmbeddingsCallbacks } from 'payloadcms-vectorize' export const voyageEmbedDocs = async (texts: string[]): Promise => { const embedResult = await embedMany({ @@ -54,3 +55,38 @@ export function makeDummyEmbedDocs(dims: number) { } } export const testEmbeddingVersion = 'test-v1' + +export function makeLocalBulkEmbeddingsCallbacks(dims: number): BulkEmbeddingsCallbacks { + const pendingInputs = new Map>() + const embedDocs = makeDummyEmbedDocs(dims) + return { + prepareBulkEmbeddings: async ({ inputs }) => { + const providerBatchId = `local-${dims}-${Date.now()}` + pendingInputs.set(providerBatchId, inputs) + return { + providerBatchId, + status: 'queued', + counts: { inputs: inputs.length }, + } + }, + pollBulkEmbeddings: async ({ providerBatchId }) => { + if (!pendingInputs.has(providerBatchId)) { + return { status: 'failed', error: 'unknown batch' } + } + return { status: 'succeeded' } + }, + completeBulkEmbeddings: async ({ providerBatchId }) => { + const inputs = pendingInputs.get(providerBatchId) || [] + const embeddings = await embedDocs(inputs.map((i) => i.text)) + pendingInputs.delete(providerBatchId) + return { + status: 'succeeded', + outputs: embeddings.map((vector, idx) => ({ + id: inputs[idx]?.id ?? String(idx), + embedding: vector, + })), + counts: { inputs: inputs.length, succeeded: embeddings.length, failed: 0 }, + } + }, + } +} diff --git a/dev/payload.config.ts b/dev/payload.config.ts index 74ea031..2b376ce 100644 --- a/dev/payload.config.ts +++ b/dev/payload.config.ts @@ -9,6 +9,7 @@ import { voyageEmbedDocs, voyageEmbedQuery, makeDummyEmbedQuery, + makeLocalBulkEmbeddingsCallbacks, } from './helpers/embed.js' import sharp from 'sharp' import { fileURLToPath } from 'url' @@ -122,6 +123,8 @@ const buildConfigWithPostgres = async () => { embedDocs, embedQuery, embeddingVersion: testEmbeddingVersion, + ingestMode: 'realtime', + bulkEmbeddings: makeLocalBulkEmbeddingsCallbacks(dims), }, }, }), diff --git a/dev/specs/bulkEmbed.spec.ts b/dev/specs/bulkEmbed.spec.ts new file mode 100644 index 0000000..386d319 --- /dev/null +++ b/dev/specs/bulkEmbed.spec.ts @@ -0,0 +1,191 @@ +import type { Payload, SanitizedConfig } from 'payload' + +import { buildConfig, getPayload } from 'payload' +import { beforeAll, describe, expect, test } from 'vitest' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { lexicalEditor } from '@payloadcms/richtext-lexical' +import { createVectorizeIntegration } from 'payloadcms-vectorize' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../src/collections/bulkEmbeddingsRuns.js' +import { createBulkEmbedAllTask } from '../../src/tasks/bulkEmbedAll.js' +import { createTestDb } from './utils.js' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, makeLocalBulkEmbeddingsCallbacks, testEmbeddingVersion } from 'helpers/embed.js' + +const DIMS = 8 + +describe('Bulk embed ingest mode', () => { + let payload: Payload + let config: SanitizedConfig + const dbName = 'bulk_embed_test' + + const integration = createVectorizeIntegration({ + default: { + dims: DIMS, + ivfflatLists: 1, + }, + }) + + const pluginOptions = { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embedDocs: makeDummyEmbedDocs(DIMS), + embedQuery: makeDummyEmbedQuery(DIMS), + embeddingVersion: testEmbeddingVersion, + ingestMode: 'bulk' as const, + bulkEmbeddings: makeLocalBulkEmbeddingsCallbacks(DIMS), + }, + }, + } + + beforeAll(async () => { + await createTestDb({ dbName }) + config = await buildConfig({ + secret: 'test-secret', + editor: lexicalEditor(), + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [integration.payloadcmsVectorize(pluginOptions)], + jobs: { tasks: [] }, + }) + + payload = await getPayload({ config }) + }) + + test('queues no realtime embeddings and bulk job backfills missing docs', async () => { + const post = await payload.create({ + collection: 'posts', + data: { title: 'Bulk Mode Title' } as any, + }) + + const initialEmbeds = await payload.find({ + collection: 'default', + where: { + and: [ + { sourceCollection: { equals: 'posts' } }, + { docId: { equals: String(post.id) } }, + ], + }, + }) + expect(initialEmbeds.totalDocs).toBe(0) + + const run = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: 'default', + embeddingVersion: testEmbeddingVersion, + status: 'queued', + }, + }) + + const bulkTask = createBulkEmbedAllTask({ + knowledgePools: pluginOptions.knowledgePools, + }) + + await bulkTask.handler({ + input: { runId: String(run.id) }, + req: { payload } as any, + }) + + const embeds = await payload.find({ + collection: 'default', + where: { + and: [ + { sourceCollection: { equals: 'posts' } }, + { docId: { equals: String(post.id) } }, + ], + }, + }) + expect(embeds.totalDocs).toBeGreaterThan(0) + expect(embeds.docs[0]?.chunkText).toContain('Bulk Mode Title') + + const runDoc = await payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: run.id, + }) + expect(runDoc.status).toBe('succeeded') + expect(runDoc.inputs).toBeGreaterThan(0) + }) + + test('document updates clear stale embeddings and rerun populates new chunks', async () => { + const post = await payload.create({ + collection: 'posts', + data: { title: 'Original' } as any, + }) + + // First run to embed + const firstRun = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: 'default', + embeddingVersion: testEmbeddingVersion, + status: 'queued', + }, + }) + const bulkTask = createBulkEmbedAllTask({ + knowledgePools: pluginOptions.knowledgePools, + }) + await bulkTask.handler({ + input: { runId: String(firstRun.id) }, + req: { payload } as any, + }) + + // Update document - should delete embeddings in bulk mode + await payload.update({ + collection: 'posts', + id: post.id, + data: { title: 'Updated Title' } as any, + }) + + const afterUpdateEmbeds = await payload.find({ + collection: 'default', + where: { + and: [ + { sourceCollection: { equals: 'posts' } }, + { docId: { equals: String(post.id) } }, + ], + }, + }) + expect(afterUpdateEmbeds.totalDocs).toBe(0) + + // Run again to backfill + const secondRun = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: 'default', + embeddingVersion: testEmbeddingVersion, + status: 'queued', + }, + }) + await bulkTask.handler({ + input: { runId: String(secondRun.id) }, + req: { payload } as any, + }) + + const embedsAfterRerun = await payload.find({ + collection: 'default', + where: { + and: [ + { sourceCollection: { equals: 'posts' } }, + { docId: { equals: String(post.id) } }, + ], + }, + }) + expect(embedsAfterRerun.totalDocs).toBeGreaterThan(0) + expect(embedsAfterRerun.docs[0]?.chunkText).toContain('Updated Title') + }) +}) diff --git a/dev/specs/config.spec.ts b/dev/specs/config.spec.ts index f6457f6..0a467f2 100644 --- a/dev/specs/config.spec.ts +++ b/dev/specs/config.spec.ts @@ -6,9 +6,12 @@ describe('jobs.tasks merging', () => { const cfg = await buildDummyConfig({ jobs: { tasks: [] } }) const tasks = cfg.jobs?.tasks expect(Array.isArray(tasks)).toBe(true) - expect(tasks).toEqual([ - { slug: 'payloadcms-vectorize:vectorize', handler: expect.any(Function) }, - ]) + expect(tasks).toEqual( + expect.arrayContaining([ + { slug: 'payloadcms-vectorize:vectorize', handler: expect.any(Function) }, + { slug: 'payloadcms-vectorize:bulk-embed-all', handler: expect.any(Function) }, + ]), + ) }) }) @@ -24,6 +27,11 @@ describe('/vector-search endpoint', () => { method: 'post', handler: expect.any(Function), }), + expect.objectContaining({ + path: '/vector-bulk-embed', + method: 'post', + handler: expect.any(Function), + }), ]), ) }) @@ -40,6 +48,11 @@ describe('/vector-search endpoint', () => { method: 'post', handler: expect.any(Function), }), + expect.objectContaining({ + path: '/vector-bulk-embed', + method: 'post', + handler: expect.any(Function), + }), ]), ) }) @@ -56,6 +69,11 @@ describe('/vector-search endpoint', () => { method: 'post', handler: expect.any(Function), }), + expect.objectContaining({ + path: '/vector-bulk-embed', + method: 'post', + handler: expect.any(Function), + }), ]), ) }) diff --git a/src/admin/components/EmbedAllButton.tsx b/src/admin/components/EmbedAllButton.tsx new file mode 100644 index 0000000..666f2de --- /dev/null +++ b/src/admin/components/EmbedAllButton.tsx @@ -0,0 +1,54 @@ +'use client' + +import React, { useState } from 'react' + +type EmbedAllButtonProps = { + collectionSlug: string + hasCreatePermission?: boolean + newDocumentURL?: string +} + +export const EmbedAllButton: React.FC = ({ collectionSlug }) => { + const [isSubmitting, setIsSubmitting] = useState(false) + const [message, setMessage] = useState(null) + + const handleClick = async () => { + setIsSubmitting(true) + setMessage(null) + try { + const res = await fetch('/api/vector-bulk-embed', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ knowledgePool: collectionSlug }), + }) + const data = await res.json() + if (!res.ok) { + setMessage(data?.error || 'Failed to queue bulk embed run') + return + } + setMessage(`Queued bulk embed run ${data.runId}`) + } catch (error: any) { + setMessage(error?.message || 'Failed to queue bulk embed run') + } finally { + setIsSubmitting(false) + } + } + + return ( +
+ + {message ? {message} : null} +
+ ) +} + +export default EmbedAllButton diff --git a/src/collections/bulkEmbeddingsRuns.ts b/src/collections/bulkEmbeddingsRuns.ts new file mode 100644 index 0000000..ea8dbe3 --- /dev/null +++ b/src/collections/bulkEmbeddingsRuns.ts @@ -0,0 +1,111 @@ +import type { CollectionConfig } from 'payload' +import type { BulkEmbeddingRunStatus } from '../types.js' + +export const BULK_EMBEDDINGS_RUNS_SLUG = 'vector-bulk-embeddings-runs' + +const statusOptions: BulkEmbeddingRunStatus[] = [ + 'queued', + 'running', + 'succeeded', + 'failed', + 'canceled', +] + +export const createBulkEmbeddingsRunsCollection = (): CollectionConfig => ({ + slug: BULK_EMBEDDINGS_RUNS_SLUG, + admin: { + useAsTitle: 'pool', + description: + 'Bulk embedding run records. Created automatically when the Embed all action is triggered.', + defaultColumns: ['pool', 'status', 'inputs', 'succeeded', 'failed', 'submittedAt'], + }, + access: { + read: () => true, + create: () => true, + update: () => true, + delete: () => false, + }, + fields: [ + { + name: 'pool', + type: 'text', + required: true, + admin: { + description: 'Knowledge pool slug', + }, + }, + { + name: 'embeddingVersion', + type: 'text', + required: true, + admin: { + description: 'Embedding version at submission time', + }, + }, + { + name: 'inputFileRef', + type: 'text', + admin: { + description: 'Provider file or input reference used for the batch', + }, + }, + { + name: 'providerBatchId', + type: 'text', + admin: { + description: 'Provider batch identifier', + }, + }, + { + name: 'status', + type: 'select', + options: statusOptions.map((value) => ({ value, label: value })), + required: true, + defaultValue: 'queued', + }, + { + name: 'inputs', + type: 'number', + defaultValue: 0, + }, + { + name: 'succeeded', + type: 'number', + defaultValue: 0, + }, + { + name: 'failed', + type: 'number', + defaultValue: 0, + }, + { + name: 'submittedAt', + type: 'date', + admin: { description: 'Timestamp when the batch was submitted' }, + }, + { + name: 'completedAt', + type: 'date', + admin: { description: 'Timestamp when the batch finished' }, + }, + { + name: 'error', + type: 'textarea', + admin: { + description: 'Failure reason if the run ended in error', + }, + }, + ], + timestamps: true, + indexes: [ + { + fields: ['pool'], + }, + { + fields: ['providerBatchId'], + }, + { + fields: ['status'], + }, + ], +}) diff --git a/src/collections/embeddings.ts b/src/collections/embeddings.ts index 2b02bd7..5637b50 100644 --- a/src/collections/embeddings.ts +++ b/src/collections/embeddings.ts @@ -25,6 +25,14 @@ export const createEmbeddingsCollection = ( admin: { description: 'Vector embeddings for search and similarity queries. Created by the payloadcms-vectorize plugin. Embeddings cannot be added or modified, only deleted, through the admin panel. No other restrictions enforced.', + components: { + beforeList: [ + { + path: 'payloadcms-vectorize/client#EmbedAllButton', + exportName: 'EmbedAllButton', + }, + ], + }, }, access: { create: () => false, // Cannot add new embeddings through admin panel diff --git a/src/endpoints/bulkEmbed.ts b/src/endpoints/bulkEmbed.ts new file mode 100644 index 0000000..edd5b31 --- /dev/null +++ b/src/endpoints/bulkEmbed.ts @@ -0,0 +1,61 @@ +import type { PayloadHandler } from 'payload' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../collections/bulkEmbeddingsRuns.js' +import type { KnowledgePoolDynamicConfig, KnowledgePoolName } from '../types.js' + +export const createBulkEmbedHandler = ( + knowledgePools: Record, + queueName?: string, +): PayloadHandler => { + const handler: PayloadHandler = async (req) => { + if (!req || !req.json) { + return Response.json({ error: 'Request is required' }, { status: 400 }) + } + try { + const body = await req.json() + const knowledgePool = body?.knowledgePool as KnowledgePoolName + if (!knowledgePool) { + return Response.json( + { error: 'knowledgePool is required and must be a string' }, + { status: 400 }, + ) + } + const poolConfig = knowledgePools[knowledgePool] + if (!poolConfig) { + return Response.json( + { error: `Knowledge pool "${knowledgePool}" not found` }, + { status: 400 }, + ) + } + + const payload = req.payload + const run = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: knowledgePool, + embeddingVersion: poolConfig.embeddingVersion, + status: 'queued', + }, + }) + + await payload.jobs.queue<'payloadcms-vectorize:bulk-embed-all'>({ + task: 'payloadcms-vectorize:bulk-embed-all', + input: { + runId: String(run.id), + }, + req, + ...(queueName ? { queue: queueName } : {}), + }) + + return Response.json( + { + runId: String(run.id), + status: 'queued', + }, + { status: 202 }, + ) + } catch (error) { + return Response.json({ error: 'Failed to queue bulk embed run' }, { status: 500 }) + } + } + return handler +} diff --git a/src/exports/client.ts b/src/exports/client.ts new file mode 100644 index 0000000..eaa8a1d --- /dev/null +++ b/src/exports/client.ts @@ -0,0 +1 @@ +export { EmbedAllButton } from '../admin/components/EmbedAllButton.js' diff --git a/src/exports/rsc.ts b/src/exports/rsc.ts new file mode 100644 index 0000000..e9a98cc --- /dev/null +++ b/src/exports/rsc.ts @@ -0,0 +1,2 @@ +// Placeholder RSC export; no server-specific components yet. +export {} diff --git a/src/index.ts b/src/index.ts index 68a3b3f..15ec71d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -14,6 +14,9 @@ import type { PostgresAdapterArgs } from '@payloadcms/db-postgres' import { createVectorizeTask } from './tasks/vectorize.js' import { createVectorSearchHandler } from './endpoints/vectorSearch.js' import { clearEmbeddingsTables, registerEmbeddingsTable } from './drizzle/tables.js' +import { createBulkEmbeddingsRunsCollection, BULK_EMBEDDINGS_RUNS_SLUG } from './collections/bulkEmbeddingsRuns.js' +import { createBulkEmbedAllTask } from './tasks/bulkEmbedAll.js' +import { createBulkEmbedHandler } from './endpoints/bulkEmbed.js' export type * from './types.js' @@ -119,6 +122,12 @@ export const createVectorizeIntegration = // Ensure collections array exists config.collections = [...(config.collections || [])] + // Ensure bulk runs collection exists once + const bulkRunsCollection = createBulkEmbeddingsRunsCollection() + if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_RUNS_SLUG)) { + config.collections.push(bulkRunsCollection) + } + // Validate static/dynamic configs share the same pool names for (const poolName in pluginOptions.knowledgePools) { if (!staticConfigs[poolName]) { @@ -182,6 +191,10 @@ export const createVectorizeIntegration = knowledgePools: pluginOptions.knowledgePools, }) tasks.push(vectorizeTask) + const bulkEmbedTask = createBulkEmbedAllTask({ + knowledgePools: pluginOptions.knowledgePools, + }) + tasks.push(bulkEmbedTask) config.jobs = { ...incomingJobs, @@ -208,6 +221,20 @@ export const createVectorizeIntegration = const collectionConfig = dynamic.collections[collectionSlug] if (!collectionConfig) continue + if ((dynamic.ingestMode || 'realtime') === 'bulk') { + // In bulk mode, clear stale embeddings and let the bulk job recreate them + await payload.delete({ + collection: pool, + where: { + and: [ + { sourceCollection: { equals: collectionSlug } }, + { docId: { equals: String(doc.id) } }, + ], + }, + }) + continue + } + await payload.jobs.queue<'payloadcms-vectorize:vectorize'>({ task: 'payloadcms-vectorize:vectorize', input: { @@ -270,14 +297,20 @@ export const createVectorizeIntegration = if (pluginOptions.endpointOverrides?.enabled !== false) { const path = pluginOptions.endpointOverrides?.path || '/vector-search' const inputEndpoints = config.endpoints || [] - config.endpoints = [ + const endpoints = [ ...inputEndpoints, { path, method: 'post', handler: createVectorSearchHandler(pluginOptions.knowledgePools), }, + { + path: '/vector-bulk-embed', + method: 'post', + handler: createBulkEmbedHandler(pluginOptions.knowledgePools, pluginOptions.queueName), + }, ] + config.endpoints = endpoints } return config diff --git a/src/tasks/bulkEmbedAll.ts b/src/tasks/bulkEmbedAll.ts new file mode 100644 index 0000000..4a35cb2 --- /dev/null +++ b/src/tasks/bulkEmbedAll.ts @@ -0,0 +1,357 @@ +import { Payload, TaskConfig, TaskHandlerResult } from 'payload' +import { + BulkEmbeddingInput, + BulkEmbeddingsCallbacks, + KnowledgePoolDynamicConfig, + KnowledgePoolName, +} from '../types.js' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../collections/bulkEmbeddingsRuns.js' +import { isPostgresPayload, PostgresPayload } from '../types.js' + +type BulkEmbedAllTaskInput = { + runId: string +} + +type BulkEmbedAllTaskOutput = { + runId: string + status: string +} + +const TERMINAL_STATUSES = new Set(['succeeded', 'failed', 'canceled']) +const fallbackInputsCache = new Map() + +export function createFallbackBulkCallbacks( + dynamicConfig: KnowledgePoolDynamicConfig, +): BulkEmbeddingsCallbacks { + return { + prepareBulkEmbeddings: async ({ inputs }) => { + const providerBatchId = `local-${Date.now()}-${Math.random().toString(16).slice(2)}` + fallbackInputsCache.set(providerBatchId, inputs) + return { + providerBatchId, + status: 'queued', + counts: { inputs: inputs.length }, + } + }, + pollBulkEmbeddings: async ({ providerBatchId }) => { + if (!fallbackInputsCache.has(providerBatchId)) { + return { status: 'failed', error: 'Unknown local batch' } + } + return { status: 'succeeded', counts: { inputs: fallbackInputsCache.get(providerBatchId)?.length } } + }, + completeBulkEmbeddings: async ({ providerBatchId }) => { + const inputs = fallbackInputsCache.get(providerBatchId) || [] + const embeddings = await dynamicConfig.embedDocs(inputs.map((i) => i.text)) + const outputs = embeddings.map((vector, idx) => { + const input = inputs[idx] + return { + id: input?.id ?? String(idx), + embedding: Array.isArray(vector) ? vector : Array.from(vector), + } + }) + fallbackInputsCache.delete(providerBatchId) + return { + status: 'succeeded', + outputs, + counts: { inputs: inputs.length, succeeded: outputs.length, failed: inputs.length - outputs.length }, + } + }, + } +} + +export const createBulkEmbedAllTask = ({ + knowledgePools, +}: { + knowledgePools: Record +}): TaskConfig => { + const task: TaskConfig = { + slug: 'payloadcms-vectorize:bulk-embed-all', + handler: async ({ input, req }): Promise> => { + if (!input?.runId) { + throw new Error('[payloadcms-vectorize] bulk embed runId is required') + } + const payload = req.payload + const run = await payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: input.runId, + }) + const poolName = (run as any)?.pool as KnowledgePoolName + if (!poolName) { + throw new Error(`[payloadcms-vectorize] bulk embed run ${input.runId} missing pool`) + } + const dynamicConfig = knowledgePools[poolName] + if (!dynamicConfig) { + throw new Error( + `[payloadcms-vectorize] knowledgePool "${poolName}" not found for bulk embed run ${input.runId}`, + ) + } + + const callbacks = dynamicConfig.bulkEmbeddings || createFallbackBulkCallbacks(dynamicConfig) + const embeddingVersion = dynamicConfig.embeddingVersion + + const inputs = await collectMissingEmbeddings({ + payload, + poolName, + dynamicConfig, + embeddingVersion, + }) + + const inputsCount = inputs.length + if (inputsCount === 0) { + await payload.update({ + id: input.runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + status: 'succeeded', + inputs: 0, + succeeded: 0, + failed: 0, + completedAt: new Date().toISOString(), + }, + }) + return { output: { runId: input.runId, status: 'succeeded' } } + } + + const prepare = (await callbacks.prepareBulkEmbeddings({ + payload, + knowledgePool: poolName, + embeddingVersion, + inputs, + })) || { providerBatchId: `local-${Date.now()}` } + + const providerBatchId = prepare.providerBatchId + let status = prepare.status ?? 'running' + await payload.update({ + id: input.runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + providerBatchId, + inputFileRef: prepare.inputFileRef, + status, + inputs: prepare.counts?.inputs ?? inputsCount, + submittedAt: new Date().toISOString(), + }, + }) + + // Poll until terminal + let pollResult: any = null + const maxPolls = 10 + let polls = 0 + while (!TERMINAL_STATUSES.has(status) && polls < maxPolls) { + pollResult = await callbacks.pollBulkEmbeddings({ + payload, + knowledgePool: poolName, + providerBatchId, + }) + status = pollResult.status + await payload.update({ + id: input.runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + status, + inputs: pollResult.counts?.inputs ?? inputsCount, + succeeded: pollResult.counts?.succeeded, + failed: pollResult.counts?.failed, + error: pollResult.error, + }, + }) + if (TERMINAL_STATUSES.has(status)) break + polls += 1 + const delay = pollResult.nextPollMs ?? 1000 + await new Promise((resolve) => setTimeout(resolve, delay)) + } + + if (status !== 'succeeded') { + await payload.update({ + id: input.runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + status, + error: pollResult?.error, + completedAt: new Date().toISOString(), + }, + }) + return { output: { runId: input.runId, status } } + } + + const completion = + (await callbacks.completeBulkEmbeddings({ + payload, + knowledgePool: poolName, + providerBatchId, + })) || { status, outputs: [] } + + const outputs = completion.outputs || [] + const inputsById = new Map(inputs.map((input) => [input.id, input])) + const successfulOutputs = outputs.filter((o) => !o.error && o.embedding) + const failedCount = completion.counts?.failed ?? inputsCount - successfulOutputs.length + + // Remove existing embeddings for successful doc ids before writing new vectors + const docKeys = new Set() + for (const output of successfulOutputs) { + const inputMeta = inputsById.get(output.id)?.metadata + if (!inputMeta) continue + docKeys.add(`${inputMeta.sourceCollection}:${inputMeta.docId}`) + } + for (const key of docKeys) { + const [sourceCollection, docId] = key.split(':') + await payload.delete({ + collection: poolName, + where: { + and: [ + { sourceCollection: { equals: sourceCollection } }, + { docId: { equals: String(docId) } }, + ], + }, + }) + } + + for (const output of successfulOutputs) { + const input = inputsById.get(output.id) + if (!input || !output.embedding) continue + + const embeddingArray = Array.isArray(output.embedding) + ? output.embedding + : Array.from(output.embedding) + + const { chunkIndex, sourceCollection, docId, embeddingVersion: version, ...rest } = + input.metadata + const chunkText = input.text + + const created = await payload.create({ + collection: poolName, + data: { + sourceCollection, + docId: String(docId), + chunkIndex, + chunkText, + embeddingVersion: version, + ...rest, + embedding: embeddingArray, + } as any, + }) + await persistVectorColumn({ + payload, + poolName, + vector: embeddingArray, + id: String((created as any)?.id ?? ''), + }) + } + + await payload.update({ + id: input.runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + status: completion.status ?? 'succeeded', + inputs: completion.counts?.inputs ?? inputsCount, + succeeded: completion.counts?.succeeded ?? successfulOutputs.length, + failed: failedCount, + error: completion.error, + completedAt: new Date().toISOString(), + }, + }) + + return { + output: { + runId: input.runId, + status: completion.status ?? 'succeeded', + }, + } + }, + } + + return task +} + +async function persistVectorColumn(args: { + payload: Payload + poolName: KnowledgePoolName + vector: number[] | Float32Array + id: string +}) { + const { payload, poolName, vector, id } = args + if (!isPostgresPayload(payload)) { + throw new Error('[payloadcms-vectorize] Bulk embeddings require the Postgres adapter') + } + const postgresPayload = payload as PostgresPayload + const schemaName = postgresPayload.db.schemaName || 'public' + const literal = `[${Array.from(vector).join(',')}]` + const sql = `UPDATE "${schemaName}"."${poolName}" SET embedding = $1 WHERE id = $2` + const runSQL = async (statement: string, params?: any[]) => { + if (postgresPayload.db.pool?.query) return postgresPayload.db.pool.query(statement, params) + if (postgresPayload.db.drizzle?.execute) return postgresPayload.db.drizzle.execute(statement) + throw new Error('[payloadcms-vectorize] Failed to persist vector column') + } + try { + await runSQL(sql, [literal, id]) + } catch (e) { + payload.logger.error('[payloadcms-vectorize] Failed to persist vector column', e as Error) + throw e + } +} + +async function collectMissingEmbeddings(args: { + payload: Payload + poolName: KnowledgePoolName + dynamicConfig: KnowledgePoolDynamicConfig + embeddingVersion: string +}): Promise { + const { payload, poolName, dynamicConfig, embeddingVersion } = args + const inputs: BulkEmbeddingInput[] = [] + + for (const collectionSlug of Object.keys(dynamicConfig.collections)) { + const collectionConfig = dynamicConfig.collections[collectionSlug] + if (!collectionConfig) continue + const toKnowledgePool = collectionConfig.toKnowledgePool + let page = 1 + const limit = 50 + + // Paginate through source collection docs + while (true) { + const res = await payload.find({ + collection: collectionSlug, + page, + limit, + }) + const docs = (res as any)?.docs || [] + if (!docs.length) break + const totalPages = (res as any)?.totalPages ?? page + + for (const doc of docs) { + const existing = await payload.find({ + collection: poolName, + where: { + and: [ + { sourceCollection: { equals: collectionSlug } }, + { docId: { equals: String(doc.id) } }, + { embeddingVersion: { equals: embeddingVersion } }, + ], + }, + limit: 1, + }) + if (existing.totalDocs > 0) continue + + const chunkData = await toKnowledgePool(doc, payload) + chunkData.forEach((chunkEntry, idx) => { + if (!chunkEntry?.chunk) return + const { chunk, ...extensionFields } = chunkEntry + inputs.push({ + id: `${collectionSlug}:${doc.id}:${idx}`, + text: chunk, + metadata: { + sourceCollection: collectionSlug, + docId: String(doc.id), + chunkIndex: idx, + embeddingVersion, + ...extensionFields, + }, + }) + }) + } + page += 1 + if (page > totalPages) break + } + } + + return inputs +} diff --git a/src/types.ts b/src/types.ts index 5e2fff6..79a706d 100644 --- a/src/types.ts +++ b/src/types.ts @@ -13,6 +13,8 @@ export type CollectionVectorizeOption = { toKnowledgePool: ToKnowledgePoolFn } +export type IngestMode = 'realtime' | 'bulk' + /** Knowledge pool name identifier */ export type KnowledgePoolName = string @@ -38,6 +40,89 @@ export type KnowledgePoolDynamicConfig = { embeddingVersion: string /** Optional fields to extend the knowledge pool collection schema */ extensionFields?: Field[] + /** Controls whether docs embed immediately or are staged for bulk runs */ + ingestMode?: IngestMode + /** Provider-specific bulk embedding callbacks */ + bulkEmbeddings?: BulkEmbeddingsCallbacks +} + +export type BulkEmbeddingRunStatus = 'queued' | 'running' | 'succeeded' | 'failed' | 'canceled' + +export type BulkEmbeddingInput = { + /** Stable identifier for correlating outputs (should be unique per chunk) */ + id: string + /** Raw text to embed */ + text: string + metadata: { + sourceCollection: string + docId: string + chunkIndex: number + embeddingVersion: string + [key: string]: any + } +} + +export type BulkEmbeddingOutput = { + id: string + embedding?: number[] | Float32Array + error?: string | null +} + +export type BulkEmbeddingCounts = { + inputs?: number + succeeded?: number + failed?: number +} + +export type PrepareBulkEmbeddingsArgs = { + payload: Payload + knowledgePool: KnowledgePoolName + embeddingVersion: string + inputs: BulkEmbeddingInput[] +} + +export type PrepareBulkEmbeddingsResult = { + providerBatchId: string + inputFileRef?: string + status?: BulkEmbeddingRunStatus + counts?: BulkEmbeddingCounts +} + +export type PollBulkEmbeddingsArgs = { + payload: Payload + knowledgePool: KnowledgePoolName + providerBatchId: string +} + +export type PollBulkEmbeddingsResult = { + status: BulkEmbeddingRunStatus + counts?: BulkEmbeddingCounts + error?: string + /** Optional delay hint in ms before the next poll */ + nextPollMs?: number +} + +export type CompleteBulkEmbeddingsArgs = { + payload: Payload + knowledgePool: KnowledgePoolName + providerBatchId: string +} + +export type CompleteBulkEmbeddingsResult = { + status: BulkEmbeddingRunStatus + outputs: BulkEmbeddingOutput[] + counts?: BulkEmbeddingCounts + error?: string +} + +export type BulkEmbeddingsCallbacks = { + prepareBulkEmbeddings: ( + args: PrepareBulkEmbeddingsArgs, + ) => Promise + pollBulkEmbeddings: (args: PollBulkEmbeddingsArgs) => Promise + completeBulkEmbeddings: ( + args: CompleteBulkEmbeddingsArgs, + ) => Promise } export type PayloadcmsVectorizeConfig = { From f71a67402fb3b4ddb05ffc204f7e5281b8e14144 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Tue, 16 Dec 2025 14:50:50 +0700 Subject: [PATCH 02/49] Better API --- README.md | 12 ++++++++---- dev/helpers/embed.ts | 4 ++-- dev/payload.config.ts | 6 ++++-- dev/specs/bulkEmbed.spec.ts | 33 ++++++++++++++------------------- src/index.ts | 7 +++++-- src/tasks/bulkEmbedAll.ts | 35 +++++++++++++++++++++++------------ src/types.ts | 10 +++++----- 7 files changed, 61 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index ec36941..c891446 100644 --- a/README.md +++ b/README.md @@ -190,8 +190,12 @@ The embeddings collection name will be the same as the knowledge pool name. - `embedQuery`: `EmbedQueryFn` - Function to embed search queries - `embeddingVersion`: `string` - Version string for tracking model changes - `extensionFields?`: `Field[]` - Optional fields to extend the embeddings collection schema -- `ingestMode?`: `'realtime' | 'bulk'` - Default `realtime` queues embeddings immediately. `bulk` skips realtime embedding, deletes stale vectors on updates, and relies on the bulk job to backfill. -- `bulkEmbeddings?`: Provider-specific callbacks for batch embedding (`prepareBulkEmbeddings`, `pollBulkEmbeddings`, `completeBulkEmbeddings`). If omitted, the plugin falls back to using `embedDocs` in-process. +- `bulkEmbeddings?`: Configuration for bulk embedding operations: + - `ingestMode?`: `'realtime' | 'bulk'` - Default `realtime` queues embeddings immediately. `bulk` skips realtime embedding, deletes stale vectors on updates, and relies on the bulk job to backfill. + - `prepareBulkEmbeddings`: Callback to prepare a bulk embedding batch + - `pollBulkEmbeddings`: Callback to poll the status of a bulk embedding batch + - `completeBulkEmbeddings`: Callback to retrieve completed embeddings from a batch + If `bulkEmbeddings` is omitted, the plugin falls back to using `embedDocs` in-process. #### CollectionVectorizeOption @@ -218,7 +222,7 @@ Because you control the output, you can mix different field types, discard empty ## PostgreSQL Custom Schema Support -The plugin reads the `schemaName` configuration from your Postgres adapter within the Payload config. +The plugin reads the `schemaName` configuration from your Postgres adapter within the Payload config. When you configure a custom schema via `postgresAdapter({ schemaName: 'custom' })`, all plugin SQL queries (for vector columns, indexes, and embeddings) are qualified with that schema name. This is useful for multi-tenant setups or when content tables live in a dedicated schema. @@ -311,7 +315,7 @@ Search for similar content using vector similarity. ```jsonc { - "knowledgePool": "main" + "knowledgePool": "main", } ``` diff --git a/dev/helpers/embed.ts b/dev/helpers/embed.ts index d70b87e..540e56c 100644 --- a/dev/helpers/embed.ts +++ b/dev/helpers/embed.ts @@ -1,6 +1,6 @@ import { voyage } from 'voyage-ai-provider' import { embed, embedMany } from 'ai' -import type { BulkEmbeddingsCallbacks } from 'payloadcms-vectorize' +import type { BulkEmbeddingsConfig } from 'payloadcms-vectorize' export const voyageEmbedDocs = async (texts: string[]): Promise => { const embedResult = await embedMany({ @@ -56,7 +56,7 @@ export function makeDummyEmbedDocs(dims: number) { } export const testEmbeddingVersion = 'test-v1' -export function makeLocalBulkEmbeddingsCallbacks(dims: number): BulkEmbeddingsCallbacks { +export function makeLocalBulkEmbeddingsCallbacks(dims: number): BulkEmbeddingsConfig { const pendingInputs = new Map>() const embedDocs = makeDummyEmbedDocs(dims) return { diff --git a/dev/payload.config.ts b/dev/payload.config.ts index 2b376ce..0350447 100644 --- a/dev/payload.config.ts +++ b/dev/payload.config.ts @@ -123,8 +123,10 @@ const buildConfigWithPostgres = async () => { embedDocs, embedQuery, embeddingVersion: testEmbeddingVersion, - ingestMode: 'realtime', - bulkEmbeddings: makeLocalBulkEmbeddingsCallbacks(dims), + bulkEmbeddings: { + ...makeLocalBulkEmbeddingsCallbacks(dims), + ingestMode: 'realtime', + }, }, }, }), diff --git a/dev/specs/bulkEmbed.spec.ts b/dev/specs/bulkEmbed.spec.ts index 386d319..7f46bde 100644 --- a/dev/specs/bulkEmbed.spec.ts +++ b/dev/specs/bulkEmbed.spec.ts @@ -8,7 +8,12 @@ import { createVectorizeIntegration } from 'payloadcms-vectorize' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../src/collections/bulkEmbeddingsRuns.js' import { createBulkEmbedAllTask } from '../../src/tasks/bulkEmbedAll.js' import { createTestDb } from './utils.js' -import { makeDummyEmbedDocs, makeDummyEmbedQuery, makeLocalBulkEmbeddingsCallbacks, testEmbeddingVersion } from 'helpers/embed.js' +import { + makeDummyEmbedDocs, + makeDummyEmbedQuery, + makeLocalBulkEmbeddingsCallbacks, + testEmbeddingVersion, +} from 'helpers/embed.js' const DIMS = 8 @@ -35,8 +40,10 @@ describe('Bulk embed ingest mode', () => { embedDocs: makeDummyEmbedDocs(DIMS), embedQuery: makeDummyEmbedQuery(DIMS), embeddingVersion: testEmbeddingVersion, - ingestMode: 'bulk' as const, - bulkEmbeddings: makeLocalBulkEmbeddingsCallbacks(DIMS), + bulkEmbeddings: { + ...makeLocalBulkEmbeddingsCallbacks(DIMS), + ingestMode: 'bulk' as const, + }, }, }, } @@ -75,10 +82,7 @@ describe('Bulk embed ingest mode', () => { const initialEmbeds = await payload.find({ collection: 'default', where: { - and: [ - { sourceCollection: { equals: 'posts' } }, - { docId: { equals: String(post.id) } }, - ], + and: [{ sourceCollection: { equals: 'posts' } }, { docId: { equals: String(post.id) } }], }, }) expect(initialEmbeds.totalDocs).toBe(0) @@ -104,10 +108,7 @@ describe('Bulk embed ingest mode', () => { const embeds = await payload.find({ collection: 'default', where: { - and: [ - { sourceCollection: { equals: 'posts' } }, - { docId: { equals: String(post.id) } }, - ], + and: [{ sourceCollection: { equals: 'posts' } }, { docId: { equals: String(post.id) } }], }, }) expect(embeds.totalDocs).toBeGreaterThan(0) @@ -154,10 +155,7 @@ describe('Bulk embed ingest mode', () => { const afterUpdateEmbeds = await payload.find({ collection: 'default', where: { - and: [ - { sourceCollection: { equals: 'posts' } }, - { docId: { equals: String(post.id) } }, - ], + and: [{ sourceCollection: { equals: 'posts' } }, { docId: { equals: String(post.id) } }], }, }) expect(afterUpdateEmbeds.totalDocs).toBe(0) @@ -179,10 +177,7 @@ describe('Bulk embed ingest mode', () => { const embedsAfterRerun = await payload.find({ collection: 'default', where: { - and: [ - { sourceCollection: { equals: 'posts' } }, - { docId: { equals: String(post.id) } }, - ], + and: [{ sourceCollection: { equals: 'posts' } }, { docId: { equals: String(post.id) } }], }, }) expect(embedsAfterRerun.totalDocs).toBeGreaterThan(0) diff --git a/src/index.ts b/src/index.ts index 15ec71d..bcf6cc1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -14,7 +14,10 @@ import type { PostgresAdapterArgs } from '@payloadcms/db-postgres' import { createVectorizeTask } from './tasks/vectorize.js' import { createVectorSearchHandler } from './endpoints/vectorSearch.js' import { clearEmbeddingsTables, registerEmbeddingsTable } from './drizzle/tables.js' -import { createBulkEmbeddingsRunsCollection, BULK_EMBEDDINGS_RUNS_SLUG } from './collections/bulkEmbeddingsRuns.js' +import { + createBulkEmbeddingsRunsCollection, + BULK_EMBEDDINGS_RUNS_SLUG, +} from './collections/bulkEmbeddingsRuns.js' import { createBulkEmbedAllTask } from './tasks/bulkEmbedAll.js' import { createBulkEmbedHandler } from './endpoints/bulkEmbed.js' @@ -221,7 +224,7 @@ export const createVectorizeIntegration = const collectionConfig = dynamic.collections[collectionSlug] if (!collectionConfig) continue - if ((dynamic.ingestMode || 'realtime') === 'bulk') { + if ((dynamic.bulkEmbeddings?.ingestMode || 'realtime') === 'bulk') { // In bulk mode, clear stale embeddings and let the bulk job recreate them await payload.delete({ collection: pool, diff --git a/src/tasks/bulkEmbedAll.ts b/src/tasks/bulkEmbedAll.ts index 4a35cb2..0bc4cca 100644 --- a/src/tasks/bulkEmbedAll.ts +++ b/src/tasks/bulkEmbedAll.ts @@ -1,7 +1,7 @@ import { Payload, TaskConfig, TaskHandlerResult } from 'payload' import { BulkEmbeddingInput, - BulkEmbeddingsCallbacks, + BulkEmbeddingsConfig, KnowledgePoolDynamicConfig, KnowledgePoolName, } from '../types.js' @@ -22,7 +22,7 @@ const fallbackInputsCache = new Map() export function createFallbackBulkCallbacks( dynamicConfig: KnowledgePoolDynamicConfig, -): BulkEmbeddingsCallbacks { +): BulkEmbeddingsConfig { return { prepareBulkEmbeddings: async ({ inputs }) => { const providerBatchId = `local-${Date.now()}-${Math.random().toString(16).slice(2)}` @@ -37,7 +37,10 @@ export function createFallbackBulkCallbacks( if (!fallbackInputsCache.has(providerBatchId)) { return { status: 'failed', error: 'Unknown local batch' } } - return { status: 'succeeded', counts: { inputs: fallbackInputsCache.get(providerBatchId)?.length } } + return { + status: 'succeeded', + counts: { inputs: fallbackInputsCache.get(providerBatchId)?.length }, + } }, completeBulkEmbeddings: async ({ providerBatchId }) => { const inputs = fallbackInputsCache.get(providerBatchId) || [] @@ -53,7 +56,11 @@ export function createFallbackBulkCallbacks( return { status: 'succeeded', outputs, - counts: { inputs: inputs.length, succeeded: outputs.length, failed: inputs.length - outputs.length }, + counts: { + inputs: inputs.length, + succeeded: outputs.length, + failed: inputs.length - outputs.length, + }, } }, } @@ -174,12 +181,11 @@ export const createBulkEmbedAllTask = ({ return { output: { runId: input.runId, status } } } - const completion = - (await callbacks.completeBulkEmbeddings({ - payload, - knowledgePool: poolName, - providerBatchId, - })) || { status, outputs: [] } + const completion = (await callbacks.completeBulkEmbeddings({ + payload, + knowledgePool: poolName, + providerBatchId, + })) || { status, outputs: [] } const outputs = completion.outputs || [] const inputsById = new Map(inputs.map((input) => [input.id, input])) @@ -214,8 +220,13 @@ export const createBulkEmbedAllTask = ({ ? output.embedding : Array.from(output.embedding) - const { chunkIndex, sourceCollection, docId, embeddingVersion: version, ...rest } = - input.metadata + const { + chunkIndex, + sourceCollection, + docId, + embeddingVersion: version, + ...rest + } = input.metadata const chunkText = input.text const created = await payload.create({ diff --git a/src/types.ts b/src/types.ts index 79a706d..d2147ac 100644 --- a/src/types.ts +++ b/src/types.ts @@ -40,10 +40,8 @@ export type KnowledgePoolDynamicConfig = { embeddingVersion: string /** Optional fields to extend the knowledge pool collection schema */ extensionFields?: Field[] - /** Controls whether docs embed immediately or are staged for bulk runs */ - ingestMode?: IngestMode - /** Provider-specific bulk embedding callbacks */ - bulkEmbeddings?: BulkEmbeddingsCallbacks + /** User provided bulk embedding configuration */ + bulkEmbeddings?: BulkEmbeddingsConfig } export type BulkEmbeddingRunStatus = 'queued' | 'running' | 'succeeded' | 'failed' | 'canceled' @@ -115,7 +113,9 @@ export type CompleteBulkEmbeddingsResult = { error?: string } -export type BulkEmbeddingsCallbacks = { +export type BulkEmbeddingsConfig = { + /** Controls whether docs embed immediately or are staged for bulk runs */ + ingestMode?: IngestMode prepareBulkEmbeddings: ( args: PrepareBulkEmbeddingsArgs, ) => Promise From 3dc508df209fd9562bf7448954df7e6d968968ca Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Tue, 16 Dec 2025 23:03:16 +0700 Subject: [PATCH 03/49] WIP --- CHANGELOG.md | 22 ++ README.md | 53 +++- dev/helpers/embed.ts | 241 ++++++++++++++-- dev/payload.config.ts | 19 +- dev/specs/bulkEmbed.spec.ts | 363 ++++++++++++++++++++++-- package.json | 12 +- src/admin/components/EmbedAllButton.tsx | 29 +- src/collections/embeddings.ts | 16 ++ src/endpoints/bulkEmbed.ts | 4 +- src/exports/rsc.ts | 2 - src/index.ts | 43 ++- src/tasks/bulkEmbedAll.ts | 255 ++++++++++------- src/types.ts | 11 +- 13 files changed, 881 insertions(+), 189 deletions(-) delete mode 100644 src/exports/rsc.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e51c31..081b342 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,28 @@ All notable changes to this project will be documented in this file. +## 0.5.0 - 2025-12-17 + +### Breaking Changes + +- **`queueName` renamed to `realtimeQueueName`**: The plugin option `queueName` has been renamed to `realtimeQueueName` to clarify that it only affects realtime vectorization jobs. + +### New Features + +- **`bulkQueueName` option**: New plugin option to isolate bulk embedding workloads to a dedicated queue. Required when any knowledge pool uses bulk ingest mode (`bulkEmbeddings.ingestMode === 'bulk'`). +- **Non-blocking bulk polling**: Bulk jobs now use separate, short-lived tasks that can safely handle long-running providers (hours/days) without blocking worker processes. +- **Improved admin UX**: The "Embed all" button now: + - Disables when bulk embeddings are not configured for a pool + - Links to the latest bulk run for easy status tracking +- **Enhanced bulk provider support**: Added real Voyage AI Batch API integration in dev environment, demonstrating production-ready bulk embedding with file uploads and async polling. + +### Tests & Reliability + +- Added comprehensive tests for realtime vs bulk ingest behavior +- Added tests for bulk polling error conditions (`failed`, `canceled` statuses) +- Added tests for bulk fan-in behavior (multiple documents processed in single run) +- Improved test coverage for edge cases in bulk embedding workflow + ## 0.4.1 - 2025-12-02 ### Added diff --git a/README.md b/README.md index c891446..41c446d 100644 --- a/README.md +++ b/README.md @@ -165,12 +165,13 @@ const { results } = await response.json() ### Plugin Options -| Option | Type | Required | Description | -| ------------------- | --------------------------------------------------- | -------- | ---------------------------------------- | -| `knowledgePools` | `Record` | ✅ | Knowledge pools and their configurations | -| `queueName` | `string` | ❌ | Custom queue name for background jobs | -| `endpointOverrides` | `object` | ❌ | Customize the search endpoint | -| `disabled` | `boolean` | ❌ | Disable plugin while keeping schema | +| Option | Type | Required | Description | +| ------------------- | --------------------------------------------------- | -------- | -------------------------------------------------------------------------- | +| `knowledgePools` | `Record` | ✅ | Knowledge pools and their configurations | +| `realtimeQueueName` | `string` | ❌ | Custom queue name for realtime vectorization jobs | +| `bulkQueueName` | `string` | ❌ | Queue name for bulk embedding jobs (required if any pool uses bulk ingest) | +| `endpointOverrides` | `object` | ❌ | Customize the search endpoint | +| `disabled` | `boolean` | ❌ | Disable plugin while keeping schema | ### Knowledge Pool Config @@ -191,11 +192,41 @@ The embeddings collection name will be the same as the knowledge pool name. - `embeddingVersion`: `string` - Version string for tracking model changes - `extensionFields?`: `Field[]` - Optional fields to extend the embeddings collection schema - `bulkEmbeddings?`: Configuration for bulk embedding operations: - - `ingestMode?`: `'realtime' | 'bulk'` - Default `realtime` queues embeddings immediately. `bulk` skips realtime embedding, deletes stale vectors on updates, and relies on the bulk job to backfill. - - `prepareBulkEmbeddings`: Callback to prepare a bulk embedding batch - - `pollBulkEmbeddings`: Callback to poll the status of a bulk embedding batch - - `completeBulkEmbeddings`: Callback to retrieve completed embeddings from a batch - If `bulkEmbeddings` is omitted, the plugin falls back to using `embedDocs` in-process. + - `ingestMode?`: `'realtime' | 'bulk'` - Default `'realtime'` queues embeddings immediately. `'bulk'` skips realtime embedding, deletes stale vectors on updates, and relies on the bulk job to backfill. + - `prepareBulkEmbeddings(args)`: Callback to prepare a bulk embedding batch + - `pollBulkEmbeddings(args)`: Callback to poll the status of a bulk embedding batch + - `completeBulkEmbeddings(args)`: Callback to retrieve completed embeddings from a batch + If `bulkEmbeddings` is omitted for a pool, the "Embed all" button is disabled and bulk is not available. + +### Bulk Task Model + +When bulk ingest mode is enabled, the plugin uses separate Payload jobs for reliability with long-running providers: + +- **`prepare-bulk-embedding`**: One-shot task that collects missing embeddings and submits them to the provider. Short-lived. +- **`poll-or-complete-bulk-embedding`**: Polls the provider status and completes embedding ingestion when ready. Can requeue itself until completion. + +### Queue Configuration + +For production deployments with bulk embedding: + +```typescript +// Recommended production setup +plugins: [ + payloadcmsVectorize({ + knowledgePools: { /* ... */ }, + realtimeQueueName: 'vectorize-realtime', // Separate realtime jobs (Optional) + bulkQueueName: 'vectorize-bulk', // Isolate bulk workloads (Required if any knowledge pool uses bulk ingestion of any kind) + }), +] + +// Configure Payload queues +jobs: { + queues: { + 'vectorize-realtime': { concurrency: 5 }, + 'vectorize-bulk': { concurrency: 2 }, + }, +} +``` #### CollectionVectorizeOption diff --git a/dev/helpers/embed.ts b/dev/helpers/embed.ts index 540e56c..29fc53b 100644 --- a/dev/helpers/embed.ts +++ b/dev/helpers/embed.ts @@ -1,6 +1,11 @@ import { voyage } from 'voyage-ai-provider' import { embed, embedMany } from 'ai' -import type { BulkEmbeddingsConfig } from 'payloadcms-vectorize' +import type { + BulkEmbeddingInput, + BulkEmbeddingOutput, + BulkEmbeddingRunStatus, + BulkEmbeddingsConfig, +} from 'payloadcms-vectorize' export const voyageEmbedDocs = async (texts: string[]): Promise => { const embedResult = await embedMany({ @@ -56,36 +61,226 @@ export function makeDummyEmbedDocs(dims: number) { } export const testEmbeddingVersion = 'test-v1' -export function makeLocalBulkEmbeddingsCallbacks(dims: number): BulkEmbeddingsConfig { - const pendingInputs = new Map>() - const embedDocs = makeDummyEmbedDocs(dims) +// Real Voyage Batch API implementation +export function makeVoyageBulkEmbeddingsConfig(): BulkEmbeddingsConfig { + // Store batch state in memory for dev purposes + const batchState = new Map< + string, + { + inputs: BulkEmbeddingInput[] + batchId: string + outputFileId?: string + } + >() + return { + ingestMode: 'bulk', prepareBulkEmbeddings: async ({ inputs }) => { - const providerBatchId = `local-${dims}-${Date.now()}` - pendingInputs.set(providerBatchId, inputs) - return { - providerBatchId, - status: 'queued', - counts: { inputs: inputs.length }, + try { + // Create JSONL content for Voyage batch + const jsonlLines = inputs.map((input) => { + return JSON.stringify({ + custom_id: input.id, + body: { + input: [input.text], + model: 'voyage-3.5-lite', + input_type: 'document', + }, + }) + }) + const jsonlContent = jsonlLines.join('\n') + + // Upload file to Voyage Files API using FormData + const formData = new FormData() + const blob = new Blob([jsonlContent], { type: 'application/jsonl' }) + formData.append('file', blob, 'batch-input.jsonl') + formData.append('purpose', 'batch') + + const uploadResponse = await fetch('https://api.voyageai.com/v1/files', { + method: 'POST', + headers: { + Authorization: `Bearer ${process.env.VOYAGE_API_KEY}`, + }, + body: formData, + }) + + if (!uploadResponse.ok) { + const error = await uploadResponse.text() + throw new Error(`Voyage file upload failed: ${error}`) + } + + const fileData = await uploadResponse.json() + const fileId = fileData.id + + // Create batch + const batchResponse = await fetch('https://api.voyageai.com/v1/batches', { + method: 'POST', + headers: { + Authorization: `Bearer ${process.env.VOYAGE_API_KEY}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + input_file_id: fileId, + endpoint: '/v1/embeddings', + completion_window: '24h', + }), + }) + + if (!batchResponse.ok) { + const error = await batchResponse.text() + throw new Error(`Voyage batch creation failed: ${error}`) + } + + const batchData = await batchResponse.json() + const batchId = batchData.id + + // Store state for later retrieval + batchState.set(batchId, { + inputs, + batchId, + }) + + return { + providerBatchId: batchId, + status: batchData.status || 'queued', + counts: { inputs: inputs.length }, + } + } catch (error) { + console.error('Voyage prepareBulkEmbeddings error:', error) + throw error } }, + pollBulkEmbeddings: async ({ providerBatchId }) => { - if (!pendingInputs.has(providerBatchId)) { - return { status: 'failed', error: 'unknown batch' } + try { + const response = await fetch(`https://api.voyageai.com/v1/batches/${providerBatchId}`, { + headers: { + Authorization: `Bearer ${process.env.VOYAGE_API_KEY}`, + }, + }) + + if (!response.ok) { + const error = await response.text() + return { status: 'failed', error: `Voyage API error: ${error}` } + } + + const batchData = await response.json() + + // Map Voyage status to our status + let status: BulkEmbeddingRunStatus + switch (batchData.status) { + case 'queued': + case 'validating': + status = 'queued' + break + case 'running': + case 'finalizing': + status = 'running' + break + case 'completed': + status = 'succeeded' + break + case 'failed': + case 'cancelled': + case 'expired': + status = batchData.status === 'cancelled' ? 'canceled' : 'failed' + break + default: + status = 'running' + } + + // Store output file ID if available + if (batchData.output_file_id) { + const state = batchState.get(providerBatchId) + if (state) { + state.outputFileId = batchData.output_file_id + } + } + + return { + status, + counts: batchData.request_counts + ? { + inputs: batchData.request_counts.total || 0, + succeeded: batchData.request_counts.completed || 0, + failed: batchData.request_counts.failed || 0, + } + : undefined, + nextPollMs: status === 'running' || status === 'queued' ? 10000 : undefined, // Poll every 10s if not terminal + } + } catch (error) { + console.error('Voyage pollBulkEmbeddings error:', error) + return { status: 'failed', error: 'Failed to poll batch status' } } - return { status: 'succeeded' } }, + completeBulkEmbeddings: async ({ providerBatchId }) => { - const inputs = pendingInputs.get(providerBatchId) || [] - const embeddings = await embedDocs(inputs.map((i) => i.text)) - pendingInputs.delete(providerBatchId) - return { - status: 'succeeded', - outputs: embeddings.map((vector, idx) => ({ - id: inputs[idx]?.id ?? String(idx), - embedding: vector, - })), - counts: { inputs: inputs.length, succeeded: embeddings.length, failed: 0 }, + try { + const state = batchState.get(providerBatchId) + if (!state?.outputFileId) { + throw new Error('No output file available for batch') + } + + // Download output file + const response = await fetch( + `https://api.voyageai.com/v1/files/${state.outputFileId}/content`, + { + headers: { + Authorization: `Bearer ${process.env.VOYAGE_API_KEY}`, + }, + }, + ) + + if (!response.ok) { + const error = await response.text() + throw new Error(`Failed to download output file: ${error}`) + } + + const jsonlContent = await response.text() + const lines = jsonlContent.trim().split('\n') + + const outputs: BulkEmbeddingOutput[] = [] + let succeeded = 0 + let failed = 0 + + for (const line of lines) { + if (!line.trim()) continue + try { + const result = JSON.parse(line) + if (result.error) { + outputs.push({ + id: result.custom_id, + error: result.error.message || 'Unknown error', + }) + failed++ + } else { + outputs.push({ + id: result.custom_id, + embedding: result.response.body.data[0].embedding, + }) + succeeded++ + } + } catch (parseError) { + console.error('Failed to parse output line:', line, parseError) + failed++ + } + } + + // Clean up state + batchState.delete(providerBatchId) + + return { + status: 'succeeded', + outputs, + counts: { + inputs: state.inputs.length, + succeeded, + failed, + }, + } + } catch (error) { + console.error('Voyage completeBulkEmbeddings error:', error) + throw error } }, } diff --git a/dev/payload.config.ts b/dev/payload.config.ts index 0350447..056681c 100644 --- a/dev/payload.config.ts +++ b/dev/payload.config.ts @@ -9,7 +9,7 @@ import { voyageEmbedDocs, voyageEmbedQuery, makeDummyEmbedQuery, - makeLocalBulkEmbeddingsCallbacks, + makeVoyageBulkEmbeddingsConfig, } from './helpers/embed.js' import sharp from 'sharp' import { fileURLToPath } from 'url' @@ -79,12 +79,22 @@ const buildConfigWithPostgres = async () => { email: testEmailAdapter, jobs: { tasks: [], + queues: { + 'vectorize-bulk': { + concurrency: 2, + }, + }, autoRun: [ { cron: '*/5 * * * * *', // Run every 5 seconds in development limit: 10, queue: 'default', }, + { + cron: '*/10 * * * * *', // Run every 10 seconds for bulk jobs + limit: 5, + queue: 'vectorize-bulk', + }, ], jobsCollectionOverrides: ({ defaultJobsCollection }) => { // Make jobs collection visible in admin for debugging @@ -123,12 +133,11 @@ const buildConfigWithPostgres = async () => { embedDocs, embedQuery, embeddingVersion: testEmbeddingVersion, - bulkEmbeddings: { - ...makeLocalBulkEmbeddingsCallbacks(dims), - ingestMode: 'realtime', - }, + bulkEmbeddings: makeVoyageBulkEmbeddingsConfig(), }, }, + realtimeQueueName: 'vectorize-realtime', + bulkQueueName: 'vectorize-bulk', }), ], secret: process.env.PAYLOAD_SECRET || 'test-secret_key', diff --git a/dev/specs/bulkEmbed.spec.ts b/dev/specs/bulkEmbed.spec.ts index 7f46bde..868dd7b 100644 --- a/dev/specs/bulkEmbed.spec.ts +++ b/dev/specs/bulkEmbed.spec.ts @@ -6,17 +6,49 @@ import { postgresAdapter } from '@payloadcms/db-postgres' import { lexicalEditor } from '@payloadcms/richtext-lexical' import { createVectorizeIntegration } from 'payloadcms-vectorize' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../src/collections/bulkEmbeddingsRuns.js' -import { createBulkEmbedAllTask } from '../../src/tasks/bulkEmbedAll.js' -import { createTestDb } from './utils.js' import { - makeDummyEmbedDocs, - makeDummyEmbedQuery, - makeLocalBulkEmbeddingsCallbacks, - testEmbeddingVersion, -} from 'helpers/embed.js' + createPrepareBulkEmbeddingTask, + createPollOrCompleteBulkEmbeddingTask, +} from '../../src/tasks/bulkEmbedAll.js' +import { createTestDb } from './utils.js' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import type { BulkEmbeddingsConfig, BulkEmbeddingRunStatus } from '../../src/types.js' const DIMS = 8 +// Mock bulk embeddings configs for testing +function createMockBulkEmbeddings(statusSequence: BulkEmbeddingRunStatus[]): BulkEmbeddingsConfig { + let callCount = 0 + const embeddings = makeDummyEmbedDocs(DIMS) + + return { + ingestMode: 'bulk', + prepareBulkEmbeddings: async ({ inputs }) => { + return { + providerBatchId: `mock-${Date.now()}`, + status: 'queued', + counts: { inputs: inputs.length }, + } + }, + pollBulkEmbeddings: async () => { + const status = statusSequence[Math.min(callCount++, statusSequence.length - 1)] + return { + status, + counts: status === 'succeeded' ? { inputs: 1, succeeded: 1, failed: 0 } : undefined, + } + }, + completeBulkEmbeddings: async ({ providerBatchId }) => { + const inputs = [{ id: 'test-1', text: 'test text', metadata: {} }] + const vectors = await embeddings([inputs[0].text]) + return { + status: 'succeeded', + outputs: [{ id: inputs[0].id, embedding: vectors[0] }], + counts: { inputs: 1, succeeded: 1, failed: 0 }, + } + }, + } +} + describe('Bulk embed ingest mode', () => { let payload: Payload let config: SanitizedConfig @@ -40,12 +72,10 @@ describe('Bulk embed ingest mode', () => { embedDocs: makeDummyEmbedDocs(DIMS), embedQuery: makeDummyEmbedQuery(DIMS), embeddingVersion: testEmbeddingVersion, - bulkEmbeddings: { - ...makeLocalBulkEmbeddingsCallbacks(DIMS), - ingestMode: 'bulk' as const, - }, + bulkEmbeddings: createMockBulkEmbeddings(['succeeded']), }, }, + bulkQueueName: 'vectorize-bulk', } beforeAll(async () => { @@ -73,7 +103,7 @@ describe('Bulk embed ingest mode', () => { payload = await getPayload({ config }) }) - test('queues no realtime embeddings and bulk job backfills missing docs', async () => { + test('bulk ingest mode queues no realtime embeddings and bulk job backfills missing docs', async () => { const post = await payload.create({ collection: 'posts', data: { title: 'Bulk Mode Title' } as any, @@ -96,11 +126,22 @@ describe('Bulk embed ingest mode', () => { }, }) - const bulkTask = createBulkEmbedAllTask({ + // Run prepare task + const prepareTask = createPrepareBulkEmbeddingTask({ knowledgePools: pluginOptions.knowledgePools, + bulkQueueName: pluginOptions.bulkQueueName, + }) + await prepareTask.handler({ + input: { runId: String(run.id) }, + req: { payload } as any, }) - await bulkTask.handler({ + // Run poll/complete task + const pollTask = createPollOrCompleteBulkEmbeddingTask({ + knowledgePools: pluginOptions.knowledgePools, + bulkQueueName: pluginOptions.bulkQueueName, + }) + await pollTask.handler({ input: { runId: String(run.id) }, req: { payload } as any, }) @@ -122,7 +163,7 @@ describe('Bulk embed ingest mode', () => { expect(runDoc.inputs).toBeGreaterThan(0) }) - test('document updates clear stale embeddings and rerun populates new chunks', async () => { + test('bulk ingest mode clears stale embeddings on document updates and rerun populates new chunks', async () => { const post = await payload.create({ collection: 'posts', data: { title: 'Original' } as any, @@ -137,14 +178,19 @@ describe('Bulk embed ingest mode', () => { status: 'queued', }, }) - const bulkTask = createBulkEmbedAllTask({ + + const prepareTask = createPrepareBulkEmbeddingTask({ knowledgePools: pluginOptions.knowledgePools, + bulkQueueName: pluginOptions.bulkQueueName, }) - await bulkTask.handler({ - input: { runId: String(firstRun.id) }, - req: { payload } as any, + const pollTask = createPollOrCompleteBulkEmbeddingTask({ + knowledgePools: pluginOptions.knowledgePools, + bulkQueueName: pluginOptions.bulkQueueName, }) + await prepareTask.handler({ input: { runId: String(firstRun.id) }, req: { payload } as any }) + await pollTask.handler({ input: { runId: String(firstRun.id) }, req: { payload } as any }) + // Update document - should delete embeddings in bulk mode await payload.update({ collection: 'posts', @@ -169,10 +215,8 @@ describe('Bulk embed ingest mode', () => { status: 'queued', }, }) - await bulkTask.handler({ - input: { runId: String(secondRun.id) }, - req: { payload } as any, - }) + await prepareTask.handler({ input: { runId: String(secondRun.id) }, req: { payload } as any }) + await pollTask.handler({ input: { runId: String(secondRun.id) }, req: { payload } as any }) const embedsAfterRerun = await payload.find({ collection: 'default', @@ -183,4 +227,277 @@ describe('Bulk embed ingest mode', () => { expect(embedsAfterRerun.totalDocs).toBeGreaterThan(0) expect(embedsAfterRerun.docs[0]?.chunkText).toContain('Updated Title') }) + + test('realtime ingest mode queues vectorize jobs on document creation', async () => { + const realtimePluginOptions = { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embedDocs: makeDummyEmbedDocs(DIMS), + embedQuery: makeDummyEmbedQuery(DIMS), + embeddingVersion: testEmbeddingVersion, + // No bulkEmbeddings - should default to realtime + }, + }, + } + + const realtimeConfig = await buildConfig({ + secret: 'test-secret', + editor: lexicalEditor(), + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [integration.payloadcmsVectorize(realtimePluginOptions)], + jobs: { tasks: [] }, + }) + + const realtimePayload = await getPayload({ config: realtimeConfig }) + + // Create a document - should trigger realtime vectorization + const post = await realtimePayload.create({ + collection: 'posts', + data: { title: 'Realtime Test' } as any, + }) + + // Check that embeddings were created immediately + const embeds = await realtimePayload.find({ + collection: 'default', + where: { + and: [{ sourceCollection: { equals: 'posts' } }, { docId: { equals: String(post.id) } }], + }, + }) + expect(embeds.totalDocs).toBeGreaterThan(0) + expect(embeds.docs[0]?.chunkText).toBe('Realtime Test') + }) + + test('bulk polling handles failed status correctly', async () => { + const failedBulkOptions = { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embedDocs: makeDummyEmbedDocs(DIMS), + embedQuery: makeDummyEmbedQuery(DIMS), + embeddingVersion: testEmbeddingVersion, + bulkEmbeddings: createMockBulkEmbeddings(['failed']), + }, + }, + bulkQueueName: 'vectorize-bulk', + } + + const failedConfig = await buildConfig({ + secret: 'test-secret', + editor: lexicalEditor(), + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [integration.payloadcmsVectorize(failedBulkOptions)], + jobs: { tasks: [] }, + }) + + const failedPayload = await getPayload({ config: failedConfig }) + + const post = await failedPayload.create({ + collection: 'posts', + data: { title: 'Failed Test' } as any, + }) + + const run = await failedPayload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: 'default', + embeddingVersion: testEmbeddingVersion, + status: 'queued', + }, + }) + + const prepareTask = createPrepareBulkEmbeddingTask({ + knowledgePools: failedBulkOptions.knowledgePools, + bulkQueueName: failedBulkOptions.bulkQueueName, + }) + const pollTask = createPollOrCompleteBulkEmbeddingTask({ + knowledgePools: failedBulkOptions.knowledgePools, + bulkQueueName: failedBulkOptions.bulkQueueName, + }) + + await prepareTask.handler({ + input: { runId: String(run.id) }, + req: { payload: failedPayload } as any, + }) + await pollTask.handler({ + input: { runId: String(run.id) }, + req: { payload: failedPayload } as any, + }) + + const runDoc = await failedPayload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: run.id, + }) + expect(runDoc.status).toBe('failed') + // Should not call completeBulkEmbeddings, so no embeddings created + const embeds = await failedPayload.find({ + collection: 'default', + where: { + and: [{ sourceCollection: { equals: 'posts' } }, { docId: { equals: String(post.id) } }], + }, + }) + expect(embeds.totalDocs).toBe(0) + }) + + test('bulk polling handles canceled status correctly', async () => { + const canceledBulkOptions = { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embedDocs: makeDummyEmbedDocs(DIMS), + embedQuery: makeDummyEmbedQuery(DIMS), + embeddingVersion: testEmbeddingVersion, + bulkEmbeddings: createMockBulkEmbeddings(['canceled']), + }, + }, + bulkQueueName: 'vectorize-bulk', + } + + const canceledConfig = await buildConfig({ + secret: 'test-secret', + editor: lexicalEditor(), + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [integration.payloadcmsVectorize(canceledBulkOptions)], + jobs: { tasks: [] }, + }) + + const canceledPayload = await getPayload({ config: canceledConfig }) + + const run = await canceledPayload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: 'default', + embeddingVersion: testEmbeddingVersion, + status: 'queued', + }, + }) + + const prepareTask = createPrepareBulkEmbeddingTask({ + knowledgePools: canceledBulkOptions.knowledgePools, + bulkQueueName: canceledBulkOptions.bulkQueueName, + }) + const pollTask = createPollOrCompleteBulkEmbeddingTask({ + knowledgePools: canceledBulkOptions.knowledgePools, + bulkQueueName: canceledBulkOptions.bulkQueueName, + }) + + await prepareTask.handler({ + input: { runId: String(run.id) }, + req: { payload: canceledPayload } as any, + }) + await pollTask.handler({ + input: { runId: String(run.id) }, + req: { payload: canceledPayload } as any, + }) + + const runDoc = await canceledPayload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: run.id, + }) + expect(runDoc.status).toBe('canceled') + }) + + test('bulk fan-in: multiple documents created before bulk task runs are all processed in single run', async () => { + // Create multiple documents + const post1 = await payload.create({ + collection: 'posts', + data: { title: 'Post 1' } as any, + }) + const post2 = await payload.create({ + collection: 'posts', + data: { title: 'Post 2' } as any, + }) + + // Verify no embeddings initially + const initialEmbeds = await payload.find({ + collection: 'default', + where: { sourceCollection: { equals: 'posts' } }, + }) + expect(initialEmbeds.totalDocs).toBe(0) + + // Create single bulk run + const run = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: 'default', + embeddingVersion: testEmbeddingVersion, + status: 'queued', + }, + }) + + // Run bulk tasks + const prepareTask = createPrepareBulkEmbeddingTask({ + knowledgePools: pluginOptions.knowledgePools, + bulkQueueName: pluginOptions.bulkQueueName, + }) + const pollTask = createPollOrCompleteBulkEmbeddingTask({ + knowledgePools: pluginOptions.knowledgePools, + bulkQueueName: pluginOptions.bulkQueueName, + }) + + await prepareTask.handler({ input: { runId: String(run.id) }, req: { payload } as any }) + await pollTask.handler({ input: { runId: String(run.id) }, req: { payload } as any }) + + // Verify all documents got embeddings + const finalEmbeds = await payload.find({ + collection: 'default', + where: { sourceCollection: { equals: 'posts' } }, + }) + expect(finalEmbeds.totalDocs).toBe(2) + + const runDoc = await payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: run.id, + }) + expect(runDoc.status).toBe('succeeded') + expect(runDoc.inputs).toBe(2) + }) }) diff --git a/package.json b/package.json index fadeddf..06d4ef9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "payloadcms-vectorize", - "version": "0.4.1", + "version": "0.5.0", "description": "A plugin to vectorize collections for RAG in Payload 3.0", "license": "MIT", "type": "module", @@ -14,11 +14,6 @@ "import": "./src/exports/client.ts", "types": "./src/exports/client.ts", "default": "./src/exports/client.ts" - }, - "./rsc": { - "import": "./src/exports/rsc.ts", - "types": "./src/exports/rsc.ts", - "default": "./src/exports/rsc.ts" } }, "main": "./src/index.ts", @@ -111,11 +106,6 @@ "import": "./dist/exports/client.js", "types": "./dist/exports/client.d.ts", "default": "./dist/exports/client.js" - }, - "./rsc": { - "import": "./dist/exports/rsc.js", - "types": "./dist/exports/rsc.d.ts", - "default": "./dist/exports/rsc.js" } }, "main": "./dist/index.js", diff --git a/src/admin/components/EmbedAllButton.tsx b/src/admin/components/EmbedAllButton.tsx index 666f2de..6ca02a7 100644 --- a/src/admin/components/EmbedAllButton.tsx +++ b/src/admin/components/EmbedAllButton.tsx @@ -2,13 +2,22 @@ import React, { useState } from 'react' -type EmbedAllButtonProps = { +type EmbedAllButtonServerProps = { + hasBulkEmbeddings: boolean +} + +type EmbedAllButtonClientProps = { collectionSlug: string hasCreatePermission?: boolean newDocumentURL?: string } -export const EmbedAllButton: React.FC = ({ collectionSlug }) => { +type EmbedAllButtonProps = EmbedAllButtonServerProps & EmbedAllButtonClientProps + +export const EmbedAllButton: React.FC = ({ + collectionSlug, + hasBulkEmbeddings, +}) => { const [isSubmitting, setIsSubmitting] = useState(false) const [message, setMessage] = useState(null) @@ -36,6 +45,22 @@ export const EmbedAllButton: React.FC = ({ collectionSlug } } } + if (!hasBulkEmbeddings) { + return ( +
+ + Bulk embedding not configured +
+ ) + } + return (
- {message ? {message} : null} + {message ? ( + + {message.text} + {message.runId ? ( + <> + {' '} + + #{message.runId} + + + ) : null} + + ) : null}
) } diff --git a/src/index.ts b/src/index.ts index 2c48aec..b47adc3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -67,7 +67,6 @@ async function ensurePgvectorArtifacts(args: { for (const sql of sqls) { await postgresPayload.db.drizzle.execute(sql) } - } postgresPayload.logger.info('[payloadcms-vectorize] pgvector extension/columns/index ensured') } catch (err) { @@ -272,18 +271,18 @@ export const createVectorizeIntegration = // Only queue real-time vectorization if realTimeIngestionFn is provided // Bulk embedding is only triggered manually via API (/vector-bulk-embed) or admin UI if (realTimeIngestionFn) { - await payload.jobs.queue<'payloadcms-vectorize:vectorize'>({ - task: 'payloadcms-vectorize:vectorize', - input: { - doc, - collection: collectionSlug, - knowledgePool: pool, - }, - req: req, - ...(pluginOptions.realtimeQueueName - ? { queue: pluginOptions.realtimeQueueName } - : {}), - }) + await payload.jobs.queue<'payloadcms-vectorize:vectorize'>({ + task: 'payloadcms-vectorize:vectorize', + input: { + doc, + collection: collectionSlug, + knowledgePool: pool, + }, + req: req, + ...(pluginOptions.realtimeQueueName + ? { queue: pluginOptions.realtimeQueueName } + : {}), + }) } // If no realTimeIngestionFn, nothing happens on doc change // User must trigger bulk embedding manually @@ -365,4 +364,3 @@ export const createVectorizeIntegration = payloadcmsVectorize, } } - diff --git a/vitest.config.js b/vitest.config.js index 37c533b..9d7b479 100644 --- a/vitest.config.js +++ b/vitest.config.js @@ -25,8 +25,8 @@ export default defineConfig(() => { // (embeddingsTables map and Payload instance caching) fileParallelism: false, // Disable parallel test execution within files as well - threads: false, - maxConcurrency: 1, + //threads: false, + //maxConcurrency: 1, }, } }) From ea041981b06f214d3cff2819e0d9de394185e986 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Thu, 8 Jan 2026 15:05:09 +0700 Subject: [PATCH 19/49] WIP --- README.md | 16 + dev/app/(payload)/admin/importMap.js | 6 +- dev/payload.config.ts | 1 - dev/specs/e2e.spec.ts | 135 +++++- dev/specs/utils.ts | 2 +- playwright.config.js | 4 +- src/admin/components/EmbedAllButton.tsx | 95 ---- .../components/EmbedAllButton/client.tsx | 419 ++++++++++++++++++ src/admin/components/EmbedAllButton/index.tsx | 57 +++ src/collections/embeddings.ts | 28 +- src/exports/client.ts | 2 +- src/index.ts | 23 +- src/tasks/bulkEmbedAll.ts | 5 +- src/tasks/vectorize.ts | 4 +- src/types.ts | 18 + 15 files changed, 686 insertions(+), 129 deletions(-) delete mode 100644 src/admin/components/EmbedAllButton.tsx create mode 100644 src/admin/components/EmbedAllButton/client.tsx create mode 100644 src/admin/components/EmbedAllButton/index.tsx diff --git a/README.md b/README.md index a39b3e2..c603274 100644 --- a/README.md +++ b/README.md @@ -148,6 +148,21 @@ export default buildConfig({ **Important:** `knowledgePools` must have **different names than your collections**—reusing a collection name for a knowledge pool **will cause schema conflicts**. (In this example, the knowledge pool is named 'main' and a collection named 'main' will be created.) +### 1.5. Generate Import Map (Required for Admin UI) + +After configuring the plugin, you must generate the import map so that Payload can resolve client components (like the "Embed all" button) in the admin UI for bulk embeddings: + +```bash +pnpm run generate:importmap +``` + +**⚠️ Important:** Run this command: + +- After initial plugin setup +- If the "Embed all" button doesn't appear in the admin UI + +The import map tells Payload how to resolve component paths (like `'payloadcms-vectorize/client#EmbedAllButton'`) to actual React components. Without it, client components referenced in your collection configs won't render. + ### 2. Search Your Content The plugin automatically creates a `/api/vector-search` endpoint: @@ -474,6 +489,7 @@ Search for similar content using vector similarity. ### Bulk Embedding (Embed All) - Each knowledge pool's embeddings list shows an **Embed all** admin button that triggers a bulk run. +- **Note:** Make sure you've run `pnpm run generate:importmap` after plugin configuration, otherwise the button won't appear. - Bulk runs only include documents missing embeddings for the pool's current `embeddingConfig.version`. - Progress is recorded in `vector-bulk-embeddings-runs` and `vector-bulk-embeddings-batches` collections. - Endpoint: **POST** `/api/vector-bulk-embed` diff --git a/dev/app/(payload)/admin/importMap.js b/dev/app/(payload)/admin/importMap.js index 5bc8ec3..abe5d88 100644 --- a/dev/app/(payload)/admin/importMap.js +++ b/dev/app/(payload)/admin/importMap.js @@ -21,6 +21,8 @@ import { StrikethroughFeatureClient as StrikethroughFeatureClient_e70f5e05f09f93 import { UnderlineFeatureClient as UnderlineFeatureClient_e70f5e05f09f93e00b997edb1ef0c864 } from '@payloadcms/richtext-lexical/client' import { BoldFeatureClient as BoldFeatureClient_e70f5e05f09f93e00b997edb1ef0c864 } from '@payloadcms/richtext-lexical/client' import { ItalicFeatureClient as ItalicFeatureClient_e70f5e05f09f93e00b997edb1ef0c864 } from '@payloadcms/richtext-lexical/client' +import { EmbedAllButton as EmbedAllButton_69051d9d0217691c78245f4f33731b73 } from 'payloadcms-vectorize/client' +import { CollectionCards as CollectionCards_ab83ff7e88da8d3530831f296ec4756a } from '@payloadcms/ui/rsc' export const importMap = { "@payloadcms/richtext-lexical/rsc#RscEntryLexicalCell": RscEntryLexicalCell_44fe37237e0ebf4470c9990d8cb7b07e, @@ -45,5 +47,7 @@ export const importMap = { "@payloadcms/richtext-lexical/client#StrikethroughFeatureClient": StrikethroughFeatureClient_e70f5e05f09f93e00b997edb1ef0c864, "@payloadcms/richtext-lexical/client#UnderlineFeatureClient": UnderlineFeatureClient_e70f5e05f09f93e00b997edb1ef0c864, "@payloadcms/richtext-lexical/client#BoldFeatureClient": BoldFeatureClient_e70f5e05f09f93e00b997edb1ef0c864, - "@payloadcms/richtext-lexical/client#ItalicFeatureClient": ItalicFeatureClient_e70f5e05f09f93e00b997edb1ef0c864 + "@payloadcms/richtext-lexical/client#ItalicFeatureClient": ItalicFeatureClient_e70f5e05f09f93e00b997edb1ef0c864, + "payloadcms-vectorize/client#EmbedAllButton": EmbedAllButton_69051d9d0217691c78245f4f33731b73, + "@payloadcms/ui/rsc#CollectionCards": CollectionCards_ab83ff7e88da8d3530831f296ec4756a } diff --git a/dev/payload.config.ts b/dev/payload.config.ts index 56d43f1..efc9723 100644 --- a/dev/payload.config.ts +++ b/dev/payload.config.ts @@ -145,7 +145,6 @@ const buildConfigWithPostgres = async () => { version: testEmbeddingVersion, queryFn: embedQuery, realTimeIngestionFn: embedDocs, - bulkEmbeddingsFns, }, }, bulkDefault: { diff --git a/dev/specs/e2e.spec.ts b/dev/specs/e2e.spec.ts index daef019..6b2b80b 100644 --- a/dev/specs/e2e.spec.ts +++ b/dev/specs/e2e.spec.ts @@ -5,6 +5,32 @@ import { getPayload } from 'payload' import { getInitialMarkdownContent } from './constants.js' import { waitForVectorizationJobs } from './utils.js' import { testEmbeddingVersion } from 'helpers/embed.js' +import { devUser } from 'helpers/credentials.js' + +// Helper function to log in to the admin panel +const loginToAdmin = async (page: any) => { + console.log('[loginToAdmin] Starting login process...') + await page.goto('/admin/login') + console.log('[loginToAdmin] Navigated to login page') + + await page.waitForLoadState('domcontentloaded') + console.log('[loginToAdmin] Page loaded') + + // Fill in the login form + console.log('[loginToAdmin] Filling in email...') + await page.fill('input[name="email"]', devUser.email) + console.log('[loginToAdmin] Filling in password...') + await page.fill('input[name="password"]', devUser.password) + + // Click the login button + console.log('[loginToAdmin] Clicking submit button...') + await page.click('button[type="submit"]') + + // Wait for redirect to admin dashboard + console.log('[loginToAdmin] Waiting for redirect...') + await page.waitForURL(/\/admin(?!\/login)/, { timeout: 15000 }) + console.log('[loginToAdmin] Login complete!') +} const expectVectorSearchResponse = async (response: any, post: any, title: string) => { expect(response.ok()).toBe(true) @@ -38,9 +64,11 @@ test.describe('Vector embedding e2e tests', () => { let post: any test.beforeAll(async () => { + console.log('[beforeAll] Setting up Payload instance...') // Setup: Create a post and wait for realtime embedding _config = await config payload = await getPayload({ config: _config, key: `e2e-test-${Date.now()}` }) + console.log('[beforeAll] Payload instance created') }) test('querying the endpoint should return the title with testEmbeddingVersion', async ({ @@ -77,9 +105,15 @@ test.describe('Vector embedding e2e tests', () => { page, request, }) => { + console.log('[test] Starting bulk embedding test...') test.setTimeout(120000) + // Login to admin first + console.log('[test] Logging in...') + await loginToAdmin(page) + // Verify bulkDefault pool is EMPTY (no realTimeIngestionFn configured) + console.log('[test] Checking bulkDefault pool is empty...') const emptyResponse = await request.post('/api/vector-search', { data: { query: title, @@ -89,13 +123,41 @@ test.describe('Vector embedding e2e tests', () => { await expectEmptyVectorSearchResponse(emptyResponse) // Navigate to the bulkDefault embeddings collection page in admin - await page.goto('/admin/collections/bulkDefault') + console.log('[test] Navigating to bulkDefault collection page...') + await page.goto('/admin/collections/bulkDefault', { waitUntil: 'networkidle' }) + console.log('[test] Page loaded') + + // Wait for the page to fully load and render + console.log('[test] Waiting for page to fully load...') + await page.waitForLoadState('domcontentloaded') + await page.waitForLoadState('networkidle') + console.log('[test] Page fully loaded') + + // Wait for the collapsible header to appear - use getByText for more flexible matching + // Note: If this fails, ensure `pnpm run generate:importmap` has been run + console.log('[test] Looking for "Bulk Embed All" text...') + const bulkEmbedAllText = page.getByText('Bulk Embed All', { exact: false }) + await expect(bulkEmbedAllText).toBeVisible({ timeout: 15000 }) + console.log('[test] Found "Bulk Embed All" text!') - // Wait for the page to load and find the Embed All button - const embedAllButton = page.locator('button:has-text("Embed all")') - await expect(embedAllButton).toBeVisible({ timeout: 10000 }) + // Click the button that contains the h3 with "Bulk Embed All" text + // The button wraps the h3, so we click the button that contains the h3 + const expandButton = page.locator('button:has(h3:has-text("Bulk Embed All"))') + // If that doesn't work, try clicking the parent of the text + if ((await expandButton.count()) === 0) { + const parentButton = bulkEmbedAllText.locator('..').locator('button').first() + await parentButton.click() + } else { + await expandButton.click() + } + + // Wait for the expanded content to appear (the Embed All button should become visible) + await page.waitForTimeout(500) // Small delay for animation - // Click the Embed All button + // Now find and click the Embed All button (should be visible after expansion) + // Use a more specific selector to avoid clicking the expand button again + const embedAllButton = page.locator('button.btn--style-primary:has-text("Embed all")') + await expect(embedAllButton).toBeVisible({ timeout: 5000 }) await embedAllButton.click() // Wait for success message with run link @@ -124,15 +186,21 @@ test.describe('Vector embedding e2e tests', () => { let finalStatus = '' while (attempts < maxAttempts) { + console.log('[test] Polling for status...') // Refresh the page to see updated status await page.reload() await page.waitForLoadState('domcontentloaded') - // Get the status value - it's in a select or text field - const statusValue = await statusField.inputValue().catch(() => null) + // Get the status value - React Select displays value in .rs__single-value + const statusValue = await statusField + .locator('.rs__single-value') + .textContent() + .catch(() => null) + console.log('[test] Status value:', statusValue) if (statusValue) { finalStatus = statusValue - if (statusValue === 'completed') { + console.log('[test] Status value:', statusValue) + if (statusValue === 'succeeded') { break } } @@ -141,7 +209,7 @@ test.describe('Vector embedding e2e tests', () => { await page.waitForTimeout(3000) } - expect(finalStatus).toBe('completed') + expect(finalStatus).toBe('succeeded') // Now verify vector-search returns results for bulkDefault pool const filledResponse = await request.post('/api/vector-search', { @@ -152,4 +220,53 @@ test.describe('Vector embedding e2e tests', () => { }) await expectVectorSearchResponse(filledResponse, post, title) }) + + test('clicking expand section on default collection shows not enabled message', async ({ + page, + }) => { + console.log('[test] Starting default collection test...') + + // Login to admin first + console.log('[test] Logging in...') + await loginToAdmin(page) + + // Navigate to the default embeddings collection page in admin + console.log('[test] Navigating to default collection page...') + await page.goto('/admin/collections/default', { waitUntil: 'networkidle' }) + console.log('[test] Page loaded') + + // Wait for the page to fully load and render + console.log('[test] Waiting for page to fully load...') + await page.waitForLoadState('domcontentloaded') + await page.waitForLoadState('networkidle') + console.log('[test] Page fully loaded') + + // Wait for the collapsible header to appear - use getByText for more flexible matching + // Note: If this fails, ensure `pnpm run generate:importmap` has been run + console.log('[test] Looking for "Bulk Embed All" text...') + const bulkEmbedAllText = page.getByText('Bulk Embed All', { exact: false }) + await expect(bulkEmbedAllText).toBeVisible({ timeout: 15000 }) + console.log('[test] Found "Bulk Embed All" text!') + + // Click the button that contains the h3 with "Bulk Embed All" text + const expandButton = page.locator('button:has(h3:has-text("Bulk Embed All"))') + // If that doesn't work, try clicking the parent of the text + if ((await expandButton.count()) === 0) { + const parentButton = bulkEmbedAllText.locator('..').locator('button').first() + await parentButton.click() + } else { + await expandButton.click() + } + + // Wait for the expanded content to appear + await page.waitForTimeout(500) // Small delay for animation + + // Verify the "Bulk embedding not configured" message appears + const notConfiguredMessage = page.locator('text=/Bulk embedding not configured/i') + await expect(notConfiguredMessage).toBeVisible({ timeout: 5000 }) + + // Verify the message about configuring bulkEmbeddingsFns appears + const configMessage = page.locator('text=/bulkEmbeddingsFns/i') + await expect(configMessage).toBeVisible({ timeout: 5000 }) + }) }) diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index 4c46570..ae1804a 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -8,7 +8,7 @@ import { createVectorizeIntegration } from 'payloadcms-vectorize' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../src/collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../../src/collections/bulkEmbeddingInputMetadata.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../src/collections/bulkEmbeddingsBatches.js' -import { makeDummyEmbedDocs } from 'helpers/embed.js' +import { makeDummyEmbedDocs } from '../helpers/embed.js' import type { BulkEmbeddingsFns, BulkEmbeddingInput, diff --git a/playwright.config.js b/playwright.config.js index 1404bda..41d2423 100644 --- a/playwright.config.js +++ b/playwright.config.js @@ -25,9 +25,9 @@ export default defineConfig({ /* Fail the build on CI if you accidentally left test.only in the source code. */ forbidOnly: !!process.env.CI, /* Retry on CI only */ - retries: process.env.CI ? 2 : 0, + retries: 0, /* Opt out of parallel tests on CI. */ - workers: process.env.CI ? 1 : undefined, + workers: 1, /* Reporter to use. See https://playwright.dev/docs/test-reporters */ reporter: 'html', /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */ diff --git a/src/admin/components/EmbedAllButton.tsx b/src/admin/components/EmbedAllButton.tsx deleted file mode 100644 index 64bd604..0000000 --- a/src/admin/components/EmbedAllButton.tsx +++ /dev/null @@ -1,95 +0,0 @@ -'use client' - -import React, { useState } from 'react' - -type EmbedAllButtonServerProps = { - hasBulkEmbeddings: boolean -} - -type EmbedAllButtonClientProps = { - collectionSlug: string - hasCreatePermission?: boolean - newDocumentURL?: string -} - -type EmbedAllButtonProps = EmbedAllButtonServerProps & EmbedAllButtonClientProps - -export const EmbedAllButton: React.FC = ({ - collectionSlug, - hasBulkEmbeddings, -}) => { - const [isSubmitting, setIsSubmitting] = useState(false) - const [message, setMessage] = useState<{ text: string; runId?: string } | null>(null) - - const handleClick = async () => { - setIsSubmitting(true) - setMessage(null) - try { - const res = await fetch('/api/vector-bulk-embed', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ knowledgePool: collectionSlug }), - }) - const data = await res.json() - if (!res.ok) { - setMessage({ text: data?.error || 'Failed to queue bulk embed run' }) - return - } - setMessage({ text: 'Queued bulk embed run', runId: data.runId }) - } catch (error: any) { - setMessage({ text: error?.message || 'Failed to queue bulk embed run' }) - } finally { - setIsSubmitting(false) - } - } - - if (!hasBulkEmbeddings) { - return ( -
- - Bulk embedding not configured -
- ) - } - - return ( -
- - {message ? ( - - {message.text} - {message.runId ? ( - <> - {' '} - - #{message.runId} - - - ) : null} - - ) : null} -
- ) -} - -export default EmbedAllButton diff --git a/src/admin/components/EmbedAllButton/client.tsx b/src/admin/components/EmbedAllButton/client.tsx new file mode 100644 index 0000000..14d3dbf --- /dev/null +++ b/src/admin/components/EmbedAllButton/client.tsx @@ -0,0 +1,419 @@ +'use client' + +import React, { useState } from 'react' + +type EmbedAllButtonClientProps = { + collectionSlug: string + hasBulkEmbeddings: boolean +} + +export const EmbedAllButtonClient: React.FC = ({ + collectionSlug, + hasBulkEmbeddings, +}) => { + const [isSubmitting, setIsSubmitting] = useState(false) + const [message, setMessage] = useState<{ text: string; runId?: string; error?: boolean } | null>( + null, + ) + const [isExpanded, setIsExpanded] = useState(false) + const [isExpandedDisabled, setIsExpandedDisabled] = useState(false) + + const handleClick = async () => { + setIsSubmitting(true) + setMessage(null) + try { + const res = await fetch('/api/vector-bulk-embed', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ knowledgePool: collectionSlug }), + }) + const data = await res.json() + if (!res.ok) { + setMessage({ text: data?.error || 'Failed to queue bulk embed run', error: true }) + return + } + setMessage({ text: 'Queued bulk embed run', runId: data.runId, error: false }) + } catch (error: any) { + setMessage({ text: error?.message || 'Failed to queue bulk embed run', error: true }) + } finally { + setIsSubmitting(false) + } + } + + if (!hasBulkEmbeddings) { + return ( +
+ + + {isExpandedDisabled && ( +
+ + Bulk embedding not configured + +

+ This knowledge pool does not have bulk embedding configured. Configure{' '} + + bulkEmbeddingsFns + {' '} + in your plugin options to enable this feature. +

+
+ )} + + +
+ ) + } + + return ( +
+
+ + + {isExpanded && ( +
+

+ Generate embeddings for all documents that don't have embeddings in this knowledge + pool. This process will: +

+
    +
  • + Collects all documents missing embeddings or with embeddings of a different version +
  • +
  • Create batches and submit them to your embedding provider
  • +
  • Monitor batch completion and save embeddings atomically
  • +
  • Track progress in the bulk embeddings runs collection
  • +
+

+ Note: This is a large operation. You can monitor progress by clicking the run link + after submission. +

+ +
+ + + {message && ( +
+ {message.error ? ( + + + + + ) : ( + + + + + )} + + {message.text} + {message.runId && !message.error && ( + <> + {' — '} + { + e.preventDefault() + window.location.href = `/admin/collections/vector-bulk-embeddings-runs/${message.runId}` + }} + > + View run #{message.runId} + + + )} + +
+ )} +
+
+ )} +
+ + +
+ ) +} diff --git a/src/admin/components/EmbedAllButton/index.tsx b/src/admin/components/EmbedAllButton/index.tsx new file mode 100644 index 0000000..472ff6c --- /dev/null +++ b/src/admin/components/EmbedAllButton/index.tsx @@ -0,0 +1,57 @@ +import React from 'react' +import { EmbedAllButtonClient } from './client.js' + +type EmbedAllButtonServerProps = { + hasBulkEmbeddings: boolean + collectionSlug: string +} + +type EmbedAllButtonProps = EmbedAllButtonServerProps + +export const EmbedAllButton: React.FC = ( + props, +) => { + // Payload passes serverProps functions - we need to call them ourselves + // The function receives { payload, params } context + let hasBulkEmbeddings: boolean = false + + if (typeof props.hasBulkEmbeddings === 'function') { + // Call the serverProps function with the payload/params context + try { + hasBulkEmbeddings = Boolean( + (props.hasBulkEmbeddings as any)({ payload: props.payload, params: props.params }), + ) + } catch (error) { + console.error('[EmbedAllButton Server] Error calling hasBulkEmbeddings:', error) + hasBulkEmbeddings = false + } + } else { + hasBulkEmbeddings = Boolean(props.hasBulkEmbeddings) + } + + let collectionSlug: string = '' + + if (typeof props.collectionSlug === 'function') { + // Call the serverProps function with the payload/params context + try { + collectionSlug = String( + (props.collectionSlug as any)({ payload: props.payload, params: props.params }) || '', + ) + } catch (error) { + console.error('[EmbedAllButton Server] Error calling collectionSlug:', error) + collectionSlug = '' + } + } else { + collectionSlug = String(props.collectionSlug || '') + } + + console.log('[EmbedAllButton Server] Resolved hasBulkEmbeddings:', hasBulkEmbeddings) + console.log('[EmbedAllButton Server] Resolved collectionSlug:', collectionSlug) + + // Only pass serializable props to the client component + return ( + + ) +} + +export default EmbedAllButton diff --git a/src/collections/embeddings.ts b/src/collections/embeddings.ts index 603c854..9081ae4 100644 --- a/src/collections/embeddings.ts +++ b/src/collections/embeddings.ts @@ -1,5 +1,6 @@ import type { CollectionConfig, Field } from 'payload' -import type { KnowledgePoolName } from '../types.js' +import type { KnowledgePoolName, VectorizedPayload } from '../types.js' +import { isVectorizedPayload } from '../types.js' const RESERVED_FIELDS = ['sourceCollection', 'docId', 'chunkIndex', 'chunkText', 'embeddingVersion'] @@ -31,19 +32,22 @@ export const createEmbeddingsCollection = ( path: 'payloadcms-vectorize/client#EmbedAllButton', exportName: 'EmbedAllButton', serverProps: { - hasBulkEmbeddings: ({ payload, params }: { payload: any; params: any }) => { - // Get the knowledge pool name from the collection slug - const poolName = params?.slug as string - if (!poolName) return false + hasBulkEmbeddings: ({ payload, params }: { payload: any; params: any }): boolean => { + // Get the knowledge pool name from params.segments + // params structure: { segments: [ 'collections', 'bulkDefault' ] } + const poolName = params?.segments?.[1] - // Access plugin options from payload config - const pluginOptions = payload.config.plugins?.find( - (p: any) => p.payloadcmsVectorize, - )?.payloadcmsVectorize + // Use the _isBulkEmbedEnabled method added by the plugin + if (poolName && typeof poolName === 'string' && isVectorizedPayload(payload)) { + return payload._isBulkEmbedEnabled(poolName) + } - if (!pluginOptions?.knowledgePools?.[poolName]) return false - - return !!pluginOptions.knowledgePools[poolName].embeddingConfig.bulkEmbeddingsFns + return false + }, + collectionSlug: ({ params }: { payload: any; params: any }): string => { + // Get the knowledge pool name from params.segments + // params structure: { segments: [ 'collections', 'bulkDefault' ] } + return params?.segments?.[1] || '' }, }, }, diff --git a/src/exports/client.ts b/src/exports/client.ts index eaa8a1d..e864467 100644 --- a/src/exports/client.ts +++ b/src/exports/client.ts @@ -1 +1 @@ -export { EmbedAllButton } from '../admin/components/EmbedAllButton.js' +export { EmbedAllButton } from '../admin/components/EmbedAllButton/index.js' diff --git a/src/index.ts b/src/index.ts index b47adc3..590ae51 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,5 +1,6 @@ import type { Config, Payload } from 'payload' import { customType } from '@payloadcms/db-postgres/drizzle/pg-core' +import toSnakeCase from 'to-snake-case' import { createEmbeddingsCollection } from './collections/embeddings.js' import type { @@ -8,6 +9,7 @@ import type { KnowledgePoolName, KnowledgePoolStaticConfig, KnowledgePoolDynamicConfig, + VectorizedPayload, } from './types.js' import { isPostgresPayload } from './types.js' import type { PostgresAdapterArgs } from '@payloadcms/db-postgres' @@ -109,10 +111,12 @@ export const createVectorizeIntegration = }, }) - const table = schema?.tables?.[poolName] + // Drizzle converts camelCase collection slugs to snake_case table names + const tableName = toSnakeCase(poolName) + const table = schema?.tables?.[tableName] if (!table) { throw new Error( - `[payloadcms-vectorize] Embeddings table "${poolName}" not found during schema initialization. Ensure the collection has been registered.`, + `[payloadcms-vectorize] Embeddings table "${poolName}" (table: "${tableName}") not found during schema initialization. Ensure the collection has been registered.`, ) } @@ -321,14 +325,25 @@ export const createVectorizeIntegration = const incomingOnInit = config.onInit config.onInit = async (payload) => { - if (incomingOnInit) await incomingOnInit(payload) + if (incomingOnInit) + await incomingOnInit(payload) + + // Add _isBulkEmbedEnabled method to payload object + // This allows checking if bulk embedding is enabled for a knowledge pool + ;(payload as VectorizedPayload)._isBulkEmbedEnabled = ( + knowledgePool: TPoolNames, + ): boolean => { + const poolConfig = pluginOptions.knowledgePools[knowledgePool] + return !!poolConfig?.embeddingConfig?.bulkEmbeddingsFns + } // Ensure pgvector artifacts for each knowledge pool for (const poolName in staticConfigs) { const staticConfig = staticConfigs[poolName] + // Drizzle converts camelCase collection slugs to snake_case table names await ensurePgvectorArtifacts({ payload, - tableName: poolName, + tableName: toSnakeCase(poolName), dims: staticConfig.dims, ivfflatLists: staticConfig.ivfflatLists, }) diff --git a/src/tasks/bulkEmbedAll.ts b/src/tasks/bulkEmbedAll.ts index cea93db..051dc62 100644 --- a/src/tasks/bulkEmbedAll.ts +++ b/src/tasks/bulkEmbedAll.ts @@ -10,6 +10,7 @@ import { BULK_EMBEDDINGS_RUNS_SLUG } from '../collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../collections/bulkEmbeddingInputMetadata.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../collections/bulkEmbeddingsBatches.js' import { isPostgresPayload, PostgresPayload, BulkEmbeddingInput } from '../types.js' +import toSnakeCase from 'to-snake-case' type PrepareBulkEmbeddingTaskInput = { runId: string @@ -622,7 +623,7 @@ async function completeAllBatchesAtomically(args: { await persistVectorColumn({ payload, - poolName, + poolName: toSnakeCase(poolName), vector: embeddingArray, id: String((created as any)?.id ?? ''), }) @@ -657,7 +658,7 @@ async function persistVectorColumn(args: { const postgresPayload = payload as PostgresPayload const schemaName = postgresPayload.db.schemaName || 'public' const literal = `[${Array.from(vector).join(',')}]` - const sql = `UPDATE "${schemaName}"."${poolName}" SET embedding = $1 WHERE id = $2` + const sql = `UPDATE "${schemaName}"."${toSnakeCase(poolName)}" SET embedding = $1 WHERE id = $2` const runSQL = async (statement: string, params?: any[]) => { if (postgresPayload.db.pool?.query) return postgresPayload.db.pool.query(statement, params) if (postgresPayload.db.drizzle?.execute) return postgresPayload.db.drizzle.execute(statement) diff --git a/src/tasks/vectorize.ts b/src/tasks/vectorize.ts index 0e497f2..5f8364c 100644 --- a/src/tasks/vectorize.ts +++ b/src/tasks/vectorize.ts @@ -6,6 +6,7 @@ import { KnowledgePoolDynamicConfig, ToKnowledgePoolFn, } from '../types.js' +import toSnakeCase from 'to-snake-case' type VectorizeTaskInput = { doc: Record @@ -159,7 +160,8 @@ async function runVectorizeTask(args: { const literal = `[${Array.from(vector).join(',')}]` const postgresPayload = payload as PostgresPayload const schemaName = postgresPayload.db.schemaName || 'public' - const sql = `UPDATE "${schemaName}"."${poolName}" SET embedding = $1 WHERE id = $2` as string + const sql = + `UPDATE "${schemaName}"."${toSnakeCase(poolName)}" SET embedding = $1 WHERE id = $2` as string try { await runSQL(sql, [literal, id]) } catch (e) { diff --git a/src/types.ts b/src/types.ts index 670ae59..9216e71 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,5 +1,23 @@ import type { CollectionSlug, Payload, Field, Where } from 'payload' +/** + * Extended Payload type with vectorize plugin methods + */ +export type VectorizedPayload = + Payload & { + /** Check if bulk embedding is enabled for a knowledge pool */ + _isBulkEmbedEnabled: (knowledgePool: TPoolNames) => boolean + } + +/** + * Type guard to check if a Payload instance has vectorize extensions + */ +export function isVectorizedPayload(payload: Payload): payload is VectorizedPayload { + return ( + '_isBulkEmbedEnabled' in payload && typeof (payload as any)._isBulkEmbedEnabled === 'function' + ) +} + export type EmbedDocsFn = (texts: string[]) => Promise export type EmbedQueryFn = (text: string) => Promise From 1aaf52cbeb793aae152ed6c29288ad79cead3865 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Thu, 8 Jan 2026 17:06:27 +0700 Subject: [PATCH 20/49] Adds CI browser --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aa206bf..d0a3d4c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,6 +39,9 @@ jobs: - name: Install dependencies run: pnpm install + - name: Install Playwright browsers + run: pnpm exec playwright install --with-deps chromium + - name: Install pgvector extension run: | sudo apt-get update From 91d0bf72a5cd21966e23a55ae61b52f1fd25c675 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Thu, 8 Jan 2026 18:26:34 +0700 Subject: [PATCH 21/49] Runs sequentially so the tests pass in CI --- dev/specs/e2e.spec.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dev/specs/e2e.spec.ts b/dev/specs/e2e.spec.ts index 6b2b80b..406055e 100644 --- a/dev/specs/e2e.spec.ts +++ b/dev/specs/e2e.spec.ts @@ -58,6 +58,9 @@ const expectEmptyVectorSearchResponse = async (response: any) => { } test.describe('Vector embedding e2e tests', () => { + // Force tests to run sequentially + test.describe.configure({ mode: 'serial' }) + const title = 'e2e test post title' let payload: Payload let _config: SanitizedConfig From 95bdb714e587586d2d5d94c37b7f9901c98f38e2 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Thu, 8 Jan 2026 18:54:59 +0700 Subject: [PATCH 22/49] increases timeout since tests are in parallel now --- dev/specs/e2e.spec.ts | 2 +- playwright.config.js | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/dev/specs/e2e.spec.ts b/dev/specs/e2e.spec.ts index 406055e..baaf4fb 100644 --- a/dev/specs/e2e.spec.ts +++ b/dev/specs/e2e.spec.ts @@ -59,7 +59,7 @@ const expectEmptyVectorSearchResponse = async (response: any) => { test.describe('Vector embedding e2e tests', () => { // Force tests to run sequentially - test.describe.configure({ mode: 'serial' }) + test.describe.configure({ mode: 'serial', timeout: 120000 }) const title = 'e2e test post title' let payload: Payload diff --git a/playwright.config.js b/playwright.config.js index 41d2423..9c895ac 100644 --- a/playwright.config.js +++ b/playwright.config.js @@ -24,9 +24,7 @@ export default defineConfig({ fullyParallel: true, /* Fail the build on CI if you accidentally left test.only in the source code. */ forbidOnly: !!process.env.CI, - /* Retry on CI only */ retries: 0, - /* Opt out of parallel tests on CI. */ workers: 1, /* Reporter to use. See https://playwright.dev/docs/test-reporters */ reporter: 'html', From 3a7b73cb093bda37c90463b0b33a0edeec9b5488 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Fri, 9 Jan 2026 23:51:13 +0700 Subject: [PATCH 23/49] Better explanation and leaner API --- README.md | 12 ++++++++++++ dev/helpers/embed.ts | 7 ------- dev/specs/utils.ts | 6 ------ src/index.ts | 36 +++++++++++++++++++++++++++++++++++- src/tasks/bulkEmbedAll.ts | 2 -- src/types.ts | 13 +++---------- 6 files changed, 50 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index c603274..80b5485 100644 --- a/README.md +++ b/README.md @@ -255,6 +255,12 @@ type BatchSubmission = { } ``` +**About the `chunk.id` field:** + +- **Plugin-generated**: The plugin automatically generates a unique `id` for each chunk (format: `${collectionSlug}:${docId}:${chunkIndex}`). You don't need to create it. +- **Purpose**: The `id` is used to correlate embedding outputs back to their original inputs, ensuring each embedding is correctly associated with its source document and chunk. +- **Usage**: When submitting batches to your provider, you must pass this `id` along with the text (e.g., as `custom_id` in Voyage AI's batch API). This allows your provider to return the `id` with each embedding result. + **Return values:** - `null` - "I'm accumulating this chunk, not ready to submit yet" @@ -328,6 +334,12 @@ type BulkEmbeddingOutput = { } ``` +**About the `id` field in outputs:** + +- **Correlation**: The `id` in each `BulkEmbeddingOutput` must match the `chunk.id` that was passed to `addChunk`. This is how the plugin correlates outputs back to their original inputs. +- **Extraction**: When processing your provider's response, extract the `id` that you originally sent (e.g., from Voyage's `custom_id` field) and include it in the returned `BulkEmbeddingOutput`. +- **Example**: If you sent `{ custom_id: "posts:123:0", input: [...] }` to your provider, extract `result.custom_id` from the response and return `{ id: result.custom_id, embedding: [...] }`. + #### `onError` - Cleanup on Failure (Optional) Called when the bulk run fails. Use this to clean up provider-side resources (delete files, cancel batches). The run can be re-queued after cleanup. diff --git a/dev/helpers/embed.ts b/dev/helpers/embed.ts index c85ae8a..20ca634 100644 --- a/dev/helpers/embed.ts +++ b/dev/helpers/embed.ts @@ -228,13 +228,6 @@ export function makeVoyageBulkEmbeddingsConfig(): BulkEmbeddingsFns { return { status, - counts: batchData.request_counts - ? { - inputs: batchData.request_counts.total || 0, - succeeded: batchData.request_counts.completed || 0, - failed: batchData.request_counts.failed || 0, - } - : undefined, } } catch (error) { console.error('Voyage pollBatch error:', error) diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index ae1804a..f6be5a5 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -128,14 +128,8 @@ export function createMockBulkEmbeddings( const callCount = batchPollCount.get(providerBatchId) ?? 0 batchPollCount.set(providerBatchId, callCount + 1) const status = statusSequence[Math.min(callCount, statusSequence.length - 1)] - const inputs = batchInputs.get(providerBatchId) ?? [] - const counts = - status === 'succeeded' - ? { inputs: inputs.length, succeeded: inputs.length, failed: 0 } - : undefined return { status, - counts, } }, diff --git a/src/index.ts b/src/index.ts index 8f5d5a7..8520bc3 100644 --- a/src/index.ts +++ b/src/index.ts @@ -34,7 +34,41 @@ import { } from './tasks/bulkEmbedAll.js' import { createBulkEmbedHandler } from './endpoints/bulkEmbed.js' -export type * from './types.js' +export type { + KnowledgePoolStaticConfig, + PayloadcmsVectorizeConfig, + + // PayloadcmsVectorizeConfig + KnowledgePoolDynamicConfig, + KnowledgePoolName, + + // KnowledgePoolDynamicConfig, + CollectionVectorizeOption, + EmbeddingConfig, + + // CollectionVectorizeOption + ToKnowledgePoolFn, + + // EmbeddingConfig + EmbedQueryFn, + EmbedDocsFn, + BulkEmbeddingsFns, + + // BulkEmbeddingsFns + AddChunkArgs, + BatchSubmission, + PollBatchArgs, + PollBulkEmbeddingsResult, + CompleteBatchArgs, + BulkEmbeddingOutput, + OnBulkErrorArgs, + + // AddChunkArgs + BulkEmbeddingInput, + + // PollBulkEmbeddingsResult + BulkEmbeddingRunStatus, +} from './types.js' async function ensurePgvectorArtifacts(args: { payload: Payload diff --git a/src/tasks/bulkEmbedAll.ts b/src/tasks/bulkEmbedAll.ts index 051dc62..6445827 100644 --- a/src/tasks/bulkEmbedAll.ts +++ b/src/tasks/bulkEmbedAll.ts @@ -261,8 +261,6 @@ export const createPollOrCompleteBulkEmbeddingTask = ({ collection: BULK_EMBEDDINGS_BATCHES_SLUG, data: { status: pollResult.status, - succeededCount: pollResult.counts?.succeeded, - failedCount: pollResult.counts?.failed, error: pollResult.error, ...(TERMINAL_STATUSES.has(pollResult.status) ? { completedAt: new Date().toISOString() } diff --git a/src/types.ts b/src/types.ts index 9216e71..b62e335 100644 --- a/src/types.ts +++ b/src/types.ts @@ -54,7 +54,7 @@ export type KnowledgePoolDynamicConfig = { embeddingConfig: EmbeddingConfig } -type EmbeddingConfig = { +export type EmbeddingConfig = { /** Version string to track embedding model/version - stored in each embedding document */ version: string /** Embedding function for query provided by the user @@ -75,7 +75,7 @@ type EmbeddingConfig = { export type BulkEmbeddingRunStatus = 'queued' | 'running' | 'succeeded' | 'failed' | 'canceled' export type BulkEmbeddingInput = { - /** Stable identifier for correlating outputs (should be unique per chunk) */ + /** Stable identifier for correlating outputs (is unique per chunk) */ id: string /** Raw text to embed */ text: string @@ -99,15 +99,8 @@ export type BulkEmbeddingOutput = { error?: string | null } -export type BulkEmbeddingCounts = { - inputs?: number - succeeded?: number - failed?: number -} - export type PollBulkEmbeddingsResult = { status: BulkEmbeddingRunStatus - counts?: BulkEmbeddingCounts error?: string } @@ -226,7 +219,7 @@ export type PostgresPayload = any & { // Job task argument types export type VectorizeTaskArgs = { payload: any - pluginOptions: PayloadcmsVectorizeConfig & { embeddingsCollectionSlug?: string } + pluginOptions: PayloadcmsVectorizeConfig doc: Record collection: string knowledgePool: KnowledgePoolName From 306cd3194a19bae3f1482421083b694b606bce6e Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sat, 10 Jan 2026 09:09:20 +0700 Subject: [PATCH 24/49] WIP --- README.md | 28 ++- dev/specs/bulkEmbed/onError.spec.ts | 7 +- dev/specs/bulkEmbed/partialFailure.spec.ts | 244 +++++++++++++++++++++ dev/specs/utils.ts | 7 +- src/collections/bulkEmbeddingsRuns.ts | 8 + src/tasks/bulkEmbedAll.ts | 60 ++++- src/types.ts | 14 ++ 7 files changed, 353 insertions(+), 15 deletions(-) create mode 100644 dev/specs/bulkEmbed/partialFailure.spec.ts diff --git a/README.md b/README.md index 80b5485..3b1e676 100644 --- a/README.md +++ b/README.md @@ -342,21 +342,38 @@ type BulkEmbeddingOutput = { #### `onError` - Cleanup on Failure (Optional) -Called when the bulk run fails. Use this to clean up provider-side resources (delete files, cancel batches). The run can be re-queued after cleanup. +Called when the bulk run fails OR when there are partial chunk failures. Use this to clean up provider-side resources (delete files, cancel batches) and handle failed chunks. The run can be re-queued after cleanup. ```typescript +type FailedChunkData = { + collection: string // Source collection slug + documentId: string // Source document ID + chunkIndex: number // Index of the chunk within the document +} + type OnBulkErrorArgs = { providerBatchIds: string[] error: Error + /** Data about chunks that failed during completion */ + failedChunkData?: FailedChunkData[] + /** Count of failed chunks */ + failedChunkCount?: number } ``` +**Error handling behavior:** + +- **Batch failures**: If any batch fails during polling, the entire run fails and `onError` is called. +- **Partial chunk failures**: If individual chunks fail during completion (e.g., provider returned an error for specific inputs), the run still succeeds but `onError` is called with `failedChunkData` and `failedChunkCount`. +- **Failed chunk data**: The `failedChunkData` array contains structured information about failed chunks, including `collection`, `documentId`, and `chunkIndex`. This data is also stored in the run record (`failedChunkData` field) for later inspection and potential retry. +- **Partial success**: Successful embeddings are still written even when some chunks fail. Only the failed chunks are skipped. + ### Bulk Task Model The plugin uses separate Payload jobs for reliability with long-running providers: - **`prepare-bulk-embedding`**: Streams through documents, calls your `addChunk` for each chunk, creates batch records. -- **`poll-or-complete-bulk-embedding`**: Polls all batches, requeues itself until done, then atomically writes all embeddings. +- **`poll-or-complete-bulk-embedding`**: Polls all batches, requeues itself until done, then writes all successful embeddings (partial chunk failures are allowed). ### Queue Configuration @@ -512,7 +529,12 @@ Search for similar content using vector similarity. } ``` -The bulk embedding process is **atomic**: either all embeddings are written or none are. If any batch fails, the run is marked failed and no partial writes occur. +The bulk embedding process has **two levels of atomicity**: + +- **Batch level**: If any batch fails during polling, the entire run fails and no embeddings are written. This is fully atomic. +- **Chunk level**: If individual chunks fail during completion (e.g., provider returns errors for specific inputs), the run still succeeds and successful embeddings are written. Failed chunks are tracked in `failedChunkData` (with structured `collection`, `documentId`, and `chunkIndex` fields) and passed to the `onError` callback for cleanup. + +This design allows for partial success: if 100 chunks are processed and 2 fail, 98 embeddings are written and the 2 failures are tracked for potential retry. **Error Recovery:** If a run fails, you can re-queue it. If you provided an `onError` callback, it will be called with all `providerBatchIds` so you can clean up provider-side resources before retrying. diff --git a/dev/specs/bulkEmbed/onError.spec.ts b/dev/specs/bulkEmbed/onError.spec.ts index ffc3087..cfc2e89 100644 --- a/dev/specs/bulkEmbed/onError.spec.ts +++ b/dev/specs/bulkEmbed/onError.spec.ts @@ -17,7 +17,12 @@ const dbName = `bulk_onerror_${Date.now()}` describe('Bulk embed - onError callback', () => { let payload: Payload let onErrorCalled = false - let onErrorArgs: { providerBatchIds: string[]; error: Error } | null = null + let onErrorArgs: { + providerBatchIds: string[] + error: Error + failedChunkData?: Array<{ collection: string; documentId: string; chunkIndex: number }> + failedChunkCount?: number + } | null = null beforeAll(async () => { await createTestDb({ dbName }) diff --git a/dev/specs/bulkEmbed/partialFailure.spec.ts b/dev/specs/bulkEmbed/partialFailure.spec.ts new file mode 100644 index 0000000..3d2f928 --- /dev/null +++ b/dev/specs/bulkEmbed/partialFailure.spec.ts @@ -0,0 +1,244 @@ +import type { Payload } from 'payload' +import { beforeAll, describe, expect, test } from 'vitest' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' +import { + BULK_QUEUE_NAMES, + DEFAULT_DIMS, + buildPayloadWithIntegration, + createMockBulkEmbeddings, + createTestDb, + waitForBulkJobs, +} from '../utils.js' +import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' + +const DIMS = DEFAULT_DIMS +const dbName = `bulk_partial_failure_${Date.now()}` + +describe('Bulk embed - partial chunk failures', () => { + let payload: Payload + let onErrorCalled = false + let onErrorArgs: { + providerBatchIds: string[] + error: Error + failedChunkData?: Array<{ collection: string; documentId: string; chunkIndex: number }> + failedChunkCount?: number + } | null = null + + beforeAll(async () => { + await createTestDb({ dbName }) + // We'll set up the payload dynamically in each test to control failIds + }) + + test('partial chunk failures are tracked and passed to onError', async () => { + // Reset state + onErrorCalled = false + onErrorArgs = null + + // The ID format is collectionSlug:docId:chunkIndex + // We need to fail a specific chunk - but we don't know the docId yet + // So we'll create the payload with a dynamic failIds check + + const built = await buildPayloadWithIntegration({ + dbName, + pluginOpts: { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [ + { chunk: doc.title }, + { chunk: doc.title + ' chunk2' }, + ], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings( + { + statusSequence: ['succeeded'], + // We'll fail chunks that contain ":1" (second chunk of any doc) + partialFailure: { failIds: [] }, // Will be updated below + onErrorCallback: (args) => { + onErrorCalled = true + onErrorArgs = args + }, + }, + DIMS, + ), + }, + }, + }, + bulkQueueNames: BULK_QUEUE_NAMES, + }, + secret: 'test-secret', + dims: DIMS, + key: `partial-failure-${Date.now()}`, + }) + payload = built.payload + + // Create a post + const post = await payload.create({ + collection: 'posts', + data: { title: 'Partial Failure Test' } as any, + }) + + // Now we know the docId, update the mock to fail the second chunk + const failChunkId = `posts:${post.id}:1` + + // Re-create with the correct failIds + const built2 = await buildPayloadWithIntegration({ + dbName, + pluginOpts: { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [ + { chunk: doc.title }, + { chunk: doc.title + ' chunk2' }, + ], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion + '-v2', + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings( + { + statusSequence: ['succeeded'], + partialFailure: { failIds: [failChunkId] }, + onErrorCallback: (args) => { + onErrorCalled = true + onErrorArgs = args + }, + }, + DIMS, + ), + }, + }, + }, + bulkQueueNames: BULK_QUEUE_NAMES, + }, + secret: 'test-secret', + dims: DIMS, + key: `partial-failure-2-${Date.now()}`, + }) + payload = built2.payload + + const run = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { pool: 'default', embeddingVersion: testEmbeddingVersion + '-v2', status: 'queued' }, + }) + + await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ + task: 'payloadcms-vectorize:prepare-bulk-embedding', + input: { runId: String(run.id) }, + req: { payload } as any, + ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName + ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } + : {}), + }) + + await waitForBulkJobs(payload) + + // Check run status - should still succeed but with failed count + const updatedRun = await payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: run.id, + }) + + expect(updatedRun.status).toBe('succeeded') + expect(updatedRun.succeeded).toBe(1) // First chunk succeeded + expect(updatedRun.failed).toBe(1) // Second chunk failed + expect(updatedRun.failedChunkData).toBeDefined() + expect(Array.isArray(updatedRun.failedChunkData)).toBe(true) + expect((updatedRun.failedChunkData as Array<{ collection: string; documentId: string; chunkIndex: number }>).length).toBe(1) + const failedChunk = (updatedRun.failedChunkData as Array<{ collection: string; documentId: string; chunkIndex: number }>)[0] + expect(failedChunk.collection).toBe('posts') + expect(failedChunk.documentId).toBe(String(post.id)) + expect(failedChunk.chunkIndex).toBe(1) // Second chunk (index 1) + + // Check onError callback was called with failed chunk info + expect(onErrorCalled).toBe(true) + expect(onErrorArgs).not.toBeNull() + expect(onErrorArgs!.failedChunkData).toBeDefined() + expect(onErrorArgs!.failedChunkData!.length).toBe(1) + expect(onErrorArgs!.failedChunkData![0].collection).toBe('posts') + expect(onErrorArgs!.failedChunkData![0].documentId).toBe(String(post.id)) + expect(onErrorArgs!.failedChunkData![0].chunkIndex).toBe(1) + expect(onErrorArgs!.failedChunkCount).toBe(1) + expect(onErrorArgs!.error.message).toContain('1 chunk(s) failed') + }) + + test('run with no partial failures does not call onError', async () => { + // Reset state + onErrorCalled = false + onErrorArgs = null + + const built = await buildPayloadWithIntegration({ + dbName, + pluginOpts: { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion + '-v3', + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings( + { + statusSequence: ['succeeded'], + // No partial failures + onErrorCallback: (args) => { + onErrorCalled = true + onErrorArgs = args + }, + }, + DIMS, + ), + }, + }, + }, + bulkQueueNames: BULK_QUEUE_NAMES, + }, + secret: 'test-secret', + dims: DIMS, + key: `no-partial-failure-${Date.now()}`, + }) + payload = built.payload + + await payload.create({ collection: 'posts', data: { title: 'No Failure Test' } as any }) + + const run = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { pool: 'default', embeddingVersion: testEmbeddingVersion + '-v3', status: 'queued' }, + }) + + await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ + task: 'payloadcms-vectorize:prepare-bulk-embedding', + input: { runId: String(run.id) }, + req: { payload } as any, + ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName + ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } + : {}), + }) + + await waitForBulkJobs(payload) + + // Check run status + const updatedRun = await payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: run.id, + }) + + expect(updatedRun.status).toBe('succeeded') + expect(updatedRun.failed).toBe(0) + expect(updatedRun.failedChunkData).toBeUndefined() + + // onError should NOT be called when everything succeeds + expect(onErrorCalled).toBe(false) + }) +}) diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index f6be5a5..d3d01ce 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -80,7 +80,12 @@ type MockOptions = { /** Optional: flush after this many chunks (for testing multi-batch scenarios) */ flushAfterChunks?: number /** Optional: callback to track onError calls for testing */ - onErrorCallback?: (args: { providerBatchIds: string[]; error: Error }) => void + onErrorCallback?: (args: { + providerBatchIds: string[] + error: Error + failedChunkData?: Array<{ collection: string; documentId: string; chunkIndex: number }> + failedChunkCount?: number + }) => void } /** diff --git a/src/collections/bulkEmbeddingsRuns.ts b/src/collections/bulkEmbeddingsRuns.ts index c2a6757..c6faf25 100644 --- a/src/collections/bulkEmbeddingsRuns.ts +++ b/src/collections/bulkEmbeddingsRuns.ts @@ -90,6 +90,14 @@ export const createBulkEmbeddingsRunsCollection = (): CollectionConfig => ({ description: 'Failure reason if the run ended in error', }, }, + { + name: 'failedChunkData', + type: 'json', + admin: { + description: + 'Data about chunks that failed during completion (collection, documentId, chunkIndex)', + }, + }, ], timestamps: true, indexes: [ diff --git a/src/tasks/bulkEmbedAll.ts b/src/tasks/bulkEmbedAll.ts index 6445827..d8cf536 100644 --- a/src/tasks/bulkEmbedAll.ts +++ b/src/tasks/bulkEmbedAll.ts @@ -9,7 +9,12 @@ import { import { BULK_EMBEDDINGS_RUNS_SLUG } from '../collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../collections/bulkEmbeddingInputMetadata.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../collections/bulkEmbeddingsBatches.js' -import { isPostgresPayload, PostgresPayload, BulkEmbeddingInput } from '../types.js' +import { + isPostgresPayload, + PostgresPayload, + BulkEmbeddingInput, + FailedChunkData, +} from '../types.js' import toSnakeCase from 'to-snake-case' type PrepareBulkEmbeddingTaskInput = { @@ -315,9 +320,9 @@ export const createPollOrCompleteBulkEmbeddingTask = ({ return { output: { runId: input.runId, status: 'polling' } } } - // All batches succeeded - complete the embeddings atomically + // All batches succeeded - complete the embeddings (writes successful chunks, tracks failures) if (allSucceeded) { - const completionResult = await completeAllBatchesAtomically({ + const completionResult = await completeBatches({ payload, runId: input.runId, poolName, @@ -333,6 +338,10 @@ export const createPollOrCompleteBulkEmbeddingTask = ({ succeeded: completionResult.succeededCount, failed: completionResult.failedCount, error: completionResult.error, + failedChunkData: + completionResult.failedChunkData.length > 0 + ? completionResult.failedChunkData + : undefined, completedAt: new Date().toISOString(), }, }) @@ -343,12 +352,23 @@ export const createPollOrCompleteBulkEmbeddingTask = ({ where: { run: { equals: (run as any).id } }, }) - // If completion failed, call onError so user can clean up provider resources - if (!completionResult.success && callbacks.onError) { + // Call onError if completion failed OR if there were partial chunk failures + if (callbacks.onError && (!completionResult.success || completionResult.failedCount > 0)) { const providerBatchIds = batches.map((b: any) => b.providerBatchId as string) await callbacks.onError({ providerBatchIds, - error: new Error(completionResult.error || 'Completion failed'), + error: new Error( + completionResult.error || + (completionResult.failedCount > 0 + ? `${completionResult.failedCount} chunk(s) failed during completion` + : 'Completion failed'), + ), + failedChunkData: + completionResult.failedChunkData.length > 0 + ? completionResult.failedChunkData + : undefined, + failedChunkCount: + completionResult.failedCount > 0 ? completionResult.failedCount : undefined, }) } @@ -540,9 +560,13 @@ async function streamAndBatchMissingEmbeddings(args: { } /** - * Complete all batches atomically - download all outputs and write all embeddings + * Complete all batches - download all outputs and write successful embeddings. + * + * Note: This function writes partial results. If some chunks fail during completion, + * successful embeddings are still written. Only failed chunks are skipped. + * The operation is atomic in that if an exception is thrown, nothing is written. */ -async function completeAllBatchesAtomically(args: { +async function completeBatches(args: { payload: Payload runId: string poolName: KnowledgePoolName @@ -554,6 +578,7 @@ async function completeAllBatchesAtomically(args: { success: boolean succeededCount: number failedCount: number + failedChunkData: FailedChunkData[] error?: string }> { const { payload, runId, poolName, batches, callbacks } = args @@ -571,9 +596,22 @@ async function completeAllBatchesAtomically(args: { allOutputs.push(...outputs) } - // Filter successful outputs + // Filter successful outputs and collect failed chunk data const successfulOutputs = allOutputs.filter((o) => !o.error && o.embedding) - const failedCount = allOutputs.length - successfulOutputs.length + const failedChunkData: FailedChunkData[] = [] + for (const output of allOutputs) { + if (output.error) { + const meta = metadataById.get(output.id) + if (meta) { + failedChunkData.push({ + collection: meta.sourceCollection, + documentId: meta.docId, + chunkIndex: meta.chunkIndex, + }) + } + } + } + const failedCount = failedChunkData.length // Collect unique doc keys for deletion const docKeys = new Set() @@ -631,6 +669,7 @@ async function completeAllBatchesAtomically(args: { success: true, succeededCount: successfulOutputs.length, failedCount, + failedChunkData, } } catch (error) { const errorMessage = (error as Error).message || String(error) @@ -638,6 +677,7 @@ async function completeAllBatchesAtomically(args: { success: false, succeededCount: 0, failedCount: 0, + failedChunkData: [], error: `Completion failed: ${errorMessage}`, } } diff --git a/src/types.ts b/src/types.ts index b62e335..2368a22 100644 --- a/src/types.ts +++ b/src/types.ts @@ -130,12 +130,26 @@ export type CompleteBatchArgs = { providerBatchId: string } +/** Data about a failed chunk during bulk embedding completion */ +export type FailedChunkData = { + /** Source collection slug */ + collection: string + /** Source document ID */ + documentId: string + /** Index of the chunk within the document */ + chunkIndex: number +} + /** Arguments passed to onError callback */ export type OnBulkErrorArgs = { /** All provider batch IDs that were created during this run */ providerBatchIds: string[] /** The error that caused the failure */ error: Error + /** Optional: Data about chunks that failed during completion */ + failedChunkData?: FailedChunkData[] + /** Optional: Count of failed chunks (for quick summary without iterating failedChunkData) */ + failedChunkCount?: number } /** From b60af9f652c7a0a425b0bb613dde54bcae638c9c Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sun, 11 Jan 2026 20:00:31 +0700 Subject: [PATCH 25/49] WIP --- README.md | 33 +- dev/helpers/embed.ts | 101 +-- dev/payload.config.ts | 31 + dev/specs/bulkEmbed/concurrentRuns.spec.ts | 121 +++ dev/specs/bulkEmbed/failedBatch.spec.ts | 50 +- dev/specs/e2e.spec.ts | 213 ++++- dev/specs/utils.ts | 40 +- dev/specs/vectorizedPayload.spec.ts | 74 +- .../RetryFailedBatchButton/client.tsx | 183 +++++ .../RetryFailedBatchButton/index.tsx | 56 ++ src/collections/bulkEmbeddingInputMetadata.ts | 3 + src/collections/bulkEmbeddingsBatches.ts | 13 + src/endpoints/bulkEmbed.ts | 133 +-- src/endpoints/retryFailedBatch.ts | 167 ++++ src/exports/client.ts | 1 + src/index.ts | 57 +- src/tasks/bulkEmbedAll.ts | 757 +++++++++++------- src/types.ts | 71 +- 18 files changed, 1618 insertions(+), 486 deletions(-) create mode 100644 dev/specs/bulkEmbed/concurrentRuns.spec.ts create mode 100644 src/admin/components/RetryFailedBatchButton/client.tsx create mode 100644 src/admin/components/RetryFailedBatchButton/index.tsx create mode 100644 src/endpoints/retryFailedBatch.ts diff --git a/README.md b/README.md index 7d87061..4b0fb91 100644 --- a/README.md +++ b/README.md @@ -224,7 +224,7 @@ if (isVectorizedPayload(payload)) { | `realtimeQueueName` | `string` | ❌ | Custom queue name for realtime vectorization jobs | | `bulkQueueNames` | `{prepareBulkEmbedQueueName: string, pollOrCompleteQueueName: string}` | ❌ | Queue names for bulk embedding jobs (required if any pool uses bulk ingest) | | `endpointOverrides` | `object` | ❌ | Customize the search endpoint | -| `disabled` | `boolean` | ❌ | Disable plugin while keeping schema | +| `disabled` | `boolean` | ❌ | Disable plugin, except embeddings deletions, while keeping schema | ### Knowledge Pool Config @@ -251,6 +251,8 @@ If `realTimeIngestionFn` is provided, documents are embedded immediately on crea If only `bulkEmbeddingsFns` is provided (no `realTimeIngestionFn`), embedding only happens via manual bulk runs. If neither is provided, embedding is disabled for that pool. +**Note:** Embedding deletion cannot be disabled. When a source document is deleted, all its embeddings are automatically deleted from all knowledge pools that contain that collection, regardless of how the embeddings were created (bulk or real-time). This behavior ensures data consistency and cannot be configured. + ### Bulk Embeddings API The bulk embedding API is designed for large-scale embedding using provider batch APIs (like Voyage AI). **Bulk runs are never auto-queued** - they must be triggered manually via the admin UI or API. @@ -262,8 +264,7 @@ The plugin streams chunks to your callbacks one at a time, giving you full contr ```typescript type BulkEmbeddingsFns = { addChunk: (args: AddChunkArgs) => Promise - pollBatch: (args: PollBatchArgs) => Promise - completeBatch: (args: CompleteBatchArgs) => Promise + pollOrCompleteBatch: (args: PollOrCompleteBatchArgs) => Promise onError?: (args: OnBulkErrorArgs) => Promise } ``` @@ -334,26 +335,21 @@ addChunk: async ({ chunk, isLastChunk }) => { **Note:** If a single chunk exceeds your provider's file size limit, you'll need to handle that edge case in your implementation (e.g., skip it, split it, or fail gracefully). -#### `pollBatch` - Check Status +#### `pollOrCompleteBatch` - Poll and Stream Results -Called repeatedly until the batch reaches a terminal status. +Called repeatedly until the batch reaches a terminal status. When the batch completes, stream the outputs via the `onChunk` callback. ```typescript -type PollBatchArgs = { providerBatchId: string } +type PollOrCompleteBatchArgs = { + providerBatchId: string + onChunk: (chunk: BulkEmbeddingOutput) => Promise +} type PollBulkEmbeddingsResult = { status: 'queued' | 'running' | 'succeeded' | 'failed' | 'canceled' counts?: { inputs?: number; succeeded?: number; failed?: number } error?: string } -``` - -#### `completeBatch` - Download Results - -Called after all batches succeed. Download the embeddings from your provider. - -```typescript -type CompleteBatchArgs = { providerBatchId: string } type BulkEmbeddingOutput = { id: string // Must match the chunk.id from addChunk @@ -362,11 +358,18 @@ type BulkEmbeddingOutput = { } ``` +**How it works:** + +1. The plugin calls `pollOrCompleteBatch` repeatedly for each batch +2. While the batch is in progress, return the status (`queued` or `running`) without calling `onChunk` +3. When the batch completes, stream each embedding result by calling `onChunk` for each output, then return `{ status: 'succeeded' }` +4. If the batch fails, return `{ status: 'failed', error: '...' }` without calling `onChunk` + **About the `id` field in outputs:** - **Correlation**: The `id` in each `BulkEmbeddingOutput` must match the `chunk.id` that was passed to `addChunk`. This is how the plugin correlates outputs back to their original inputs. - **Extraction**: When processing your provider's response, extract the `id` that you originally sent (e.g., from Voyage's `custom_id` field) and include it in the returned `BulkEmbeddingOutput`. -- **Example**: If you sent `{ custom_id: "posts:123:0", input: [...] }` to your provider, extract `result.custom_id` from the response and return `{ id: result.custom_id, embedding: [...] }`. +- **Example**: If you sent `{ custom_id: "posts:123:0", input: [...] }` to your provider, extract `result.custom_id` from the response and call `await onChunk({ id: result.custom_id, embedding: [...] })`. #### `onError` - Cleanup on Failure (Optional) diff --git a/dev/helpers/embed.ts b/dev/helpers/embed.ts index 20ca634..dfb5daf 100644 --- a/dev/helpers/embed.ts +++ b/dev/helpers/embed.ts @@ -183,7 +183,7 @@ export function makeVoyageBulkEmbeddingsConfig(): BulkEmbeddingsFns { return null }, - pollBatch: async ({ providerBatchId }) => { + pollOrCompleteBatch: async ({ providerBatchId, onChunk }) => { try { const response = await fetch(`https://api.voyageai.com/v1/batches/${providerBatchId}`, { headers: { @@ -221,71 +221,56 @@ export function makeVoyageBulkEmbeddingsConfig(): BulkEmbeddingsFns { status = 'running' } - // Store output file ID if available for later completion - if (batchData.output_file_id) { - batchOutputFiles.set(providerBatchId, batchData.output_file_id) - } - - return { - status, - } - } catch (error) { - console.error('Voyage pollBatch error:', error) - return { status: 'failed', error: 'Failed to poll batch status' } - } - }, - - completeBatch: async ({ providerBatchId }) => { - try { - const outputFileId = batchOutputFiles.get(providerBatchId) - if (!outputFileId) { - throw new Error('No output file available for batch') - } + // If succeeded, download and stream outputs + if (status === 'succeeded') { + const outputFileId = batchData.output_file_id + if (!outputFileId) { + return { status: 'failed', error: 'No output file available for completed batch' } + } - // Download output file - const response = await fetch(`https://api.voyageai.com/v1/files/${outputFileId}/content`, { - headers: { - Authorization: `Bearer ${process.env.VOYAGE_API_KEY}`, - }, - }) + // Download output file + const downloadResponse = await fetch( + `https://api.voyageai.com/v1/files/${outputFileId}/content`, + { + headers: { + Authorization: `Bearer ${process.env.VOYAGE_API_KEY}`, + }, + }, + ) - if (!response.ok) { - const error = await response.text() - throw new Error(`Failed to download output file: ${error}`) - } + if (!downloadResponse.ok) { + const error = await downloadResponse.text() + return { status: 'failed', error: `Failed to download output file: ${error}` } + } - const jsonlContent = await response.text() - const lines = jsonlContent.trim().split('\n') - - const outputs: BulkEmbeddingOutput[] = [] - - for (const line of lines) { - if (!line.trim()) continue - try { - const result = JSON.parse(line) - if (result.error) { - outputs.push({ - id: result.custom_id, - error: result.error.message || 'Unknown error', - }) - } else { - outputs.push({ - id: result.custom_id, - embedding: result.response.body.data[0].embedding, - }) + const jsonlContent = await downloadResponse.text() + const lines = jsonlContent.trim().split('\n') + + for (const line of lines) { + if (!line.trim()) continue + try { + const result = JSON.parse(line) + if (result.error) { + await onChunk({ + id: result.custom_id, + error: result.error.message || 'Unknown error', + }) + } else { + await onChunk({ + id: result.custom_id, + embedding: result.response.body.data[0].embedding, + }) + } + } catch (parseError) { + console.error('Failed to parse output line:', line, parseError) } - } catch (parseError) { - console.error('Failed to parse output line:', line, parseError) } } - // Clean up state - batchOutputFiles.delete(providerBatchId) - - return outputs + return { status } } catch (error) { - console.error('Voyage completeBatch error:', error) - throw error + console.error('Voyage pollOrCompleteBatch error:', error) + return { status: 'failed', error: 'Failed to poll batch status' } } }, diff --git a/dev/payload.config.ts b/dev/payload.config.ts index efc9723..41d82e2 100644 --- a/dev/payload.config.ts +++ b/dev/payload.config.ts @@ -59,6 +59,10 @@ const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration( dims, ivfflatLists, }, + failingBulkDefault: { + dims, + ivfflatLists, + }, }) const buildConfigWithPostgres = async () => { @@ -172,6 +176,33 @@ const buildConfigWithPostgres = async () => { bulkEmbeddingsFns, }, }, + failingBulkDefault: { + collections: { + posts: { + toKnowledgePool: async (doc, payload) => { + const chunks: Array<{ chunk: string }> = [] + // Process title + if (doc.title) { + const titleChunks = chunkText(doc.title) + chunks.push(...titleChunks.map((chunk) => ({ chunk }))) + } + // Process content + if (doc.content) { + const contentChunks = await chunkRichText(doc.content, payload) + chunks.push(...contentChunks.map((chunk) => ({ chunk }))) + } + return chunks + }, + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: embedQuery, + bulkEmbeddingsFns: createMockBulkEmbeddings({ + statusSequence: ['queued', 'running', 'failed'], + }), + }, + }, }, bulkQueueNames: { prepareBulkEmbedQueueName: 'vectorize-bulk-prepare', diff --git a/dev/specs/bulkEmbed/concurrentRuns.spec.ts b/dev/specs/bulkEmbed/concurrentRuns.spec.ts new file mode 100644 index 0000000..289a202 --- /dev/null +++ b/dev/specs/bulkEmbed/concurrentRuns.spec.ts @@ -0,0 +1,121 @@ +import type { Payload } from 'payload' +import { beforeAll, describe, expect, test } from 'vitest' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' +import type { VectorizedPayload } from '../../../src/types.js' +import { + BULK_QUEUE_NAMES, + DEFAULT_DIMS, + buildPayloadWithIntegration, + createMockBulkEmbeddings, + createTestDb, +} from '../utils.js' +import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' + +const DIMS = DEFAULT_DIMS +const dbName = `bulk_concurrent_${Date.now()}` + +describe('Bulk embed - concurrent runs prevention', () => { + let payload: VectorizedPayload<'default'> + + beforeAll(async () => { + await createTestDb({ dbName }) + const built = await buildPayloadWithIntegration({ + dbName, + pluginOpts: { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings({ + statusSequence: ['queued', 'running'], + }), + }, + }, + }, + bulkQueueNames: BULK_QUEUE_NAMES, + }, + secret: 'test-secret', + dims: DIMS, + key: `concurrent-${Date.now()}`, + }) + payload = built.payload as VectorizedPayload<'default'> + }) + + test('cannot start concurrent bulk embed runs for the same pool', async () => { + // Create a test post first + await payload.create({ + collection: 'posts', + data: { title: 'Concurrent test post' } as any, + }) + + // Create a run manually in 'running' status + const existingRun = await (payload as any).create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: 'default', + embeddingVersion: testEmbeddingVersion, + status: 'running', + }, + }) + + // Try to start another bulk embed for the same pool + const result = await payload.bulkEmbed({ knowledgePool: 'default' }) + + expect('conflict' in result && result.conflict).toBe(true) + expect(result.status).toBe('running') + expect(result.runId).toBe(String(existingRun.id)) + expect('message' in result && result.message).toContain('already running') + + // Cleanup: mark the run as succeeded so it doesn't interfere with other tests + await (payload as any).update({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: existingRun.id, + data: { + status: 'succeeded', + completedAt: new Date().toISOString(), + }, + }) + }) + + test('can start bulk embed run after previous run completes', async () => { + // Create a test post + await payload.create({ + collection: 'posts', + data: { title: 'Sequential test post' } as any, + }) + + // Create a completed run + await (payload as any).create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: 'default', + embeddingVersion: testEmbeddingVersion, + status: 'succeeded', + completedAt: new Date().toISOString(), + }, + }) + + // Should be able to start a new run for the same pool + const result = await payload.bulkEmbed({ knowledgePool: 'default' }) + + expect('conflict' in result).toBe(false) + expect(result.status).toBe('queued') + expect(result.runId).toBeDefined() + + // Cleanup: mark the new run as succeeded + await (payload as any).update({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: result.runId, + data: { + status: 'succeeded', + completedAt: new Date().toISOString(), + }, + }) + }) +}) diff --git a/dev/specs/bulkEmbed/failedBatch.spec.ts b/dev/specs/bulkEmbed/failedBatch.spec.ts index 54c1877..2d56359 100644 --- a/dev/specs/bulkEmbed/failedBatch.spec.ts +++ b/dev/specs/bulkEmbed/failedBatch.spec.ts @@ -1,7 +1,9 @@ import type { Payload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' +import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../src/collections/bulkEmbeddingsBatches.js' import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../../../src/collections/bulkEmbeddingInputMetadata.js' +import type { VectorizedPayload } from '../../../src/types.js' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, @@ -16,7 +18,7 @@ const DIMS = DEFAULT_DIMS const dbName = `bulk_failed_${Date.now()}` describe('Bulk embed - failed batch', () => { - let payload: Payload + let payload: VectorizedPayload<'default'> beforeAll(async () => { await createTestDb({ dbName }) @@ -43,7 +45,7 @@ describe('Bulk embed - failed batch', () => { dims: DIMS, key: `failed-${Date.now()}`, }) - payload = built.payload + payload = built.payload as VectorizedPayload<'default'> }) test('failed batch marks entire run as failed', async () => { @@ -105,6 +107,48 @@ describe('Bulk embed - failed batch', () => { }) expect(metadata.totalDocs).toBe(0) }) -}) + test('cannot retry batch while run is still running', async () => { + // Create a run in 'running' status + const run = await (payload as any).create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: 'default', + embeddingVersion: testEmbeddingVersion, + status: 'running', + }, + }) + + // Create a failed batch for this running run + const batch = await (payload as any).create({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + data: { + run: run.id, + batchIndex: 0, + providerBatchId: `mock-failed-lock-test-${Date.now()}`, + status: 'failed', + inputCount: 1, + error: 'Test error for lock test', + }, + }) + + // Try to retry the batch while run is running - should be rejected + const result = await payload.retryFailedBatch({ batchId: String(batch.id) }) + + expect('error' in result).toBe(true) + expect('conflict' in result && result.conflict).toBe(true) + if ('error' in result) { + expect(result.error).toContain('Cannot retry batch while run is running') + } + // Cleanup: mark the run as failed so the batch can be retried in the future + await (payload as any).update({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: run.id, + data: { + status: 'failed', + completedAt: new Date().toISOString(), + }, + }) + }) +}) diff --git a/dev/specs/e2e.spec.ts b/dev/specs/e2e.spec.ts index baaf4fb..1706ea7 100644 --- a/dev/specs/e2e.spec.ts +++ b/dev/specs/e2e.spec.ts @@ -3,9 +3,11 @@ import type { Payload, SanitizedConfig } from 'payload' import config from '@payload-config' import { getPayload } from 'payload' import { getInitialMarkdownContent } from './constants.js' -import { waitForVectorizationJobs } from './utils.js' +import { waitForVectorizationJobs, waitForBulkJobs } from './utils.js' import { testEmbeddingVersion } from 'helpers/embed.js' import { devUser } from 'helpers/credentials.js' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../src/collections/bulkEmbeddingsRuns.js' +import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../src/collections/bulkEmbeddingsBatches.js' // Helper function to log in to the admin panel const loginToAdmin = async (page: any) => { @@ -222,6 +224,54 @@ test.describe('Vector embedding e2e tests', () => { }, }) await expectVectorSearchResponse(filledResponse, post, title) + + // Get the run ID from the current URL + const runUrl = page.url() + const runIdMatch = runUrl.match(/\/(\d+)$/) + const bulkRunId = runIdMatch ? runIdMatch[1] : null + expect(bulkRunId).not.toBeNull() + console.log('[test] Bulk run ID:', bulkRunId) + + // Find the succeeded batch that was created + const succeededBatches = await (payload as any).find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { + and: [{ run: { equals: bulkRunId } }, { status: { equals: 'succeeded' } }], + }, + }) + expect(succeededBatches.totalDocs).toBeGreaterThan(0) + const succeededBatch = succeededBatches.docs[0] + console.log('[test] Found succeeded batch:', succeededBatch.id) + + // Test: Retry endpoint returns 400 for succeeded batch + const succeededRetryResponse = await request.post('/api/vector-retry-failed-batch', { + data: { batchId: String(succeededBatch.id) }, + }) + expect(succeededRetryResponse.status()).toBe(400) + const succeededRetryJson = await succeededRetryResponse.json() + expect(succeededRetryJson.error).toContain('not in failed status') + console.log('[test] Retry endpoint correctly rejected succeeded batch') + + // Navigate to the succeeded batch page and verify retry button is disabled + console.log('[test] Navigating to succeeded batch page...') + await page.goto(`/admin/collections/${BULK_EMBEDDINGS_BATCHES_SLUG}/${succeededBatch.id}`, { + waitUntil: 'networkidle', + }) + await page.waitForLoadState('domcontentloaded') + + // Look for the retry button - it should be present but disabled + const retryButton = page.locator('[data-testid="retry-failed-batch-button"]') + await expect(retryButton).toBeVisible({ timeout: 15000 }) + + // Verify the button is disabled (opacity check) + const buttonStyle = await retryButton.getAttribute('style') + expect(buttonStyle).toContain('opacity: 0.5') + + // Verify the "Retry Not Available" message is shown + const notAvailableMessage = page.locator('text=/Retry Not Available/i') + await expect(notAvailableMessage).toBeVisible({ timeout: 5000 }) + + console.log('[test] Retry button correctly disabled for succeeded batch!') }) test('clicking expand section on default collection shows not enabled message', async ({ @@ -272,4 +322,165 @@ test.describe('Vector embedding e2e tests', () => { const configMessage = page.locator('text=/bulkEmbeddingsFns/i') await expect(configMessage).toBeVisible({ timeout: 5000 }) }) + + test('retry failed batch endpoint returns 404 for non-existent batch', async ({ request }) => { + console.log('[test] Testing non-existent batch retry...') + + const nonExistentResponse = await request.post('/api/vector-retry-failed-batch', { + data: { batchId: '999999' }, + }) + expect(nonExistentResponse.status()).toBe(404) + + console.log('[test] Non-existent batch test completed!') + }) + + test('retry failed batch endpoint works correctly', async ({ request }) => { + console.log('[test] Starting retry failed batch endpoint test...') + + // Create a test post first (needed for bulk embedding to have something to embed) + const post = await payload.create({ + collection: 'posts', + data: { + title: 'Failed batch test post', + }, + }) + console.log('[test] Created test post:', post.id) + + // Use the bulk embed endpoint to create a run for failingBulkDefault pool + const bulkEmbedResponse = await request.post('/api/vector-bulk-embed', { + data: { + knowledgePool: 'failingBulkDefault', + }, + }) + expect(bulkEmbedResponse.ok()).toBe(true) + const bulkEmbedJson = await bulkEmbedResponse.json() + const runId = bulkEmbedJson.runId + console.log('[test] Created bulk run via endpoint:', runId) + + // Wait for the bulk jobs to process and fail (failingBulkDefault has a mock that fails) + await waitForBulkJobs(payload, 30000) + console.log('[test] Bulk jobs completed') + + // Find the failed batch that was created + const batches = await (payload as any).find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { + and: [{ run: { equals: runId } }, { status: { equals: 'failed' } }], + }, + }) + expect(batches.totalDocs).toBeGreaterThan(0) + const batch = batches.docs[0] + console.log('[test] Found failed batch:', batch.id) + + // Retry the failed batch (should succeed) + const retryResponse = await request.post('/api/vector-retry-failed-batch', { + data: { batchId: String(batch.id) }, + }) + expect(retryResponse.status()).toBe(202) + const retryJson = await retryResponse.json() + expect(retryJson.message).toBe('Failed batch has been re-queued for processing') + expect(retryJson.batchId).toBe(String(batch.id)) + expect(retryJson.status).toBe('queued') + + // Verify the batch status was updated + const updatedBatch = await (payload as any).findByID({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + id: String(batch.id), + }) + expect(updatedBatch.status).toBe('queued') + expect(updatedBatch.error).toBeNull() + + // Verify the run status was reset to running + const updatedRun = await (payload as any).findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: runId, + }) + expect((updatedRun as any).status).toBe('running') + + console.log('[test] Retry failed batch endpoint test completed successfully!') + }) + + test('retry failed batch button works for failed batches', async ({ page }) => { + console.log('[test] Starting retry button click test...') + test.setTimeout(120000) + + // Login first + await loginToAdmin(page) + + // Create a bulk embedding run + const run = await (payload as any).create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: 'failingBulkDefault', + embeddingVersion: testEmbeddingVersion, + status: 'failed', + }, + }) + console.log('[test] Created bulk run:', run.id) + + // Create a failed batch + const failedBatch = await (payload as any).create({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + data: { + run: run.id, + batchIndex: 0, + providerBatchId: `mock-failed-ui-${Date.now()}`, + status: 'failed', + inputCount: 1, + error: 'Test error for UI test', + }, + }) + console.log('[test] Created failed batch:', failedBatch.id) + + // Navigate to the failed batch edit page + console.log('[test] Navigating to failed batch page...') + await page.goto(`/admin/collections/${BULK_EMBEDDINGS_BATCHES_SLUG}/${failedBatch.id}`, { + waitUntil: 'networkidle', + }) + await page.waitForLoadState('domcontentloaded') + + // Look for the retry button + const retryButton = page.locator('[data-testid="retry-failed-batch-button"]') + await expect(retryButton).toBeVisible({ timeout: 15000 }) + + // Verify the "Retry Failed Batch" message is shown (not "Retry Not Available") + const retryMessage = page.locator('text=/Retry Failed Batch/i') + await expect(retryMessage).toBeVisible({ timeout: 5000 }) + + // Verify the button is NOT disabled + const buttonStyle = await retryButton.getAttribute('style') + expect(buttonStyle).not.toContain('opacity: 0.5') + + // Click the retry button + console.log('[test] Clicking retry button...') + await retryButton.click() + + // Wait for success message + const successMessage = page.locator('text=/Batch re-queued successfully/i') + await expect(successMessage).toBeVisible({ timeout: 10000 }) + + console.log('[test] Retry button click test completed!') + + // Wait a bit for the page reload + await page.waitForTimeout(2000) + + // Verify we're still on the batch page after reload + await page.waitForURL(/\/admin\/collections\/vector-bulk-embeddings-batches\/\d+/) + + console.log('[test] Retry failed batch button test completed successfully!') + }) + + test('missing batchId returns 400 error', async ({ request }) => { + console.log('[test] Testing missing batchId...') + + const response = await request.post('/api/vector-retry-failed-batch', { + data: {}, + }) + + expect(response.status()).toBe(400) + const json = await response.json() + expect(json.error).toContain('batchId is required') + + console.log('[test] Missing batchId test completed!') + }) }) diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index d3d01ce..b44643c 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -129,31 +129,31 @@ export function createMockBulkEmbeddings( return null }, - pollBatch: async ({ providerBatchId }) => { + pollOrCompleteBatch: async ({ providerBatchId, onChunk }) => { const callCount = batchPollCount.get(providerBatchId) ?? 0 batchPollCount.set(providerBatchId, callCount + 1) const status = statusSequence[Math.min(callCount, statusSequence.length - 1)] - return { - status, - } - }, - completeBatch: async ({ providerBatchId }) => { - const inputs = batchInputs.get(providerBatchId) ?? [] - if (!inputs.length) { - return [] + // If succeeded, stream the outputs via onChunk + if (status === 'succeeded') { + const inputs = batchInputs.get(providerBatchId) ?? [] + if (inputs.length) { + const vectors = await embeddings(inputs.map((i) => i.text)) + for (let idx = 0; idx < inputs.length; idx++) { + const input = inputs[idx] + const shouldFail = partialFailure?.failIds?.includes(input.id) + const output = shouldFail + ? { id: input.id, error: 'fail' } + : { id: input.id, embedding: vectors[idx] } + await onChunk(output) + } + } + // Clean up state + batchInputs.delete(providerBatchId) + batchPollCount.delete(providerBatchId) } - const vectors = await embeddings(inputs.map((i) => i.text)) - const outputs = inputs.map((input, idx) => { - const shouldFail = partialFailure?.failIds?.includes(input.id) - return shouldFail - ? { id: input.id, error: 'fail' } - : { id: input.id, embedding: vectors[idx] } - }) - // Clean up state - batchInputs.delete(providerBatchId) - batchPollCount.delete(providerBatchId) - return outputs + + return { status } }, onError: async ({ providerBatchIds, error }) => { diff --git a/dev/specs/vectorizedPayload.spec.ts b/dev/specs/vectorizedPayload.spec.ts index 6424253..e717909 100644 --- a/dev/specs/vectorizedPayload.spec.ts +++ b/dev/specs/vectorizedPayload.spec.ts @@ -79,9 +79,11 @@ describe('VectorizedPayload', () => { }, }, }, - embedDocs: makeDummyEmbedDocs(DIMS), - embedQuery: makeDummyEmbedQuery(DIMS), - embeddingVersion: testEmbeddingVersion, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, }, }, }), @@ -97,12 +99,42 @@ describe('VectorizedPayload', () => { }) test('returns false for a plain object without search method', () => { - const plainObj = { queueEmbed: () => Promise.resolve() } as unknown as Payload + const plainObj = { + _isBulkEmbedEnabled: () => false, + queueEmbed: () => Promise.resolve(), + bulkEmbed: () => Promise.resolve({}), + retryFailedBatch: () => Promise.resolve({}), + } as unknown as Payload expect(isVectorizedPayload(plainObj)).toBe(false) }) test('returns false for a plain object without queueEmbed method', () => { - const plainObj = { search: () => Promise.resolve([]) } as unknown as Payload + const plainObj = { + _isBulkEmbedEnabled: () => false, + search: () => Promise.resolve([]), + bulkEmbed: () => Promise.resolve({}), + retryFailedBatch: () => Promise.resolve({}), + } as unknown as Payload + expect(isVectorizedPayload(plainObj)).toBe(false) + }) + + test('returns false for a plain object without bulkEmbed method', () => { + const plainObj = { + _isBulkEmbedEnabled: () => false, + search: () => Promise.resolve([]), + queueEmbed: () => Promise.resolve(), + retryFailedBatch: () => Promise.resolve({}), + } as unknown as Payload + expect(isVectorizedPayload(plainObj)).toBe(false) + }) + + test('returns false for a plain object without retryFailedBatch method', () => { + const plainObj = { + _isBulkEmbedEnabled: () => false, + search: () => Promise.resolve([]), + queueEmbed: () => Promise.resolve(), + bulkEmbed: () => Promise.resolve({}), + } as unknown as Payload expect(isVectorizedPayload(plainObj)).toBe(false) }) @@ -236,4 +268,36 @@ describe('VectorizedPayload', () => { expect(pendingJobs.totalDocs).toBeGreaterThan(0) }) }) + + describe('bulkEmbed method', () => { + test('payload has bulkEmbed method', () => { + expect(typeof (payload as VectorizedPayload).bulkEmbed).toBe('function') + }) + + test('bulkEmbed throws error when bulk embedding not configured', async () => { + const vectorizedPayload = payload as VectorizedPayload<'default'> + + // This pool doesn't have bulkEmbeddingsFns configured + await expect(vectorizedPayload.bulkEmbed({ knowledgePool: 'default' })).rejects.toThrow( + 'does not have bulk embedding configured', + ) + }) + }) + + describe('retryFailedBatch method', () => { + test('payload has retryFailedBatch method', () => { + expect(typeof (payload as VectorizedPayload).retryFailedBatch).toBe('function') + }) + + test('retryFailedBatch returns error for non-existent batch', async () => { + const vectorizedPayload = payload as VectorizedPayload + + const result = await vectorizedPayload.retryFailedBatch({ batchId: '999999' }) + + expect('error' in result).toBe(true) + if ('error' in result) { + expect(result.error).toContain('not found') + } + }) + }) }) diff --git a/src/admin/components/RetryFailedBatchButton/client.tsx b/src/admin/components/RetryFailedBatchButton/client.tsx new file mode 100644 index 0000000..40a4374 --- /dev/null +++ b/src/admin/components/RetryFailedBatchButton/client.tsx @@ -0,0 +1,183 @@ +'use client' + +import React, { useState } from 'react' + +type RetryFailedBatchButtonClientProps = { + batchId: string + status: string +} + +export const RetryFailedBatchButtonClient: React.FC = ({ + batchId, + status, +}) => { + const [isSubmitting, setIsSubmitting] = useState(false) + const [message, setMessage] = useState<{ text: string; error?: boolean } | null>(null) + + const isDisabled = status !== 'failed' + + const handleClick = async () => { + if (isDisabled) return + + setIsSubmitting(true) + setMessage(null) + + try { + const res = await fetch('/api/vector-retry-failed-batch', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ batchId }), + }) + + const data = await res.json() + + if (!res.ok) { + setMessage({ text: data?.error || 'Failed to retry batch', error: true }) + return + } + + setMessage({ text: 'Batch re-queued successfully', error: false }) + + // Reload the page after a short delay to show the updated status + setTimeout(() => { + window.location.reload() + }, 1500) + } catch (error: any) { + setMessage({ text: error?.message || 'Failed to retry batch', error: true }) + } finally { + setIsSubmitting(false) + } + } + + return ( +
+
+
+

+ {isDisabled ? 'Retry Not Available' : 'Retry Failed Batch'} +

+

+ {isDisabled + ? `This batch is in "${status}" status. Retry is only available for failed batches.` + : 'Re-queue this failed batch for processing. The batch will be polled again and embeddings will be written for successful chunks.'} +

+
+ + +
+ + {message && ( +
+ {message.error ? ( + + + + + ) : ( + + + + + )} + {message.text} +
+ )} + + +
+ ) +} diff --git a/src/admin/components/RetryFailedBatchButton/index.tsx b/src/admin/components/RetryFailedBatchButton/index.tsx new file mode 100644 index 0000000..b06b00d --- /dev/null +++ b/src/admin/components/RetryFailedBatchButton/index.tsx @@ -0,0 +1,56 @@ +import React from 'react' +import { RetryFailedBatchButtonClient } from './client.js' + +type RetryFailedBatchButtonProps = { + batchId: string + status: string +} + +export const RetryFailedBatchButton: React.FC< + RetryFailedBatchButtonProps & { payload?: any; params?: any; data?: any } +> = (props) => { + // Handle both direct props and serverProps functions + let batchId: string = '' + let status: string = '' + + if (typeof props.batchId === 'function') { + try { + batchId = String( + (props.batchId as any)({ payload: props.payload, params: props.params, data: props.data }) || + '', + ) + } catch (error) { + console.error('[RetryFailedBatchButton] Error calling batchId:', error) + batchId = '' + } + } else if (props.data?.id) { + batchId = String(props.data.id) + } else { + batchId = String(props.batchId || '') + } + + if (typeof props.status === 'function') { + try { + status = String( + (props.status as any)({ payload: props.payload, params: props.params, data: props.data }) || + '', + ) + } catch (error) { + console.error('[RetryFailedBatchButton] Error calling status:', error) + status = '' + } + } else if (props.data?.status) { + status = String(props.data.status) + } else { + status = String(props.status || '') + } + + // Only render on the edit view (when we have a batchId) + if (!batchId) { + return null + } + + return +} + +export default RetryFailedBatchButton diff --git a/src/collections/bulkEmbeddingInputMetadata.ts b/src/collections/bulkEmbeddingInputMetadata.ts index 22263fe..29472c1 100644 --- a/src/collections/bulkEmbeddingInputMetadata.ts +++ b/src/collections/bulkEmbeddingInputMetadata.ts @@ -71,5 +71,8 @@ export const createBulkEmbeddingInputMetadataCollection = (): CollectionConfig = { fields: ['run'], }, + { + fields: ['sourceCollection', 'docId'], + }, ], }) diff --git a/src/collections/bulkEmbeddingsBatches.ts b/src/collections/bulkEmbeddingsBatches.ts index 21e89e3..219e1ba 100644 --- a/src/collections/bulkEmbeddingsBatches.ts +++ b/src/collections/bulkEmbeddingsBatches.ts @@ -22,6 +22,19 @@ export const createBulkEmbeddingsBatchesCollection = (): CollectionConfig => ({ description: 'Individual batches within a bulk embedding run. Created when input count exceeds file limits.', defaultColumns: ['run', 'batchIndex', 'status', 'inputCount', 'succeededCount', 'failedCount'], + components: { + edit: { + beforeDocumentControls: [ + { + path: 'payloadcms-vectorize/client#RetryFailedBatchButton', + serverProps: { + batchId: ({ data }: { data: any }) => data?.id, + status: ({ data }: { data: any }) => data?.status, + }, + }, + ], + }, + }, }, access: { // Anyone can read; only internal (local API) can mutate. diff --git a/src/endpoints/bulkEmbed.ts b/src/endpoints/bulkEmbed.ts index a3f0f5b..d3d5503 100644 --- a/src/endpoints/bulkEmbed.ts +++ b/src/endpoints/bulkEmbed.ts @@ -1,6 +1,70 @@ -import type { PayloadHandler } from 'payload' +import type { Payload, PayloadHandler } from 'payload' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../collections/bulkEmbeddingsRuns.js' -import type { KnowledgePoolDynamicConfig, KnowledgePoolName } from '../types.js' +import type { BulkEmbedResult, KnowledgePoolDynamicConfig, KnowledgePoolName } from '../types.js' + +/** + * Core logic for starting a bulk embed run. + * Used by both the HTTP handler and VectorizedPayload.bulkEmbed method. + */ +export async function startBulkEmbed< + TPoolNames extends KnowledgePoolName = KnowledgePoolName, +>(args: { + payload: Payload + knowledgePool: TPoolNames + knowledgePools: Record + queueName?: string +}): Promise { + const { payload, knowledgePool, knowledgePools, queueName } = args + + const poolConfig = knowledgePools[knowledgePool] + if (!poolConfig) { + throw new Error(`[payloadcms-vectorize] Knowledge pool "${knowledgePool}" not found`) + } + if (!poolConfig.embeddingConfig.bulkEmbeddingsFns) { + throw new Error( + `[payloadcms-vectorize] Knowledge pool "${knowledgePool}" does not have bulk embedding configured`, + ) + } + + // Check for existing non-terminal run for this pool + const existingActiveRun = await payload.find({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + where: { + and: [{ pool: { equals: knowledgePool } }, { status: { in: ['queued', 'running'] } }], + }, + limit: 1, + }) + + if (existingActiveRun.totalDocs > 0) { + const existing = existingActiveRun.docs[0] as any + return { + runId: String(existing.id), + status: existing.status, + message: `A bulk embedding run is already ${existing.status} for this knowledge pool. Wait for it to complete or cancel it first.`, + conflict: true, + } + } + + const run = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + pool: knowledgePool, + embeddingVersion: poolConfig.embeddingConfig.version, + status: 'queued', + }, + }) + + await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ + task: 'payloadcms-vectorize:prepare-bulk-embedding', + input: { runId: String(run.id) }, + ...(queueName ? { queue: queueName } : {}), + }) + + return { + runId: String(run.id), + status: 'queued', + } +} export const createBulkEmbedHandler = ( knowledgePools: Record, @@ -19,64 +83,31 @@ export const createBulkEmbedHandler = ( { status: 400 }, ) } - const poolConfig = knowledgePools[knowledgePool] - if (!poolConfig) { - return Response.json( - { error: `Knowledge pool "${knowledgePool}" not found` }, - { status: 400 }, - ) - } - const payload = req.payload - - // Check for existing queued run for this pool - return it instead of creating a new one - const existingQueuedRun = await payload.find({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - where: { - and: [{ pool: { equals: knowledgePool } }, { status: { equals: 'queued' } }], - }, - limit: 1, + const result = await startBulkEmbed({ + payload: req.payload, + knowledgePool, + knowledgePools, + queueName, }) - if (existingQueuedRun.totalDocs > 0) { - const existing = existingQueuedRun.docs[0] as any - return Response.json( - { - runId: String(existing.id), - status: existing.status, - message: `A bulk embedding run is already queued for this knowledge pool`, - }, - { status: 200 }, - ) + if ('conflict' in result && result.conflict) { + return Response.json(result, { status: 409 }) } - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { - pool: knowledgePool, - embeddingVersion: poolConfig.embeddingConfig.version, - status: 'queued', - }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { - runId: String(run.id), - }, - req, - ...(queueName ? { queue: queueName } : {}), - }) - + return Response.json(result, { status: 202 }) + } catch (error) { + const errorMessage = (error as Error).message || String(error) + req.payload.logger.error( + `[payloadcms-vectorize] Failed to queue bulk embed run: ${errorMessage}`, + ) return Response.json( { - runId: String(run.id), - status: 'queued', + error: 'Failed to queue bulk embed run', + details: errorMessage, }, - { status: 202 }, + { status: 500 }, ) - } catch (error) { - return Response.json({ error: 'Failed to queue bulk embed run' }, { status: 500 }) } } return handler diff --git a/src/endpoints/retryFailedBatch.ts b/src/endpoints/retryFailedBatch.ts new file mode 100644 index 0000000..d66cb61 --- /dev/null +++ b/src/endpoints/retryFailedBatch.ts @@ -0,0 +1,167 @@ +import type { Payload, PayloadHandler } from 'payload' +import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../collections/bulkEmbeddingsBatches.js' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../collections/bulkEmbeddingsRuns.js' +import type { + KnowledgePoolDynamicConfig, + KnowledgePoolName, + RetryFailedBatchResult, +} from '../types.js' + +/** + * Core logic for retrying a failed batch. + * Used by both the HTTP handler and VectorizedPayload.retryFailedBatch method. + */ +export async function retryBatch(args: { + payload: Payload + batchId: string + knowledgePools: Record + queueName?: string +}): Promise { + const { payload, batchId, knowledgePools, queueName } = args + + // Find the batch + let batch: any + try { + batch = await payload.findByID({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + id: batchId, + }) + } catch { + return { error: `Batch "${batchId}" not found` } + } + + if (!batch) { + return { error: `Batch "${batchId}" not found` } + } + + // Verify batch has failed status + if (batch.status !== 'failed') { + return { + error: `Batch "${batchId}" is not in failed status. Current status: ${batch.status}`, + } + } + + // Get the parent run + const runId = typeof batch.run === 'object' ? batch.run.id : batch.run + const run = await payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: String(runId), + }) + + if (!run) { + return { error: `Parent run not found for batch "${batchId}"` } + } + + // Only allow retry when run is in a terminal state + const runStatus = (run as any).status + if (runStatus === 'running' || runStatus === 'queued') { + return { + error: `Cannot retry batch while run is ${runStatus}. Wait for the run to complete first.`, + conflict: true, + } + } + + const poolName = (run as any).pool as TPoolNames + const poolConfig = knowledgePools[poolName] + + if (!poolConfig) { + return { error: `Knowledge pool "${poolName}" not found` } + } + + if (!poolConfig.embeddingConfig.bulkEmbeddingsFns) { + return { + error: `Knowledge pool "${poolName}" does not have bulk embedding configured`, + } + } + + // Reset the batch status to queued + await payload.update({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + id: batchId, + data: { + status: 'queued', + error: null, + completedAt: null, + succeededCount: 0, + failedCount: 0, + }, + }) + + // If the parent run is in failed/succeeded status, reset it to running + if (runStatus === 'failed' || runStatus === 'succeeded') { + await payload.update({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: String(runId), + data: { + status: 'running', + completedAt: null, + }, + }) + } + + // Queue the poll-or-complete task + await payload.jobs.queue<'payloadcms-vectorize:poll-or-complete-bulk-embedding'>({ + task: 'payloadcms-vectorize:poll-or-complete-bulk-embedding', + input: { runId: String(runId) }, + ...(queueName ? { queue: queueName } : {}), + }) + + return { + batchId, + runId: String(runId), + status: 'queued', + message: 'Failed batch has been re-queued for processing', + } +} + +export const createRetryFailedBatchHandler = ( + knowledgePools: Record, + pollOrCompleteQueueName?: string, +): PayloadHandler => { + const handler: PayloadHandler = async (req) => { + if (!req || !req.json) { + return Response.json({ error: 'Request is required' }, { status: 400 }) + } + + try { + const body = await req.json() + const batchId = body?.batchId as string + + if (!batchId) { + return Response.json({ error: 'batchId is required and must be a string' }, { status: 400 }) + } + + const result = await retryBatch({ + payload: req.payload, + batchId, + knowledgePools, + queueName: pollOrCompleteQueueName, + }) + + if ('error' in result) { + if ('conflict' in result && result.conflict) { + return Response.json(result, { status: 409 }) + } + // Check if it's a "not found" error + if (result.error.includes('not found')) { + return Response.json(result, { status: 404 }) + } + return Response.json(result, { status: 400 }) + } + + return Response.json(result, { status: 202 }) + } catch (error) { + const errorMessage = (error as Error).message || String(error) + req.payload.logger.error(`[payloadcms-vectorize] Failed to retry batch: ${errorMessage}`) + return Response.json( + { + error: 'Failed to retry batch', + details: errorMessage, + }, + { status: 500 }, + ) + } + } + + return handler +} diff --git a/src/exports/client.ts b/src/exports/client.ts index e864467..c871ed9 100644 --- a/src/exports/client.ts +++ b/src/exports/client.ts @@ -1 +1,2 @@ export { EmbedAllButton } from '../admin/components/EmbedAllButton/index.js' +export { RetryFailedBatchButton } from '../admin/components/RetryFailedBatchButton/index.js' diff --git a/src/index.ts b/src/index.ts index 50e1611..d0eaf09 100644 --- a/src/index.ts +++ b/src/index.ts @@ -11,6 +11,8 @@ import type { KnowledgePoolDynamicConfig, VectorizedPayload, VectorSearchQuery, + BulkEmbedResult, + RetryFailedBatchResult, } from './types.js' import { isPostgresPayload } from './types.js' import type { PostgresAdapterArgs } from '@payloadcms/db-postgres' @@ -33,7 +35,8 @@ import { createPrepareBulkEmbeddingTask, createPollOrCompleteBulkEmbeddingTask, } from './tasks/bulkEmbedAll.js' -import { createBulkEmbedHandler } from './endpoints/bulkEmbed.js' +import { createBulkEmbedHandler, startBulkEmbed } from './endpoints/bulkEmbed.js' +import { createRetryFailedBatchHandler, retryBatch } from './endpoints/retryFailedBatch.js' export type { KnowledgePoolStaticConfig, @@ -58,9 +61,8 @@ export type { // BulkEmbeddingsFns AddChunkArgs, BatchSubmission, - PollBatchArgs, + PollOrCompleteBatchArgs, PollBulkEmbeddingsResult, - CompleteBatchArgs, BulkEmbeddingOutput, OnBulkErrorArgs, @@ -69,6 +71,8 @@ export type { // PollBulkEmbeddingsResult BulkEmbeddingRunStatus, + isVectorizedPayload, + VectorizedPayload, } from './types.js' async function ensurePgvectorArtifacts(args: { @@ -359,6 +363,25 @@ export const createVectorizeIntegration = ) } } + + // Also clean up any pending bulk embedding metadata for this document + // This prevents embedding a document that was deleted during a bulk run + try { + await payload.delete({ + collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, + where: { + and: [ + { sourceCollection: { equals: collectionSlug } }, + { docId: { equals: String(id) } }, + ], + }, + }) + } catch (e) { + payload?.logger?.warn?.( + `[payloadcms-vectorize] Failed to delete bulk embedding metadata for ${collectionSlug}:${id}`, + e as Error, + ) + } }, ], } @@ -367,11 +390,7 @@ export const createVectorizeIntegration = const incomingOnInit = config.onInit const vectorSearchHandlers = createVectorSearchHandlers(pluginOptions.knowledgePools) config.onInit = async (payload) => { - if (incomingOnInit) - await incomingOnInit(payload) - - // Add _isBulkEmbedEnabled method to payload object - // This allows checking if bulk embedding is enabled for a knowledge pool + if (incomingOnInit) await incomingOnInit(payload) ;(payload as VectorizedPayload) = { ...(payload as any), _isBulkEmbedEnabled: (knowledgePool: TPoolNames): boolean => { @@ -419,6 +438,20 @@ export const createVectorizeIntegration = } return embedQueue(doc, payload) }, + bulkEmbed: (params: { knowledgePool: TPoolNames }): Promise => + startBulkEmbed({ + payload, + knowledgePool: params.knowledgePool, + knowledgePools: pluginOptions.knowledgePools, + queueName: pluginOptions.bulkQueueNames?.prepareBulkEmbedQueueName, + }), + retryFailedBatch: (params: { batchId: string }): Promise => + retryBatch({ + payload, + batchId: params.batchId, + knowledgePools: pluginOptions.knowledgePools, + queueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, + }), } // Ensure pgvector artifacts for each knowledge pool for (const poolName in staticConfigs) { @@ -452,6 +485,14 @@ export const createVectorizeIntegration = pluginOptions.bulkQueueNames?.prepareBulkEmbedQueueName, ), }, + { + path: '/vector-retry-failed-batch', + method: 'post' as const, + handler: createRetryFailedBatchHandler( + pluginOptions.knowledgePools, + pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, + ), + }, ] config.endpoints = endpoints } diff --git a/src/tasks/bulkEmbedAll.ts b/src/tasks/bulkEmbedAll.ts index d8cf536..fd9af6d 100644 --- a/src/tasks/bulkEmbedAll.ts +++ b/src/tasks/bulkEmbedAll.ts @@ -215,16 +215,28 @@ export const createPollOrCompleteBulkEmbeddingTask = ({ return { output: { runId: input.runId, status: currentStatus } } } - // Load all batches for this run + // Load all batches for this run with pagination to handle >1000 batches // Convert runId to number for postgres relationship queries const runIdNum = parseInt(input.runId, 10) - const batchesResult = await payload.find({ - collection: BULK_EMBEDDINGS_BATCHES_SLUG, - where: { run: { equals: runIdNum } }, - limit: 1000, - sort: 'batchIndex', - }) - const batches = (batchesResult as any)?.docs || [] + const batches: any[] = [] + let batchPage = 1 + const batchLimit = 100 // Smaller pages for better memory management + + while (true) { + const batchesResult = await payload.find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { run: { equals: runIdNum } }, + limit: batchLimit, + page: batchPage, + sort: 'batchIndex', + }) + const pageDocs = (batchesResult as any)?.docs || [] + batches.push(...pageDocs) + + const totalPages = (batchesResult as any)?.totalPages ?? batchPage + if (batchPage >= totalPages || pageDocs.length === 0) break + batchPage++ + } if (batches.length === 0) { // No batches found - this shouldn't happen but handle gracefully @@ -240,73 +252,155 @@ export const createPollOrCompleteBulkEmbeddingTask = ({ return { output: { runId: input.runId, status: 'failed' } } } - // Poll each non-terminal batch - let allSucceeded = true - let anyFailed = false + // Poll each non-terminal batch and complete succeeded ones incrementally let anyRunning = false + let totalSucceeded = 0 + let totalFailed = 0 + const allFailedChunkData: FailedChunkData[] = [] + const batchStatuses = new Map() // Track batch statuses as we process + // Initialize with current statuses for (const batch of batches) { - const batchStatus = batch.status as string - if (TERMINAL_STATUSES.has(batchStatus)) { - if (batchStatus !== 'succeeded') { - anyFailed = true - allSucceeded = false + batchStatuses.set(String(batch.id), batch.status as string) + // Accumulate counts from already completed batches + if (TERMINAL_STATUSES.has(batch.status as string)) { + if (batch.status === 'succeeded') { + totalSucceeded += batch.succeededCount || 0 + totalFailed += batch.failedCount || 0 } + } + } + + for (const batch of batches) { + const batchStatus = batchStatuses.get(String(batch.id)) as string + + // Skip batches that are already completed + if (TERMINAL_STATUSES.has(batchStatus)) { continue } - // Poll this batch - const pollResult = await callbacks.pollBatch({ - providerBatchId: batch.providerBatchId, - }) + // Poll batch and complete if succeeded (streams embeddings via onChunk callback) + try { + const completionResult = await pollAndCompleteSingleBatch({ + payload, + runId: input.runId, + poolName, + batch, + callbacks, + }) - // Update batch status - await payload.update({ - id: batch.id, - collection: BULK_EMBEDDINGS_BATCHES_SLUG, - data: { - status: pollResult.status, - error: pollResult.error, - ...(TERMINAL_STATUSES.has(pollResult.status) - ? { completedAt: new Date().toISOString() } - : {}), - }, - }) + // Update batch status and counts + await payload.update({ + id: batch.id, + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + data: { + status: completionResult.status, + error: completionResult.error, + ...(TERMINAL_STATUSES.has(completionResult.status) + ? { completedAt: new Date().toISOString() } + : {}), + ...(completionResult.status === 'succeeded' + ? { + succeededCount: completionResult.succeededCount, + failedCount: completionResult.failedCount, + } + : {}), + }, + }) + + // Track the new status + batchStatuses.set(String(batch.id), completionResult.status) + + // Accumulate counts from newly succeeded batches + if (completionResult.status === 'succeeded') { + totalSucceeded += completionResult.succeededCount + totalFailed += completionResult.failedCount + allFailedChunkData.push(...completionResult.failedChunkData) + } - if (pollResult.status === 'failed' || pollResult.status === 'canceled') { - anyFailed = true - allSucceeded = false - } else if (!TERMINAL_STATUSES.has(pollResult.status)) { - anyRunning = true - allSucceeded = false + // Track if still running (queued or running) + if (completionResult.status === 'queued' || completionResult.status === 'running') { + anyRunning = true + } + // Failed/canceled batches - leave them, can be re-run later + } catch (error) { + // Completion failed - mark batch as failed + const errorMessage = (error as Error).message || String(error) + await payload.update({ + id: batch.id, + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + data: { + status: 'failed', + error: `Completion failed: ${errorMessage}`, + completedAt: new Date().toISOString(), + }, + }) + batchStatuses.set(String(batch.id), 'failed') } } - // If any batch failed, mark the entire run as failed - if (anyFailed) { + // Check if all batches are complete + const allBatchesComplete = Array.from(batchStatuses.values()).every((status) => + TERMINAL_STATUSES.has(status), + ) + + if (allBatchesComplete) { + // All batches are done - finalize the run + const hasAnySucceeded = Array.from(batchStatuses.values()).some( + (status) => status === 'succeeded', + ) + + // Check if any batches are failed (not just canceled) - we keep metadata for potential retries + const hasFailedBatches = Array.from(batchStatuses.values()).some( + (status) => status === 'failed', + ) + await payload.update({ id: input.runId, collection: BULK_EMBEDDINGS_RUNS_SLUG, data: { - status: 'failed', - error: 'One or more batches failed', + status: hasAnySucceeded ? 'succeeded' : 'failed', + succeeded: totalSucceeded, + failed: totalFailed, + failedChunkData: allFailedChunkData.length > 0 ? allFailedChunkData : undefined, completedAt: new Date().toISOString(), }, }) - // Cleanup metadata without writing embeddings - await payload.delete({ - collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, - where: { run: { equals: (run as any).id } }, - }) - // Call onError callback so user can clean up provider-side resources - if (callbacks.onError) { + + // Cleanup metadata for succeeded batches only + // Keep metadata for failed batches to allow retry functionality + const succeededBatchIds = Array.from(batchStatuses.entries()) + .filter(([_, status]) => status === 'succeeded') + .map(([id, _]) => parseInt(id, 10)) + + if (succeededBatchIds.length > 0) { + await payload.delete({ + collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, + where: { batch: { in: succeededBatchIds } }, + }) + } + + // Call onError if there were any failures + if (callbacks.onError && (totalFailed > 0 || !hasAnySucceeded)) { const providerBatchIds = batches.map((b: any) => b.providerBatchId as string) await callbacks.onError({ providerBatchIds, - error: new Error('One or more batches failed'), + error: new Error( + totalFailed > 0 + ? `${totalFailed} chunk(s) failed during completion` + : 'All batches failed', + ), + failedChunkData: allFailedChunkData.length > 0 ? allFailedChunkData : undefined, + failedChunkCount: totalFailed > 0 ? totalFailed : undefined, }) } - return { output: { runId: input.runId, status: 'failed' } } + + return { + output: { + runId: input.runId, + status: hasAnySucceeded ? 'succeeded' : 'failed', + }, + } } // If still running, requeue this task @@ -320,67 +414,47 @@ export const createPollOrCompleteBulkEmbeddingTask = ({ return { output: { runId: input.runId, status: 'polling' } } } - // All batches succeeded - complete the embeddings (writes successful chunks, tracks failures) - if (allSucceeded) { - const completionResult = await completeBatches({ - payload, - runId: input.runId, - poolName, - batches, - callbacks, - }) + // Edge case: allBatchesComplete is false but anyRunning is false + // This happens when all batches are in 'canceled' or 'failed' status but we didn't detect it above + // Check if all batches are canceled + const allCanceled = Array.from(batchStatuses.values()).every( + (status) => status === 'canceled', + ) + if (allCanceled) { await payload.update({ id: input.runId, collection: BULK_EMBEDDINGS_RUNS_SLUG, data: { - status: completionResult.success ? 'succeeded' : 'failed', - succeeded: completionResult.succeededCount, - failed: completionResult.failedCount, - error: completionResult.error, - failedChunkData: - completionResult.failedChunkData.length > 0 - ? completionResult.failedChunkData - : undefined, + status: 'canceled', completedAt: new Date().toISOString(), }, }) - - // Cleanup metadata - await payload.delete({ - collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, - where: { run: { equals: (run as any).id } }, - }) - - // Call onError if completion failed OR if there were partial chunk failures - if (callbacks.onError && (!completionResult.success || completionResult.failedCount > 0)) { - const providerBatchIds = batches.map((b: any) => b.providerBatchId as string) - await callbacks.onError({ - providerBatchIds, - error: new Error( - completionResult.error || - (completionResult.failedCount > 0 - ? `${completionResult.failedCount} chunk(s) failed during completion` - : 'Completion failed'), - ), - failedChunkData: - completionResult.failedChunkData.length > 0 - ? completionResult.failedChunkData - : undefined, - failedChunkCount: - completionResult.failedCount > 0 ? completionResult.failedCount : undefined, - }) - } - - return { - output: { - runId: input.runId, - status: completionResult.success ? 'succeeded' : 'failed', - }, - } + return { output: { runId: input.runId, status: 'canceled' } } } - return { output: { runId: input.runId, status: 'unknown' } } + // Fallback: mark as failed with diagnostic info + const statusCounts = Array.from(batchStatuses.values()).reduce( + (acc, status) => { + acc[status] = (acc[status] || 0) + 1 + return acc + }, + {} as Record, + ) + payload.logger.warn( + `[payloadcms-vectorize] Run ${input.runId} reached unexpected state. Batch statuses: ${JSON.stringify(statusCounts)}`, + ) + + await payload.update({ + id: input.runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + status: 'failed', + error: `Run reached unexpected state. Batch statuses: ${JSON.stringify(statusCounts)}`, + completedAt: new Date().toISOString(), + }, + }) + return { output: { runId: input.runId, status: 'failed' } } }, } @@ -390,6 +464,10 @@ export const createPollOrCompleteBulkEmbeddingTask = ({ /** * Stream through missing embeddings, calling addChunk for each. * User controls batching via addChunk return value. + * + * Uses a two-pass approach: + * 1. First pass: count total chunks to know when we reach the last one + * 2. Second pass: stream chunks without holding all in memory */ async function streamAndBatchMissingEmbeddings(args: { payload: Payload @@ -419,15 +497,14 @@ async function streamAndBatchMissingEmbeddings(args: { const includeAll = versionMismatch || !hasBaseline const lastCompletedAtDate = lastBulkCompletedAt ? new Date(lastBulkCompletedAt) : undefined - - let batchIndex = 0 - let totalInputs = 0 const collectionSlugs = Object.keys(dynamicConfig.collections) - // Collect all chunks first to know which is the last one - const allChunks: CollectedEmbeddingInput[] = [] + // First pass: count total chunks to know the last one + // We store minimal info (docId + chunkCount) to avoid OOM + type DocChunkInfo = { collectionSlug: string; docId: string; chunkCount: number } + const docsToProcess: DocChunkInfo[] = [] + let totalChunkCount = 0 - // Iterate through all collections and their documents for (const collectionSlug of collectionSlugs) { const collectionConfig = dynamicConfig.collections[collectionSlug] if (!collectionConfig) continue @@ -436,7 +513,6 @@ async function streamAndBatchMissingEmbeddings(args: { let page = 1 const limit = 50 - // Paginate through source collection docs while (true) { const res = await payload.find({ collection: collectionSlug, @@ -465,22 +541,14 @@ async function streamAndBatchMissingEmbeddings(args: { if (!shouldInclude) continue const chunkData = await toKnowledgePool(doc, payload) - for (let idx = 0; idx < chunkData.length; idx++) { - const chunkEntry = chunkData[idx] - if (!chunkEntry?.chunk) continue - - const { chunk, ...extensionFields } = chunkEntry - allChunks.push({ - id: `${collectionSlug}:${doc.id}:${idx}`, - text: chunk, - metadata: { - sourceCollection: collectionSlug, - docId: String(doc.id), - chunkIndex: idx, - embeddingVersion, - extensionFields, - }, + const validChunkCount = chunkData.filter((c) => c?.chunk).length + if (validChunkCount > 0) { + docsToProcess.push({ + collectionSlug, + docId: String(doc.id), + chunkCount: validChunkCount, }) + totalChunkCount += validChunkCount } } @@ -489,70 +557,108 @@ async function streamAndBatchMissingEmbeddings(args: { } } - // Track pending chunks - plugin manages this queue - const pendingChunks: CollectedEmbeddingInput[] = [] + // If no chunks, return early + if (totalChunkCount === 0) { + return { batchCount: 0, totalInputs: 0 } + } - // Stream chunks to addChunk, tracking which is last - for (let i = 0; i < allChunks.length; i++) { - const collectedChunk = allChunks[i] - const isLastChunk = i === allChunks.length - 1 + // Second pass: stream chunks without holding all in memory + let batchIndex = 0 + let totalInputs = 0 + let processedChunkCount = 0 + const pendingChunks: CollectedEmbeddingInput[] = [] - // Add to pending queue BEFORE calling addChunk - pendingChunks.push(collectedChunk) + for (const docInfo of docsToProcess) { + const collectionConfig = dynamicConfig.collections[docInfo.collectionSlug] + if (!collectionConfig) continue - const submission = await addChunk({ - chunk: { id: collectedChunk.id, text: collectedChunk.text }, - isLastChunk, + // Re-fetch the document to get its data + const doc = await payload.findByID({ + collection: docInfo.collectionSlug as any, + id: docInfo.docId, }) + if (!doc) continue - if (submission) { - // User submitted a batch - // - If isLastChunk: all pending chunks were submitted - // - If not isLastChunk: all except current were submitted (current starts fresh) - let submittedChunks: CollectedEmbeddingInput[] - if (isLastChunk) { - submittedChunks = pendingChunks.splice(0) - } else { - submittedChunks = pendingChunks.splice(0, pendingChunks.length - 1) + const toKnowledgePool = collectionConfig.toKnowledgePool + const chunkData = await toKnowledgePool(doc, payload) + + for (let idx = 0; idx < chunkData.length; idx++) { + const chunkEntry = chunkData[idx] + if (!chunkEntry?.chunk) continue + + processedChunkCount++ + const isLastChunk = processedChunkCount === totalChunkCount + + const { chunk, ...extensionFields } = chunkEntry + const collectedChunk: CollectedEmbeddingInput = { + id: `${docInfo.collectionSlug}:${doc.id}:${idx}`, + text: chunk, + metadata: { + sourceCollection: docInfo.collectionSlug, + docId: String(doc.id), + chunkIndex: idx, + embeddingVersion, + extensionFields, + }, } - // Convert runId to number for postgres relationships - const runIdNum = parseInt(runId, 10) + // Add to pending queue BEFORE calling addChunk + pendingChunks.push(collectedChunk) - // Store metadata for submitted chunks - await Promise.all( - submittedChunks.map((chunk) => - payload.create({ - collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, - data: { - run: runIdNum, - inputId: chunk.id, - text: chunk.text, - sourceCollection: chunk.metadata.sourceCollection, - docId: chunk.metadata.docId, - chunkIndex: chunk.metadata.chunkIndex, - embeddingVersion: chunk.metadata.embeddingVersion, - extensionFields: chunk.metadata.extensionFields, - }, - }), - ), - ) - - // Create batch record - await payload.create({ - collection: BULK_EMBEDDINGS_BATCHES_SLUG, - data: { - run: runIdNum, - batchIndex, - providerBatchId: submission.providerBatchId, - status: 'queued', - inputCount: submittedChunks.length, - submittedAt: new Date().toISOString(), - }, + const submission = await addChunk({ + chunk: { id: collectedChunk.id, text: collectedChunk.text }, + isLastChunk, }) - totalInputs += submittedChunks.length - batchIndex++ + if (submission) { + // User submitted a batch + // - If isLastChunk: all pending chunks were submitted + // - If not isLastChunk: all except current were submitted (current starts fresh) + let submittedChunks: CollectedEmbeddingInput[] + if (isLastChunk) { + submittedChunks = pendingChunks.splice(0) + } else { + submittedChunks = pendingChunks.splice(0, pendingChunks.length - 1) + } + + // Convert runId to number for postgres relationships + const runIdNum = parseInt(runId, 10) + + // Store metadata for submitted chunks + await Promise.all( + submittedChunks.map((c) => + payload.create({ + collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, + data: { + run: runIdNum, + inputId: c.id, + text: c.text, + sourceCollection: c.metadata.sourceCollection, + docId: c.metadata.docId, + chunkIndex: c.metadata.chunkIndex, + embeddingVersion: c.metadata.embeddingVersion, + extensionFields: c.metadata.extensionFields, + }, + }), + ), + ) + + // Create batch record + await payload.create({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + data: { + run: runIdNum, + batchIndex, + providerBatchId: submission.providerBatchId, + status: 'queued', + inputCount: submittedChunks.length, + submittedAt: new Date().toISOString(), + }, + }) + + totalInputs += submittedChunks.length + batchIndex++ + } } } @@ -560,86 +666,141 @@ async function streamAndBatchMissingEmbeddings(args: { } /** - * Complete all batches - download all outputs and write successful embeddings. - * - * Note: This function writes partial results. If some chunks fail during completion, - * successful embeddings are still written. Only failed chunks are skipped. - * The operation is atomic in that if an exception is thrown, nothing is written. + * Check if a source document exists */ -async function completeBatches(args: { +async function documentExists(args: { + payload: Payload + collection: string + docId: string +}): Promise { + const { payload, collection, docId } = args + try { + await payload.findByID({ + collection: collection as any, + id: docId, + }) + return true + } catch (error) { + // Document not found or other error + return false + } +} + +/** + * Poll a single batch and complete if succeeded - stream outputs and write embeddings incrementally. + * Checks document existence before writing each embedding (skips deleted docs). + * Returns both the batch status and completion counts. + */ +async function pollAndCompleteSingleBatch(args: { payload: Payload runId: string poolName: KnowledgePoolName - batches: any[] + batch: any callbacks: { - completeBatch: (args: { providerBatchId: string }) => Promise + pollOrCompleteBatch: (args: { + providerBatchId: string + onChunk: (chunk: BulkEmbeddingOutput) => Promise + }) => Promise<{ status: string; error?: string }> } }): Promise<{ - success: boolean + status: string + error?: string succeededCount: number failedCount: number failedChunkData: FailedChunkData[] - error?: string }> { - const { payload, runId, poolName, batches, callbacks } = args + const { payload, runId, poolName, batch, callbacks } = args + + let succeededCount = 0 + let failedCount = 0 + const failedChunkData: FailedChunkData[] = [] + const processedDocs = new Set() // Track which docs we've processed (for deletion) + + // Poll batch and stream chunks when complete + const pollResult = await callbacks.pollOrCompleteBatch({ + providerBatchId: batch.providerBatchId, + onChunk: async (output: BulkEmbeddingOutput) => { + // Lookup metadata on-demand (O(1) with index) instead of loading all into memory + const meta = await getMetadataByInputId({ + payload, + runId, + inputId: output.id, + }) + if (!meta) { + // Metadata not found - log and skip this chunk (may have been deleted or cleanup ran) + payload.logger.warn( + `[payloadcms-vectorize] Metadata not found for chunk ${output.id} in run ${runId}. Skipping chunk.`, + ) + failedCount++ + return + } - try { - // Load all metadata for this run - const metadataById = await loadInputMetadataByRun({ payload, runId }) - - // Collect all outputs from all batches - const allOutputs: BulkEmbeddingOutput[] = [] - for (const batch of batches) { - const outputs = await callbacks.completeBatch({ - providerBatchId: batch.providerBatchId, + // Check if document still exists (may have been deleted during bulk embedding) + const docExists = await documentExists({ + payload, + collection: meta.sourceCollection, + docId: meta.docId, }) - allOutputs.push(...outputs) - } - // Filter successful outputs and collect failed chunk data - const successfulOutputs = allOutputs.filter((o) => !o.error && o.embedding) - const failedChunkData: FailedChunkData[] = [] - for (const output of allOutputs) { - if (output.error) { - const meta = metadataById.get(output.id) - if (meta) { - failedChunkData.push({ - collection: meta.sourceCollection, - documentId: meta.docId, - chunkIndex: meta.chunkIndex, - }) - } + if (!docExists) { + // Document was deleted - skip this chunk and clean up metadata + await payload.delete({ + collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, + where: { + and: [{ run: { equals: parseInt(runId, 10) } }, { inputId: { equals: output.id } }], + }, + }) + failedCount++ + failedChunkData.push({ + collection: meta.sourceCollection, + documentId: meta.docId, + chunkIndex: meta.chunkIndex, + }) + return } - } - const failedCount = failedChunkData.length - - // Collect unique doc keys for deletion - const docKeys = new Set() - for (const output of successfulOutputs) { - const meta = metadataById.get(output.id) - if (!meta) continue - docKeys.add(`${meta.sourceCollection}:${meta.docId}`) - } - // Delete existing embeddings for docs we're about to update - for (const key of docKeys) { - const [sourceCollection, docId] = key.split(':') - await payload.delete({ - collection: poolName, - where: { - and: [ - { sourceCollection: { equals: sourceCollection } }, - { docId: { equals: String(docId) } }, - ], - }, - }) - } + // Handle errors from provider + if (output.error || !output.embedding) { + failedCount++ + failedChunkData.push({ + collection: meta.sourceCollection, + documentId: meta.docId, + chunkIndex: meta.chunkIndex, + }) + return + } - // Write all new embeddings - for (const output of successfulOutputs) { - const meta = metadataById.get(output.id) - if (!meta || !output.embedding) continue + // Track this doc for potential deletion of old embeddings + const docKey = `${meta.sourceCollection}:${meta.docId}` + const isFirstChunkForDoc = !processedDocs.has(docKey) + if (isFirstChunkForDoc) { + processedDocs.add(docKey) + // Check if embeddings already exist for this document+version (from a previous batch) + const hasCurrentEmbedding = await docHasEmbeddingVersion({ + payload, + poolName, + sourceCollection: meta.sourceCollection, + docId: meta.docId, + embeddingVersion: meta.embeddingVersion, + }) + + // Only delete if no embeddings exist for this version (they're from an old version) + if (!hasCurrentEmbedding) { + // Delete existing embeddings for this document (from old version) + await payload.delete({ + collection: poolName, + where: { + and: [ + { sourceCollection: { equals: meta.sourceCollection } }, + { docId: { equals: String(meta.docId) } }, + ], + }, + }) + } + } + + // Write the embedding const embeddingArray = Array.isArray(output.embedding) ? output.embedding : Array.from(output.embedding) @@ -663,23 +824,17 @@ async function completeBatches(args: { vector: embeddingArray, id: String((created as any)?.id ?? ''), }) - } - return { - success: true, - succeededCount: successfulOutputs.length, - failedCount, - failedChunkData, - } - } catch (error) { - const errorMessage = (error as Error).message || String(error) - return { - success: false, - succeededCount: 0, - failedCount: 0, - failedChunkData: [], - error: `Completion failed: ${errorMessage}`, - } + succeededCount++ + }, + }) + + return { + status: pollResult.status, + error: pollResult.error, + succeededCount, + failedCount, + failedChunkData, } } @@ -733,63 +888,43 @@ async function docHasEmbeddingVersion(args: { return (existing as any)?.totalDocs > 0 } -async function loadInputMetadataByRun(args: { payload: Payload; runId: string }): Promise< - Map< - string, - { - text: string - sourceCollection: string - docId: string - chunkIndex: number - embeddingVersion: string - extensionFields?: Record - } - > -> { - const { payload, runId } = args - const map = new Map< - string, - { - text: string - sourceCollection: string - docId: string - chunkIndex: number - embeddingVersion: string - extensionFields?: Record - } - >() - - // Convert runId to number for postgres relationship queries +/** + * Lookup metadata for a single input by runId + inputId. + * Uses the composite index ['run', 'inputId'] for O(1) lookup. + * This approach uses constant memory instead of loading all metadata into memory. + */ +async function getMetadataByInputId(args: { + payload: Payload + runId: string + inputId: string +}): Promise<{ + text: string + sourceCollection: string + docId: string + chunkIndex: number + embeddingVersion: string + extensionFields?: Record +} | null> { + const { payload, runId, inputId } = args const runIdNum = parseInt(runId, 10) - let page = 1 - const limit = 100 - while (true) { - const res = await payload.find({ - collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, - page, - limit, - where: { run: { equals: runIdNum } }, - sort: 'inputId', - }) - const docs = (res as any)?.docs || [] - if (!docs.length) break - - for (const doc of docs) { - map.set(String(doc.inputId), { - text: doc.text, - sourceCollection: doc.sourceCollection, - docId: String(doc.docId), - chunkIndex: doc.chunkIndex, - embeddingVersion: doc.embeddingVersion, - extensionFields: doc.extensionFields || undefined, - }) - } + const result = await payload.find({ + collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, + where: { + and: [{ run: { equals: runIdNum } }, { inputId: { equals: inputId } }], + }, + limit: 1, + }) - const totalPages = (res as any)?.totalPages ?? page - page++ - if (page > totalPages) break - } + const doc = (result as any)?.docs?.[0] + if (!doc) return null - return map + return { + text: doc.text, + sourceCollection: doc.sourceCollection, + docId: String(doc.docId), + chunkIndex: doc.chunkIndex, + embeddingVersion: doc.embeddingVersion, + extensionFields: doc.extensionFields || undefined, + } } diff --git a/src/types.ts b/src/types.ts index 220605f..63aeb58 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,5 +1,43 @@ import type { CollectionSlug, Payload, Field, Where } from 'payload' +/** Result from bulkEmbed method */ +export type BulkEmbedResult = + | { + /** ID of the created run */ + runId: string + /** Status of the run */ + status: 'queued' + } + | { + /** ID of existing active run */ + runId: string + /** Status of existing run */ + status: 'queued' | 'running' + /** Message explaining why a new run wasn't started */ + message: string + /** Indicates a conflict occurred */ + conflict: true + } + +/** Result from retryFailedBatch method */ +export type RetryFailedBatchResult = + | { + /** ID of the batch being retried */ + batchId: string + /** ID of the parent run */ + runId: string + /** New status of the batch */ + status: 'queued' + /** Confirmation message */ + message: string + } + | { + /** Error message */ + error: string + /** Indicates a conflict occurred (e.g., run still active) */ + conflict?: true + } + /** * Extended Payload type with vectorize plugin methods */ @@ -19,6 +57,10 @@ export type VectorizedPayload }, ) => Promise + /** Start a bulk embedding run for a knowledge pool */ + bulkEmbed: (params: { knowledgePool: TPoolNames }) => Promise + /** Retry a failed batch */ + retryFailedBatch: (params: { batchId: string }) => Promise } /** @@ -31,7 +73,11 @@ export function isVectorizedPayload(payload: Payload): payload is VectorizedPayl 'search' in payload && typeof (payload as any).search === 'function' && 'queueEmbed' in payload && - typeof (payload as any).queueEmbed === 'function' + typeof (payload as any).queueEmbed === 'function' && + 'bulkEmbed' in payload && + typeof (payload as any).bulkEmbed === 'function' && + 'retryFailedBatch' in payload && + typeof (payload as any).retryFailedBatch === 'function' ) } @@ -135,16 +181,12 @@ export type BatchSubmission = { providerBatchId: string } -/** Arguments for polling a single batch */ -export type PollBatchArgs = { - /** Provider-specific batch identifier */ - providerBatchId: string -} - -/** Arguments for completing/downloading a single batch */ -export type CompleteBatchArgs = { +/** Arguments for polling or completing a single batch */ +export type PollOrCompleteBatchArgs = { /** Provider-specific batch identifier */ providerBatchId: string + /** Callback function to stream completed chunks as they become available */ + onChunk: (chunk: BulkEmbeddingOutput) => Promise } /** Data about a failed chunk during bulk embedding completion */ @@ -190,11 +232,12 @@ export type BulkEmbeddingsFns = { */ addChunk: (args: AddChunkArgs) => Promise - /** Poll a specific batch by providerBatchId */ - pollBatch: (args: PollBatchArgs) => Promise - - /** Download outputs for a completed batch */ - completeBatch: (args: CompleteBatchArgs) => Promise + /** + * Poll a specific batch by providerBatchId, and stream outputs when complete. + * Call onChunk for each output as it becomes available once the batch completes. + * The function completes when all chunks have been streamed. + */ + pollOrCompleteBatch: (args: PollOrCompleteBatchArgs) => Promise /** * Called when the bulk run fails. Use this to clean up provider-side resources From 313ef3f99cccb5228c48891565de2d5d7a014175 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sun, 11 Jan 2026 20:05:02 +0700 Subject: [PATCH 26/49] adds import map --- dev/app/(payload)/admin/importMap.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/app/(payload)/admin/importMap.js b/dev/app/(payload)/admin/importMap.js index abe5d88..a13c2ea 100644 --- a/dev/app/(payload)/admin/importMap.js +++ b/dev/app/(payload)/admin/importMap.js @@ -21,6 +21,7 @@ import { StrikethroughFeatureClient as StrikethroughFeatureClient_e70f5e05f09f93 import { UnderlineFeatureClient as UnderlineFeatureClient_e70f5e05f09f93e00b997edb1ef0c864 } from '@payloadcms/richtext-lexical/client' import { BoldFeatureClient as BoldFeatureClient_e70f5e05f09f93e00b997edb1ef0c864 } from '@payloadcms/richtext-lexical/client' import { ItalicFeatureClient as ItalicFeatureClient_e70f5e05f09f93e00b997edb1ef0c864 } from '@payloadcms/richtext-lexical/client' +import { RetryFailedBatchButton as RetryFailedBatchButton_69051d9d0217691c78245f4f33731b73 } from 'payloadcms-vectorize/client' import { EmbedAllButton as EmbedAllButton_69051d9d0217691c78245f4f33731b73 } from 'payloadcms-vectorize/client' import { CollectionCards as CollectionCards_ab83ff7e88da8d3530831f296ec4756a } from '@payloadcms/ui/rsc' @@ -48,6 +49,7 @@ export const importMap = { "@payloadcms/richtext-lexical/client#UnderlineFeatureClient": UnderlineFeatureClient_e70f5e05f09f93e00b997edb1ef0c864, "@payloadcms/richtext-lexical/client#BoldFeatureClient": BoldFeatureClient_e70f5e05f09f93e00b997edb1ef0c864, "@payloadcms/richtext-lexical/client#ItalicFeatureClient": ItalicFeatureClient_e70f5e05f09f93e00b997edb1ef0c864, + "payloadcms-vectorize/client#RetryFailedBatchButton": RetryFailedBatchButton_69051d9d0217691c78245f4f33731b73, "payloadcms-vectorize/client#EmbedAllButton": EmbedAllButton_69051d9d0217691c78245f4f33731b73, "@payloadcms/ui/rsc#CollectionCards": CollectionCards_ab83ff7e88da8d3530831f296ec4756a } From 1db26018ccb8dcfe9514e0b892c582077f12989a Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sun, 11 Jan 2026 21:20:03 +0700 Subject: [PATCH 27/49] assigns the extra funcs to the payload instance --- src/index.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/index.ts b/src/index.ts index d0eaf09..2da03ea 100644 --- a/src/index.ts +++ b/src/index.ts @@ -391,8 +391,7 @@ export const createVectorizeIntegration = const vectorSearchHandlers = createVectorSearchHandlers(pluginOptions.knowledgePools) config.onInit = async (payload) => { if (incomingOnInit) await incomingOnInit(payload) - ;(payload as VectorizedPayload) = { - ...(payload as any), + Object.assign(payload, { _isBulkEmbedEnabled: (knowledgePool: TPoolNames): boolean => { const poolConfig = pluginOptions.knowledgePools[knowledgePool] return !!poolConfig?.embeddingConfig?.bulkEmbeddingsFns @@ -452,7 +451,7 @@ export const createVectorizeIntegration = knowledgePools: pluginOptions.knowledgePools, queueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, }), - } + } as Partial>) // Ensure pgvector artifacts for each knowledge pool for (const poolName in staticConfigs) { const staticConfig = staticConfigs[poolName] From 2ca94b505d73d5648f8bda7e1ef0253dfec73fc8 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sun, 11 Jan 2026 21:59:32 +0700 Subject: [PATCH 28/49] WIP --- dev/payload-types.ts | 148 +++++++++++++++++++-- dev/specs/bulkEmbed/partialFailure.spec.ts | 20 ++- 2 files changed, 156 insertions(+), 12 deletions(-) diff --git a/dev/payload-types.ts b/dev/payload-types.ts index cf54a34..5fb6133 100644 --- a/dev/payload-types.ts +++ b/dev/payload-types.ts @@ -70,8 +70,10 @@ export interface Config { posts: Post; 'vector-bulk-embeddings-runs': VectorBulkEmbeddingsRun; 'vector-bulk-embedding-input-metadata': VectorBulkEmbeddingInputMetadatum; + 'vector-bulk-embeddings-batches': VectorBulkEmbeddingsBatch; default: Default; bulkDefault: BulkDefault; + failingBulkDefault: FailingBulkDefault; 'payload-kv': PayloadKv; users: User; 'payload-jobs': PayloadJob; @@ -84,8 +86,10 @@ export interface Config { posts: PostsSelect | PostsSelect; 'vector-bulk-embeddings-runs': VectorBulkEmbeddingsRunsSelect | VectorBulkEmbeddingsRunsSelect; 'vector-bulk-embedding-input-metadata': VectorBulkEmbeddingInputMetadataSelect | VectorBulkEmbeddingInputMetadataSelect; + 'vector-bulk-embeddings-batches': VectorBulkEmbeddingsBatchesSelect | VectorBulkEmbeddingsBatchesSelect; default: DefaultSelect | DefaultSelect; bulkDefault: BulkDefaultSelect | BulkDefaultSelect; + failingBulkDefault: FailingBulkDefaultSelect | FailingBulkDefaultSelect; 'payload-kv': PayloadKvSelect | PayloadKvSelect; users: UsersSelect | UsersSelect; 'payload-jobs': PayloadJobsSelect | PayloadJobsSelect; @@ -175,15 +179,11 @@ export interface VectorBulkEmbeddingsRun { * Embedding version at submission time */ embeddingVersion: string; + status: 'queued' | 'running' | 'succeeded' | 'failed' | 'canceled'; /** - * Provider file or input reference used for the batch - */ - inputFileRef?: string | null; - /** - * Provider batch identifier + * Total number of batches in this run */ - providerBatchId?: string | null; - status: 'queued' | 'running' | 'succeeded' | 'failed' | 'canceled'; + totalBatches?: number | null; inputs?: number | null; succeeded?: number | null; failed?: number | null; @@ -199,6 +199,18 @@ export interface VectorBulkEmbeddingsRun { * Failure reason if the run ended in error */ error?: string | null; + /** + * Data about chunks that failed during completion (collection, documentId, chunkIndex) + */ + failedChunkData?: + | { + [k: string]: unknown; + } + | unknown[] + | string + | number + | boolean + | null; updatedAt: string; createdAt: string; } @@ -238,6 +250,54 @@ export interface VectorBulkEmbeddingInputMetadatum { updatedAt: string; createdAt: string; } +/** + * Individual batches within a bulk embedding run. Created when input count exceeds file limits. + * + * This interface was referenced by `Config`'s JSON-Schema + * via the `definition` "vector-bulk-embeddings-batches". + */ +export interface VectorBulkEmbeddingsBatch { + id: number; + /** + * Parent bulk embedding run + */ + run: number | VectorBulkEmbeddingsRun; + /** + * Zero-based index of this batch within the run + */ + batchIndex: number; + /** + * Provider-specific batch identifier + */ + providerBatchId: string; + status: 'queued' | 'running' | 'succeeded' | 'failed' | 'canceled'; + /** + * Number of inputs in this batch + */ + inputCount: number; + /** + * Number of successful embeddings + */ + succeededCount?: number | null; + /** + * Number of failed embeddings + */ + failedCount?: number | null; + /** + * Timestamp when the batch was submitted to provider + */ + submittedAt?: string | null; + /** + * Timestamp when the batch finished + */ + completedAt?: string | null; + /** + * Error message if the batch failed + */ + error?: string | null; + updatedAt: string; + createdAt: string; +} /** * Vector embeddings for search and similarity queries. Created by the payloadcms-vectorize plugin. Embeddings cannot be added or modified, only deleted, through the admin panel. No other restrictions enforced. * @@ -300,6 +360,37 @@ export interface BulkDefault { updatedAt: string; createdAt: string; } +/** + * Vector embeddings for search and similarity queries. Created by the payloadcms-vectorize plugin. Embeddings cannot be added or modified, only deleted, through the admin panel. No other restrictions enforced. + * + * This interface was referenced by `Config`'s JSON-Schema + * via the `definition` "failingBulkDefault". + */ +export interface FailingBulkDefault { + id: number; + /** + * The collection that this embedding belongs to + */ + sourceCollection: string; + /** + * The ID of the source document + */ + docId: string; + /** + * The index of this chunk + */ + chunkIndex: number; + /** + * The original text that was vectorized + */ + chunkText?: string | null; + /** + * The version of the embedding model used + */ + embeddingVersion?: string | null; + updatedAt: string; + createdAt: string; +} /** * This interface was referenced by `Config`'s JSON-Schema * via the `definition` "payload-kv". @@ -463,6 +554,10 @@ export interface PayloadLockedDocument { relationTo: 'vector-bulk-embedding-input-metadata'; value: number | VectorBulkEmbeddingInputMetadatum; } | null) + | ({ + relationTo: 'vector-bulk-embeddings-batches'; + value: number | VectorBulkEmbeddingsBatch; + } | null) | ({ relationTo: 'default'; value: number | Default; @@ -471,6 +566,10 @@ export interface PayloadLockedDocument { relationTo: 'bulkDefault'; value: number | BulkDefault; } | null) + | ({ + relationTo: 'failingBulkDefault'; + value: number | FailingBulkDefault; + } | null) | ({ relationTo: 'users'; value: number | User; @@ -534,15 +633,15 @@ export interface PostsSelect { export interface VectorBulkEmbeddingsRunsSelect { pool?: T; embeddingVersion?: T; - inputFileRef?: T; - providerBatchId?: T; status?: T; + totalBatches?: T; inputs?: T; succeeded?: T; failed?: T; submittedAt?: T; completedAt?: T; error?: T; + failedChunkData?: T; updatedAt?: T; createdAt?: T; } @@ -562,6 +661,24 @@ export interface VectorBulkEmbeddingInputMetadataSelect { + run?: T; + batchIndex?: T; + providerBatchId?: T; + status?: T; + inputCount?: T; + succeededCount?: T; + failedCount?: T; + submittedAt?: T; + completedAt?: T; + error?: T; + updatedAt?: T; + createdAt?: T; +} /** * This interface was referenced by `Config`'s JSON-Schema * via the `definition` "default_select". @@ -588,6 +705,19 @@ export interface BulkDefaultSelect { updatedAt?: T; createdAt?: T; } +/** + * This interface was referenced by `Config`'s JSON-Schema + * via the `definition` "failingBulkDefault_select". + */ +export interface FailingBulkDefaultSelect { + sourceCollection?: T; + docId?: T; + chunkIndex?: T; + chunkText?: T; + embeddingVersion?: T; + updatedAt?: T; + createdAt?: T; +} /** * This interface was referenced by `Config`'s JSON-Schema * via the `definition` "payload-kv_select". diff --git a/dev/specs/bulkEmbed/partialFailure.spec.ts b/dev/specs/bulkEmbed/partialFailure.spec.ts index 3d2f928..961274a 100644 --- a/dev/specs/bulkEmbed/partialFailure.spec.ts +++ b/dev/specs/bulkEmbed/partialFailure.spec.ts @@ -152,8 +152,22 @@ describe('Bulk embed - partial chunk failures', () => { expect(updatedRun.failed).toBe(1) // Second chunk failed expect(updatedRun.failedChunkData).toBeDefined() expect(Array.isArray(updatedRun.failedChunkData)).toBe(true) - expect((updatedRun.failedChunkData as Array<{ collection: string; documentId: string; chunkIndex: number }>).length).toBe(1) - const failedChunk = (updatedRun.failedChunkData as Array<{ collection: string; documentId: string; chunkIndex: number }>)[0] + expect( + ( + updatedRun.failedChunkData as Array<{ + collection: string + documentId: string + chunkIndex: number + }> + ).length, + ).toBe(1) + const failedChunk = ( + updatedRun.failedChunkData as Array<{ + collection: string + documentId: string + chunkIndex: number + }> + )[0] expect(failedChunk.collection).toBe('posts') expect(failedChunk.documentId).toBe(String(post.id)) expect(failedChunk.chunkIndex).toBe(1) // Second chunk (index 1) @@ -236,7 +250,7 @@ describe('Bulk embed - partial chunk failures', () => { expect(updatedRun.status).toBe('succeeded') expect(updatedRun.failed).toBe(0) - expect(updatedRun.failedChunkData).toBeUndefined() + expect(updatedRun.failedChunkData).toBeNull() // onError should NOT be called when everything succeeds expect(onErrorCalled).toBe(false) From 22661e8d81f90ba47d7c496bd5311bfe220e7caa Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 12 Jan 2026 12:06:14 +0700 Subject: [PATCH 29/49] WIP --- dev/specs/bulkEmbed/partialFailure.spec.ts | 134 ++---------------- .../bulkEmbed/partialFailureNoFail.spec.ts | 105 ++++++++++++++ dev/specs/utils.ts | 14 +- src/collections/bulkEmbeddingInputMetadata.ts | 13 +- src/tasks/bulkEmbedAll.ts | 31 ++-- 5 files changed, 154 insertions(+), 143 deletions(-) create mode 100644 dev/specs/bulkEmbed/partialFailureNoFail.spec.ts diff --git a/dev/specs/bulkEmbed/partialFailure.spec.ts b/dev/specs/bulkEmbed/partialFailure.spec.ts index 961274a..bf84443 100644 --- a/dev/specs/bulkEmbed/partialFailure.spec.ts +++ b/dev/specs/bulkEmbed/partialFailure.spec.ts @@ -26,7 +26,6 @@ describe('Bulk embed - partial chunk failures', () => { beforeAll(async () => { await createTestDb({ dbName }) - // We'll set up the payload dynamically in each test to control failIds }) test('partial chunk failures are tracked and passed to onError', async () => { @@ -34,10 +33,11 @@ describe('Bulk embed - partial chunk failures', () => { onErrorCalled = false onErrorArgs = null - // The ID format is collectionSlug:docId:chunkIndex - // We need to fail a specific chunk - but we don't know the docId yet - // So we'll create the payload with a dynamic failIds check + // Use unique version to ensure this test only processes its own data + const testVersion = `${testEmbeddingVersion}-partial-${Date.now()}` + // Use a function-based failure check to avoid needing to know docId ahead of time + // Fail any chunk with index 1 (second chunk of any doc) const built = await buildPayloadWithIntegration({ dbName, pluginOpts: { @@ -52,13 +52,13 @@ describe('Bulk embed - partial chunk failures', () => { }, }, embeddingConfig: { - version: testEmbeddingVersion, + version: testVersion, queryFn: makeDummyEmbedQuery(DIMS), bulkEmbeddingsFns: createMockBulkEmbeddings( { statusSequence: ['succeeded'], - // We'll fail chunks that contain ":1" (second chunk of any doc) - partialFailure: { failIds: [] }, // Will be updated below + // Fail any chunk with index 1 (second chunk) - ID format is collection:docId:chunkIndex + partialFailure: { shouldFail: (id: string) => id.endsWith(':1') }, onErrorCallback: (args) => { onErrorCalled = true onErrorArgs = args @@ -73,61 +73,19 @@ describe('Bulk embed - partial chunk failures', () => { }, secret: 'test-secret', dims: DIMS, - key: `partial-failure-${Date.now()}`, + key: `partial-failure-${Date.now()}-${Math.random()}`, }) payload = built.payload - // Create a post + // Create a post with 2 chunks const post = await payload.create({ collection: 'posts', data: { title: 'Partial Failure Test' } as any, }) - // Now we know the docId, update the mock to fail the second chunk - const failChunkId = `posts:${post.id}:1` - - // Re-create with the correct failIds - const built2 = await buildPayloadWithIntegration({ - dbName, - pluginOpts: { - knowledgePools: { - default: { - collections: { - posts: { - toKnowledgePool: async (doc: any) => [ - { chunk: doc.title }, - { chunk: doc.title + ' chunk2' }, - ], - }, - }, - embeddingConfig: { - version: testEmbeddingVersion + '-v2', - queryFn: makeDummyEmbedQuery(DIMS), - bulkEmbeddingsFns: createMockBulkEmbeddings( - { - statusSequence: ['succeeded'], - partialFailure: { failIds: [failChunkId] }, - onErrorCallback: (args) => { - onErrorCalled = true - onErrorArgs = args - }, - }, - DIMS, - ), - }, - }, - }, - bulkQueueNames: BULK_QUEUE_NAMES, - }, - secret: 'test-secret', - dims: DIMS, - key: `partial-failure-2-${Date.now()}`, - }) - payload = built2.payload - const run = await payload.create({ collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion + '-v2', status: 'queued' }, + data: { pool: 'default', embeddingVersion: testVersion, status: 'queued' }, }) await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ @@ -183,76 +141,4 @@ describe('Bulk embed - partial chunk failures', () => { expect(onErrorArgs!.failedChunkCount).toBe(1) expect(onErrorArgs!.error.message).toContain('1 chunk(s) failed') }) - - test('run with no partial failures does not call onError', async () => { - // Reset state - onErrorCalled = false - onErrorArgs = null - - const built = await buildPayloadWithIntegration({ - dbName, - pluginOpts: { - knowledgePools: { - default: { - collections: { - posts: { - toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], - }, - }, - embeddingConfig: { - version: testEmbeddingVersion + '-v3', - queryFn: makeDummyEmbedQuery(DIMS), - bulkEmbeddingsFns: createMockBulkEmbeddings( - { - statusSequence: ['succeeded'], - // No partial failures - onErrorCallback: (args) => { - onErrorCalled = true - onErrorArgs = args - }, - }, - DIMS, - ), - }, - }, - }, - bulkQueueNames: BULK_QUEUE_NAMES, - }, - secret: 'test-secret', - dims: DIMS, - key: `no-partial-failure-${Date.now()}`, - }) - payload = built.payload - - await payload.create({ collection: 'posts', data: { title: 'No Failure Test' } as any }) - - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion + '-v3', status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) - - await waitForBulkJobs(payload) - - // Check run status - const updatedRun = await payload.findByID({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - id: run.id, - }) - - expect(updatedRun.status).toBe('succeeded') - expect(updatedRun.failed).toBe(0) - expect(updatedRun.failedChunkData).toBeNull() - - // onError should NOT be called when everything succeeds - expect(onErrorCalled).toBe(false) - }) }) diff --git a/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts b/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts new file mode 100644 index 0000000..133e97c --- /dev/null +++ b/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts @@ -0,0 +1,105 @@ +import type { Payload } from 'payload' +import { beforeAll, describe, expect, test } from 'vitest' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' +import { + BULK_QUEUE_NAMES, + DEFAULT_DIMS, + buildPayloadWithIntegration, + createMockBulkEmbeddings, + createTestDb, + waitForBulkJobs, +} from '../utils.js' +import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' + +const DIMS = DEFAULT_DIMS +const dbName = `bulk_partial_failure_nofail_${Date.now()}` + +describe('Bulk embed - no partial failures', () => { + let payload: Payload + let onErrorCalled = false + let onErrorArgs: { + providerBatchIds: string[] + error: Error + failedChunkData?: Array<{ collection: string; documentId: string; chunkIndex: number }> + failedChunkCount?: number + } | null = null + + beforeAll(async () => { + await createTestDb({ dbName }) + }) + + test('run with no partial failures does not call onError', async () => { + // Reset state + onErrorCalled = false + onErrorArgs = null + + // Use unique version to ensure this test only processes its own data + const testVersion = `${testEmbeddingVersion}-nofail-${Date.now()}` + + const built = await buildPayloadWithIntegration({ + dbName, + pluginOpts: { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embeddingConfig: { + version: testVersion, + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings( + { + statusSequence: ['succeeded'], + // No partial failures + onErrorCallback: (args) => { + onErrorCalled = true + onErrorArgs = args + }, + }, + DIMS, + ), + }, + }, + }, + bulkQueueNames: BULK_QUEUE_NAMES, + }, + secret: 'test-secret', + dims: DIMS, + key: `no-partial-failure-${Date.now()}-${Math.random()}`, + }) + payload = built.payload + + await payload.create({ collection: 'posts', data: { title: 'No Failure Test' } as any }) + + const run = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { pool: 'default', embeddingVersion: testVersion, status: 'queued' }, + }) + + await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ + task: 'payloadcms-vectorize:prepare-bulk-embedding', + input: { runId: String(run.id) }, + req: { payload } as any, + ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName + ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } + : {}), + }) + + await waitForBulkJobs(payload) + + // Check run status + const updatedRun = await payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: run.id, + }) + + expect(updatedRun.status).toBe('succeeded') + expect(updatedRun.failed).toBe(0) + expect(updatedRun.failedChunkData).toBeNull() + + // onError should NOT be called when everything succeeds + expect(onErrorCalled).toBe(false) + }) +}) diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index b44643c..474f4a1 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -76,7 +76,8 @@ export const BULK_QUEUE_NAMES = { type MockOptions = { statusSequence: BulkEmbeddingRunStatus[] - partialFailure?: { failIds: string[] } + /** Static list of IDs to fail, OR a function to decide at runtime */ + partialFailure?: { failIds: string[] } | { shouldFail: (id: string) => boolean } /** Optional: flush after this many chunks (for testing multi-batch scenarios) */ flushAfterChunks?: number /** Optional: callback to track onError calls for testing */ @@ -141,7 +142,12 @@ export function createMockBulkEmbeddings( const vectors = await embeddings(inputs.map((i) => i.text)) for (let idx = 0; idx < inputs.length; idx++) { const input = inputs[idx] - const shouldFail = partialFailure?.failIds?.includes(input.id) + // Support both static array and function-based failure check + const shouldFail = partialFailure + ? 'shouldFail' in partialFailure + ? partialFailure.shouldFail(input.id) + : partialFailure.failIds?.includes(input.id) + : false const output = shouldFail ? { id: input.id, error: 'fail' } : { id: input.id, embedding: vectors[idx] } @@ -156,7 +162,7 @@ export function createMockBulkEmbeddings( return { status } }, - onError: async ({ providerBatchIds, error }) => { + onError: async ({ providerBatchIds, error, failedChunkData, failedChunkCount }) => { // Clean up state for (const batchId of providerBatchIds) { batchInputs.delete(batchId) @@ -167,7 +173,7 @@ export function createMockBulkEmbeddings( // Call the test callback if provided if (onErrorCallback) { - onErrorCallback({ providerBatchIds, error }) + onErrorCallback({ providerBatchIds, error, failedChunkData, failedChunkCount }) } }, } diff --git a/src/collections/bulkEmbeddingInputMetadata.ts b/src/collections/bulkEmbeddingInputMetadata.ts index 29472c1..806c8df 100644 --- a/src/collections/bulkEmbeddingInputMetadata.ts +++ b/src/collections/bulkEmbeddingInputMetadata.ts @@ -1,5 +1,6 @@ import type { CollectionConfig } from 'payload' import { BULK_EMBEDDINGS_RUNS_SLUG } from './bulkEmbeddingsRuns.js' +import { BULK_EMBEDDINGS_BATCHES_SLUG } from './bulkEmbeddingsBatches.js' export const BULK_EMBEDDINGS_INPUT_METADATA_SLUG = 'vector-bulk-embedding-input-metadata' @@ -8,7 +9,7 @@ export const createBulkEmbeddingInputMetadataCollection = (): CollectionConfig = admin: { useAsTitle: 'inputId', description: 'Stores per-input metadata for bulk embedding runs.', - defaultColumns: ['run', 'inputId', 'sourceCollection', 'docId', 'chunkIndex'], + defaultColumns: ['run', 'batch', 'inputId', 'sourceCollection', 'docId', 'chunkIndex'], }, access: { // Anyone can read; only internal (local API) can mutate. @@ -25,6 +26,13 @@ export const createBulkEmbeddingInputMetadataCollection = (): CollectionConfig = required: true, admin: { description: 'Bulk run this input belongs to' }, }, + { + name: 'batch', + type: 'relationship', + relationTo: BULK_EMBEDDINGS_BATCHES_SLUG, + required: true, + admin: { description: 'Batch this input belongs to' }, + }, { name: 'inputId', type: 'text', @@ -71,6 +79,9 @@ export const createBulkEmbeddingInputMetadataCollection = (): CollectionConfig = { fields: ['run'], }, + { + fields: ['batch'], + }, { fields: ['sourceCollection', 'docId'], }, diff --git a/src/tasks/bulkEmbedAll.ts b/src/tasks/bulkEmbedAll.ts index fd9af6d..839eb40 100644 --- a/src/tasks/bulkEmbedAll.ts +++ b/src/tasks/bulkEmbedAll.ts @@ -624,13 +624,29 @@ async function streamAndBatchMissingEmbeddings(args: { // Convert runId to number for postgres relationships const runIdNum = parseInt(runId, 10) - // Store metadata for submitted chunks + // Create batch record first so we have the batch ID for metadata + const batchRecord = await payload.create({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + data: { + run: runIdNum, + batchIndex, + providerBatchId: submission.providerBatchId, + status: 'queued', + inputCount: submittedChunks.length, + submittedAt: new Date().toISOString(), + }, + }) + + const batchId = (batchRecord as any).id + + // Store metadata for submitted chunks with batch reference await Promise.all( submittedChunks.map((c) => payload.create({ collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, data: { run: runIdNum, + batch: batchId, inputId: c.id, text: c.text, sourceCollection: c.metadata.sourceCollection, @@ -643,19 +659,6 @@ async function streamAndBatchMissingEmbeddings(args: { ), ) - // Create batch record - await payload.create({ - collection: BULK_EMBEDDINGS_BATCHES_SLUG, - data: { - run: runIdNum, - batchIndex, - providerBatchId: submission.providerBatchId, - status: 'queued', - inputCount: submittedChunks.length, - submittedAt: new Date().toISOString(), - }, - }) - totalInputs += submittedChunks.length batchIndex++ } From 3a02647f8721cfe5e788a21d2ac762dc19c7eb4e Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 12 Jan 2026 17:06:44 +0700 Subject: [PATCH 30/49] fixes tests --- dev/specs/bulkEmbed/failedBatch.spec.ts | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/dev/specs/bulkEmbed/failedBatch.spec.ts b/dev/specs/bulkEmbed/failedBatch.spec.ts index 2d56359..e6010d8 100644 --- a/dev/specs/bulkEmbed/failedBatch.spec.ts +++ b/dev/specs/bulkEmbed/failedBatch.spec.ts @@ -82,8 +82,8 @@ describe('Bulk embed - failed batch', () => { expect(embeds.totalDocs).toBe(0) }) - test('metadata table is cleaned after failed run (no partial writes)', async () => { - await payload.create({ collection: 'posts', data: { title: 'FailCleanup' } as any }) + test('metadata table is kept after failed run (to allow retries)', async () => { + const post = await payload.create({ collection: 'posts', data: { title: 'FailCleanup' } as any }) const run = await payload.create({ collection: BULK_EMBEDDINGS_RUNS_SLUG, @@ -101,11 +101,20 @@ describe('Bulk embed - failed batch', () => { await waitForBulkJobs(payload) + // Metadata should be kept for failed batches to allow retries + const runIdNum = typeof run.id === 'number' ? run.id : parseInt(String(run.id), 10) const metadata = await payload.find({ collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, - where: { run: { exists: true } }, + where: { run: { equals: runIdNum } }, }) - expect(metadata.totalDocs).toBe(0) + expect(metadata.totalDocs).toBeGreaterThan(0) + + // Verify no partial embeddings were written (no partial writes) + const embeds = await payload.find({ + collection: 'default', + where: { docId: { equals: String(post.id) } }, + }) + expect(embeds.totalDocs).toBe(0) }) test('cannot retry batch while run is still running', async () => { From c2d745bcea388331216bcb9bad0f81c5a6cd0db3 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 12 Jan 2026 19:16:19 +0700 Subject: [PATCH 31/49] Adds better retry stragey --- dev/specs/bulkEmbed/failedBatch.spec.ts | 120 ++++++++++- .../RetryFailedBatchButton/client.tsx | 196 ++++++++++++------ .../RetryFailedBatchButton/index.tsx | 57 ++--- src/collections/bulkEmbeddingsBatches.ts | 14 +- src/endpoints/retryFailedBatch.ts | 129 +++++++++++- src/tasks/bulkEmbedAll.ts | 2 +- src/types.ts | 10 +- 7 files changed, 402 insertions(+), 126 deletions(-) diff --git a/dev/specs/bulkEmbed/failedBatch.spec.ts b/dev/specs/bulkEmbed/failedBatch.spec.ts index e6010d8..20f580e 100644 --- a/dev/specs/bulkEmbed/failedBatch.spec.ts +++ b/dev/specs/bulkEmbed/failedBatch.spec.ts @@ -83,7 +83,10 @@ describe('Bulk embed - failed batch', () => { }) test('metadata table is kept after failed run (to allow retries)', async () => { - const post = await payload.create({ collection: 'posts', data: { title: 'FailCleanup' } as any }) + const post = await payload.create({ + collection: 'posts', + data: { title: 'FailCleanup' } as any, + }) const run = await payload.create({ collection: BULK_EMBEDDINGS_RUNS_SLUG, @@ -160,4 +163,119 @@ describe('Bulk embed - failed batch', () => { }, }) }) + + test('retrying a failed batch creates a new batch and marks old batch as retried', async () => { + const post = await payload.create({ collection: 'posts', data: { title: 'RetryTest' } as any }) + + const run = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, + }) + + await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ + task: 'payloadcms-vectorize:prepare-bulk-embedding', + input: { runId: String(run.id) }, + req: { payload } as any, + ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName + ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } + : {}), + }) + + await waitForBulkJobs(payload) + + // Find the failed batch + const batchesResult = await payload.find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { run: { equals: run.id } }, + }) + const failedBatch = (batchesResult as any).docs[0] + expect(failedBatch.status).toBe('failed') + + // Retry the batch + const retryResult = await payload.retryFailedBatch({ batchId: String(failedBatch.id) }) + + expect('error' in retryResult).toBe(false) + if (!('error' in retryResult)) { + expect(retryResult.newBatchId).toBeDefined() + expect(retryResult.status).toBe('queued') + expect(retryResult.message).toContain('resubmitted') + + // Check that the old batch is marked as retried + const oldBatch = await payload.findByID({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + id: String(failedBatch.id), + }) + expect((oldBatch as any).status).toBe('retried') + expect((oldBatch as any).retriedBatch).toBeDefined() + + // Check that the new batch exists and is queued + const newBatch = await payload.findByID({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + id: retryResult.newBatchId!, + }) + expect((newBatch as any).status).toBe('queued') + expect((newBatch as any).providerBatchId).toBeDefined() + expect((newBatch as any).providerBatchId).not.toBe(failedBatch.providerBatchId) + + // Check that metadata points to the new batch + const runIdNum = typeof run.id === 'number' ? run.id : parseInt(String(run.id), 10) + const metadata = await payload.find({ + collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, + where: { run: { equals: runIdNum } }, + }) + expect(metadata.totalDocs).toBeGreaterThan(0) + // All metadata should point to the new batch + for (const meta of (metadata as any).docs) { + const metaBatchId = + typeof meta.batch === 'object' ? meta.batch.id : parseInt(String(meta.batch), 10) + expect(metaBatchId).toBe(parseInt(retryResult.newBatchId!, 10)) + } + } + }) + + test('retrying a retried batch returns the existing retry batch', async () => { + const post = await payload.create({ + collection: 'posts', + data: { title: 'RetryRetryTest' } as any, + }) + + const run = await payload.create({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, + }) + + await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ + task: 'payloadcms-vectorize:prepare-bulk-embedding', + input: { runId: String(run.id) }, + req: { payload } as any, + ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName + ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } + : {}), + }) + + await waitForBulkJobs(payload) + + // Find the failed batch + const batchesResult = await payload.find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { run: { equals: run.id } }, + }) + const failedBatch = (batchesResult as any).docs[0] + + // Retry the batch first time + const firstRetryResult = await payload.retryFailedBatch({ batchId: String(failedBatch.id) }) + expect('error' in firstRetryResult).toBe(false) + if ('error' in firstRetryResult) return + + const firstRetryBatchId = firstRetryResult.newBatchId! + + // Retry the retried batch - should return the existing retry batch + const secondRetryResult = await payload.retryFailedBatch({ batchId: String(failedBatch.id) }) + + expect('error' in secondRetryResult).toBe(false) + if (!('error' in secondRetryResult)) { + expect(secondRetryResult.newBatchId).toBe(firstRetryBatchId) + expect(secondRetryResult.message).toContain('already retried') + } + }) }) diff --git a/src/admin/components/RetryFailedBatchButton/client.tsx b/src/admin/components/RetryFailedBatchButton/client.tsx index 40a4374..c1aa874 100644 --- a/src/admin/components/RetryFailedBatchButton/client.tsx +++ b/src/admin/components/RetryFailedBatchButton/client.tsx @@ -1,20 +1,25 @@ 'use client' import React, { useState } from 'react' +import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../collections/bulkEmbeddingsBatches.js' type RetryFailedBatchButtonClientProps = { batchId: string status: string + retriedBatchId?: string | null } export const RetryFailedBatchButtonClient: React.FC = ({ batchId, status, + retriedBatchId, }) => { + console.log('RetryFailedBatchButtonClient', batchId, status, retriedBatchId) const [isSubmitting, setIsSubmitting] = useState(false) const [message, setMessage] = useState<{ text: string; error?: boolean } | null>(null) - const isDisabled = status !== 'failed' + const isDisabled = status !== 'failed' && status !== 'retried' + const isRetried = status === 'retried' && retriedBatchId const handleClick = async () => { if (isDisabled) return @@ -38,12 +43,24 @@ export const RetryFailedBatchButtonClient: React.FC { - window.location.reload() - }, 1500) + // If a new batch was created, show that in the message + const newBatchId = data?.newBatchId + if (newBatchId) { + setMessage({ + text: `Batch resubmitted successfully. New batch ID: ${newBatchId}`, + error: false, + }) + // Redirect to the new batch after a delay + setTimeout(() => { + window.location.href = `/admin/collections/${BULK_EMBEDDINGS_BATCHES_SLUG}/${newBatchId}` + }, 2000) + } else { + setMessage({ text: 'Batch resubmitted successfully', error: false }) + // Reload the page after a short delay to show the updated status + setTimeout(() => { + window.location.reload() + }, 1500) + } } catch (error: any) { setMessage({ text: error?.message || 'Failed to retry batch', error: true }) } finally { @@ -68,10 +85,14 @@ export const RetryFailedBatchButtonClient: React.FC - {isDisabled ? 'Retry Not Available' : 'Retry Failed Batch'} + {isRetried + ? 'Batch Retried' + : isDisabled + ? 'Retry Not Available' + : 'Retry Failed Batch'}

- {isDisabled - ? `This batch is in "${status}" status. Retry is only available for failed batches.` - : 'Re-queue this failed batch for processing. The batch will be polled again and embeddings will be written for successful chunks.'} + {isRetried ? ( + <> + This batch was retried.{' '} + {retriedBatchId && ( + + View retry batch + + )} + + ) : isDisabled ? ( + `This batch is in "${status}" status. Retry is only available for failed or retried batches.` + ) : ( + 'Resubmit this failed batch to the provider. The batch will be resubmitted and processed from the beginning.' + )}

- + Retrying... + + ) : ( + <> + + + + Retry + + )} + + )} {message && ( @@ -157,14 +194,37 @@ export const RetryFailedBatchButtonClient: React.FC {message.error ? ( - + - + ) : ( - + - + )} {message.text} diff --git a/src/admin/components/RetryFailedBatchButton/index.tsx b/src/admin/components/RetryFailedBatchButton/index.tsx index b06b00d..7f47387 100644 --- a/src/admin/components/RetryFailedBatchButton/index.tsx +++ b/src/admin/components/RetryFailedBatchButton/index.tsx @@ -1,5 +1,6 @@ import React from 'react' import { RetryFailedBatchButtonClient } from './client.js' +import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../collections/bulkEmbeddingsBatches.js' type RetryFailedBatchButtonProps = { batchId: string @@ -7,50 +8,22 @@ type RetryFailedBatchButtonProps = { } export const RetryFailedBatchButton: React.FC< - RetryFailedBatchButtonProps & { payload?: any; params?: any; data?: any } -> = (props) => { - // Handle both direct props and serverProps functions - let batchId: string = '' - let status: string = '' + RetryFailedBatchButtonProps & { payload?: any; id?: string } +> = async (props) => { + const batch = await props.payload?.findByID({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + id: props.id, + }) - if (typeof props.batchId === 'function') { - try { - batchId = String( - (props.batchId as any)({ payload: props.payload, params: props.params, data: props.data }) || - '', - ) - } catch (error) { - console.error('[RetryFailedBatchButton] Error calling batchId:', error) - batchId = '' - } - } else if (props.data?.id) { - batchId = String(props.data.id) - } else { - batchId = String(props.batchId || '') - } + console.log('RetryFailedBatchButtonBatch', batch) - if (typeof props.status === 'function') { - try { - status = String( - (props.status as any)({ payload: props.payload, params: props.params, data: props.data }) || - '', - ) - } catch (error) { - console.error('[RetryFailedBatchButton] Error calling status:', error) - status = '' - } - } else if (props.data?.status) { - status = String(props.data.status) - } else { - status = String(props.status || '') - } - - // Only render on the edit view (when we have a batchId) - if (!batchId) { - return null - } - - return + return ( + + ) } export default RetryFailedBatchButton diff --git a/src/collections/bulkEmbeddingsBatches.ts b/src/collections/bulkEmbeddingsBatches.ts index 219e1ba..488e013 100644 --- a/src/collections/bulkEmbeddingsBatches.ts +++ b/src/collections/bulkEmbeddingsBatches.ts @@ -9,6 +9,7 @@ const statusOptions: BulkEmbeddingRunStatus[] = [ 'succeeded', 'failed', 'canceled', + 'retried', ] /** @@ -27,10 +28,6 @@ export const createBulkEmbeddingsBatchesCollection = (): CollectionConfig => ({ beforeDocumentControls: [ { path: 'payloadcms-vectorize/client#RetryFailedBatchButton', - serverProps: { - batchId: ({ data }: { data: any }) => data?.id, - status: ({ data }: { data: any }) => data?.status, - }, }, ], }, @@ -118,6 +115,15 @@ export const createBulkEmbeddingsBatchesCollection = (): CollectionConfig => ({ description: 'Error message if the batch failed', }, }, + { + name: 'retriedBatch', + type: 'relationship', + relationTo: BULK_EMBEDDINGS_BATCHES_SLUG, + admin: { + description: 'The new batch created when this batch was retried', + }, + hasMany: false, + }, ], timestamps: true, indexes: [ diff --git a/src/endpoints/retryFailedBatch.ts b/src/endpoints/retryFailedBatch.ts index d66cb61..8d86e50 100644 --- a/src/endpoints/retryFailedBatch.ts +++ b/src/endpoints/retryFailedBatch.ts @@ -1,10 +1,12 @@ import type { Payload, PayloadHandler } from 'payload' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../collections/bulkEmbeddingsBatches.js' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../collections/bulkEmbeddingsRuns.js' +import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../collections/bulkEmbeddingInputMetadata.js' import type { KnowledgePoolDynamicConfig, KnowledgePoolName, RetryFailedBatchResult, + BulkEmbeddingInput, } from '../types.js' /** @@ -34,10 +36,25 @@ export async function retryBatch= totalPages || pageDocs.length === 0) break + metadataPage++ + } + + if (metadataDocs.length === 0) { + return { + error: `No metadata found for batch "${batchId}". Cannot retry without chunk data.`, + } + } + + // Reconstruct chunks from metadata (only id and text for addChunk) + const chunks: BulkEmbeddingInput[] = metadataDocs.map((meta: any) => ({ + id: meta.inputId, + text: meta.text, + })) + + // Find the highest batchIndex for this run to determine the new batch index + const existingBatchesResult = await payload.find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { run: { equals: runIdNum } }, + limit: 1000, + sort: '-batchIndex', + }) + const existingBatches = (existingBatchesResult as any)?.docs || [] + const maxBatchIndex = existingBatches.length > 0 ? (existingBatches[0].batchIndex as number) : -1 + const newBatchIndex = maxBatchIndex + 1 + + // Resubmit chunks via addChunk to get a new providerBatchId + // Submit all chunks - addChunk will accumulate and return a BatchSubmission when ready + let submission: { providerBatchId: string } | null = null + for (let i = 0; i < chunks.length; i++) { + const chunk = chunks[i] + const isLastChunk = i === chunks.length - 1 + + const result = await callbacks.addChunk({ + chunk, + isLastChunk, + }) + + if (result) { + submission = result + break // Batch was submitted + } + } + + if (!submission) { + return { + error: 'Failed to resubmit batch - no providerBatchId was returned from addChunk', + } + } + + // Create the new batch + const newBatch = await payload.create({ collection: BULK_EMBEDDINGS_BATCHES_SLUG, - id: batchId, data: { + run: runIdNum, + batchIndex: newBatchIndex, + providerBatchId: submission.providerBatchId, status: 'queued', - error: null, - completedAt: null, + inputCount: chunks.length, succeededCount: 0, failedCount: 0, + submittedAt: new Date().toISOString(), + }, + }) + + // Update metadata to point to the new batch + await payload.update({ + collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, + where: { batch: { equals: batchIdNum } }, + data: { + batch: newBatch.id, + }, + }) + + // Update the old batch to point to the new batch and set status to 'retried' + await payload.update({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + id: batchId, + data: { + status: 'retried', + retriedBatch: newBatch.id, }, }) @@ -108,9 +218,10 @@ export async function retryBatch Date: Mon, 12 Jan 2026 19:35:58 +0700 Subject: [PATCH 32/49] Increases test time --- playwright.config.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playwright.config.js b/playwright.config.js index 9c895ac..f6e2af2 100644 --- a/playwright.config.js +++ b/playwright.config.js @@ -46,7 +46,7 @@ export default defineConfig({ command: 'cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS=--require=dotenv/config next dev dev --turbo', reuseExistingServer: true, - timeout: 180_000, + timeout: 300_000, url: 'http://localhost:3000/admin', }, }) From 5fdd48aac11291a80f340d68f52e6cb34ba51ddd Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 12 Jan 2026 20:36:18 +0700 Subject: [PATCH 33/49] WIP --- dev/specs/e2e.spec.ts | 89 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 14 deletions(-) diff --git a/dev/specs/e2e.spec.ts b/dev/specs/e2e.spec.ts index 1706ea7..532edb3 100644 --- a/dev/specs/e2e.spec.ts +++ b/dev/specs/e2e.spec.ts @@ -249,7 +249,7 @@ test.describe('Vector embedding e2e tests', () => { }) expect(succeededRetryResponse.status()).toBe(400) const succeededRetryJson = await succeededRetryResponse.json() - expect(succeededRetryJson.error).toContain('not in failed status') + expect(succeededRetryJson.error).toContain('not in failed or retried status') console.log('[test] Retry endpoint correctly rejected succeeded batch') // Navigate to the succeeded batch page and verify retry button is disabled @@ -265,7 +265,8 @@ test.describe('Vector embedding e2e tests', () => { // Verify the button is disabled (opacity check) const buttonStyle = await retryButton.getAttribute('style') - expect(buttonStyle).toContain('opacity: 0.5') + console.log('[test] Button style:', buttonStyle) + expect(buttonStyle).toContain('opacity:0.5') // Verify the "Retry Not Available" message is shown const notAvailableMessage = page.locator('text=/Retry Not Available/i') @@ -361,14 +362,64 @@ test.describe('Vector embedding e2e tests', () => { await waitForBulkJobs(payload, 30000) console.log('[test] Bulk jobs completed') - // Find the failed batch that was created - const batches = await (payload as any).find({ - collection: BULK_EMBEDDINGS_BATCHES_SLUG, - where: { - and: [{ run: { equals: runId } }, { status: { equals: 'failed' } }], - }, - }) - expect(batches.totalDocs).toBeGreaterThan(0) + // Wait for the batch to actually fail (poll-or-complete job needs to finish) + const runIdNum = parseInt(runId, 10) + let batches: any + let attempts = 0 + const maxAttempts = 30 // Wait up to 30 seconds + + while (attempts < maxAttempts) { + batches = await (payload as any).find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { + and: [{ run: { equals: runIdNum } }, { status: { equals: 'failed' } }], + }, + }) + + if (batches.totalDocs > 0) { + break + } + + // Check current batch status + const allBatches = await (payload as any).find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { run: { equals: runIdNum } }, + }) + if (allBatches.totalDocs > 0) { + const currentStatus = allBatches.docs[0].status + if (currentStatus === 'failed') { + batches = allBatches + break + } + } + + // Wait a bit before retrying + await new Promise((resolve) => setTimeout(resolve, 1000)) + attempts++ + } + + if (!batches || batches.totalDocs === 0) { + // Final check for debugging + const allBatchesFinal = await (payload as any).find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { run: { equals: runIdNum } }, + }) + const runFinal = await (payload as any).findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: runId, + }) + console.log('[test] Failed to find failed batch after', attempts, 'attempts') + console.log('[test] Run status:', runFinal.status) + console.log('[test] Batches found:', allBatchesFinal.totalDocs) + if (allBatchesFinal.totalDocs > 0) { + console.log( + '[test] Batch statuses:', + allBatchesFinal.docs.map((b: any) => b.status), + ) + } + } + + expect(batches?.totalDocs).toBeGreaterThan(0) const batch = batches.docs[0] console.log('[test] Found failed batch:', batch.id) @@ -378,17 +429,27 @@ test.describe('Vector embedding e2e tests', () => { }) expect(retryResponse.status()).toBe(202) const retryJson = await retryResponse.json() - expect(retryJson.message).toBe('Failed batch has been re-queued for processing') + expect(retryJson.message).toBe('Failed batch has been resubmitted and re-queued for processing') expect(retryJson.batchId).toBe(String(batch.id)) + expect(retryJson.newBatchId).toBeDefined() expect(retryJson.status).toBe('queued') - // Verify the batch status was updated + // Verify the old batch status was updated to 'retried' const updatedBatch = await (payload as any).findByID({ collection: BULK_EMBEDDINGS_BATCHES_SLUG, id: String(batch.id), }) - expect(updatedBatch.status).toBe('queued') - expect(updatedBatch.error).toBeNull() + expect(updatedBatch.status).toBe('retried') + expect(updatedBatch.retriedBatch).toBeDefined() + + // Verify the new batch exists and is queued + const newBatch = await (payload as any).findByID({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + id: retryJson.newBatchId, + }) + expect(newBatch.status).toBe('queued') + expect(newBatch.providerBatchId).toBeDefined() + expect(newBatch.providerBatchId).not.toBe(batch.providerBatchId) // Verify the run status was reset to running const updatedRun = await (payload as any).findByID({ From 5194ce94e6c426da9ec363bcfde854f1face7be8 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Mon, 12 Jan 2026 23:59:03 +0700 Subject: [PATCH 34/49] WIP --- dev/app/(payload)/admin/importMap.js | 2 + dev/specs/e2e.spec.ts | 99 +++++++--- .../components/FailedBatchesList/client.tsx | 182 ++++++++++++++++++ .../components/FailedBatchesList/index.tsx | 57 ++++++ src/collections/bulkEmbeddingInputMetadata.ts | 1 + src/collections/bulkEmbeddingsBatches.ts | 6 +- src/collections/bulkEmbeddingsRuns.ts | 15 +- src/exports/client.ts | 1 + 8 files changed, 335 insertions(+), 28 deletions(-) create mode 100644 src/admin/components/FailedBatchesList/client.tsx create mode 100644 src/admin/components/FailedBatchesList/index.tsx diff --git a/dev/app/(payload)/admin/importMap.js b/dev/app/(payload)/admin/importMap.js index a13c2ea..8fa9361 100644 --- a/dev/app/(payload)/admin/importMap.js +++ b/dev/app/(payload)/admin/importMap.js @@ -21,6 +21,7 @@ import { StrikethroughFeatureClient as StrikethroughFeatureClient_e70f5e05f09f93 import { UnderlineFeatureClient as UnderlineFeatureClient_e70f5e05f09f93e00b997edb1ef0c864 } from '@payloadcms/richtext-lexical/client' import { BoldFeatureClient as BoldFeatureClient_e70f5e05f09f93e00b997edb1ef0c864 } from '@payloadcms/richtext-lexical/client' import { ItalicFeatureClient as ItalicFeatureClient_e70f5e05f09f93e00b997edb1ef0c864 } from '@payloadcms/richtext-lexical/client' +import { FailedBatchesList as FailedBatchesList_69051d9d0217691c78245f4f33731b73 } from 'payloadcms-vectorize/client' import { RetryFailedBatchButton as RetryFailedBatchButton_69051d9d0217691c78245f4f33731b73 } from 'payloadcms-vectorize/client' import { EmbedAllButton as EmbedAllButton_69051d9d0217691c78245f4f33731b73 } from 'payloadcms-vectorize/client' import { CollectionCards as CollectionCards_ab83ff7e88da8d3530831f296ec4756a } from '@payloadcms/ui/rsc' @@ -49,6 +50,7 @@ export const importMap = { "@payloadcms/richtext-lexical/client#UnderlineFeatureClient": UnderlineFeatureClient_e70f5e05f09f93e00b997edb1ef0c864, "@payloadcms/richtext-lexical/client#BoldFeatureClient": BoldFeatureClient_e70f5e05f09f93e00b997edb1ef0c864, "@payloadcms/richtext-lexical/client#ItalicFeatureClient": ItalicFeatureClient_e70f5e05f09f93e00b997edb1ef0c864, + "payloadcms-vectorize/client#FailedBatchesList": FailedBatchesList_69051d9d0217691c78245f4f33731b73, "payloadcms-vectorize/client#RetryFailedBatchButton": RetryFailedBatchButton_69051d9d0217691c78245f4f33731b73, "payloadcms-vectorize/client#EmbedAllButton": EmbedAllButton_69051d9d0217691c78245f4f33731b73, "@payloadcms/ui/rsc#CollectionCards": CollectionCards_ab83ff7e88da8d3530831f296ec4756a diff --git a/dev/specs/e2e.spec.ts b/dev/specs/e2e.spec.ts index 532edb3..e0be340 100644 --- a/dev/specs/e2e.spec.ts +++ b/dev/specs/e2e.spec.ts @@ -461,45 +461,100 @@ test.describe('Vector embedding e2e tests', () => { console.log('[test] Retry failed batch endpoint test completed successfully!') }) - test('retry failed batch button works for failed batches', async ({ page }) => { + test('retry failed batch button works for failed batches', async ({ page, request }) => { console.log('[test] Starting retry button click test...') test.setTimeout(120000) // Login first await loginToAdmin(page) - // Create a bulk embedding run - const run = await (payload as any).create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, + // Create a test post first (needed for bulk embedding to have something to embed) + const post = await payload.create({ + collection: 'posts', data: { - pool: 'failingBulkDefault', - embeddingVersion: testEmbeddingVersion, - status: 'failed', + title: 'Failed batch UI test post', }, }) - console.log('[test] Created bulk run:', run.id) + console.log('[test] Created test post:', post.id) - // Create a failed batch - const failedBatch = await (payload as any).create({ - collection: BULK_EMBEDDINGS_BATCHES_SLUG, + // Use the bulk embed endpoint to create a run for failingBulkDefault pool + const bulkEmbedResponse = await request.post('/api/vector-bulk-embed', { data: { - run: run.id, - batchIndex: 0, - providerBatchId: `mock-failed-ui-${Date.now()}`, - status: 'failed', - inputCount: 1, - error: 'Test error for UI test', + knowledgePool: 'failingBulkDefault', }, }) - console.log('[test] Created failed batch:', failedBatch.id) + expect(bulkEmbedResponse.ok()).toBe(true) + const bulkEmbedJson = await bulkEmbedResponse.json() + const runId = bulkEmbedJson.runId + console.log('[test] Created bulk run via endpoint:', runId) - // Navigate to the failed batch edit page - console.log('[test] Navigating to failed batch page...') - await page.goto(`/admin/collections/${BULK_EMBEDDINGS_BATCHES_SLUG}/${failedBatch.id}`, { + // Wait for the bulk jobs to process and fail (failingBulkDefault has a mock that fails) + await waitForBulkJobs(payload, 30000) + console.log('[test] Bulk jobs completed') + + // Wait for the batch to actually fail (poll-or-complete job needs to finish) + const runIdNum = parseInt(runId, 10) + let batches: any + let attempts = 0 + const maxAttempts = 30 // Wait up to 30 seconds + + while (attempts < maxAttempts) { + batches = await (payload as any).find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { + and: [{ run: { equals: runIdNum } }, { status: { equals: 'failed' } }], + }, + }) + + if (batches.totalDocs > 0) { + break + } + + // Check current batch status + const allBatches = await (payload as any).find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { run: { equals: runIdNum } }, + }) + if (allBatches.totalDocs > 0) { + const currentStatus = allBatches.docs[0].status + if (currentStatus === 'failed') { + batches = allBatches + break + } + } + + // Wait a bit before retrying + await new Promise((resolve) => setTimeout(resolve, 1000)) + attempts++ + } + + expect(batches?.totalDocs).toBeGreaterThan(0) + const failedBatch = batches.docs[0] + console.log('[test] Found failed batch:', failedBatch.id) + + // Navigate to the run edit page (where FailedBatchesList component should be visible) + console.log('[test] Navigating to run page...') + await page.goto(`/admin/collections/${BULK_EMBEDDINGS_RUNS_SLUG}/${runId}`, { waitUntil: 'networkidle', }) await page.waitForLoadState('domcontentloaded') + // Wait for the FailedBatchesList component to appear + const failedBatchesList = page.locator('[data-testid^="failed-batch-link-"]').first() + await expect(failedBatchesList).toBeVisible({ timeout: 10000 }) + console.log('[test] Failed batches list is visible') + + // Click on the failed batch link to navigate to the batch page + console.log('[test] Clicking failed batch link...') + await failedBatchesList.click() + + // Wait for navigation to batch page + await page.waitForURL(/\/admin\/collections\/vector-bulk-embeddings-batches\/\d+/, { + timeout: 10000, + }) + await page.waitForLoadState('domcontentloaded') + console.log('[test] Navigated to batch page') + // Look for the retry button const retryButton = page.locator('[data-testid="retry-failed-batch-button"]') await expect(retryButton).toBeVisible({ timeout: 15000 }) @@ -517,7 +572,7 @@ test.describe('Vector embedding e2e tests', () => { await retryButton.click() // Wait for success message - const successMessage = page.locator('text=/Batch re-queued successfully/i') + const successMessage = page.locator('text=/Batch resubmitted successfully/i') await expect(successMessage).toBeVisible({ timeout: 10000 }) console.log('[test] Retry button click test completed!') diff --git a/src/admin/components/FailedBatchesList/client.tsx b/src/admin/components/FailedBatchesList/client.tsx new file mode 100644 index 0000000..5503a1b --- /dev/null +++ b/src/admin/components/FailedBatchesList/client.tsx @@ -0,0 +1,182 @@ +'use client' + +import React from 'react' +import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../collections/bulkEmbeddingsBatches.js' + +type FailedBatch = { + id: string + batchIndex: number + providerBatchId: string + error?: string | null +} + +type FailedBatchesListClientProps = { + runId: string + failedCount: number + batches: FailedBatch[] +} + +export const FailedBatchesListClient: React.FC = ({ + runId, + failedCount, + batches, +}) => { + if (batches.length === 0) { + return null + } + + return ( +
+
+

+ + + + Failed Batches ({failedCount}) +

+

+ {batches.length === failedCount + ? 'All failed batches are listed below. Click to view details and retry.' + : `Showing ${batches.length} of ${failedCount} failed batches.`} +

+
+ + + + {batches.length < failedCount && ( + + )} +
+ ) +} + +export default FailedBatchesListClient diff --git a/src/admin/components/FailedBatchesList/index.tsx b/src/admin/components/FailedBatchesList/index.tsx new file mode 100644 index 0000000..a666529 --- /dev/null +++ b/src/admin/components/FailedBatchesList/index.tsx @@ -0,0 +1,57 @@ +import React from 'react' +import { FailedBatchesListClient } from './client.js' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../collections/bulkEmbeddingsRuns.js' +import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../collections/bulkEmbeddingsBatches.js' + +type FailedBatchesListProps = { + payload?: any + id?: string + data?: any // The document data passed by beforeDocumentControls +} + +export const FailedBatchesList: React.FC = async (props) => { + // Always render something for debugging + console.log('[FailedBatchesList] Component called with props:', { + hasPayload: !!props.payload, + hasId: !!props.id, + allProps: Object.keys(props), + }) + + const run = await props.payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: props.id, + }) + + console.log('[FailedBatchesList] Fetching failed batches for run:', run.id) + + // Fetch failed batches for this run + const runIdNum = typeof run.id === 'number' ? run.id : parseInt(String(run.id), 10) + const failedBatches = await props.payload.find({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + where: { + and: [{ run: { equals: runIdNum } }, { status: { equals: 'failed' } }], + }, + limit: 100, // Limit to first 100 failed batches + sort: 'batchIndex', + }) + + const batches = (failedBatches as any)?.docs || [] + const runId = props.id || String(run.id) + + console.log('[FailedBatchesList] Found batches:', batches.length, 'for run:', runId) + + return ( + ({ + id: String(b.id), + batchIndex: b.batchIndex, + providerBatchId: b.providerBatchId, + error: b.error, + }))} + /> + ) +} + +export default FailedBatchesList diff --git a/src/collections/bulkEmbeddingInputMetadata.ts b/src/collections/bulkEmbeddingInputMetadata.ts index 806c8df..5ebce1b 100644 --- a/src/collections/bulkEmbeddingInputMetadata.ts +++ b/src/collections/bulkEmbeddingInputMetadata.ts @@ -7,6 +7,7 @@ export const BULK_EMBEDDINGS_INPUT_METADATA_SLUG = 'vector-bulk-embedding-input- export const createBulkEmbeddingInputMetadataCollection = (): CollectionConfig => ({ slug: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, admin: { + hidden: true, useAsTitle: 'inputId', description: 'Stores per-input metadata for bulk embedding runs.', defaultColumns: ['run', 'batch', 'inputId', 'sourceCollection', 'docId', 'chunkIndex'], diff --git a/src/collections/bulkEmbeddingsBatches.ts b/src/collections/bulkEmbeddingsBatches.ts index 488e013..e47c18a 100644 --- a/src/collections/bulkEmbeddingsBatches.ts +++ b/src/collections/bulkEmbeddingsBatches.ts @@ -36,9 +36,9 @@ export const createBulkEmbeddingsBatchesCollection = (): CollectionConfig => ({ access: { // Anyone can read; only internal (local API) can mutate. read: () => true, - create: ({ req }) => req?.payloadAPI === 'local', - update: ({ req }) => req?.payloadAPI === 'local', - delete: ({ req }) => req?.payloadAPI === 'local', + create: ({ req }) => false, + update: ({ req }) => false, + delete: ({ req }) => false, }, fields: [ { diff --git a/src/collections/bulkEmbeddingsRuns.ts b/src/collections/bulkEmbeddingsRuns.ts index c6faf25..c9f5b36 100644 --- a/src/collections/bulkEmbeddingsRuns.ts +++ b/src/collections/bulkEmbeddingsRuns.ts @@ -18,13 +18,22 @@ export const createBulkEmbeddingsRunsCollection = (): CollectionConfig => ({ description: 'Bulk embedding run records. Created automatically when the Embed all action is triggered.', defaultColumns: ['pool', 'status', 'inputs', 'succeeded', 'failed', 'submittedAt'], + components: { + edit: { + beforeDocumentControls: [ + { + path: 'payloadcms-vectorize/client#FailedBatchesList', + }, + ], + }, + }, }, access: { // Anyone can read; only internal (local API) can mutate. read: () => true, - create: ({ req }) => req?.payloadAPI === 'local', - update: ({ req }) => req?.payloadAPI === 'local', - delete: ({ req }) => req?.payloadAPI === 'local', + create: ({ req }) => false, + update: ({ req }) => false, + delete: ({ req }) => false, }, fields: [ { diff --git a/src/exports/client.ts b/src/exports/client.ts index c871ed9..0eb8619 100644 --- a/src/exports/client.ts +++ b/src/exports/client.ts @@ -1,2 +1,3 @@ export { EmbedAllButton } from '../admin/components/EmbedAllButton/index.js' export { RetryFailedBatchButton } from '../admin/components/RetryFailedBatchButton/index.js' +export { FailedBatchesList } from '../admin/components/FailedBatchesList/index.js' From 171411a11a82bf2e8625905596946e375ad3e21d Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Tue, 13 Jan 2026 12:07:48 +0700 Subject: [PATCH 35/49] Fixes tests WIP --- dev/specs/e2e.spec.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dev/specs/e2e.spec.ts b/dev/specs/e2e.spec.ts index e0be340..8d5669c 100644 --- a/dev/specs/e2e.spec.ts +++ b/dev/specs/e2e.spec.ts @@ -353,6 +353,7 @@ test.describe('Vector embedding e2e tests', () => { knowledgePool: 'failingBulkDefault', }, }) + console.log('[test] Bulk embed response:', await bulkEmbedResponse.json()) expect(bulkEmbedResponse.ok()).toBe(true) const bulkEmbedJson = await bulkEmbedResponse.json() const runId = bulkEmbedJson.runId @@ -477,12 +478,17 @@ test.describe('Vector embedding e2e tests', () => { }) console.log('[test] Created test post:', post.id) + // Wait for any existing bulk embedding jobs to complete before starting a new run + await waitForBulkJobs(payload, 30000) + console.log('[test] Existing bulk jobs completed, proceeding...') + // Use the bulk embed endpoint to create a run for failingBulkDefault pool const bulkEmbedResponse = await request.post('/api/vector-bulk-embed', { data: { knowledgePool: 'failingBulkDefault', }, }) + console.log('[test] Bulk embed response:', await bulkEmbedResponse.json()) expect(bulkEmbedResponse.ok()).toBe(true) const bulkEmbedJson = await bulkEmbedResponse.json() const runId = bulkEmbedJson.runId From da8965c1ca19d48a74fc188cb3e380c72b4f06ca Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Tue, 13 Jan 2026 20:28:13 +0700 Subject: [PATCH 36/49] betters embed --- dev/helpers/embed.ts | 79 +++++++++++--------------------------------- 1 file changed, 19 insertions(+), 60 deletions(-) diff --git a/dev/helpers/embed.ts b/dev/helpers/embed.ts index dfb5daf..04a5e83 100644 --- a/dev/helpers/embed.ts +++ b/dev/helpers/embed.ts @@ -62,35 +62,18 @@ export function makeDummyEmbedDocs(dims: number) { } export const testEmbeddingVersion = 'test-v1' -// Voyage file size limit (approximately 100MB, we use a safer threshold) -const VOYAGE_FILE_SIZE_LIMIT = 50 * 1024 * 1024 // 50MB to be safe +// Voyage line limit (100,000 lines per batch) +// https://docs.voyageai.com/docs/batch-inference +const VOYAGE_LINE_LIMIT = 100_000 /** * Real Voyage Batch API implementation using the new streaming API. - * User controls batching based on file size. */ export function makeVoyageBulkEmbeddingsConfig(): BulkEmbeddingsFns { // Accumulated chunks for current batch let accumulatedChunks: BulkEmbeddingInput[] = [] - let accumulatedSize = 0 let batchIndex = 0 - // Store batch state in memory for dev purposes (output file IDs for completion) - const batchOutputFiles = new Map() - - // Helper to estimate JSONL line size for a chunk - const estimateChunkSize = (chunk: BulkEmbeddingInput): number => { - const jsonLine = JSON.stringify({ - custom_id: chunk.id, - body: { - input: [chunk.text], - model: 'voyage-3.5-lite', - input_type: 'document', - }, - }) - return jsonLine.length + 1 // +1 for newline - } - // Helper to submit accumulated chunks to Voyage const submitBatch = async (chunks: BulkEmbeddingInput[]): Promise => { // Create JSONL content for Voyage batch @@ -98,9 +81,7 @@ export function makeVoyageBulkEmbeddingsConfig(): BulkEmbeddingsFns { return JSON.stringify({ custom_id: input.id, body: { - input: [input.text], - model: 'voyage-3.5-lite', - input_type: 'document', + input: input.text, }, }) }) @@ -138,7 +119,11 @@ export function makeVoyageBulkEmbeddingsConfig(): BulkEmbeddingsFns { body: JSON.stringify({ input_file_id: fileId, endpoint: '/v1/embeddings', - completion_window: '24h', + completion_window: '12h', + request_params: { + model: 'voyage-3.5-lite', + input_type: 'document', + }, }), }) @@ -157,26 +142,20 @@ export function makeVoyageBulkEmbeddingsConfig(): BulkEmbeddingsFns { return { addChunk: async ({ chunk, isLastChunk }) => { - const chunkSize = estimateChunkSize(chunk) + // Add chunk to accumulator + accumulatedChunks.push(chunk) - // Check if adding this chunk would exceed the file size limit - if (accumulatedSize + chunkSize > VOYAGE_FILE_SIZE_LIMIT && accumulatedChunks.length > 0) { - // Submit what we have (without this chunk) + // If we hit the 100,000 limit, submit and start a new batch + if (accumulatedChunks.length === VOYAGE_LINE_LIMIT) { const toSubmit = [...accumulatedChunks] - accumulatedChunks = [chunk] - accumulatedSize = chunkSize + accumulatedChunks = [] return await submitBatch(toSubmit) } - // Add chunk to accumulator - accumulatedChunks.push(chunk) - accumulatedSize += chunkSize - // If this is the last chunk, flush everything if (isLastChunk && accumulatedChunks.length > 0) { const toSubmit = [...accumulatedChunks] accumulatedChunks = [] - accumulatedSize = 0 return await submitBatch(toSubmit) } @@ -275,33 +254,13 @@ export function makeVoyageBulkEmbeddingsConfig(): BulkEmbeddingsFns { }, onError: async ({ providerBatchIds, error }) => { + // TODO: Could implement error recovery here, e.g.: + // - Cancel running batches via API + // - Retry failed embeddings one by one using the regular embed API + // - Clean up uploaded files console.log( - `Voyage bulk run failed: ${error.message}. Cleaning up ${providerBatchIds.length} batches...`, + `Voyage bulk run failed: ${error.message}. ${providerBatchIds.length} batches affected.`, ) - - // Cancel any running batches - for (const batchId of providerBatchIds) { - try { - await fetch(`https://api.voyageai.com/v1/batches/${batchId}/cancel`, { - method: 'POST', - headers: { - Authorization: `Bearer ${process.env.VOYAGE_API_KEY}`, - }, - }) - } catch (cancelError) { - console.error(`Failed to cancel batch ${batchId}:`, cancelError) - } - } - - // Clean up local state - for (const batchId of providerBatchIds) { - batchOutputFiles.delete(batchId) - } - - // Reset accumulator state for potential retry - accumulatedChunks = [] - accumulatedSize = 0 - batchIndex = 0 }, } } From 9861c90dacb83410ee86b97b27c9363d4c08af7d Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:33:44 +0700 Subject: [PATCH 37/49] WIP --- dev/payload.config.ts | 3 +- dev/specs/bulkEmbed/concurrentRuns.spec.ts | 12 +-- dev/specs/bulkEmbed/failedBatch.spec.ts | 23 +++-- dev/specs/vectorizedPayload.spec.ts | 88 ++++++------------- src/admin/components/EmbedAllButton/index.tsx | 3 + src/collections/embeddings.ts | 13 +-- src/index.ts | 24 +++-- src/types.ts | 66 +++++++------- 8 files changed, 114 insertions(+), 118 deletions(-) diff --git a/dev/payload.config.ts b/dev/payload.config.ts index 41d82e2..b171cdf 100644 --- a/dev/payload.config.ts +++ b/dev/payload.config.ts @@ -42,6 +42,7 @@ const bulkEmbeddingsFns = : createMockBulkEmbeddings({ statusSequence: ['queued', 'running', 'running', 'succeeded'], }) +console.log('bulkEmbeddingsFns', bulkEmbeddingsFns) const ssl = process.env.DATABASE_URI !== undefined ? { @@ -106,7 +107,7 @@ const buildConfigWithPostgres = async () => { queue: 'vectorize-bulk-prepare', }, { - cron: '*/10 * * * * *', // Run every 10 seconds for bulk jobs + cron: '0 * * * *', // Run every hour limit: 5, queue: 'vectorize-bulk-poll', }, diff --git a/dev/specs/bulkEmbed/concurrentRuns.spec.ts b/dev/specs/bulkEmbed/concurrentRuns.spec.ts index 289a202..4d3d01b 100644 --- a/dev/specs/bulkEmbed/concurrentRuns.spec.ts +++ b/dev/specs/bulkEmbed/concurrentRuns.spec.ts @@ -1,7 +1,7 @@ import type { Payload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' -import type { VectorizedPayload } from '../../../src/types.js' +import { getVectorizedPayload } from '../../../src/types.js' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, @@ -15,7 +15,7 @@ const DIMS = DEFAULT_DIMS const dbName = `bulk_concurrent_${Date.now()}` describe('Bulk embed - concurrent runs prevention', () => { - let payload: VectorizedPayload<'default'> + let payload: Payload beforeAll(async () => { await createTestDb({ dbName }) @@ -44,10 +44,11 @@ describe('Bulk embed - concurrent runs prevention', () => { dims: DIMS, key: `concurrent-${Date.now()}`, }) - payload = built.payload as VectorizedPayload<'default'> + payload = built.payload }) test('cannot start concurrent bulk embed runs for the same pool', async () => { + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! // Create a test post first await payload.create({ collection: 'posts', @@ -65,7 +66,7 @@ describe('Bulk embed - concurrent runs prevention', () => { }) // Try to start another bulk embed for the same pool - const result = await payload.bulkEmbed({ knowledgePool: 'default' }) + const result = await vectorizedPayload.bulkEmbed({ knowledgePool: 'default' }) expect('conflict' in result && result.conflict).toBe(true) expect(result.status).toBe('running') @@ -84,6 +85,7 @@ describe('Bulk embed - concurrent runs prevention', () => { }) test('can start bulk embed run after previous run completes', async () => { + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! // Create a test post await payload.create({ collection: 'posts', @@ -102,7 +104,7 @@ describe('Bulk embed - concurrent runs prevention', () => { }) // Should be able to start a new run for the same pool - const result = await payload.bulkEmbed({ knowledgePool: 'default' }) + const result = await vectorizedPayload.bulkEmbed({ knowledgePool: 'default' }) expect('conflict' in result).toBe(false) expect(result.status).toBe('queued') diff --git a/dev/specs/bulkEmbed/failedBatch.spec.ts b/dev/specs/bulkEmbed/failedBatch.spec.ts index 20f580e..f25a0f3 100644 --- a/dev/specs/bulkEmbed/failedBatch.spec.ts +++ b/dev/specs/bulkEmbed/failedBatch.spec.ts @@ -3,7 +3,7 @@ import { beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../src/collections/bulkEmbeddingsBatches.js' import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../../../src/collections/bulkEmbeddingInputMetadata.js' -import type { VectorizedPayload } from '../../../src/types.js' +import { getVectorizedPayload } from '../../../src/types.js' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, @@ -18,7 +18,7 @@ const DIMS = DEFAULT_DIMS const dbName = `bulk_failed_${Date.now()}` describe('Bulk embed - failed batch', () => { - let payload: VectorizedPayload<'default'> + let payload: Payload beforeAll(async () => { await createTestDb({ dbName }) @@ -45,7 +45,7 @@ describe('Bulk embed - failed batch', () => { dims: DIMS, key: `failed-${Date.now()}`, }) - payload = built.payload as VectorizedPayload<'default'> + payload = built.payload }) test('failed batch marks entire run as failed', async () => { @@ -121,6 +121,7 @@ describe('Bulk embed - failed batch', () => { }) test('cannot retry batch while run is still running', async () => { + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! // Create a run in 'running' status const run = await (payload as any).create({ collection: BULK_EMBEDDINGS_RUNS_SLUG, @@ -145,7 +146,7 @@ describe('Bulk embed - failed batch', () => { }) // Try to retry the batch while run is running - should be rejected - const result = await payload.retryFailedBatch({ batchId: String(batch.id) }) + const result = await vectorizedPayload.retryFailedBatch({ batchId: String(batch.id) }) expect('error' in result).toBe(true) expect('conflict' in result && result.conflict).toBe(true) @@ -165,6 +166,7 @@ describe('Bulk embed - failed batch', () => { }) test('retrying a failed batch creates a new batch and marks old batch as retried', async () => { + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! const post = await payload.create({ collection: 'posts', data: { title: 'RetryTest' } as any }) const run = await payload.create({ @@ -192,7 +194,9 @@ describe('Bulk embed - failed batch', () => { expect(failedBatch.status).toBe('failed') // Retry the batch - const retryResult = await payload.retryFailedBatch({ batchId: String(failedBatch.id) }) + const retryResult = await vectorizedPayload.retryFailedBatch({ + batchId: String(failedBatch.id), + }) expect('error' in retryResult).toBe(false) if (!('error' in retryResult)) { @@ -234,6 +238,7 @@ describe('Bulk embed - failed batch', () => { }) test('retrying a retried batch returns the existing retry batch', async () => { + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! const post = await payload.create({ collection: 'posts', data: { title: 'RetryRetryTest' } as any, @@ -263,14 +268,18 @@ describe('Bulk embed - failed batch', () => { const failedBatch = (batchesResult as any).docs[0] // Retry the batch first time - const firstRetryResult = await payload.retryFailedBatch({ batchId: String(failedBatch.id) }) + const firstRetryResult = await vectorizedPayload.retryFailedBatch({ + batchId: String(failedBatch.id), + }) expect('error' in firstRetryResult).toBe(false) if ('error' in firstRetryResult) return const firstRetryBatchId = firstRetryResult.newBatchId! // Retry the retried batch - should return the existing retry batch - const secondRetryResult = await payload.retryFailedBatch({ batchId: String(failedBatch.id) }) + const secondRetryResult = await vectorizedPayload.retryFailedBatch({ + batchId: String(failedBatch.id), + }) expect('error' in secondRetryResult).toBe(false) if (!('error' in secondRetryResult)) { diff --git a/dev/specs/vectorizedPayload.spec.ts b/dev/specs/vectorizedPayload.spec.ts index e717909..6ea9539 100644 --- a/dev/specs/vectorizedPayload.spec.ts +++ b/dev/specs/vectorizedPayload.spec.ts @@ -2,7 +2,7 @@ import type { Payload } from 'payload' import { getPayload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' -import { isVectorizedPayload, VectorizedPayload } from '../../src/types.js' +import { getVectorizedPayload, VectorizedPayload } from '../../src/types.js' import { buildDummyConfig, DIMS, getInitialMarkdownContent } from './constants.js' import { createTestDb, waitForVectorizationJobs } from './utils.js' import { postgresAdapter } from '@payloadcms/db-postgres' @@ -93,54 +93,16 @@ describe('VectorizedPayload', () => { markdownContent = await getInitialMarkdownContent(config) }) - describe('isVectorizedPayload type guard', () => { - test('returns true for a payload instance with vectorize extensions', () => { - expect(isVectorizedPayload(payload)).toBe(true) + describe('getVectorizedPayload', () => { + test('returns vectorized payload object for a payload instance with vectorize extensions', () => { + const vectorizedPayload = getVectorizedPayload(payload) + expect(vectorizedPayload).not.toBeNull() + expect(vectorizedPayload).toBeDefined() }) - test('returns false for a plain object without search method', () => { - const plainObj = { - _isBulkEmbedEnabled: () => false, - queueEmbed: () => Promise.resolve(), - bulkEmbed: () => Promise.resolve({}), - retryFailedBatch: () => Promise.resolve({}), - } as unknown as Payload - expect(isVectorizedPayload(plainObj)).toBe(false) - }) - - test('returns false for a plain object without queueEmbed method', () => { - const plainObj = { - _isBulkEmbedEnabled: () => false, - search: () => Promise.resolve([]), - bulkEmbed: () => Promise.resolve({}), - retryFailedBatch: () => Promise.resolve({}), - } as unknown as Payload - expect(isVectorizedPayload(plainObj)).toBe(false) - }) - - test('returns false for a plain object without bulkEmbed method', () => { - const plainObj = { - _isBulkEmbedEnabled: () => false, - search: () => Promise.resolve([]), - queueEmbed: () => Promise.resolve(), - retryFailedBatch: () => Promise.resolve({}), - } as unknown as Payload - expect(isVectorizedPayload(plainObj)).toBe(false) - }) - - test('returns false for a plain object without retryFailedBatch method', () => { - const plainObj = { - _isBulkEmbedEnabled: () => false, - search: () => Promise.resolve([]), - queueEmbed: () => Promise.resolve(), - bulkEmbed: () => Promise.resolve({}), - } as unknown as Payload - expect(isVectorizedPayload(plainObj)).toBe(false) - }) - - test('returns false for an empty object', () => { - const emptyObj = {} as unknown as Payload - expect(isVectorizedPayload(emptyObj)).toBe(false) + test('returns null for a payload instance without vectorize extensions', () => { + const plainPayload = {} as unknown as Payload + expect(getVectorizedPayload(plainPayload)).toBeNull() }) }) @@ -161,11 +123,13 @@ describe('VectorizedPayload', () => { }) test('payload has search method', () => { - expect(typeof (payload as VectorizedPayload).search).toBe('function') + const vectorizedPayload = getVectorizedPayload<'default'>(payload) + expect(vectorizedPayload).not.toBeNull() + expect(typeof vectorizedPayload!.search).toBe('function') }) test('search returns an array of VectorSearchResult', async () => { - const vectorizedPayload = payload as VectorizedPayload<'default'> + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! const results = await vectorizedPayload.search({ query: titleAndQuery, @@ -177,7 +141,7 @@ describe('VectorizedPayload', () => { }) test('search results are ordered by similarity (highest first)', async () => { - const vectorizedPayload = payload as VectorizedPayload<'default'> + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! const results = await vectorizedPayload.search({ query: titleAndQuery, @@ -189,7 +153,7 @@ describe('VectorizedPayload', () => { }) test('search respects limit parameter', async () => { - const vectorizedPayload = payload as VectorizedPayload<'default'> + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! const results = await vectorizedPayload.search({ query: titleAndQuery, @@ -201,7 +165,7 @@ describe('VectorizedPayload', () => { }) test('search respects where clause', async () => { - const vectorizedPayload = payload as VectorizedPayload<'default'> + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! const results = await vectorizedPayload.search({ query: titleAndQuery, @@ -216,7 +180,7 @@ describe('VectorizedPayload', () => { }) test('querying a title should return the title as top result', async () => { - const vectorizedPayload = payload as VectorizedPayload<'default'> + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! const results = await vectorizedPayload.search({ query: titleAndQuery, @@ -230,11 +194,13 @@ describe('VectorizedPayload', () => { describe('queueEmbed method', () => { test('payload has queueEmbed method', () => { - expect(typeof (payload as VectorizedPayload).queueEmbed).toBe('function') + const vectorizedPayload = getVectorizedPayload(payload) + expect(vectorizedPayload).not.toBeNull() + expect(typeof vectorizedPayload!.queueEmbed).toBe('function') }) test('queueEmbed queues a vectorization job', async () => { - const vectorizedPayload = payload as VectorizedPayload + const vectorizedPayload = getVectorizedPayload(payload)! // Create a post (triggers automatic embedding) const post = await payload.create({ @@ -271,11 +237,13 @@ describe('VectorizedPayload', () => { describe('bulkEmbed method', () => { test('payload has bulkEmbed method', () => { - expect(typeof (payload as VectorizedPayload).bulkEmbed).toBe('function') + const vectorizedPayload = getVectorizedPayload(payload) + expect(vectorizedPayload).not.toBeNull() + expect(typeof vectorizedPayload!.bulkEmbed).toBe('function') }) test('bulkEmbed throws error when bulk embedding not configured', async () => { - const vectorizedPayload = payload as VectorizedPayload<'default'> + const vectorizedPayload = getVectorizedPayload<'default'>(payload)! // This pool doesn't have bulkEmbeddingsFns configured await expect(vectorizedPayload.bulkEmbed({ knowledgePool: 'default' })).rejects.toThrow( @@ -286,11 +254,13 @@ describe('VectorizedPayload', () => { describe('retryFailedBatch method', () => { test('payload has retryFailedBatch method', () => { - expect(typeof (payload as VectorizedPayload).retryFailedBatch).toBe('function') + const vectorizedPayload = getVectorizedPayload(payload) + expect(vectorizedPayload).not.toBeNull() + expect(typeof vectorizedPayload!.retryFailedBatch).toBe('function') }) test('retryFailedBatch returns error for non-existent batch', async () => { - const vectorizedPayload = payload as VectorizedPayload + const vectorizedPayload = getVectorizedPayload(payload)! const result = await vectorizedPayload.retryFailedBatch({ batchId: '999999' }) diff --git a/src/admin/components/EmbedAllButton/index.tsx b/src/admin/components/EmbedAllButton/index.tsx index 472ff6c..de0dbeb 100644 --- a/src/admin/components/EmbedAllButton/index.tsx +++ b/src/admin/components/EmbedAllButton/index.tsx @@ -15,12 +15,15 @@ export const EmbedAllButton: React.FC } } - const incomingOnInit = config.onInit const vectorSearchHandlers = createVectorSearchHandlers(pluginOptions.knowledgePools) - config.onInit = async (payload) => { - if (incomingOnInit) await incomingOnInit(payload) - Object.assign(payload, { + + // Create vectorized payload object factory that creates methods bound to a payload instance + const createVectorizedPayloadObject = (payload: Payload): VectorizedPayload => { + console.log('createVectorizedPayloadObject', payload) + return { _isBulkEmbedEnabled: (knowledgePool: TPoolNames): boolean => { const poolConfig = pluginOptions.knowledgePools[knowledgePool] return !!poolConfig?.embeddingConfig?.bulkEmbeddingsFns @@ -451,7 +452,18 @@ export const createVectorizeIntegration = knowledgePools: pluginOptions.knowledgePools, queueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, }), - } as Partial>) + } as VectorizedPayload + } + + // Store factory in config.custom + config.custom = { + ...(config.custom || {}), + createVectorizedPayloadObject, + } + + const incomingOnInit = config.onInit + config.onInit = async (payload) => { + if (incomingOnInit) await incomingOnInit(payload) // Ensure pgvector artifacts for each knowledge pool for (const poolName in staticConfigs) { const staticConfig = staticConfigs[poolName] diff --git a/src/types.ts b/src/types.ts index 3a0a31d..0a6cd4c 100644 --- a/src/types.ts +++ b/src/types.ts @@ -43,44 +43,40 @@ export type RetryFailedBatchResult = /** * Extended Payload type with vectorize plugin methods */ -export type VectorizedPayload = - Payload & { - /** Check if bulk embedding is enabled for a knowledge pool */ - _isBulkEmbedEnabled: (knowledgePool: TPoolNames) => boolean - search: (params: VectorSearchQuery) => Promise> - queueEmbed: ( - params: - | { - collection: string - docId: string - } - | { - collection: string - doc: Record - }, - ) => Promise - /** Start a bulk embedding run for a knowledge pool */ - bulkEmbed: (params: { knowledgePool: TPoolNames }) => Promise - /** Retry a failed batch */ - retryFailedBatch: (params: { batchId: string }) => Promise - } +export type VectorizedPayload = { + /** Check if bulk embedding is enabled for a knowledge pool */ + _isBulkEmbedEnabled: (knowledgePool: TPoolNames) => boolean + search: (params: VectorSearchQuery) => Promise> + queueEmbed: ( + params: + | { + collection: string + docId: string + } + | { + collection: string + doc: Record + }, + ) => Promise + /** Start a bulk embedding run for a knowledge pool */ + bulkEmbed: (params: { knowledgePool: TPoolNames }) => Promise + /** Retry a failed batch */ + retryFailedBatch: (params: { batchId: string }) => Promise +} /** - * Type guard to check if a Payload instance has vectorize extensions + * Get the vectorized payload object from config.custom + * Returns null if the payload instance doesn't have vectorize extensions */ -export function isVectorizedPayload(payload: Payload): payload is VectorizedPayload { - return ( - '_isBulkEmbedEnabled' in payload && - typeof (payload as any)._isBulkEmbedEnabled === 'function' && - 'search' in payload && - typeof (payload as any).search === 'function' && - 'queueEmbed' in payload && - typeof (payload as any).queueEmbed === 'function' && - 'bulkEmbed' in payload && - typeof (payload as any).bulkEmbed === 'function' && - 'retryFailedBatch' in payload && - typeof (payload as any).retryFailedBatch === 'function' - ) +export function getVectorizedPayload( + payload: Payload, +): VectorizedPayload | null { + const custom = (payload.config as any)?.custom + const vectorizedPayloadFactory = custom?.createVectorizedPayloadObject + if (vectorizedPayloadFactory && typeof vectorizedPayloadFactory === 'function') { + return vectorizedPayloadFactory(payload) as VectorizedPayload + } + return null } export type EmbedDocsFn = (texts: string[]) => Promise From fff3ef5e6f41ff2b6bac01b9c8cf8914e1360a7e Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Tue, 13 Jan 2026 21:58:36 +0700 Subject: [PATCH 38/49] WIP --- CHANGELOG.md | 33 ++++++++++++++++ README.md | 92 ++++++++++++++++++++++++++----------------- dev/payload.config.ts | 5 ++- 3 files changed, 93 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc80d09..2b4ee5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,37 @@ All notable changes to this project will be documented in this file. +## Unreleased + +### Changed + +- **`isVectorizedPayload` replaced with `getVectorizedPayload`**: The type guard `isVectorizedPayload(payload)` has been replaced with `getVectorizedPayload(payload)` which returns the vectorized payload object directly (or `null` if not available). This provides a cleaner API that doesn't require type assertions. + +### Migration + +**Before:** + +```typescript +import { isVectorizedPayload, type VectorizedPayload } from 'payloadcms-vectorize' + +if (isVectorizedPayload(payload)) { + const results = await payload.search({ ... }) + await payload.queueEmbed({ ... }) +} +``` + +**After:** + +```typescript +import { getVectorizedPayload } from 'payloadcms-vectorize' + +const vectorizedPayload = getVectorizedPayload(payload) +if (vectorizedPayload) { + const results = await vectorizedPayload.search({ ... }) + await vectorizedPayload.queueEmbed({ ... }) +} +``` + ## 0.5.0 - 2026-01-10 ### Breaking Changes @@ -27,6 +58,8 @@ All notable changes to this project will be documented in this file. ## 0.4.5 - 2025-01-09 +**Note:** This version is deprecated due to a critical bug with `isVectorizedPayload`. Use `getVectorizedPayload(payload)` instead (see 0.5.0 section above). No 0.4 line fix (0.4.6) exists yet. + ### Added - **Local API**: Added `payload.search()` and `payload.queueEmbed()` methods directly on the Payload instance for programmatic vector search without HTTP requests diff --git a/README.md b/README.md index 4b0fb91..814a1f9 100644 --- a/README.md +++ b/README.md @@ -189,14 +189,14 @@ const { results } = await response.json() Alternatively, you can use the local API directly on the Payload instance: ```typescript -import { isVectorizedPayload, type VectorizedPayload } from 'payloadcms-vectorize' +import { getVectorizedPayload } from 'payloadcms-vectorize' -// After initializing Payload, it will have the search and queueEmbed methods +// After initializing Payload, get the vectorized payload object const payload = await getPayload({ config, cron: true }) +const vectorizedPayload = getVectorizedPayload(payload) -// Type guard to ensure payload has vectorize extensions -if (isVectorizedPayload(payload)) { - const results = await payload.search({ +if (vectorizedPayload) { + const results = await vectorizedPayload.search({ query: 'What is machine learning?', knowledgePool: 'main', where: { @@ -207,7 +207,7 @@ if (isVectorizedPayload(payload)) { // results is an array of VectorSearchResult // Manually queue an embedding job - await payload.queueEmbed({ + await vectorizedPayload.queueEmbed({ collection: 'posts', docId: 'some-post-id', }) @@ -591,18 +591,21 @@ Perform vector search programmatically without making an HTTP request. **Example:** ```typescript -import type { VectorizedPayload } from 'payloadcms-vectorize' +import { getVectorizedPayload } from 'payloadcms-vectorize' const payload = await getPayload({ config, cron: true }) +const vectorizedPayload = getVectorizedPayload<'main'>(payload) -const results = await (payload as VectorizedPayload<'main'>).search({ - query: 'What is machine learning?', - knowledgePool: 'main', - where: { - category: { equals: 'guides' }, - }, - limit: 5, -}) +if (vectorizedPayload) { + const results = await vectorizedPayload.search({ + query: 'What is machine learning?', + knowledgePool: 'main', + where: { + category: { equals: 'guides' }, + }, + limit: 5, + }) +} ``` #### `payload.queueEmbed(params)` @@ -626,40 +629,57 @@ Or: **Example:** ```typescript -// Queue by document ID (fetches document first) -await (payload as VectorizedPayload).queueEmbed({ - collection: 'posts', - docId: 'some-post-id', -}) +import { getVectorizedPayload } from 'payloadcms-vectorize' -// Queue with document object directly -await (payload as VectorizedPayload).queueEmbed({ - collection: 'posts', - doc: { - id: 'some-post-id', - title: 'Post Title', - content: { - /* ... */ +const payload = await getPayload({ config, cron: true }) +const vectorizedPayload = getVectorizedPayload(payload) + +if (vectorizedPayload) { + // Queue by document ID (fetches document first) + await vectorizedPayload.queueEmbed({ + collection: 'posts', + docId: 'some-post-id', + }) + + // Queue with document object directly + await vectorizedPayload.queueEmbed({ + collection: 'posts', + doc: { + id: 'some-post-id', + title: 'Post Title', + content: { + /* ... */ + }, }, - }, -}) + }) +} ``` -#### Type Guard +#### Getting the Vectorized Payload Object -Use the `isVectorizedPayload` type guard to check if a Payload instance has vectorize extensions: +Use the `getVectorizedPayload` function to get the vectorized payload object with all vectorize methods: ```typescript -import { isVectorizedPayload } from 'payloadcms-vectorize' +import { getVectorizedPayload } from 'payloadcms-vectorize' const payload = await getPayload({ config, cron: true }) +const vectorizedPayload = getVectorizedPayload(payload) -if (isVectorizedPayload(payload)) { - // TypeScript now knows payload has search and queueEmbed methods - const results = await payload.search({ +if (vectorizedPayload) { + // Use all vectorize methods + const results = await vectorizedPayload.search({ query: 'search query', knowledgePool: 'main', }) + + await vectorizedPayload.queueEmbed({ + collection: 'posts', + docId: 'some-id', + }) + + await vectorizedPayload.bulkEmbed({ + knowledgePool: 'main', + }) } ``` diff --git a/dev/payload.config.ts b/dev/payload.config.ts index b171cdf..c9238bc 100644 --- a/dev/payload.config.ts +++ b/dev/payload.config.ts @@ -42,6 +42,9 @@ const bulkEmbeddingsFns = : createMockBulkEmbeddings({ statusSequence: ['queued', 'running', 'running', 'succeeded'], }) + +// Run every hour for voyage, every 5 seconds for mock +const bulkPollCronSchedule = process.env.USE_VOYAGE !== undefined ? '0 * * * *' : '*/5 * * * * *' console.log('bulkEmbeddingsFns', bulkEmbeddingsFns) const ssl = process.env.DATABASE_URI !== undefined @@ -107,7 +110,7 @@ const buildConfigWithPostgres = async () => { queue: 'vectorize-bulk-prepare', }, { - cron: '0 * * * *', // Run every hour + cron: bulkPollCronSchedule, limit: 5, queue: 'vectorize-bulk-poll', }, From f0614813669f7680ec372282db648a3d75dceb99 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Thu, 15 Jan 2026 16:44:20 +0700 Subject: [PATCH 39/49] Clean up --- CHANGELOG.md | 36 +---- README.md | 12 +- dev/helpers/embed.ts | 34 +++- dev/payload.config.ts | 1 - dev/specs/bulkEmbed/basic.spec.ts | 103 ++----------- dev/specs/bulkEmbed/canceledBatch.spec.ts | 26 ++-- dev/specs/bulkEmbed/extensionFields.spec.ts | 22 +-- dev/specs/bulkEmbed/failedBatch.spec.ts | 81 +++------- dev/specs/bulkEmbed/multipleBatches.spec.ts | 25 +-- dev/specs/bulkEmbed/multipleChunks.spec.ts | 21 +-- dev/specs/bulkEmbed/partialFailure.spec.ts | 20 +-- .../bulkEmbed/partialFailureNoFail.spec.ts | 22 +-- dev/specs/bulkEmbed/polling.spec.ts | 29 ++-- dev/specs/bulkEmbed/realtimeMode.spec.ts | 18 +-- dev/specs/bulkEmbed/versionBump.spec.ts | 145 ++++++++++-------- dev/specs/config.spec.ts | 11 +- dev/specs/e2e.spec.ts | 92 ----------- dev/specs/utils.ts | 13 ++ src/admin/components/EmbedAllButton/index.tsx | 28 +--- .../components/FailedBatchesList/index.tsx | 11 -- .../RetryFailedBatchButton/client.tsx | 1 - .../RetryFailedBatchButton/index.tsx | 2 - src/index.ts | 4 +- src/tasks/vectorize.ts | 1 - 24 files changed, 243 insertions(+), 515 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2b4ee5f..15f2d73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,43 +2,13 @@ All notable changes to this project will be documented in this file. -## Unreleased - -### Changed - -- **`isVectorizedPayload` replaced with `getVectorizedPayload`**: The type guard `isVectorizedPayload(payload)` has been replaced with `getVectorizedPayload(payload)` which returns the vectorized payload object directly (or `null` if not available). This provides a cleaner API that doesn't require type assertions. - -### Migration - -**Before:** - -```typescript -import { isVectorizedPayload, type VectorizedPayload } from 'payloadcms-vectorize' - -if (isVectorizedPayload(payload)) { - const results = await payload.search({ ... }) - await payload.queueEmbed({ ... }) -} -``` - -**After:** - -```typescript -import { getVectorizedPayload } from 'payloadcms-vectorize' - -const vectorizedPayload = getVectorizedPayload(payload) -if (vectorizedPayload) { - const results = await vectorizedPayload.search({ ... }) - await vectorizedPayload.queueEmbed({ ... }) -} -``` - -## 0.5.0 - 2026-01-10 +## 0.5.0 - 2026-01-15 ### Breaking Changes - **`queueName` renamed to `realtimeQueueName`**: The plugin option `queueName` has been renamed to `realtimeQueueName` to clarify that it only affects realtime vectorization jobs. - **`bulkQueueName` changed to `bulkQueueNames`**: The plugin option `bulkQueueName` has been replaced with `bulkQueueNames` object containing `prepareBulkEmbedQueueName` and `pollOrCompleteQueueName` for separate queue isolation of bulk preparation vs polling workloads. +- **`isVectorizedPayload` replaced with `getVectorizedPayload`**: The type guard `isVectorizedPayload(payload)` has been replaced with `getVectorizedPayload(payload)` which returns the vectorized payload object directly (or `null` if not available). This fixes a bug where methods are missing because onInit was not called. ### New Features @@ -51,7 +21,7 @@ if (vectorizedPayload) { ### Tests & Reliability -- Added comprehensive tests for realtime vs bulk ingest behavior +- Added comprehensive tests for realtime vs bulk ingest behavior, and failing bulk situations - Added tests for bulk polling error conditions (`failed`, `canceled` statuses) - Added tests for bulk fan-in behavior (multiple documents processed in single run) - Improved test coverage for edge cases in bulk embedding workflow diff --git a/README.md b/README.md index 814a1f9..4ce2cf8 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,10 @@ export default buildConfig({ // realtimeQueueName: 'custom-queue', // endpointOverrides: { path: '/custom-vector-search', enabled: true }, // disabled: false, + // bulkQueueNames: { // Required iff `bulkEmbeddingsFns` included + // prepareBulkEmbedQueueName: ..., + // pollOrCompleteQueueName: ..., + // }, }), ], }) @@ -148,9 +152,9 @@ export default buildConfig({ **Important:** `knowledgePools` must have **different names than your collections**—reusing a collection name for a knowledge pool **will cause schema conflicts**. (In this example, the knowledge pool is named 'main' and a collection named 'main' will be created.) -### 1.5. Generate Import Map (Required for Admin UI) +### 1.5. Generate Import Map (If Needed) -After configuring the plugin, you must generate the import map so that Payload can resolve client components (like the "Embed all" button) in the admin UI for bulk embeddings: +Payload automatically generates the import map on startup and during development (HMR), so you typically don't need to run this manually. However, if client components (like the "Embed all" button) don't appear in the admin UI, you may need to manually generate the import map: ```bash pnpm run generate:importmap @@ -671,12 +675,12 @@ if (vectorizedPayload) { query: 'search query', knowledgePool: 'main', }) - + await vectorizedPayload.queueEmbed({ collection: 'posts', docId: 'some-id', }) - + await vectorizedPayload.bulkEmbed({ knowledgePool: 'main', }) diff --git a/dev/helpers/embed.ts b/dev/helpers/embed.ts index 04a5e83..229dde1 100644 --- a/dev/helpers/embed.ts +++ b/dev/helpers/embed.ts @@ -229,17 +229,49 @@ export function makeVoyageBulkEmbeddingsConfig(): BulkEmbeddingsFns { if (!line.trim()) continue try { const result = JSON.parse(line) + // Check for error in result.error field if (result.error) { await onChunk({ id: result.custom_id, error: result.error.message || 'Unknown error', }) - } else { + } + // Check for error in result.response.status_code (Voyage AI format) + // Error if status_code exists and is >= 400 or not 200 + else if (result.response?.status_code && result.response.status_code !== 200) { + await onChunk({ + id: result.custom_id, + error: result.response.message || `HTTP ${result.response.status_code}`, + }) + } + // Success case - check for embedding data + // Handle body.object === "list" with data array + else if ( + result.response?.body?.object === 'list' && + result.response.body.data?.[0]?.embedding + ) { await onChunk({ id: result.custom_id, embedding: result.response.body.data[0].embedding, }) } + // Handle body.object === "embedding" (direct embedding) + else if ( + result.response?.body?.object === 'embedding' && + result.response.body.embedding + ) { + await onChunk({ + id: result.custom_id, + embedding: result.response.body.embedding, + }) + } + // Unknown format + else { + await onChunk({ + id: result.custom_id, + error: 'Unexpected response format', + }) + } } catch (parseError) { console.error('Failed to parse output line:', line, parseError) } diff --git a/dev/payload.config.ts b/dev/payload.config.ts index c9238bc..cb8a738 100644 --- a/dev/payload.config.ts +++ b/dev/payload.config.ts @@ -45,7 +45,6 @@ const bulkEmbeddingsFns = // Run every hour for voyage, every 5 seconds for mock const bulkPollCronSchedule = process.env.USE_VOYAGE !== undefined ? '0 * * * *' : '*/5 * * * * *' -console.log('bulkEmbeddingsFns', bulkEmbeddingsFns) const ssl = process.env.DATABASE_URI !== undefined ? { diff --git a/dev/specs/bulkEmbed/basic.spec.ts b/dev/specs/bulkEmbed/basic.spec.ts index 9825387..46b0d29 100644 --- a/dev/specs/bulkEmbed/basic.spec.ts +++ b/dev/specs/bulkEmbed/basic.spec.ts @@ -10,9 +10,12 @@ import { clearAllCollections, createMockBulkEmbeddings, createTestDb, + expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' +import { BulkEmbedResult } from '../../../src/types.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_basic_${Date.now()}` @@ -38,6 +41,7 @@ const basePluginOptions = { describe('Bulk embed - basic tests', () => { let payload: Payload let config: SanitizedConfig + let vectorizedPayload: VectorizedPayload | null = null beforeAll(async () => { await createTestDb({ dbName }) @@ -50,6 +54,7 @@ describe('Bulk embed - basic tests', () => { }) payload = built.payload config = built.config + vectorizedPayload = getVectorizedPayload(payload) }) beforeEach(async () => { @@ -60,38 +65,11 @@ describe('Bulk embed - basic tests', () => { vi.restoreAllMocks() }) - test('no bulk run is queued on init or doc creation (bulk-only mode)', async () => { - const runsBeforeCreate = await (payload as any).find({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - where: { pool: { equals: 'default' } }, - }) - expect(runsBeforeCreate.totalDocs).toBe(0) - - await payload.create({ collection: 'posts', data: { title: 'First' } as any }) - - const runsAfterCreate = await (payload as any).find({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - where: { pool: { equals: 'default' } }, - }) - expect(runsAfterCreate.totalDocs).toBe(0) - }) - test('manually triggered bulk run embeds documents', async () => { const post = await payload.create({ collection: 'posts', data: { title: 'First' } as any }) - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload) @@ -103,7 +81,7 @@ describe('Bulk embed - basic tests', () => { const runDoc = ( await (payload as any).find({ collection: BULK_EMBEDDINGS_RUNS_SLUG, - where: { id: { equals: String(run.id) } }, + where: { id: { equals: String(result!.runId) } }, }) ).docs[0] expect(runDoc.status).toBe('succeeded') @@ -111,26 +89,14 @@ describe('Bulk embed - basic tests', () => { test('bulk run creates batch records', async () => { await payload.create({ collection: 'posts', data: { title: 'Batch Test' } as any }) - - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload) const batches = await payload.find({ collection: BULK_EMBEDDINGS_BATCHES_SLUG as any, - where: { run: { equals: String(run.id) } }, + where: { run: { equals: String(result!.runId) } }, }) expect(batches.totalDocs).toBe(1) expect(batches.docs[0]).toHaveProperty('batchIndex', 0) @@ -141,18 +107,8 @@ describe('Bulk embed - basic tests', () => { const post = await payload.create({ collection: 'posts', data: { title: 'Stable' } as any }) // First bulk run - const baselineRun = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(baselineRun.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result0 = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result0) await waitForBulkJobs(payload) const embeds = await payload.find({ @@ -162,26 +118,15 @@ describe('Bulk embed - basic tests', () => { expect(embeds.totalDocs).toBe(1) // Second bulk run - should find zero eligible - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result1 = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expect(result1).toBeDefined() await waitForBulkJobs(payload) const runDoc = ( await (payload as any).find({ collection: BULK_EMBEDDINGS_RUNS_SLUG, - where: { id: { equals: String(run.id) } }, + where: { id: { equals: String(result1!.runId) } }, }) ).docs[0] expect(runDoc.status).toBe('succeeded') @@ -192,19 +137,7 @@ describe('Bulk embed - basic tests', () => { test('metadata table is cleaned after successful completion', async () => { await payload.create({ collection: 'posts', data: { title: 'Cleanup' } as any }) - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) await waitForBulkJobs(payload) @@ -215,5 +148,3 @@ describe('Bulk embed - basic tests', () => { expect(metadata.totalDocs).toBe(0) }) }) - - diff --git a/dev/specs/bulkEmbed/canceledBatch.spec.ts b/dev/specs/bulkEmbed/canceledBatch.spec.ts index 60d2170..f14c4d2 100644 --- a/dev/specs/bulkEmbed/canceledBatch.spec.ts +++ b/dev/specs/bulkEmbed/canceledBatch.spec.ts @@ -1,21 +1,26 @@ import type { Payload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' -import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' const DIMS = DEFAULT_DIMS const dbName = `bulk_canceled_${Date.now()}` +// Right now, we only test if the batch was canceled outside of the bulk embed process. +// TODO(techiejd): Add a way to cancel a batch and/or a run inside the bulk embed process. + describe('Bulk embed - canceled batch', () => { let payload: Payload + let vectorizedPayload: VectorizedPayload | null = null beforeAll(async () => { await createTestDb({ dbName }) @@ -43,24 +48,13 @@ describe('Bulk embed - canceled batch', () => { key: `canceled-${Date.now()}`, }) payload = built.payload + vectorizedPayload = getVectorizedPayload(payload) }) test('canceled batch marks entire run as failed', async () => { const post = await payload.create({ collection: 'posts', data: { title: 'Cancel' } as any }) - - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload) @@ -71,5 +65,3 @@ describe('Bulk embed - canceled batch', () => { expect(embeds.totalDocs).toBe(0) }) }) - - diff --git a/dev/specs/bulkEmbed/extensionFields.spec.ts b/dev/specs/bulkEmbed/extensionFields.spec.ts index cc8e92c..4b829e0 100644 --- a/dev/specs/bulkEmbed/extensionFields.spec.ts +++ b/dev/specs/bulkEmbed/extensionFields.spec.ts @@ -7,15 +7,18 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' const DIMS = DEFAULT_DIMS const dbName = `bulk_extfields_${Date.now()}` describe('Bulk embed - extension fields', () => { let payload: Payload + let vectorizedPayload: VectorizedPayload | null = null beforeAll(async () => { await createTestDb({ dbName }) @@ -49,6 +52,7 @@ describe('Bulk embed - extension fields', () => { key: `extfields-${Date.now()}`, }) payload = built.payload + vectorizedPayload = getVectorizedPayload(payload) }) test('extension fields are merged when writing embeddings', async () => { @@ -56,20 +60,8 @@ describe('Bulk embed - extension fields', () => { collection: 'posts', data: { title: 'Ext merge' } as any, }) - - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload) @@ -82,5 +74,3 @@ describe('Bulk embed - extension fields', () => { expect(embeds.docs[0]).toHaveProperty('priority', 3) }) }) - - diff --git a/dev/specs/bulkEmbed/failedBatch.spec.ts b/dev/specs/bulkEmbed/failedBatch.spec.ts index f25a0f3..5e09d16 100644 --- a/dev/specs/bulkEmbed/failedBatch.spec.ts +++ b/dev/specs/bulkEmbed/failedBatch.spec.ts @@ -3,13 +3,14 @@ import { beforeAll, describe, expect, test } from 'vitest' import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../src/collections/bulkEmbeddingsBatches.js' import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../../../src/collections/bulkEmbeddingInputMetadata.js' -import { getVectorizedPayload } from '../../../src/types.js' +import { getVectorizedPayload, VectorizedPayload } from '../../../src/types.js' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' @@ -19,6 +20,7 @@ const dbName = `bulk_failed_${Date.now()}` describe('Bulk embed - failed batch', () => { let payload: Payload + let vectorizedPayload: VectorizedPayload | null = null beforeAll(async () => { await createTestDb({ dbName }) @@ -46,31 +48,21 @@ describe('Bulk embed - failed batch', () => { key: `failed-${Date.now()}`, }) payload = built.payload + vectorizedPayload = getVectorizedPayload(payload) }) test('failed batch marks entire run as failed', async () => { const post = await payload.create({ collection: 'posts', data: { title: 'Fail' } as any }) - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload) const runDoc = ( await (payload as any).find({ collection: BULK_EMBEDDINGS_RUNS_SLUG, - where: { id: { equals: String(run.id) } }, + where: { id: { equals: String(result!.runId) } }, }) ).docs[0] expect(runDoc.status).toBe('failed') @@ -88,27 +80,17 @@ describe('Bulk embed - failed batch', () => { data: { title: 'FailCleanup' } as any, }) - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload) + const runIdNum = parseInt(String(result!.runId), 10) + // Metadata should be kept for failed batches to allow retries - const runIdNum = typeof run.id === 'number' ? run.id : parseInt(String(run.id), 10) const metadata = await payload.find({ collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, - where: { run: { equals: runIdNum } }, + where: { run: { equals: runIdNum } as any }, }) expect(metadata.totalDocs).toBeGreaterThan(0) @@ -167,28 +149,17 @@ describe('Bulk embed - failed batch', () => { test('retrying a failed batch creates a new batch and marks old batch as retried', async () => { const vectorizedPayload = getVectorizedPayload<'default'>(payload)! - const post = await payload.create({ collection: 'posts', data: { title: 'RetryTest' } as any }) + await payload.create({ collection: 'posts', data: { title: 'RetryTest' } as any }) - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload) // Find the failed batch const batchesResult = await payload.find({ collection: BULK_EMBEDDINGS_BATCHES_SLUG, - where: { run: { equals: run.id } }, + where: { run: { equals: result.runId } }, }) const failedBatch = (batchesResult as any).docs[0] expect(failedBatch.status).toBe('failed') @@ -222,7 +193,7 @@ describe('Bulk embed - failed batch', () => { expect((newBatch as any).providerBatchId).not.toBe(failedBatch.providerBatchId) // Check that metadata points to the new batch - const runIdNum = typeof run.id === 'number' ? run.id : parseInt(String(run.id), 10) + const runIdNum = parseInt(String(result!.runId), 10) const metadata = await payload.find({ collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, where: { run: { equals: runIdNum } }, @@ -239,31 +210,19 @@ describe('Bulk embed - failed batch', () => { test('retrying a retried batch returns the existing retry batch', async () => { const vectorizedPayload = getVectorizedPayload<'default'>(payload)! - const post = await payload.create({ + await payload.create({ collection: 'posts', data: { title: 'RetryRetryTest' } as any, }) - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) await waitForBulkJobs(payload) // Find the failed batch const batchesResult = await payload.find({ collection: BULK_EMBEDDINGS_BATCHES_SLUG, - where: { run: { equals: run.id } }, + where: { run: { equals: result!.runId } }, }) const failedBatch = (batchesResult as any).docs[0] diff --git a/dev/specs/bulkEmbed/multipleBatches.spec.ts b/dev/specs/bulkEmbed/multipleBatches.spec.ts index 7820f7a..6612847 100644 --- a/dev/specs/bulkEmbed/multipleBatches.spec.ts +++ b/dev/specs/bulkEmbed/multipleBatches.spec.ts @@ -8,15 +8,18 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' const DIMS = DEFAULT_DIMS const dbName = `bulk_multibatch_${Date.now()}` describe('Bulk embed - multiple batches', () => { let payload: Payload + let vectorizedPayload: VectorizedPayload | null = null beforeAll(async () => { await createTestDb({ dbName }) @@ -47,6 +50,7 @@ describe('Bulk embed - multiple batches', () => { key: `multibatch-${Date.now()}`, }) payload = built.payload + vectorizedPayload = getVectorizedPayload(payload) }) test('multiple batches are created when flushing after N chunks', async () => { @@ -55,25 +59,14 @@ describe('Bulk embed - multiple batches', () => { await payload.create({ collection: 'posts', data: { title: `Post ${i}` } as any }) } - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload, 20000) const batches = await payload.find({ collection: BULK_EMBEDDINGS_BATCHES_SLUG as any, - where: { run: { equals: String(run.id) } }, + where: { run: { equals: result!.runId } }, sort: 'batchIndex', }) expect(batches.totalDocs).toBe(3) @@ -89,12 +82,10 @@ describe('Bulk embed - multiple batches', () => { const runDoc = ( await (payload as any).find({ collection: BULK_EMBEDDINGS_RUNS_SLUG, - where: { id: { equals: String(run.id) } }, + where: { id: { equals: result!.runId } }, }) ).docs[0] expect(runDoc.totalBatches).toBe(3) expect(runDoc.status).toBe('succeeded') }) }) - - diff --git a/dev/specs/bulkEmbed/multipleChunks.spec.ts b/dev/specs/bulkEmbed/multipleChunks.spec.ts index b621b15..1b913e5 100644 --- a/dev/specs/bulkEmbed/multipleChunks.spec.ts +++ b/dev/specs/bulkEmbed/multipleChunks.spec.ts @@ -1,15 +1,16 @@ import type { Payload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' -import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { getVectorizedPayload } from 'payloadcms-vectorize' const DIMS = DEFAULT_DIMS const dbName = `bulk_multichunk_${Date.now()}` @@ -58,19 +59,9 @@ describe('Bulk embed - multiple chunks with extension fields', () => { data: { title: 'Two' } as any, }) - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const vectorizedPayload = getVectorizedPayload(payload) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload) @@ -84,5 +75,3 @@ describe('Bulk embed - multiple chunks with extension fields', () => { expect(embeds.docs[1]).toMatchObject({ category: 'b', priority: 2, chunkIndex: 1 }) }) }) - - diff --git a/dev/specs/bulkEmbed/partialFailure.spec.ts b/dev/specs/bulkEmbed/partialFailure.spec.ts index bf84443..1cb5171 100644 --- a/dev/specs/bulkEmbed/partialFailure.spec.ts +++ b/dev/specs/bulkEmbed/partialFailure.spec.ts @@ -7,9 +7,11 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { getVectorizedPayload } from 'payloadcms-vectorize' const DIMS = DEFAULT_DIMS const dbName = `bulk_partial_failure_${Date.now()}` @@ -83,26 +85,16 @@ describe('Bulk embed - partial chunk failures', () => { data: { title: 'Partial Failure Test' } as any, }) - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const vectorizedPayload = getVectorizedPayload(payload) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload) // Check run status - should still succeed but with failed count const updatedRun = await payload.findByID({ collection: BULK_EMBEDDINGS_RUNS_SLUG, - id: run.id, + id: result!.runId, }) expect(updatedRun.status).toBe('succeeded') diff --git a/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts b/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts index 133e97c..2211e40 100644 --- a/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts +++ b/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts @@ -7,14 +7,16 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { getVectorizedPayload } from 'payloadcms-vectorize' const DIMS = DEFAULT_DIMS const dbName = `bulk_partial_failure_nofail_${Date.now()}` -describe('Bulk embed - no partial failures', () => { +describe('Bulk embed - partial failures', () => { let payload: Payload let onErrorCalled = false let onErrorArgs: { @@ -73,26 +75,16 @@ describe('Bulk embed - no partial failures', () => { await payload.create({ collection: 'posts', data: { title: 'No Failure Test' } as any }) - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testVersion, status: 'queued' }, - }) - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const vectorizedPayload = getVectorizedPayload(payload) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload) // Check run status const updatedRun = await payload.findByID({ collection: BULK_EMBEDDINGS_RUNS_SLUG, - id: run.id, + id: result!.runId, }) expect(updatedRun.status).toBe('succeeded') diff --git a/dev/specs/bulkEmbed/polling.spec.ts b/dev/specs/bulkEmbed/polling.spec.ts index ba47e34..9ffae7e 100644 --- a/dev/specs/bulkEmbed/polling.spec.ts +++ b/dev/specs/bulkEmbed/polling.spec.ts @@ -7,8 +7,10 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + expectGoodResult, waitForBulkJobs, } from '../utils.js' +import { getVectorizedPayload } from 'payloadcms-vectorize' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' const DIMS = DEFAULT_DIMS @@ -49,26 +51,19 @@ describe('Bulk embed - polling requeue', () => { test('polling requeues when non-terminal then succeeds', async () => { const post = await payload.create({ collection: 'posts', data: { title: 'Loop' } as any }) - - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: testEmbeddingVersion, status: 'queued' }, - }) - const queueSpy = vi.spyOn(payload.jobs, 'queue') - - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const vectorizedPayload = getVectorizedPayload(payload) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) await waitForBulkJobs(payload, 15000) - expect(queueSpy).toHaveBeenCalledWith( + expect(queueSpy).toHaveBeenNthCalledWith( + 2, // 2nd call + expect.objectContaining({ task: 'payloadcms-vectorize:poll-or-complete-bulk-embedding' }), + ) + expect(queueSpy).toHaveBeenNthCalledWith( + 3, // 3rd call expect.objectContaining({ task: 'payloadcms-vectorize:poll-or-complete-bulk-embedding' }), ) @@ -79,5 +74,3 @@ describe('Bulk embed - polling requeue', () => { expect(embeds.totalDocs).toBe(1) }) }) - - diff --git a/dev/specs/bulkEmbed/realtimeMode.spec.ts b/dev/specs/bulkEmbed/realtimeMode.spec.ts index 82eb79d..e59da32 100644 --- a/dev/specs/bulkEmbed/realtimeMode.spec.ts +++ b/dev/specs/bulkEmbed/realtimeMode.spec.ts @@ -1,12 +1,12 @@ import type { Payload } from 'payload' -import { beforeAll, describe, expect, test, vi } from 'vitest' -import { createVectorizeTask } from '../../../src/tasks/vectorize.js' +import { beforeAll, describe, expect, test } from 'vitest' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + waitForVectorizationJobs, } from '../utils.js' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' @@ -54,18 +54,7 @@ describe('Bulk embed - realtime mode', () => { data: { title: 'Realtime Test' } as any, }) - const vectorizeTask = createVectorizeTask({ - knowledgePools: realtimeOptions.knowledgePools, - }) - const vectorizeHandler = vectorizeTask.handler as any - - await vectorizeHandler({ - input: { doc: post, collection: 'posts', knowledgePool: 'default' } as any, - req: { payload } as any, - inlineTask: vi.fn(), - tasks: {} as any, - job: {} as any, - }) + await waitForVectorizationJobs(payload) const embeds = await payload.find({ collection: 'default', @@ -74,4 +63,3 @@ describe('Bulk embed - realtime mode', () => { expect(embeds.totalDocs).toBeGreaterThan(0) }) }) - diff --git a/dev/specs/bulkEmbed/versionBump.spec.ts b/dev/specs/bulkEmbed/versionBump.spec.ts index 2facc85..7f0dd4c 100644 --- a/dev/specs/bulkEmbed/versionBump.spec.ts +++ b/dev/specs/bulkEmbed/versionBump.spec.ts @@ -1,100 +1,115 @@ -import type { Payload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' -import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' import { BULK_QUEUE_NAMES, DEFAULT_DIMS, buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, + expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery } from 'helpers/embed.js' +import { getVectorizedPayload } from '../../../src/types.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_version_${Date.now()}` -describe('Bulk embed - version bump', () => { - let payload: Payload +// Use distinct bulk queue names per payload instance so that +// the second payload's cron worker handles its own bulk runs, +// instead of the first payload instance continuing to process them. +const BULK_QUEUE_NAMES_0 = BULK_QUEUE_NAMES +const BULK_QUEUE_NAMES_1 = { + prepareBulkEmbedQueueName: `${BULK_QUEUE_NAMES.prepareBulkEmbedQueueName}-v2`, + pollOrCompleteQueueName: `${BULK_QUEUE_NAMES.pollOrCompleteQueueName}-v2`, +} +describe('Bulk embed - version bump', () => { + let post: any beforeAll(async () => { await createTestDb({ dbName }) - const built = await buildPayloadWithIntegration({ - dbName, - pluginOpts: { - knowledgePools: { - default: { - collections: { - posts: { - toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }) + + test('version bump re-embeds all even without updates', async () => { + const payload0 = ( + await buildPayloadWithIntegration({ + dbName, + pluginOpts: { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embeddingConfig: { + version: 'old-version', + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings({ statusSequence: ['succeeded'] }), }, - }, - embeddingConfig: { - version: 'new-version', - queryFn: makeDummyEmbedQuery(DIMS), - bulkEmbeddingsFns: createMockBulkEmbeddings({ statusSequence: ['succeeded'] }), }, }, + bulkQueueNames: BULK_QUEUE_NAMES_0, }, - bulkQueueNames: BULK_QUEUE_NAMES, - }, - secret: 'test-secret', - dims: DIMS, - key: `version-${Date.now()}`, - }) - payload = built.payload - }) + secret: 'test-secret', + dims: DIMS, + key: `payload0`, + }) + ).payload - test('version bump re-embeds all even without updates', async () => { - const post = await payload.create({ collection: 'posts', data: { title: 'Old' } as any }) + post = await payload0.create({ collection: 'posts', data: { title: 'Old' } as any }) + + const vectorizedPayload0 = getVectorizedPayload(payload0) + const result0 = await vectorizedPayload0?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result0) + + await waitForBulkJobs(payload0) - // Create an embedding with old version manually - await payload.create({ + // Debug: log embeddings after first run + const embeds0 = await payload0.find({ collection: 'default', - data: { - docId: String(post.id), - sourceCollection: 'posts', - text: 'Old', - chunkIndex: 0, - embedding: Array(DIMS).fill(0.1), - embeddingVersion: 'old-version', - updatedAt: new Date().toISOString(), - } as any, + where: { docId: { equals: String(post.id) } }, }) + expect(embeds0.totalDocs).toBe(1) + expect(embeds0.docs[0].embeddingVersion).toBe('old-version') - // Run bulk embed with new version - const run = await payload.create({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - data: { pool: 'default', embeddingVersion: 'new-version', status: 'queued' }, - }) + const payload1 = ( + await buildPayloadWithIntegration({ + dbName, + pluginOpts: { + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.title }], + }, + }, + embeddingConfig: { + version: 'new-version', + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings({ statusSequence: ['succeeded'] }), + }, + }, + }, + bulkQueueNames: BULK_QUEUE_NAMES_1, + }, + secret: 'test-secret', + dims: DIMS, + key: `payload1`, + }) + ).payload - await payload.jobs.queue<'payloadcms-vectorize:prepare-bulk-embedding'>({ - task: 'payloadcms-vectorize:prepare-bulk-embedding', - input: { runId: String(run.id) }, - req: { payload } as any, - ...(BULK_QUEUE_NAMES.prepareBulkEmbedQueueName - ? { queue: BULK_QUEUE_NAMES.prepareBulkEmbedQueueName } - : {}), - }) + const vectorizedPayload1 = getVectorizedPayload(payload1) + const result1 = await vectorizedPayload1?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result1) - await waitForBulkJobs(payload) + await waitForBulkJobs(payload1) - // Should have 1 embedding with new version (old one replaced) - const embeds = await payload.find({ + const embeds1 = await payload1.find({ collection: 'default', where: { docId: { equals: String(post.id) } }, }) - expect(embeds.totalDocs).toBe(1) - expect(embeds.docs[0].embeddingVersion).toBe('new-version') - const runDoc = ( - await (payload as any).find({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - where: { id: { equals: String(run.id) } }, - }) - ).docs[0] - expect(runDoc.inputs).toBe(1) + expect(embeds1.totalDocs).toBe(1) + expect(embeds1.docs[0].embeddingVersion).toBe('new-version') }) }) - - diff --git a/dev/specs/config.spec.ts b/dev/specs/config.spec.ts index 45aa676..b183af7 100644 --- a/dev/specs/config.spec.ts +++ b/dev/specs/config.spec.ts @@ -57,10 +57,16 @@ describe('endpoints: /vector-search, /vector-bulk-embed', () => { method: 'post', handler: expect.any(Function), }), + expect.objectContaining({ + path: '/vector-retry-failed-batch', + method: 'post', + handler: expect.any(Function), + }), ]), ) }) test('uses the custom path when provided', async () => { + // TODO: Add test for custom path for bulk embed and retry failed batch const cfg = await buildDummyConfig({ plugins: [plugin({ ...dummyPluginOptions, endpointOverrides: { path: '/custom-path' } })], }) @@ -73,11 +79,6 @@ describe('endpoints: /vector-search, /vector-bulk-embed', () => { method: 'post', handler: expect.any(Function), }), - expect.objectContaining({ - path: '/vector-bulk-embed', - method: 'post', - handler: expect.any(Function), - }), ]), ) }) diff --git a/dev/specs/e2e.spec.ts b/dev/specs/e2e.spec.ts index 8d5669c..d4661aa 100644 --- a/dev/specs/e2e.spec.ts +++ b/dev/specs/e2e.spec.ts @@ -11,27 +11,19 @@ import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../src/collections/bulkEmbeddin // Helper function to log in to the admin panel const loginToAdmin = async (page: any) => { - console.log('[loginToAdmin] Starting login process...') await page.goto('/admin/login') - console.log('[loginToAdmin] Navigated to login page') await page.waitForLoadState('domcontentloaded') - console.log('[loginToAdmin] Page loaded') // Fill in the login form - console.log('[loginToAdmin] Filling in email...') await page.fill('input[name="email"]', devUser.email) - console.log('[loginToAdmin] Filling in password...') await page.fill('input[name="password"]', devUser.password) // Click the login button - console.log('[loginToAdmin] Clicking submit button...') await page.click('button[type="submit"]') // Wait for redirect to admin dashboard - console.log('[loginToAdmin] Waiting for redirect...') await page.waitForURL(/\/admin(?!\/login)/, { timeout: 15000 }) - console.log('[loginToAdmin] Login complete!') } const expectVectorSearchResponse = async (response: any, post: any, title: string) => { @@ -69,11 +61,9 @@ test.describe('Vector embedding e2e tests', () => { let post: any test.beforeAll(async () => { - console.log('[beforeAll] Setting up Payload instance...') // Setup: Create a post and wait for realtime embedding _config = await config payload = await getPayload({ config: _config, key: `e2e-test-${Date.now()}` }) - console.log('[beforeAll] Payload instance created') }) test('querying the endpoint should return the title with testEmbeddingVersion', async ({ @@ -110,15 +100,12 @@ test.describe('Vector embedding e2e tests', () => { page, request, }) => { - console.log('[test] Starting bulk embedding test...') test.setTimeout(120000) // Login to admin first - console.log('[test] Logging in...') await loginToAdmin(page) // Verify bulkDefault pool is EMPTY (no realTimeIngestionFn configured) - console.log('[test] Checking bulkDefault pool is empty...') const emptyResponse = await request.post('/api/vector-search', { data: { query: title, @@ -128,22 +115,16 @@ test.describe('Vector embedding e2e tests', () => { await expectEmptyVectorSearchResponse(emptyResponse) // Navigate to the bulkDefault embeddings collection page in admin - console.log('[test] Navigating to bulkDefault collection page...') await page.goto('/admin/collections/bulkDefault', { waitUntil: 'networkidle' }) - console.log('[test] Page loaded') // Wait for the page to fully load and render - console.log('[test] Waiting for page to fully load...') await page.waitForLoadState('domcontentloaded') await page.waitForLoadState('networkidle') - console.log('[test] Page fully loaded') // Wait for the collapsible header to appear - use getByText for more flexible matching // Note: If this fails, ensure `pnpm run generate:importmap` has been run - console.log('[test] Looking for "Bulk Embed All" text...') const bulkEmbedAllText = page.getByText('Bulk Embed All', { exact: false }) await expect(bulkEmbedAllText).toBeVisible({ timeout: 15000 }) - console.log('[test] Found "Bulk Embed All" text!') // Click the button that contains the h3 with "Bulk Embed All" text // The button wraps the h3, so we click the button that contains the h3 @@ -191,7 +172,6 @@ test.describe('Vector embedding e2e tests', () => { let finalStatus = '' while (attempts < maxAttempts) { - console.log('[test] Polling for status...') // Refresh the page to see updated status await page.reload() await page.waitForLoadState('domcontentloaded') @@ -201,10 +181,8 @@ test.describe('Vector embedding e2e tests', () => { .locator('.rs__single-value') .textContent() .catch(() => null) - console.log('[test] Status value:', statusValue) if (statusValue) { finalStatus = statusValue - console.log('[test] Status value:', statusValue) if (statusValue === 'succeeded') { break } @@ -230,7 +208,6 @@ test.describe('Vector embedding e2e tests', () => { const runIdMatch = runUrl.match(/\/(\d+)$/) const bulkRunId = runIdMatch ? runIdMatch[1] : null expect(bulkRunId).not.toBeNull() - console.log('[test] Bulk run ID:', bulkRunId) // Find the succeeded batch that was created const succeededBatches = await (payload as any).find({ @@ -241,7 +218,6 @@ test.describe('Vector embedding e2e tests', () => { }) expect(succeededBatches.totalDocs).toBeGreaterThan(0) const succeededBatch = succeededBatches.docs[0] - console.log('[test] Found succeeded batch:', succeededBatch.id) // Test: Retry endpoint returns 400 for succeeded batch const succeededRetryResponse = await request.post('/api/vector-retry-failed-batch', { @@ -250,10 +226,8 @@ test.describe('Vector embedding e2e tests', () => { expect(succeededRetryResponse.status()).toBe(400) const succeededRetryJson = await succeededRetryResponse.json() expect(succeededRetryJson.error).toContain('not in failed or retried status') - console.log('[test] Retry endpoint correctly rejected succeeded batch') // Navigate to the succeeded batch page and verify retry button is disabled - console.log('[test] Navigating to succeeded batch page...') await page.goto(`/admin/collections/${BULK_EMBEDDINGS_BATCHES_SLUG}/${succeededBatch.id}`, { waitUntil: 'networkidle', }) @@ -265,42 +239,30 @@ test.describe('Vector embedding e2e tests', () => { // Verify the button is disabled (opacity check) const buttonStyle = await retryButton.getAttribute('style') - console.log('[test] Button style:', buttonStyle) expect(buttonStyle).toContain('opacity:0.5') // Verify the "Retry Not Available" message is shown const notAvailableMessage = page.locator('text=/Retry Not Available/i') await expect(notAvailableMessage).toBeVisible({ timeout: 5000 }) - - console.log('[test] Retry button correctly disabled for succeeded batch!') }) test('clicking expand section on default collection shows not enabled message', async ({ page, }) => { - console.log('[test] Starting default collection test...') - // Login to admin first - console.log('[test] Logging in...') await loginToAdmin(page) // Navigate to the default embeddings collection page in admin - console.log('[test] Navigating to default collection page...') await page.goto('/admin/collections/default', { waitUntil: 'networkidle' }) - console.log('[test] Page loaded') // Wait for the page to fully load and render - console.log('[test] Waiting for page to fully load...') await page.waitForLoadState('domcontentloaded') await page.waitForLoadState('networkidle') - console.log('[test] Page fully loaded') // Wait for the collapsible header to appear - use getByText for more flexible matching // Note: If this fails, ensure `pnpm run generate:importmap` has been run - console.log('[test] Looking for "Bulk Embed All" text...') const bulkEmbedAllText = page.getByText('Bulk Embed All', { exact: false }) await expect(bulkEmbedAllText).toBeVisible({ timeout: 15000 }) - console.log('[test] Found "Bulk Embed All" text!') // Click the button that contains the h3 with "Bulk Embed All" text const expandButton = page.locator('button:has(h3:has-text("Bulk Embed All"))') @@ -325,19 +287,13 @@ test.describe('Vector embedding e2e tests', () => { }) test('retry failed batch endpoint returns 404 for non-existent batch', async ({ request }) => { - console.log('[test] Testing non-existent batch retry...') - const nonExistentResponse = await request.post('/api/vector-retry-failed-batch', { data: { batchId: '999999' }, }) expect(nonExistentResponse.status()).toBe(404) - - console.log('[test] Non-existent batch test completed!') }) test('retry failed batch endpoint works correctly', async ({ request }) => { - console.log('[test] Starting retry failed batch endpoint test...') - // Create a test post first (needed for bulk embedding to have something to embed) const post = await payload.create({ collection: 'posts', @@ -345,7 +301,6 @@ test.describe('Vector embedding e2e tests', () => { title: 'Failed batch test post', }, }) - console.log('[test] Created test post:', post.id) // Use the bulk embed endpoint to create a run for failingBulkDefault pool const bulkEmbedResponse = await request.post('/api/vector-bulk-embed', { @@ -353,15 +308,12 @@ test.describe('Vector embedding e2e tests', () => { knowledgePool: 'failingBulkDefault', }, }) - console.log('[test] Bulk embed response:', await bulkEmbedResponse.json()) expect(bulkEmbedResponse.ok()).toBe(true) const bulkEmbedJson = await bulkEmbedResponse.json() const runId = bulkEmbedJson.runId - console.log('[test] Created bulk run via endpoint:', runId) // Wait for the bulk jobs to process and fail (failingBulkDefault has a mock that fails) await waitForBulkJobs(payload, 30000) - console.log('[test] Bulk jobs completed') // Wait for the batch to actually fail (poll-or-complete job needs to finish) const runIdNum = parseInt(runId, 10) @@ -399,30 +351,8 @@ test.describe('Vector embedding e2e tests', () => { attempts++ } - if (!batches || batches.totalDocs === 0) { - // Final check for debugging - const allBatchesFinal = await (payload as any).find({ - collection: BULK_EMBEDDINGS_BATCHES_SLUG, - where: { run: { equals: runIdNum } }, - }) - const runFinal = await (payload as any).findByID({ - collection: BULK_EMBEDDINGS_RUNS_SLUG, - id: runId, - }) - console.log('[test] Failed to find failed batch after', attempts, 'attempts') - console.log('[test] Run status:', runFinal.status) - console.log('[test] Batches found:', allBatchesFinal.totalDocs) - if (allBatchesFinal.totalDocs > 0) { - console.log( - '[test] Batch statuses:', - allBatchesFinal.docs.map((b: any) => b.status), - ) - } - } - expect(batches?.totalDocs).toBeGreaterThan(0) const batch = batches.docs[0] - console.log('[test] Found failed batch:', batch.id) // Retry the failed batch (should succeed) const retryResponse = await request.post('/api/vector-retry-failed-batch', { @@ -458,12 +388,9 @@ test.describe('Vector embedding e2e tests', () => { id: runId, }) expect((updatedRun as any).status).toBe('running') - - console.log('[test] Retry failed batch endpoint test completed successfully!') }) test('retry failed batch button works for failed batches', async ({ page, request }) => { - console.log('[test] Starting retry button click test...') test.setTimeout(120000) // Login first @@ -476,11 +403,9 @@ test.describe('Vector embedding e2e tests', () => { title: 'Failed batch UI test post', }, }) - console.log('[test] Created test post:', post.id) // Wait for any existing bulk embedding jobs to complete before starting a new run await waitForBulkJobs(payload, 30000) - console.log('[test] Existing bulk jobs completed, proceeding...') // Use the bulk embed endpoint to create a run for failingBulkDefault pool const bulkEmbedResponse = await request.post('/api/vector-bulk-embed', { @@ -488,15 +413,12 @@ test.describe('Vector embedding e2e tests', () => { knowledgePool: 'failingBulkDefault', }, }) - console.log('[test] Bulk embed response:', await bulkEmbedResponse.json()) expect(bulkEmbedResponse.ok()).toBe(true) const bulkEmbedJson = await bulkEmbedResponse.json() const runId = bulkEmbedJson.runId - console.log('[test] Created bulk run via endpoint:', runId) // Wait for the bulk jobs to process and fail (failingBulkDefault has a mock that fails) await waitForBulkJobs(payload, 30000) - console.log('[test] Bulk jobs completed') // Wait for the batch to actually fail (poll-or-complete job needs to finish) const runIdNum = parseInt(runId, 10) @@ -536,10 +458,8 @@ test.describe('Vector embedding e2e tests', () => { expect(batches?.totalDocs).toBeGreaterThan(0) const failedBatch = batches.docs[0] - console.log('[test] Found failed batch:', failedBatch.id) // Navigate to the run edit page (where FailedBatchesList component should be visible) - console.log('[test] Navigating to run page...') await page.goto(`/admin/collections/${BULK_EMBEDDINGS_RUNS_SLUG}/${runId}`, { waitUntil: 'networkidle', }) @@ -548,10 +468,8 @@ test.describe('Vector embedding e2e tests', () => { // Wait for the FailedBatchesList component to appear const failedBatchesList = page.locator('[data-testid^="failed-batch-link-"]').first() await expect(failedBatchesList).toBeVisible({ timeout: 10000 }) - console.log('[test] Failed batches list is visible') // Click on the failed batch link to navigate to the batch page - console.log('[test] Clicking failed batch link...') await failedBatchesList.click() // Wait for navigation to batch page @@ -559,7 +477,6 @@ test.describe('Vector embedding e2e tests', () => { timeout: 10000, }) await page.waitForLoadState('domcontentloaded') - console.log('[test] Navigated to batch page') // Look for the retry button const retryButton = page.locator('[data-testid="retry-failed-batch-button"]') @@ -574,27 +491,20 @@ test.describe('Vector embedding e2e tests', () => { expect(buttonStyle).not.toContain('opacity: 0.5') // Click the retry button - console.log('[test] Clicking retry button...') await retryButton.click() // Wait for success message const successMessage = page.locator('text=/Batch resubmitted successfully/i') await expect(successMessage).toBeVisible({ timeout: 10000 }) - console.log('[test] Retry button click test completed!') - // Wait a bit for the page reload await page.waitForTimeout(2000) // Verify we're still on the batch page after reload await page.waitForURL(/\/admin\/collections\/vector-bulk-embeddings-batches\/\d+/) - - console.log('[test] Retry failed batch button test completed successfully!') }) test('missing batchId returns 400 error', async ({ request }) => { - console.log('[test] Testing missing batchId...') - const response = await request.post('/api/vector-retry-failed-batch', { data: {}, }) @@ -602,7 +512,5 @@ test.describe('Vector embedding e2e tests', () => { expect(response.status()).toBe(400) const json = await response.json() expect(json.error).toContain('batchId is required') - - console.log('[test] Missing batchId test completed!') }) }) diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index 474f4a1..cd1abf2 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -13,7 +13,9 @@ import type { BulkEmbeddingsFns, BulkEmbeddingInput, BulkEmbeddingRunStatus, + BulkEmbedResult, } from '../../src/types.js' +import { expect } from 'vitest' export const createTestDb = async ({ dbName }: { dbName: string }) => { const adminUri = @@ -221,6 +223,11 @@ export async function buildPayloadWithIntegration({ jobs: { tasks: [], autoRun: [ + { + cron: '*/2 * * * * *', + limit: 10, + queue: pluginOpts.realtimeQueueName ?? 'default', + }, { cron: '*/2 * * * * *', limit: 10, @@ -281,3 +288,9 @@ export async function createSucceededBaselineRun( }, }) } + +export const expectGoodResult = (result: BulkEmbedResult | undefined) => { + expect(result).toBeDefined() + expect(result!.status).toBe('queued') + expect((result as any).conflict).toBeUndefined() +} diff --git a/src/admin/components/EmbedAllButton/index.tsx b/src/admin/components/EmbedAllButton/index.tsx index de0dbeb..04d20bb 100644 --- a/src/admin/components/EmbedAllButton/index.tsx +++ b/src/admin/components/EmbedAllButton/index.tsx @@ -15,19 +15,11 @@ export const EmbedAllButton: React.FC diff --git a/src/admin/components/FailedBatchesList/index.tsx b/src/admin/components/FailedBatchesList/index.tsx index a666529..56341a0 100644 --- a/src/admin/components/FailedBatchesList/index.tsx +++ b/src/admin/components/FailedBatchesList/index.tsx @@ -10,20 +10,11 @@ type FailedBatchesListProps = { } export const FailedBatchesList: React.FC = async (props) => { - // Always render something for debugging - console.log('[FailedBatchesList] Component called with props:', { - hasPayload: !!props.payload, - hasId: !!props.id, - allProps: Object.keys(props), - }) - const run = await props.payload.findByID({ collection: BULK_EMBEDDINGS_RUNS_SLUG, id: props.id, }) - console.log('[FailedBatchesList] Fetching failed batches for run:', run.id) - // Fetch failed batches for this run const runIdNum = typeof run.id === 'number' ? run.id : parseInt(String(run.id), 10) const failedBatches = await props.payload.find({ @@ -38,8 +29,6 @@ export const FailedBatchesList: React.FC = async (props) const batches = (failedBatches as any)?.docs || [] const runId = props.id || String(run.id) - console.log('[FailedBatchesList] Found batches:', batches.length, 'for run:', runId) - return ( { - console.log('RetryFailedBatchButtonClient', batchId, status, retriedBatchId) const [isSubmitting, setIsSubmitting] = useState(false) const [message, setMessage] = useState<{ text: string; error?: boolean } | null>(null) diff --git a/src/admin/components/RetryFailedBatchButton/index.tsx b/src/admin/components/RetryFailedBatchButton/index.tsx index 7f47387..e4b798c 100644 --- a/src/admin/components/RetryFailedBatchButton/index.tsx +++ b/src/admin/components/RetryFailedBatchButton/index.tsx @@ -15,8 +15,6 @@ export const RetryFailedBatchButton: React.FC< id: props.id, }) - console.log('RetryFailedBatchButtonBatch', batch) - return ( // Create vectorized payload object factory that creates methods bound to a payload instance const createVectorizedPayloadObject = (payload: Payload): VectorizedPayload => { - console.log('createVectorizedPayloadObject', payload) return { _isBulkEmbedEnabled: (knowledgePool: TPoolNames): boolean => { const poolConfig = pluginOptions.knowledgePools[knowledgePool] diff --git a/src/tasks/vectorize.ts b/src/tasks/vectorize.ts index 80f1ac0..5dc191c 100644 --- a/src/tasks/vectorize.ts +++ b/src/tasks/vectorize.ts @@ -29,7 +29,6 @@ export const createVectorizeTask = ({ /** * Vectorize Task Configuration * @description Scheduled task that vectorizes on data change. - * Runs every 5 seconds to call the embedding function. */ const processVectorizationTask: TaskConfig = { slug: 'payloadcms-vectorize:vectorize', From 0ecd01c83f7ed6cb24fbc1f30575aa3adabe0ba9 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Thu, 15 Jan 2026 17:22:42 +0700 Subject: [PATCH 40/49] trying to fix CI tests --- dev/specs/bulkEmbed/basic.spec.ts | 3 +-- dev/specs/bulkEmbed/canceledBatch.spec.ts | 2 +- dev/specs/bulkEmbed/extensionFields.spec.ts | 2 +- dev/specs/bulkEmbed/failedBatch.spec.ts | 2 +- dev/specs/bulkEmbed/multipleBatches.spec.ts | 2 +- dev/specs/bulkEmbed/multipleChunks.spec.ts | 2 +- dev/specs/bulkEmbed/partialFailure.spec.ts | 2 +- dev/specs/bulkEmbed/partialFailureNoFail.spec.ts | 2 +- dev/specs/bulkEmbed/polling.spec.ts | 2 +- dev/specs/bulkEmbed/versionBump.spec.ts | 2 +- dev/specs/utils.ts | 8 -------- dev/specs/utils.vitest.ts | 8 ++++++++ src/collections/embeddings.ts | 2 -- 13 files changed, 18 insertions(+), 21 deletions(-) create mode 100644 dev/specs/utils.vitest.ts diff --git a/dev/specs/bulkEmbed/basic.spec.ts b/dev/specs/bulkEmbed/basic.spec.ts index 46b0d29..6664ecc 100644 --- a/dev/specs/bulkEmbed/basic.spec.ts +++ b/dev/specs/bulkEmbed/basic.spec.ts @@ -10,12 +10,11 @@ import { clearAllCollections, createMockBulkEmbeddings, createTestDb, - expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' -import { BulkEmbedResult } from '../../../src/types.js' +import { expectGoodResult } from '../utils.vitest.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_basic_${Date.now()}` diff --git a/dev/specs/bulkEmbed/canceledBatch.spec.ts b/dev/specs/bulkEmbed/canceledBatch.spec.ts index f14c4d2..46922d9 100644 --- a/dev/specs/bulkEmbed/canceledBatch.spec.ts +++ b/dev/specs/bulkEmbed/canceledBatch.spec.ts @@ -6,11 +6,11 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, - expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' +import { expectGoodResult } from '../utils.vitest.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_canceled_${Date.now()}` diff --git a/dev/specs/bulkEmbed/extensionFields.spec.ts b/dev/specs/bulkEmbed/extensionFields.spec.ts index 4b829e0..c47fefd 100644 --- a/dev/specs/bulkEmbed/extensionFields.spec.ts +++ b/dev/specs/bulkEmbed/extensionFields.spec.ts @@ -7,11 +7,11 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, - expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' +import { expectGoodResult } from '../utils.vitest.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_extfields_${Date.now()}` diff --git a/dev/specs/bulkEmbed/failedBatch.spec.ts b/dev/specs/bulkEmbed/failedBatch.spec.ts index 5e09d16..7819def 100644 --- a/dev/specs/bulkEmbed/failedBatch.spec.ts +++ b/dev/specs/bulkEmbed/failedBatch.spec.ts @@ -10,10 +10,10 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, - expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { expectGoodResult } from '../utils.vitest.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_failed_${Date.now()}` diff --git a/dev/specs/bulkEmbed/multipleBatches.spec.ts b/dev/specs/bulkEmbed/multipleBatches.spec.ts index 6612847..ee17bdc 100644 --- a/dev/specs/bulkEmbed/multipleBatches.spec.ts +++ b/dev/specs/bulkEmbed/multipleBatches.spec.ts @@ -8,11 +8,11 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, - expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload, VectorizedPayload } from 'payloadcms-vectorize' +import { expectGoodResult } from '../utils.vitest.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_multibatch_${Date.now()}` diff --git a/dev/specs/bulkEmbed/multipleChunks.spec.ts b/dev/specs/bulkEmbed/multipleChunks.spec.ts index 1b913e5..7e05791 100644 --- a/dev/specs/bulkEmbed/multipleChunks.spec.ts +++ b/dev/specs/bulkEmbed/multipleChunks.spec.ts @@ -6,11 +6,11 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, - expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload } from 'payloadcms-vectorize' +import { expectGoodResult } from '../utils.vitest.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_multichunk_${Date.now()}` diff --git a/dev/specs/bulkEmbed/partialFailure.spec.ts b/dev/specs/bulkEmbed/partialFailure.spec.ts index 1cb5171..7eae88b 100644 --- a/dev/specs/bulkEmbed/partialFailure.spec.ts +++ b/dev/specs/bulkEmbed/partialFailure.spec.ts @@ -7,11 +7,11 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, - expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload } from 'payloadcms-vectorize' +import { expectGoodResult } from '../utils.vitest.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_partial_failure_${Date.now()}` diff --git a/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts b/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts index 2211e40..586d4e6 100644 --- a/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts +++ b/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts @@ -7,11 +7,11 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, - expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { getVectorizedPayload } from 'payloadcms-vectorize' +import { expectGoodResult } from '../utils.vitest.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_partial_failure_nofail_${Date.now()}` diff --git a/dev/specs/bulkEmbed/polling.spec.ts b/dev/specs/bulkEmbed/polling.spec.ts index 9ffae7e..9b884c7 100644 --- a/dev/specs/bulkEmbed/polling.spec.ts +++ b/dev/specs/bulkEmbed/polling.spec.ts @@ -7,11 +7,11 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, - expectGoodResult, waitForBulkJobs, } from '../utils.js' import { getVectorizedPayload } from 'payloadcms-vectorize' import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { expectGoodResult } from '../utils.vitest.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_polling_${Date.now()}` diff --git a/dev/specs/bulkEmbed/versionBump.spec.ts b/dev/specs/bulkEmbed/versionBump.spec.ts index 7f0dd4c..8c5166c 100644 --- a/dev/specs/bulkEmbed/versionBump.spec.ts +++ b/dev/specs/bulkEmbed/versionBump.spec.ts @@ -5,11 +5,11 @@ import { buildPayloadWithIntegration, createMockBulkEmbeddings, createTestDb, - expectGoodResult, waitForBulkJobs, } from '../utils.js' import { makeDummyEmbedQuery } from 'helpers/embed.js' import { getVectorizedPayload } from '../../../src/types.js' +import { expectGoodResult } from '../utils.vitest.js' const DIMS = DEFAULT_DIMS const dbName = `bulk_version_${Date.now()}` diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index cd1abf2..68e8d63 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -13,9 +13,7 @@ import type { BulkEmbeddingsFns, BulkEmbeddingInput, BulkEmbeddingRunStatus, - BulkEmbedResult, } from '../../src/types.js' -import { expect } from 'vitest' export const createTestDb = async ({ dbName }: { dbName: string }) => { const adminUri = @@ -288,9 +286,3 @@ export async function createSucceededBaselineRun( }, }) } - -export const expectGoodResult = (result: BulkEmbedResult | undefined) => { - expect(result).toBeDefined() - expect(result!.status).toBe('queued') - expect((result as any).conflict).toBeUndefined() -} diff --git a/dev/specs/utils.vitest.ts b/dev/specs/utils.vitest.ts new file mode 100644 index 0000000..50000e9 --- /dev/null +++ b/dev/specs/utils.vitest.ts @@ -0,0 +1,8 @@ +import { expect } from 'vitest' +import type { BulkEmbedResult } from '../../src/types.js' + +export const expectGoodResult = (result: BulkEmbedResult | undefined) => { + expect(result).toBeDefined() + expect(result!.status).toBe('queued') + expect((result as any).conflict).toBeUndefined() +} diff --git a/src/collections/embeddings.ts b/src/collections/embeddings.ts index da1bc5a..3f8634b 100644 --- a/src/collections/embeddings.ts +++ b/src/collections/embeddings.ts @@ -39,8 +39,6 @@ export const createEmbeddingsCollection = ( // Use getVectorizedPayload to get the vectorized payload object const vectorizedPayload = getVectorizedPayload(payload) - console.log('vectorizedPayload', vectorizedPayload) - console.log('payload.config.custom', payload.config.custom) if (poolName && typeof poolName === 'string' && vectorizedPayload) { return vectorizedPayload._isBulkEmbedEnabled(poolName) } From b3312f300250ea022db1321310f5c7ce3af24492 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Thu, 15 Jan 2026 17:58:48 +0700 Subject: [PATCH 41/49] Clean up --- dev/specs/bulkEmbed/basic.spec.ts | 2 -- dev/specs/bulkEmbed/canceledBatch.spec.ts | 2 -- dev/specs/bulkEmbed/concurrentRuns.spec.ts | 2 -- dev/specs/bulkEmbed/extensionFields.spec.ts | 2 -- dev/specs/bulkEmbed/failedBatch.spec.ts | 2 -- dev/specs/bulkEmbed/multipleBatches.spec.ts | 2 -- dev/specs/bulkEmbed/multipleChunks.spec.ts | 2 -- dev/specs/bulkEmbed/onError.spec.ts | 2 -- dev/specs/bulkEmbed/partialFailure.spec.ts | 2 -- dev/specs/bulkEmbed/partialFailureNoFail.spec.ts | 2 -- dev/specs/bulkEmbed/polling.spec.ts | 2 -- dev/specs/bulkEmbed/realtimeMode.spec.ts | 2 -- dev/specs/bulkEmbed/versionBump.spec.ts | 4 ---- dev/specs/constants.ts | 2 +- dev/specs/int.spec.ts | 2 +- dev/specs/multipools.spec.ts | 2 +- dev/specs/utils.ts | 8 ++------ 17 files changed, 5 insertions(+), 37 deletions(-) diff --git a/dev/specs/bulkEmbed/basic.spec.ts b/dev/specs/bulkEmbed/basic.spec.ts index 6664ecc..bb0219f 100644 --- a/dev/specs/bulkEmbed/basic.spec.ts +++ b/dev/specs/bulkEmbed/basic.spec.ts @@ -47,8 +47,6 @@ describe('Bulk embed - basic tests', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: basePluginOptions, - secret: 'test-secret', - dims: DIMS, key: `basic-${Date.now()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/canceledBatch.spec.ts b/dev/specs/bulkEmbed/canceledBatch.spec.ts index 46922d9..2b99b88 100644 --- a/dev/specs/bulkEmbed/canceledBatch.spec.ts +++ b/dev/specs/bulkEmbed/canceledBatch.spec.ts @@ -43,8 +43,6 @@ describe('Bulk embed - canceled batch', () => { }, bulkQueueNames: BULK_QUEUE_NAMES, }, - secret: 'test-secret', - dims: DIMS, key: `canceled-${Date.now()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/concurrentRuns.spec.ts b/dev/specs/bulkEmbed/concurrentRuns.spec.ts index 4d3d01b..c03f212 100644 --- a/dev/specs/bulkEmbed/concurrentRuns.spec.ts +++ b/dev/specs/bulkEmbed/concurrentRuns.spec.ts @@ -40,8 +40,6 @@ describe('Bulk embed - concurrent runs prevention', () => { }, bulkQueueNames: BULK_QUEUE_NAMES, }, - secret: 'test-secret', - dims: DIMS, key: `concurrent-${Date.now()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/extensionFields.spec.ts b/dev/specs/bulkEmbed/extensionFields.spec.ts index c47fefd..c564bea 100644 --- a/dev/specs/bulkEmbed/extensionFields.spec.ts +++ b/dev/specs/bulkEmbed/extensionFields.spec.ts @@ -47,8 +47,6 @@ describe('Bulk embed - extension fields', () => { }, bulkQueueNames: BULK_QUEUE_NAMES, }, - secret: 'test-secret', - dims: DIMS, key: `extfields-${Date.now()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/failedBatch.spec.ts b/dev/specs/bulkEmbed/failedBatch.spec.ts index 7819def..037fdb5 100644 --- a/dev/specs/bulkEmbed/failedBatch.spec.ts +++ b/dev/specs/bulkEmbed/failedBatch.spec.ts @@ -43,8 +43,6 @@ describe('Bulk embed - failed batch', () => { }, bulkQueueNames: BULK_QUEUE_NAMES, }, - secret: 'test-secret', - dims: DIMS, key: `failed-${Date.now()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/multipleBatches.spec.ts b/dev/specs/bulkEmbed/multipleBatches.spec.ts index ee17bdc..aa2de86 100644 --- a/dev/specs/bulkEmbed/multipleBatches.spec.ts +++ b/dev/specs/bulkEmbed/multipleBatches.spec.ts @@ -45,8 +45,6 @@ describe('Bulk embed - multiple batches', () => { }, bulkQueueNames: BULK_QUEUE_NAMES, }, - secret: 'test-secret', - dims: DIMS, key: `multibatch-${Date.now()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/multipleChunks.spec.ts b/dev/specs/bulkEmbed/multipleChunks.spec.ts index 7e05791..0f99eab 100644 --- a/dev/specs/bulkEmbed/multipleChunks.spec.ts +++ b/dev/specs/bulkEmbed/multipleChunks.spec.ts @@ -46,8 +46,6 @@ describe('Bulk embed - multiple chunks with extension fields', () => { }, bulkQueueNames: BULK_QUEUE_NAMES, }, - secret: 'test-secret', - dims: DIMS, key: `multichunk-${Date.now()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/onError.spec.ts b/dev/specs/bulkEmbed/onError.spec.ts index cfc2e89..f128009 100644 --- a/dev/specs/bulkEmbed/onError.spec.ts +++ b/dev/specs/bulkEmbed/onError.spec.ts @@ -51,8 +51,6 @@ describe('Bulk embed - onError callback', () => { }, bulkQueueNames: BULK_QUEUE_NAMES, }, - secret: 'test-secret', - dims: DIMS, key: `onerror-${Date.now()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/partialFailure.spec.ts b/dev/specs/bulkEmbed/partialFailure.spec.ts index 7eae88b..d3ef57e 100644 --- a/dev/specs/bulkEmbed/partialFailure.spec.ts +++ b/dev/specs/bulkEmbed/partialFailure.spec.ts @@ -73,8 +73,6 @@ describe('Bulk embed - partial chunk failures', () => { }, bulkQueueNames: BULK_QUEUE_NAMES, }, - secret: 'test-secret', - dims: DIMS, key: `partial-failure-${Date.now()}-${Math.random()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts b/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts index 586d4e6..35e877f 100644 --- a/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts +++ b/dev/specs/bulkEmbed/partialFailureNoFail.spec.ts @@ -67,8 +67,6 @@ describe('Bulk embed - partial failures', () => { }, bulkQueueNames: BULK_QUEUE_NAMES, }, - secret: 'test-secret', - dims: DIMS, key: `no-partial-failure-${Date.now()}-${Math.random()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/polling.spec.ts b/dev/specs/bulkEmbed/polling.spec.ts index 9b884c7..eedd32a 100644 --- a/dev/specs/bulkEmbed/polling.spec.ts +++ b/dev/specs/bulkEmbed/polling.spec.ts @@ -42,8 +42,6 @@ describe('Bulk embed - polling requeue', () => { }, bulkQueueNames: BULK_QUEUE_NAMES, }, - secret: 'test-secret', - dims: DIMS, key: `polling-${Date.now()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/realtimeMode.spec.ts b/dev/specs/bulkEmbed/realtimeMode.spec.ts index e59da32..8e4c224 100644 --- a/dev/specs/bulkEmbed/realtimeMode.spec.ts +++ b/dev/specs/bulkEmbed/realtimeMode.spec.ts @@ -41,8 +41,6 @@ describe('Bulk embed - realtime mode', () => { const built = await buildPayloadWithIntegration({ dbName, pluginOpts: realtimeOptions, - secret: 'test-secret', - dims: DIMS, key: `realtime-${Date.now()}`, }) payload = built.payload diff --git a/dev/specs/bulkEmbed/versionBump.spec.ts b/dev/specs/bulkEmbed/versionBump.spec.ts index 8c5166c..39ab08f 100644 --- a/dev/specs/bulkEmbed/versionBump.spec.ts +++ b/dev/specs/bulkEmbed/versionBump.spec.ts @@ -50,8 +50,6 @@ describe('Bulk embed - version bump', () => { }, bulkQueueNames: BULK_QUEUE_NAMES_0, }, - secret: 'test-secret', - dims: DIMS, key: `payload0`, }) ).payload @@ -92,8 +90,6 @@ describe('Bulk embed - version bump', () => { }, bulkQueueNames: BULK_QUEUE_NAMES_1, }, - secret: 'test-secret', - dims: DIMS, key: `payload1`, }) ).payload diff --git a/dev/specs/constants.ts b/dev/specs/constants.ts index 47e5784..e695599 100644 --- a/dev/specs/constants.ts +++ b/dev/specs/constants.ts @@ -73,7 +73,7 @@ export const dummyPluginOptions = { export async function buildDummyConfig(cfg: Partial) { const built = await buildConfig({ - secret: 'test-secret', + secret: process.env.PAYLOAD_SECRET || 'test-secret', collections: [], editor: lexicalEditor(), // Provide a dummy db adapter to satisfy types; not used by these tests diff --git a/dev/specs/int.spec.ts b/dev/specs/int.spec.ts index 6995412..bea7dab 100644 --- a/dev/specs/int.spec.ts +++ b/dev/specs/int.spec.ts @@ -41,7 +41,7 @@ describe('Plugin integration tests', () => { }) config = await buildConfig({ - secret: 'test-secret', + secret: process.env.PAYLOAD_SECRET || 'test-secret', editor: lexicalEditor(), collections: [ { diff --git a/dev/specs/multipools.spec.ts b/dev/specs/multipools.spec.ts index 7288132..8b9c30d 100644 --- a/dev/specs/multipools.spec.ts +++ b/dev/specs/multipools.spec.ts @@ -54,7 +54,7 @@ describe('Multiple knowledge pools', () => { } config = await buildConfig({ - secret: 'test-secret', + secret: process.env.PAYLOAD_SECRET || 'test-secret', collections: [], editor: lexicalEditor(), db: postgresAdapter({ diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index 68e8d63..214891d 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -182,27 +182,23 @@ export function createMockBulkEmbeddings( export type BuildPayloadArgs = { dbName: string pluginOpts: any - secret?: string - dims?: number key?: string } export async function buildPayloadWithIntegration({ dbName, pluginOpts, - secret = 'test-secret', - dims = DEFAULT_DIMS, key, }: BuildPayloadArgs): Promise<{ payload: Payload; config: SanitizedConfig }> { const integration = createVectorizeIntegration({ default: { - dims, + dims: DEFAULT_DIMS, ivfflatLists: 1, }, }) const config = await buildConfig({ - secret, + secret: 'test-secret', editor: lexicalEditor(), collections: [ { From 8c75e1a4baa386eb82421dd1935eaba70801b183 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Thu, 15 Jan 2026 23:53:38 +0700 Subject: [PATCH 42/49] Better bulkEmbedAll --- README.md | 30 +- dev/specs/bulkEmbed/ingestionFailure.spec.ts | 101 +++++ src/tasks/bulkEmbedAll.ts | 365 +++++++++++-------- src/types.ts | 22 +- 4 files changed, 335 insertions(+), 183 deletions(-) create mode 100644 dev/specs/bulkEmbed/ingestionFailure.spec.ts diff --git a/README.md b/README.md index 4ce2cf8..2ededdd 100644 --- a/README.md +++ b/README.md @@ -299,37 +299,37 @@ type BatchSubmission = { - `null` - "I'm accumulating this chunk, not ready to submit yet" - `{ providerBatchId }` - "I just submitted a batch to my provider" -**⚠️ Important contract about which chunks are included in a submission:** +**⚠️ Important contract:** -- When `isLastChunk=false` and you return a submission: all pending chunks **EXCEPT** the current one were submitted (current chunk starts fresh accumulation) -- When `isLastChunk=true` and you return a submission: all pending chunks **INCLUDING** the current one were submitted +When you return a submission, the plugin assumes **all chunks currently in `pendingChunks` were submitted**. The plugin tracks chunks and creates batch records based on this assumption. You control which chunks get submitted by managing your own accumulation logic. + +**About `isLastChunk`:** + +- `isLastChunk=true` indicates this is the final chunk in the run +- Use this to flush any remaining accumulated chunks before the run completes +- The plugin uses this only to know when to stop iterating, not to determine which chunks were submitted **Example implementation:** ```typescript let accumulated: BulkEmbeddingInput[] = [] -let accumulatedSize = 0 -const FILE_SIZE_LIMIT = 50 * 1024 * 1024 // 50MB +const LINE_LIMIT = 100_000 // e.g., Voyage AI's limit addChunk: async ({ chunk, isLastChunk }) => { - const chunkSize = JSON.stringify(chunk).length + // Add current chunk to accumulation first + accumulated.push(chunk) - // Would exceed limit? Submit what we have, keep current for next batch - if (accumulatedSize + chunkSize > FILE_SIZE_LIMIT && accumulated.length > 0) { + // Check if we've hit the line limit (after adding current chunk) + if (accumulated.length === LINE_LIMIT) { const result = await submitToProvider(accumulated) - accumulated = [chunk] // Start fresh WITH current chunk - accumulatedSize = chunkSize + accumulated = [] // Clear for next batch return { providerBatchId: result.id } } - accumulated.push(chunk) - accumulatedSize += chunkSize - // Last chunk? Must flush everything if (isLastChunk && accumulated.length > 0) { const result = await submitToProvider(accumulated) accumulated = [] - accumulatedSize = 0 return { providerBatchId: result.id } } @@ -337,7 +337,7 @@ addChunk: async ({ chunk, isLastChunk }) => { } ``` -**Note:** If a single chunk exceeds your provider's file size limit, you'll need to handle that edge case in your implementation (e.g., skip it, split it, or fail gracefully). +**Note:** If a single chunk exceeds your provider's file size or line limit, you'll need to handle that edge case in your implementation (e.g., skip it, split it, or fail gracefully). #### `pollOrCompleteBatch` - Poll and Stream Results diff --git a/dev/specs/bulkEmbed/ingestionFailure.spec.ts b/dev/specs/bulkEmbed/ingestionFailure.spec.ts new file mode 100644 index 0000000..24542c6 --- /dev/null +++ b/dev/specs/bulkEmbed/ingestionFailure.spec.ts @@ -0,0 +1,101 @@ +import type { Payload } from 'payload' +import { beforeAll, describe, expect, test } from 'vitest' +import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../../src/collections/bulkEmbeddingsRuns.js' +import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../../src/collections/bulkEmbeddingsBatches.js' +import { + BULK_QUEUE_NAMES, + DEFAULT_DIMS, + buildPayloadWithIntegration, + createMockBulkEmbeddings, + createTestDb, + waitForBulkJobs, +} from '../utils.js' +import { makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' +import { getVectorizedPayload } from 'payloadcms-vectorize' +import { expectGoodResult } from '../utils.vitest.js' + +const DIMS = DEFAULT_DIMS +const dbName = `bulk_ingestion_failure_${Date.now()}` + +describe('Bulk embed - ingestion validation failures', () => { + let payload: Payload + + beforeAll(async () => { + await createTestDb({ dbName }) + }) + + test('malformed chunk entry fails the bulk embedding run', async () => { + // Use unique version to ensure this test only processes its own data + const testVersion = `${testEmbeddingVersion}-ingestion-fail-${Date.now()}` + + const built = await buildPayloadWithIntegration({ + dbName, + pluginOpts: { + knowledgePools: { + default: { + collections: { + posts: { + // Malformed: second entry missing required "chunk" string + toKnowledgePool: async () => [{ chunk: 'ok chunk' }, { bad: 'oops' } as any], + }, + }, + embeddingConfig: { + version: testVersion, + queryFn: makeDummyEmbedQuery(DIMS), + bulkEmbeddingsFns: createMockBulkEmbeddings({ + statusSequence: ['succeeded'], + }), + }, + }, + }, + bulkQueueNames: BULK_QUEUE_NAMES, + }, + key: `ingestion-failure-${Date.now()}-${Math.random()}`, + }) + payload = built.payload + + // Create a post + await payload.create({ + collection: 'posts', + data: { title: 'bad chunks' } as any, + }) + + const vectorizedPayload = getVectorizedPayload(payload) + const result = await vectorizedPayload?.bulkEmbed({ knowledgePool: 'default' }) + expectGoodResult(result) + + // Wait for bulk jobs to finish (or fail) + await waitForBulkJobs(payload, 15000) + + // Check the run status - should be failed + const run = await payload.findByID({ + collection: BULK_EMBEDDINGS_RUNS_SLUG, + id: result!.runId, + }) + + expect(run.status).toBe('failed') + + // Check the prepare-bulk-embedding job failed with validation error + const res = await payload.find({ + collection: 'payload-jobs', + where: { + and: [{ taskSlug: { equals: 'payloadcms-vectorize:prepare-bulk-embedding' } }], + }, + limit: 1, + sort: '-createdAt', + }) + const failedJob = (res as any)?.docs?.[0] + expect(failedJob.hasError).toBe(true) + const errMsg = failedJob.error.message + expect(errMsg).toMatch(/chunk/i) + expect(errMsg).toMatch(/Invalid indices: 1/) + + // Ensure no embeddings were created (all-or-nothing validation) + const embeddingsCount = await payload.count({ collection: 'default' }) + expect(embeddingsCount.totalDocs).toBe(0) + + // Ensure no batches were created (validation happens before batching) + const batchesCount = await payload.count({ collection: BULK_EMBEDDINGS_BATCHES_SLUG }) + expect(batchesCount.totalDocs).toBe(0) + }) +}) diff --git a/src/tasks/bulkEmbedAll.ts b/src/tasks/bulkEmbedAll.ts index 369ea93..973371c 100644 --- a/src/tasks/bulkEmbedAll.ts +++ b/src/tasks/bulkEmbedAll.ts @@ -1,4 +1,11 @@ -import { Payload, TaskConfig, TaskHandlerResult } from 'payload' +import { + JsonObject, + PaginatedDocs, + Payload, + TaskConfig, + TaskHandlerResult, + TypeWithID, +} from 'payload' import { BatchSubmission, BulkEmbeddingOutput, @@ -97,7 +104,7 @@ export const createPrepareBulkEmbeddingTask = ({ throw new Error('[payloadcms-vectorize] bulk embed runId is required') } const payload = req.payload - const { run, poolName, dynamicConfig } = await loadRunAndConfig({ + const { poolName, dynamicConfig } = await loadRunAndConfig({ payload, runId: input.runId, knowledgePools, @@ -126,17 +133,34 @@ export const createPrepareBulkEmbeddingTask = ({ const versionMismatch = baselineVersion !== undefined && baselineVersion !== embeddingVersion // Stream missing embeddings and create batches - const result = await streamAndBatchMissingEmbeddings({ - payload, - runId: input.runId, - poolName, - dynamicConfig, - embeddingVersion, - lastBulkCompletedAt, - versionMismatch, - hasBaseline: Boolean(baselineRun), - addChunk: callbacks.addChunk, - }) + let result + try { + result = await streamAndBatchMissingEmbeddings({ + payload, + runId: input.runId, + poolName, + dynamicConfig, + embeddingVersion, + lastBulkCompletedAt, + versionMismatch, + hasBaseline: Boolean(baselineRun), + addChunk: callbacks.addChunk, + }) + } catch (error) { + // Ingestion failed (e.g., validation error) - mark run as failed + const errorMessage = (error as Error).message || String(error) + await payload.update({ + id: input.runId, + collection: BULK_EMBEDDINGS_RUNS_SLUG, + data: { + status: 'failed', + error: errorMessage, + completedAt: new Date().toISOString(), + }, + }) + // Re-throw so Payload's job system marks the job as failed + throw error + } if (result.totalInputs === 0) { // No inputs to process - mark run as succeeded @@ -465,9 +489,8 @@ export const createPollOrCompleteBulkEmbeddingTask = ({ * Stream through missing embeddings, calling addChunk for each. * User controls batching via addChunk return value. * - * Uses a two-pass approach: - * 1. First pass: count total chunks to know when we reach the last one - * 2. Second pass: stream chunks without holding all in memory + * Single-pass approach using async generator to yield chunks sequentially. + * This avoids the need for a pre-counting pass while correctly determining isLastChunk. */ async function streamAndBatchMissingEmbeddings(args: { payload: Payload @@ -499,170 +522,192 @@ async function streamAndBatchMissingEmbeddings(args: { const lastCompletedAtDate = lastBulkCompletedAt ? new Date(lastBulkCompletedAt) : undefined const collectionSlugs = Object.keys(dynamicConfig.collections) - // First pass: count total chunks to know the last one - // We store minimal info (docId + chunkCount) to avoid OOM - type DocChunkInfo = { collectionSlug: string; docId: string; chunkCount: number } - const docsToProcess: DocChunkInfo[] = [] - let totalChunkCount = 0 - - for (const collectionSlug of collectionSlugs) { - const collectionConfig = dynamicConfig.collections[collectionSlug] - if (!collectionConfig) continue - - const toKnowledgePool = collectionConfig.toKnowledgePool - let page = 1 - const limit = 50 + // Async generator that yields chunks one at a time + async function* generateChunks(): AsyncGenerator { + for (const collectionSlug of collectionSlugs) { + const collectionConfig = dynamicConfig.collections[collectionSlug] + if (!collectionConfig) continue + + const toKnowledgePool = collectionConfig.toKnowledgePool + const limit = 50 + + // Build where clause: filter by updatedAt if we have lastBulkCompletedAt and !includeAll + const where = includeAll + ? undefined + : lastCompletedAtDate + ? { + updatedAt: { + greater_than: lastCompletedAtDate.toISOString(), + }, + } + : undefined - while (true) { - const res = await payload.find({ + let res: PaginatedDocs | undefined = await payload.find({ collection: collectionSlug, - page, + where, limit, }) - const docs = (res as any)?.docs || [] - if (!docs.length) break - const totalPages = (res as any)?.totalPages ?? page - - for (const doc of docs) { - const docUpdatedAt = doc?.updatedAt ? new Date(doc.updatedAt) : undefined - let shouldInclude = includeAll - if (!shouldInclude) { - const updatedAfter = - docUpdatedAt && lastCompletedAtDate ? docUpdatedAt > lastCompletedAtDate : false - const hasCurrentEmbedding = await docHasEmbeddingVersion({ - payload, - poolName, - sourceCollection: collectionSlug, - docId: String(doc.id), - embeddingVersion, - }) - shouldInclude = updatedAfter || !hasCurrentEmbedding - } - if (!shouldInclude) continue - - const chunkData = await toKnowledgePool(doc, payload) - const validChunkCount = chunkData.filter((c) => c?.chunk).length - if (validChunkCount > 0) { - docsToProcess.push({ - collectionSlug, - docId: String(doc.id), - chunkCount: validChunkCount, - }) - totalChunkCount += validChunkCount - } - } + do { + const docs = res?.docs || [] + if (!docs.length) break + + for (const doc of docs) { + // If !includeAll, we still need to check if document has current embedding + // (can't filter this in the where clause since it's a cross-collection check) + if (!includeAll && !lastCompletedAtDate) { + const hasCurrentEmbedding = await docHasEmbeddingVersion({ + payload, + poolName, + sourceCollection: collectionSlug, + docId: String(doc.id), + embeddingVersion, + }) + if (hasCurrentEmbedding) continue + } - page++ - if (page > totalPages) break - } - } + const chunkData = await toKnowledgePool(doc, payload) + + // Validate chunks (same validation as real-time ingestion) + const invalidEntries = chunkData + .map((entry, idx) => { + if (!entry || typeof entry !== 'object') return idx + if (typeof entry.chunk !== 'string') return idx + return null + }) + .filter((idx): idx is number => idx !== null) + + if (invalidEntries.length > 0) { + throw new Error( + `[payloadcms-vectorize] toKnowledgePool returned ${invalidEntries.length} invalid entr${ + invalidEntries.length === 1 ? 'y' : 'ies' + } for document ${doc.id} in collection "${collectionSlug}". Each entry must be an object with a "chunk" string. Invalid indices: ${invalidEntries.join( + ', ', + )}`, + ) + } - // If no chunks, return early - if (totalChunkCount === 0) { - return { batchCount: 0, totalInputs: 0 } + // Yield valid chunks + for (let idx = 0; idx < chunkData.length; idx++) { + const chunkEntry = chunkData[idx] + const { chunk, ...extensionFields } = chunkEntry + + yield { + id: `${collectionSlug}:${doc.id}:${idx}`, + text: chunk, + metadata: { + sourceCollection: collectionSlug, + docId: String(doc.id), + chunkIndex: idx, + embeddingVersion, + extensionFields, + }, + } + } + } + } while ( + (res = res.nextPage + ? await payload.find({ + collection: collectionSlug, + where, + limit, + page: res.nextPage, + }) + : undefined) + ) + } } - // Second pass: stream chunks without holding all in memory + // Process chunks from generator let batchIndex = 0 let totalInputs = 0 - let processedChunkCount = 0 const pendingChunks: CollectedEmbeddingInput[] = [] - - for (const docInfo of docsToProcess) { - const collectionConfig = dynamicConfig.collections[docInfo.collectionSlug] - if (!collectionConfig) continue - - // Re-fetch the document to get its data - const doc = await payload.findByID({ - collection: docInfo.collectionSlug as any, - id: docInfo.docId, - }) - if (!doc) continue - - const toKnowledgePool = collectionConfig.toKnowledgePool - const chunkData = await toKnowledgePool(doc, payload) - - for (let idx = 0; idx < chunkData.length; idx++) { - const chunkEntry = chunkData[idx] - if (!chunkEntry?.chunk) continue - - processedChunkCount++ - const isLastChunk = processedChunkCount === totalChunkCount - - const { chunk, ...extensionFields } = chunkEntry - const collectedChunk: CollectedEmbeddingInput = { - id: `${docInfo.collectionSlug}:${doc.id}:${idx}`, - text: chunk, - metadata: { - sourceCollection: docInfo.collectionSlug, - docId: String(doc.id), - chunkIndex: idx, - embeddingVersion, - extensionFields, + const chunkIterator = generateChunks() + const runIdNum = parseInt(runId, 10) + let currentBatchId: number | undefined = undefined + + async function processChunk( + chunk: CollectedEmbeddingInput, + isLastChunk: boolean = false, + ): Promise { + // Add to pending queue BEFORE calling addChunk + pendingChunks.push(chunk) + + // If this is the first chunk in a new batch, create a placeholder batch record + if (pendingChunks.length === 1) { + // Starting a new batch - create placeholder batch record + const placeholderBatch = await payload.create({ + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + data: { + run: runIdNum, + batchIndex, + providerBatchId: `placeholder-${runId}-${batchIndex}`, // Temporary, will be updated + status: 'queued', + inputCount: 0, // Will be updated after submission + submittedAt: new Date().toISOString(), }, - } + }) + currentBatchId = (placeholderBatch as any).id + } - // Add to pending queue BEFORE calling addChunk - pendingChunks.push(collectedChunk) + if (!currentBatchId) { + throw new Error( + `[payloadcms-vectorize] Failed to get batch ID for chunk ${chunk.id} in run ${runId}`, + ) + } - const submission = await addChunk({ - chunk: { id: collectedChunk.id, text: collectedChunk.text }, - isLastChunk, - }) + // Save metadata with the batch ID + await payload.create({ + collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, + data: { + run: runIdNum, + batch: currentBatchId, + inputId: chunk.id, + text: chunk.text, + sourceCollection: chunk.metadata.sourceCollection, + docId: chunk.metadata.docId, + chunkIndex: chunk.metadata.chunkIndex, + embeddingVersion: chunk.metadata.embeddingVersion, + extensionFields: chunk.metadata.extensionFields, + }, + }) - if (submission) { - // User submitted a batch - // - If isLastChunk: all pending chunks were submitted - // - If not isLastChunk: all except current were submitted (current starts fresh) - let submittedChunks: CollectedEmbeddingInput[] - if (isLastChunk) { - submittedChunks = pendingChunks.splice(0) - } else { - submittedChunks = pendingChunks.splice(0, pendingChunks.length - 1) - } + const submission = await addChunk({ + chunk: { id: chunk.id, text: chunk.text }, + isLastChunk, + }) - // Convert runId to number for postgres relationships - const runIdNum = parseInt(runId, 10) + if (submission) { + // When addChunk returns a submission, all chunks in pendingChunks were submitted + // (the provider controls which chunks get submitted) + const submittedChunks = pendingChunks.splice(0) + const inputCount = submittedChunks.length - // Create batch record first so we have the batch ID for metadata - const batchRecord = await payload.create({ - collection: BULK_EMBEDDINGS_BATCHES_SLUG, - data: { - run: runIdNum, - batchIndex, - providerBatchId: submission.providerBatchId, - status: 'queued', - inputCount: submittedChunks.length, - submittedAt: new Date().toISOString(), - }, - }) + // Update the batch record with the real providerBatchId and inputCount + await payload.update({ + id: currentBatchId, + collection: BULK_EMBEDDINGS_BATCHES_SLUG, + data: { + providerBatchId: submission.providerBatchId, + inputCount, + }, + }) - const batchId = (batchRecord as any).id - - // Store metadata for submitted chunks with batch reference - await Promise.all( - submittedChunks.map((c) => - payload.create({ - collection: BULK_EMBEDDINGS_INPUT_METADATA_SLUG, - data: { - run: runIdNum, - batch: batchId, - inputId: c.id, - text: c.text, - sourceCollection: c.metadata.sourceCollection, - docId: c.metadata.docId, - chunkIndex: c.metadata.chunkIndex, - embeddingVersion: c.metadata.embeddingVersion, - extensionFields: c.metadata.extensionFields, - }, - }), - ), - ) + totalInputs += inputCount + batchIndex++ + currentBatchId = undefined // Reset for next batch + } + } - totalInputs += submittedChunks.length - batchIndex++ - } + // Process chunks from generator + let prevChunk: CollectedEmbeddingInput | undefined = undefined + for await (const currentChunk of chunkIterator) { + if (prevChunk) { + await processChunk(prevChunk) } + prevChunk = currentChunk + } + if (prevChunk) { + await processChunk(prevChunk, true) } return { batchCount: batchIndex, totalInputs } diff --git a/src/types.ts b/src/types.ts index 0a6cd4c..f211516 100644 --- a/src/types.ts +++ b/src/types.ts @@ -221,18 +221,24 @@ export type OnBulkErrorArgs = { */ export type BulkEmbeddingsFns = { /** - * Called for each chunk. User accumulates internally based on file size logic. + * Called for each chunk. User accumulates internally based on file size/line limits. * - Return null to keep accumulating * - Return BatchSubmission when ready to submit a batch * - * **Important contract about which chunks are included:** - * - When `isLastChunk=false` and you return a submission: all pending chunks EXCEPT the current one were submitted - * - When `isLastChunk=true` and you return a submission: all pending chunks INCLUDING the current one were submitted + * **Important contract:** + * When you return a submission, all chunks that you've accumulated (and decided to submit) + * are considered submitted. The plugin tracks chunks in `pendingChunks` and assumes all + * of them were submitted when you return a BatchSubmission. * - * Example flow when chunk would exceed file limit: - * 1. Check if adding current chunk would exceed your provider's file size limit - * 2. If yes: submit currently accumulated chunks (without this chunk), start fresh with this chunk - * 3. Return the BatchSubmission + * **About `isLastChunk`:** + * - `isLastChunk=true` indicates this is the final chunk in the run + * - Use this to flush any remaining accumulated chunks before the run completes + * - The plugin uses this only to know when to stop iterating, not to determine which chunks were submitted + * + * **Example flow when chunk would exceed limit:** + * 1. Check if adding current chunk == limit or if isLastChunk is true + * 2. If yes: submit accumulated chunks and return the BatchSubmission + * 3. Start fresh in the next call */ addChunk: (args: AddChunkArgs) => Promise From 2d15b68fab3b1a3b5947d9ba7c425e818cdfc14f Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Fri, 16 Jan 2026 01:18:50 +0700 Subject: [PATCH 43/49] new Readme --- CHANGELOG.md | 20 ++-- README.md | 329 ++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 270 insertions(+), 79 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 15f2d73..72a4fbb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,20 +4,22 @@ All notable changes to this project will be documented in this file. ## 0.5.0 - 2026-01-15 -### Breaking Changes - -- **`queueName` renamed to `realtimeQueueName`**: The plugin option `queueName` has been renamed to `realtimeQueueName` to clarify that it only affects realtime vectorization jobs. -- **`bulkQueueName` changed to `bulkQueueNames`**: The plugin option `bulkQueueName` has been replaced with `bulkQueueNames` object containing `prepareBulkEmbedQueueName` and `pollOrCompleteQueueName` for separate queue isolation of bulk preparation vs polling workloads. -- **`isVectorizedPayload` replaced with `getVectorizedPayload`**: The type guard `isVectorizedPayload(payload)` has been replaced with `getVectorizedPayload(payload)` which returns the vectorized payload object directly (or `null` if not available). This fixes a bug where methods are missing because onInit was not called. - ### New Features -- **`bulkQueueNames` option**: New plugin option to isolate bulk embedding workloads across separate queues for preparation and polling. Required when any knowledge pool uses bulk ingest mode (`bulkEmbeddings.ingestMode === 'bulk'`). +- **Bulk Embedding**: That's right! You can now embed in bulk. Very usseful to save money. +- **`bulkQueueNames` option**: New plugin option to isolate bulk embedding workloads across separate queues for preparation and polling. Required when any knowledge pool uses bulk embeddings. - **Non-blocking bulk polling**: Bulk jobs now use separate, short-lived tasks that can safely handle long-running providers (hours/days) without blocking worker processes. -- **Improved admin UX**: The "Embed all" button now: +- **Improved admin UX**: The "Embed all" button now exists: + - Can be used to trigger an 'embed all' bulk embedding - Disables when bulk embeddings are not configured for a pool - Links to the latest bulk run for easy status tracking -- **Enhanced bulk provider support**: Added real Voyage AI Batch API integration in dev environment, demonstrating production-ready bulk embedding with file uploads and async polling. +- **Showed Voyage AI example**: Added real Voyage AI Batch API integration in helpers/embed, demonstrating production-ready bulk embedding with file uploads and async polling. + +### Breaking Changes + +- **`queueName` renamed to `realtimeQueueName`**: The plugin option `queueName` has been renamed to `realtimeQueueName` to clarify that it only affects realtime vectorization jobs. +- **`bulkQueueName` changed to `bulkQueueNames`**: The plugin option `bulkQueueName` has been replaced with `bulkQueueNames` object containing `prepareBulkEmbedQueueName` and `pollOrCompleteQueueName` for separate queue isolation of bulk preparation vs polling workloads. +- **`isVectorizedPayload` replaced with `getVectorizedPayload`**: The type guard `isVectorizedPayload(payload)` has been replaced with `getVectorizedPayload(payload)` which returns the vectorized payload object directly (or `null` if not available). This fixes a bug where methods are missing because onInit was not called. ### Tests & Reliability diff --git a/README.md b/README.md index 2ededdd..a335339 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ A Payload CMS plugin that adds vector search capabilities to your collections us ## Features - 🔍 **Semantic Search**: Vectorize any collection for intelligent content discovery -- 🚀 **Automatic**: Documents are automatically vectorized when created or updated, and vectors are deleted as soon as the document is deleted. -- 🧵 **Bulk embedding**: Run “Embed all” batches that backfill only documents missing the current `embeddingVersion`. +- 🚀 **Realtime**: Documents are automatically vectorized when created or updated in realtime, and vectors are deleted as soon as the document is deleted. +- 🧵 **Bulk embedding**: Run “Embed all” batches that backfill only documents missing the current `embeddingVersion` since the last bulk run in order to save money. - 📊 **PostgreSQL Integration**: Built on pgvector for high-performance vector operations - ⚡ **Background Processing**: Uses Payload's job system for non-blocking vectorization - 🎯 **Flexible Chunking**: Drive chunk creation yourself with `toKnowledgePool` functions so you can combine any fields or content types @@ -20,12 +20,6 @@ A Payload CMS plugin that adds vector search capabilities to your collections us - PostgreSQL with pgvector extension - Node.js 18+ -**Note for Payload 3.54.0+:** When initializing Payload with `getPayload`, you must include `cron: true` if you want the cron jobs to run correctly: - -```typescript -payload = await getPayload({ config, cron: true }) -``` - ## Installation ```bash @@ -36,7 +30,7 @@ pnpm add payloadcms-vectorize ### 0. Have pgvector permissions -The plugin expects `vector` extension to be configured when Payload initializes. Your PostgreSQL database user must have permission to create extensions. If your user doesn't have these permissions, someone with permissions may need to manually create the extension once: +The plugin expects `vector` extension to be configured (`db: postgresAdapter({extensions: ['vector'],...})`) when Payload initializes. Your PostgreSQL database user must have permission to create extensions. If your user doesn't have these permissions, someone with permissions may need to manually create the extension once: ```sql CREATE EXTENSION IF NOT EXISTS vector; @@ -101,7 +95,7 @@ const postsToKnowledgePool: ToKnowledgePoolFn = async (doc, payload) => { // Create the integration with static configs (dims, ivfflatLists) const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ // Note limitation: Changing these values requires a migration. - main: { + mainKnowledgePool: { dims: 1536, // Vector dimensions ivfflatLists: 100, // IVFFLAT index parameter }, @@ -119,7 +113,7 @@ export default buildConfig({ plugins: [ payloadcmsVectorize({ knowledgePools: { - main: { + mainKnowledgePool: { collections: { posts: { toKnowledgePool: postsToKnowledgePool, @@ -147,18 +141,15 @@ export default buildConfig({ // }, }), ], + jobs: { // Remember to setup your cron for the embedding + autoRun: [ + ... + ], + }, }) ``` -**Important:** `knowledgePools` must have **different names than your collections**—reusing a collection name for a knowledge pool **will cause schema conflicts**. (In this example, the knowledge pool is named 'main' and a collection named 'main' will be created.) - -### 1.5. Generate Import Map (If Needed) - -Payload automatically generates the import map on startup and during development (HMR), so you typically don't need to run this manually. However, if client components (like the "Embed all" button) don't appear in the admin UI, you may need to manually generate the import map: - -```bash -pnpm run generate:importmap -``` +**Important:** `knowledgePools` must have **different names than your collections**—reusing a collection name for a knowledge pool **will cause schema conflicts**. (In this example, the knowledge pool is named 'mainKnowledgePool' and a collection named 'main-knowledge-pool' will be created.) **⚠️ Important:** Run this command: @@ -209,12 +200,6 @@ if (vectorizedPayload) { limit: 5, }) // results is an array of VectorSearchResult - - // Manually queue an embedding job - await vectorizedPayload.queueEmbed({ - collection: 'posts', - docId: 'some-post-id', - }) } ``` @@ -261,9 +246,9 @@ If neither is provided, embedding is disabled for that pool. The bulk embedding API is designed for large-scale embedding using provider batch APIs (like Voyage AI). **Bulk runs are never auto-queued** - they must be triggered manually via the admin UI or API. -#### The Streaming Model +#### The bulk embedding callbacks -The plugin streams chunks to your callbacks one at a time, giving you full control over batching based on your provider's file size limits: +In order to get bulk embeddings to interface with your provider, you must define the following three callbacks per knowledge pool (the functions do not have to be unique so you can re-use across knowledge pools). ```typescript type BulkEmbeddingsFns = { @@ -275,7 +260,7 @@ type BulkEmbeddingsFns = { #### `addChunk` - Accumulate and Submit -Called for each chunk. You manage your own accumulation and decide when to submit based on file size. +The plugin streams chunks to your callbacks one at a time; the callback is called for each chunk. You manage your own accumulation and decide when to submit based on file size. ```typescript type AddChunkArgs = { @@ -301,13 +286,12 @@ type BatchSubmission = { **⚠️ Important contract:** -When you return a submission, the plugin assumes **all chunks currently in `pendingChunks` were submitted**. The plugin tracks chunks and creates batch records based on this assumption. You control which chunks get submitted by managing your own accumulation logic. +When you return a submission, the plugin assumes **all chunks currently in `pendingChunks` were submitted**. The plugin tracks chunks and creates batch records based on this assumption. **About `isLastChunk`:** - `isLastChunk=true` indicates this is the final chunk in the run - Use this to flush any remaining accumulated chunks before the run completes -- The plugin uses this only to know when to stop iterating, not to determine which chunks were submitted **Example implementation:** @@ -345,7 +329,7 @@ Called repeatedly until the batch reaches a terminal status. When the batch comp ```typescript type PollOrCompleteBatchArgs = { - providerBatchId: string + providerBatchId: string // You provided it in the earlier step when you submitted a batch. onChunk: (chunk: BulkEmbeddingOutput) => Promise } @@ -412,14 +396,14 @@ The plugin uses separate Payload jobs for reliability with long-running provider ### Queue Configuration -For production deployments with bulk embedding: +For bulk embedding, you must provide the bulk queue names. ```typescript plugins: [ payloadcmsVectorize({ knowledgePools: { /* ... */ }, - realtimeQueueName: 'vectorize-realtime', - bulkQueueNames: { + realtimeQueueName: 'vectorize-realtime', // optional + bulkQueueNames: { // required iff you are using bulk embeddings prepareBulkEmbedQueueName: 'vectorize-bulk-prepare', pollOrCompleteQueueName: 'vectorize-bulk-poll', }, @@ -427,7 +411,7 @@ plugins: [ ] jobs: { - autoRun: [ + autoRun: [ // Must match { cron: '*/5 * * * * *', limit: 10, queue: 'vectorize-realtime' }, { cron: '0 0 * * * *', limit: 1, queue: 'vectorize-bulk-prepare' }, { cron: '*/30 * * * * *', limit: 5, queue: 'vectorize-bulk-poll' }, @@ -435,6 +419,103 @@ jobs: { } ``` +### Endpoints + +#### POST `/api/vector-bulk-embed` + +Starts a bulk embedding run for a knowledge pool via HTTP. This is the REST API equivalent of `vectorizedPayload.bulkEmbed()`. + +**Request Body:** + +```json +{ + "knowledgePool": "default" +} +``` + +**Success Response** (202 Accepted): + +```json +{ + "runId": "123", + "status": "queued" +} +``` + +**Conflict Response** (409 Conflict) - when a run is already active: + +```json +{ + "runId": "456", + "status": "running", + "message": "A bulk embedding run is already running for this knowledge pool. Wait for it to complete or cancel it first.", + "conflict": true +} +``` + +**Error Responses:** + +- `400 Bad Request`: Missing or invalid `knowledgePool` parameter +- `500 Internal Server Error`: Server error during processing + +**Example:** + +```bash +curl -X POST http://localhost:3000/api/vector-bulk-embed \ + -H "Content-Type: application/json" \ + -d '{"knowledgePool": "default"}' +``` + +#### POST `/api/vector-retry-failed-batch` + +Retries a failed batch from a bulk embedding run via HTTP. This is the REST API equivalent of `vectorizedPayload.retryFailedBatch()`. + +**Request Body:** + +```json +{ + "batchId": "123" +} +``` + +**Success Response** (202 Accepted): + +```json +{ + "batchId": "123", + "newBatchId": "456", + "runId": "789", + "status": "queued" +} +``` + +**Already Retried Response** (202 Accepted) - when batch was already retried: + +```json +{ + "batchId": "123", + "newBatchId": "456", + "runId": "789", + "status": "queued", + "message": "Batch was already retried. Returning the retry batch." +} +``` + +**Error Responses:** + +- `400 Bad Request`: Missing or invalid `batchId` parameter, or batch is not in a retriable state +- `404 Not Found`: Batch not found +- `409 Conflict`: Cannot retry while parent run is still active +- `500 Internal Server Error`: Server error during processing + +**Example:** + +```bash +curl -X POST http://localhost:3000/api/vector-retry-failed-batch \ + -H "Content-Type: application/json" \ + -d '{"batchId": "123"}' +``` + #### CollectionVectorizeOption - `toKnowledgePool (doc, payload)` – return an array of `{ chunk, ...extensionFieldValues }`. Each object becomes one embedding row and the index in the array determines `chunkIndex`. @@ -501,6 +582,8 @@ export const embedQuery = async (text: string): Promise => { } ``` +You can see more examples in `dev/helpers/embed.ts` + ## API Reference ### Search Endpoint @@ -553,10 +636,11 @@ Search for similar content using vector similarity. ### Bulk Embedding (Embed All) - Each knowledge pool's embeddings list shows an **Embed all** admin button that triggers a bulk run. -- **Note:** Make sure you've run `pnpm run generate:importmap` after plugin configuration, otherwise the button won't appear. -- Bulk runs only include documents missing embeddings for the pool's current `embeddingConfig.version`. -- Progress is recorded in `vector-bulk-embeddings-runs` and `vector-bulk-embeddings-batches` collections. -- Endpoint: **POST** `/api/vector-bulk-embed` +- **Note:** Payload automatically generates the import map on startup and during development (HMR), so you typically don't need to run this manually. However, if client components (like the "Embed all" button) don't appear in the admin UI, you may need to manually generate the import map: `pnpm run generate:importmap`. +- Bulk runs only include documents with mismatched embedding versions for the pool's current `embeddingConfig.version` from the previous bulk run (unless none has been done in which case it embeds all). +- Progress is recorded in `vector-bulk-embeddings-runs` and `vector-bulk-embeddings-batches` admin UI collections. +- You can re-run failed bulk embeddings from `vector-bulk-embeddings-batches` admin UI and you can link to the failed batches from the `vector-bulk-embeddings-runs` admin UI. +- Endpoints: **POST** `/api/vector-bulk-embed` and `/api/vector-retry-failed-batch` ```jsonc { @@ -564,9 +648,10 @@ Search for similar content using vector similarity. } ``` -The bulk embedding process has **two levels of atomicity**: +The bulk embedding process has **three levels of failure**: -- **Batch level**: If any batch fails during polling, the entire run fails and no embeddings are written. This is fully atomic. +- **Run level**: If any chunk fails during ingestion (toKnowledgePool), the entire run fails and no embeddings are written. This is fully atomic. Your onError is expected to handle clean up from this stage. +- **Batch level**: If any batch fails during polling, the entire run is marked as failed but embeddings from working batches are written. - **Chunk level**: If individual chunks fail during completion (e.g., provider returns errors for specific inputs), the run still succeeds and successful embeddings are written. Failed chunks are tracked in `failedChunkData` (with structured `collection`, `documentId`, and `chunkIndex` fields) and passed to the `onError` callback for cleanup. This design allows for partial success: if 100 chunks are processed and 2 fail, 98 embeddings are written and the 2 failures are tracked for potential retry. @@ -577,9 +662,37 @@ If `bulkEmbeddingsFns` is not provided, the "Embed all" button is disabled. ### Local API -The plugin extends the Payload instance with `search` and `queueEmbed` methods. +The plugin provides a `getVectorizedPayload(payload)` function which returns a 'vectorizedPayload' (an object) with `search`, `queueEmbed`, `bulkEmbed` and `retryFailedBatch` methods. + +#### Getting the Vectorized Payload Object + +Use the `getVectorizedPayload` function to get the vectorized payload object with all vectorize methods: + +```typescript +import { getVectorizedPayload } from 'payloadcms-vectorize' -#### `payload.search(params)` +const payload = await getPayload({ config, cron: true }) +const vectorizedPayload = getVectorizedPayload(payload) + +if (vectorizedPayload) { + // Use all vectorize methods + const results = await vectorizedPayload.search({ + query: 'search query', + knowledgePool: 'main', + }) + + await vectorizedPayload.queueEmbed({ + collection: 'posts', + docId: 'some-id', + }) + + await vectorizedPayload.bulkEmbed({ + knowledgePool: 'main', + }) +} +``` + +#### `vectorizedPayload.search(params)` Perform vector search programmatically without making an HTTP request. @@ -612,7 +725,7 @@ if (vectorizedPayload) { } ``` -#### `payload.queueEmbed(params)` +#### `vectorizedPayload.queueEmbed(params)` Manually queue a vectorization job for a document. @@ -659,43 +772,120 @@ if (vectorizedPayload) { } ``` -#### Getting the Vectorized Payload Object +#### `vectorizedPayload.bulkEmbed(params)` -Use the `getVectorizedPayload` function to get the vectorized payload object with all vectorize methods: +Starts a bulk embedding run for a knowledge pool. This method queues a background job that will process all documents in the knowledge pool's collections, chunk them, and submit them to your embedding provider via the `bulkEmbeddingsFns.addChunk` callback. + +**Parameters:** + +- `params.knowledgePool` (required): The name of the knowledge pool to embed + +**Returns:** `Promise` + +**Success Response:** ```typescript -import { getVectorizedPayload } from 'payloadcms-vectorize' +{ + runId: string // ID of the created bulk embedding run + status: 'queued' // Initial status of the run +} +``` -const payload = await getPayload({ config, cron: true }) -const vectorizedPayload = getVectorizedPayload(payload) +**Conflict Response** (if a run is already active): -if (vectorizedPayload) { - // Use all vectorize methods - const results = await vectorizedPayload.search({ - query: 'search query', - knowledgePool: 'main', - }) +```typescript +{ + runId: string // ID of the existing active run + status: 'queued' | 'running' // Status of the existing run + message: string // Explanation of why a new run wasn't started + conflict: true // Indicates a conflict occurred +} +``` - await vectorizedPayload.queueEmbed({ - collection: 'posts', - docId: 'some-id', - }) +**Example:** - await vectorizedPayload.bulkEmbed({ - knowledgePool: 'main', - }) +```typescript +const result = await vectorizedPayload.bulkEmbed({ knowledgePool: 'default' }) +if ('conflict' in result && result.conflict) { + console.log('A run is already active:', result.message) +} else { + console.log('Bulk embed started with run ID:', result.runId) } ``` -## Changelog +**Notes:** -See [CHANGELOG.md](./CHANGELOG.md) for release history, migration notes, and upgrade guides. +- Only one bulk embedding run can be active per knowledge pool at a time +- The run will process documents that need embedding (those with mismatched `embeddingVersion` or new documents since the last successful run) +- Progress can be tracked via the `vector-bulk-embeddings-runs` and `vector-bulk-embeddings-batches` collections in the admin UI +- The run status will progress: `queued` → `running` → `succeeded` or `failed` -## Requirements +#### `vectorizedPayload.retryFailedBatch(params)` -- Payload CMS >=3.0.0 <4.0.0 (tested on 3.69.0, previously tested on 3.37.0) -- PostgreSQL with pgvector extension -- Node.js ^18.20.2 +Retries a failed batch from a bulk embedding run. This method reconstructs the chunks from the batch's metadata, resubmits them to your embedding provider, and creates a new batch record. The original batch is marked as `retried` and linked to the new batch. + +**Parameters:** + +- `params.batchId` (required): The ID of the failed batch to retry + +**Returns:** `Promise` + +**Success Response:** + +```typescript +{ + batchId: string // ID of the batch being retried + newBatchId: string // ID of the newly created batch + runId: string // ID of the parent run + status: 'queued' // Status of the new batch + message?: string // Optional confirmation message +} +``` + +**Already Retried Response** (if batch was already retried): + +```typescript +{ + batchId: string // ID of the original batch + newBatchId: string // ID of the existing retry batch + runId: string // ID of the parent run + status: 'queued' // Status of the retry batch + message: string // Message indicating batch was already retried +} +``` + +**Error Response:** + +```typescript +{ + error: string // Error message + conflict?: true // Present if error is due to a conflict (e.g., run still active) +} +``` + +**Example:** + +```typescript +const result = await vectorizedPayload.retryFailedBatch({ batchId: '123' }) +if ('error' in result) { + console.error('Failed to retry batch:', result.error) +} else { + console.log(`Batch ${result.batchId} retried. New batch ID: ${result.newBatchId}`) +} +``` + +**Notes:** + +- Only batches with `failed` or `retried` status can be retried +- The parent run must be in a terminal state (`succeeded` or `failed`) - cannot retry while run is `queued` or `running` +- If the parent run was `succeeded` or `failed`, it will be reset to `running` status +- The original batch is marked as `retried` and linked to the new batch via the `retriedBatch` field +- Chunks are reconstructed from the batch's metadata, so metadata must still exist for the retry to work +- If a batch was already retried, calling this method again returns the existing retry batch instead of creating a duplicate + +## Changelog + +See [CHANGELOG.md](./CHANGELOG.md) for release history, migration notes, and upgrade guides. ## License @@ -731,7 +921,6 @@ Thank you for the stars! The following updates have been completed: The following features are planned for future releases based on community interest and stars: -- **Bulk prepare progress visibility**: Real-time progress tracking during the prepare phase for large collections - **Migrations for vector dimensions**: Easy migration tools for changing vector dimensions and/or ivfflatLists after initial setup - **MongoDB support**: Extend vector search capabilities to MongoDB databases - **Vercel support**: Optimized deployment and configuration for Vercel hosting From 4609ef5097f67aac8e2d6cfc0c79dff9ee148de8 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Fri, 16 Jan 2026 21:36:39 +0700 Subject: [PATCH 44/49] Working on adding migrations --- README.md | 93 +++- dev/specs/migrationCli.spec.ts | 863 +++++++++++++++++++++++++++++++++ package.json | 3 +- pnpm-lock.yaml | 45 +- src/bin/vectorize-migrate.ts | 588 ++++++++++++++++++++++ src/endpoints/vectorSearch.ts | 1 + src/index.ts | 328 +++++++++++-- src/types.ts | 2 + 8 files changed, 1873 insertions(+), 50 deletions(-) create mode 100644 dev/specs/migrationCli.spec.ts create mode 100644 src/bin/vectorize-migrate.ts diff --git a/README.md b/README.md index a335339..f447455 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,34 @@ export default buildConfig({ The import map tells Payload how to resolve component paths (like `'payloadcms-vectorize/client#EmbedAllButton'`) to actual React components. Without it, client components referenced in your collection configs won't render. -### 2. Search Your Content +### 2. Initial Migration Setup + +After configuring the plugin, you need to create an initial migration to set up the IVFFLAT indexes in your database. + +**For new setups:** + +1. Create your initial Payload migration (this will include the embedding columns via Drizzle schema): + + ```bash + pnpm payload migrate:create --name initial + ``` + +2. Use the migration CLI helper to add IVFFLAT index setup: + + ```bash + pnpm payload vectorize:migrate + ``` + + The CLI automatically extracts your static configs from the Payload config and patches the migration file with the necessary IVFFLAT index creation SQL. + +3. Review and apply the migration: + ```bash + pnpm payload migrate + ``` + +**Note:** The embedding columns are created automatically by Drizzle via the `afterSchemaInitHook`, but the IVFFLAT indexes need to be added via migrations for proper schema management. + +### 3. Search Your Content The plugin automatically creates a `/api/vector-search` endpoint: @@ -419,7 +446,68 @@ jobs: { } ``` -### Endpoints +## Changing Static Config (ivfflatLists or dims) & Migrations + +**⚠️ Important:** Changing `dims` is **destructive** - it requires re-embedding all your data. Changing `ivfflatLists` rebuilds the index (non-destructive but may take time). + +When you change static config values (`dims` or `ivfflatLists`): + +1. **Update your static config** in `payload.config.ts`: + + ```typescript + const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration({ + mainKnowledgePool: { + dims: 1536, // Changed from previous value + ivfflatLists: 200, // Changed from previous value + }, + }) + ``` + +2. **Create a migration** using the CLI helper: + + ```bash + pnpm payload vectorize:migrate + ``` + + The CLI will: + - Detect changes in your static configs + - Create a new Payload migration using `payload.db.createMigration` + - Patch it with appropriate SQL: + - **If `ivfflatLists` changed**: Rebuilds the IVFFLAT index with the new `lists` parameter (DROP + CREATE INDEX) + - **If `dims` changed**: Truncates the embeddings table (destructive - you'll need to re-embed) + +3. **Review the migration file** in `src/migrations/` - it will be named something like `*_vectorize-config.ts` + +4. **Apply the migration**: + + ```bash + pnpm payload migrate + ``` + +5. **If `dims` changed**: Re-embed all your documents using the bulk embed feature. + +**Schema name qualification:** + +The CLI automatically uses the `schemaName` from your Postgres adapter configuration. If you use a custom schema (e.g., `postgresAdapter({ schemaName: 'custom' })`), all SQL in the migration will be properly qualified with that schema name. + +**Idempotency:** + +Running `pnpm payload vectorize:migrate` multiple times with no config changes will not create duplicate migrations. The CLI detects when no changes are needed and exits early. + +**Development workflow:** + +During development, you may want to disable Payload's automatic schema push to ensure migrations are used: + +- Set `migrations: { disableAutomaticMigrations: true }` in your Payload config, or +- Avoid using `pnpm payload migrate:status --force` which auto-generates migrations + +This ensures your vector-specific migrations are properly applied. + +**Runtime behavior:** + +The `ensurePgvectorArtifacts` function is now **presence-only** - it checks that pgvector artifacts (extension, column, index) exist but does not create or modify them. If artifacts are missing, it throws descriptive errors prompting you to run migrations. This ensures migrations are the single source of truth for schema changes. + +## Endpoints #### POST `/api/vector-bulk-embed` @@ -921,7 +1009,6 @@ Thank you for the stars! The following updates have been completed: The following features are planned for future releases based on community interest and stars: -- **Migrations for vector dimensions**: Easy migration tools for changing vector dimensions and/or ivfflatLists after initial setup - **MongoDB support**: Extend vector search capabilities to MongoDB databases - **Vercel support**: Optimized deployment and configuration for Vercel hosting diff --git a/dev/specs/migrationCli.spec.ts b/dev/specs/migrationCli.spec.ts new file mode 100644 index 0000000..edb4473 --- /dev/null +++ b/dev/specs/migrationCli.spec.ts @@ -0,0 +1,863 @@ +import type { Payload, SanitizedConfig } from 'payload' +import { beforeAll, describe, expect, test, afterAll } from 'vitest' +import { postgresAdapter } from '@payloadcms/db-postgres' +import { buildConfig, getPayload } from 'payload' +import { createVectorizeIntegration } from 'payloadcms-vectorize' +import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from '../helpers/embed.js' +import { createTestDb } from './utils.js' +import { DIMS } from './constants.js' +import type { PostgresPayload } from '../../src/types.js' +import { script as vectorizeMigrateScript } from '../../src/bin/vectorize-migrate.js' +import { readdirSync, statSync, existsSync, readFileSync, rmSync } from 'fs' +import { join, resolve } from 'path' + +describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { + const dbName = `migration_cli_test_${Date.now()}` + let payload: Payload + + beforeAll(async () => { + await createTestDb({ dbName }) + + const integration = createVectorizeIntegration({ + default: { + dims: DIMS, + ivfflatLists: 10, + }, + }) + + const config = await buildConfig({ + secret: 'test-secret', + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + integration.payloadcmsVectorize({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }), + ], + jobs: { + tasks: [], + autoRun: [ + { + cron: '*/5 * * * * *', + limit: 10, + }, + ], + }, + }) + + // Temporarily disable onInit for runtime behavior tests + // This prevents ensurePgvectorArtifacts from running before tests can set up their state + + payload = await getPayload({ + config, + cron: true, + disableOnInit: true, + key: `test-runtime-behavior-${Date.now()}`, + }) + }) + + describe('Runtime behavior', () => { + test('ensurePgvectorArtifacts is presence-only and does not rebuild index', async () => { + const postgresPayload = payload as PostgresPayload + const schemaName = postgresPayload.db.schemaName || 'public' + const tableName = 'default' + + // Manually create the index first (simulating a migration) + await postgresPayload.db.pool?.query( + `CREATE INDEX IF NOT EXISTS ${tableName}_embedding_ivfflat ON "${schemaName}"."${tableName}" USING ivfflat (embedding vector_cosine_ops) WITH (lists = 10)`, + ) + + // Get initial index definition + const initialIndex = await postgresPayload.db.pool?.query( + `SELECT pg_get_indexdef(c.oid) as def + FROM pg_indexes i + JOIN pg_class c ON c.relname = i.indexname + JOIN pg_namespace n ON n.oid = c.relnamespace AND n.nspname = i.schemaname + WHERE i.schemaname = $1 AND i.tablename = $2 AND i.indexname = $3`, + [schemaName, tableName, `${tableName}_embedding_ivfflat`], + ) + const initialDef = initialIndex?.rows[0]?.def || '' + + // Call ensurePgvectorArtifacts (via onInit which should check presence) + // Since we already have the artifacts, it should pass without modifying + // Note: onInit calls ensurePgvectorArtifacts, but since artifacts exist, it should just verify + await payload.config.onInit?.(payload) + + // Verify index definition hasn't changed + const afterIndex = await postgresPayload.db.pool?.query( + `SELECT pg_get_indexdef(c.oid) as def + FROM pg_indexes i + JOIN pg_class c ON c.relname = i.indexname + JOIN pg_namespace n ON n.oid = c.relnamespace AND n.nspname = i.schemaname + WHERE i.schemaname = $1 AND i.tablename = $2 AND i.indexname = $3`, + [schemaName, tableName, `${tableName}_embedding_ivfflat`], + ) + const afterDef = afterIndex?.rows[0]?.def || '' + + // Index should still exist and be the same + expect(afterDef).toBeTruthy() + expect(afterDef).toBe(initialDef) + }) + + test('VectorizedPayload has _staticConfigs', async () => { + const { getVectorizedPayload } = await import('payloadcms-vectorize') + const vectorizedPayload = getVectorizedPayload(payload) + + expect(vectorizedPayload).toBeTruthy() + expect(vectorizedPayload?._staticConfigs).toBeDefined() + expect(vectorizedPayload?._staticConfigs.default).toBeDefined() + expect(vectorizedPayload?._staticConfigs.default.dims).toBe(DIMS) + expect(vectorizedPayload?._staticConfigs.default.ivfflatLists).toBe(10) + }) + + test('ensurePgvectorArtifacts throws error when artifacts are missing (user has not run migrations)', async () => { + // Create a new database without any migrations applied + // This simulates the state when a user hasn't run migrations yet + const testDbName = `migration_cli_test_missing_${Date.now()}` + console.log('[TEST] Step 1: Creating test database:', testDbName) + await createTestDb({ dbName: testDbName }) + console.log('[TEST] Step 2: Database created') + + console.log('[TEST] Step 3: Creating integration') + const integration = createVectorizeIntegration({ + default: { + dims: DIMS, + ivfflatLists: 10, + }, + }) + console.log('[TEST] Step 4: Integration created') + + console.log('[TEST] Step 5: Starting buildConfig...') + const config = await buildConfig({ + secret: 'test-secret', + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${testDbName}`, + }, + }), + plugins: [ + integration.payloadcmsVectorize({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }), + ], + jobs: { + tasks: [], + autoRun: [], + }, + }) + console.log('[TEST] Step 6: buildConfig completed') + + // Note: onInit will be called during getPayload and will throw because artifacts don't exist + // This simulates the real-world scenario where a user hasn't run migrations yet + // The error will be "Embedding column not found" (first check that fails) + console.log('[TEST] Step 7: Calling getPayload (should throw)...') + await expect( + getPayload({ config, cron: true, key: `test-missing-artifacts-${Date.now()}` }), + ).rejects.toThrow('Embedding column not found') + console.log('[TEST] Step 8: getPayload threw as expected') + }) + }) + + describe('CLI workflow (sequential)', () => { + const cliDbName = `migration_cli_e2e_test_${Date.now()}` + let cliPayload: Payload + let cliConfig: SanitizedConfig + const migrationsDir = resolve(process.cwd(), 'dev', 'test-migrations-cli') + + beforeAll(async () => { + await createTestDb({ dbName: cliDbName }) + + // Clean up any existing migrations directory to ensure clean state + if (existsSync(migrationsDir)) { + rmSync(migrationsDir, { recursive: true, force: true }) + } + + // Create test migrations directory + const { mkdirSync } = await import('fs') + mkdirSync(migrationsDir, { recursive: true }) + }) + + afterAll(async () => { + // Cleanup: remove test migrations directory + if (existsSync(migrationsDir)) { + rmSync(migrationsDir, { recursive: true, force: true }) + } + }) + + test('1. Initial setup: create migration with IVFFLAT index', async () => { + // Step 1: Create integration with initial config + const integration = createVectorizeIntegration({ + default: { + dims: DIMS, + ivfflatLists: 10, // Initial lists parameter + }, + }) + + cliConfig = await buildConfig({ + secret: 'test-secret', + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${cliDbName}`, + }, + }), + plugins: [ + integration.payloadcmsVectorize({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }), + ], + jobs: { + tasks: [], + autoRun: [ + { + cron: '*\/5 * * * * *', + limit: 10, + }, + ], + }, + }) + + // Temporarily disable onInit to avoid ensurePgvectorArtifacts check before migrations are applied + const savedOnInit = cliConfig.onInit + cliConfig.onInit = async () => { + // No-op: migrations haven't been applied yet + } + + cliPayload = await getPayload({ + config: cliConfig, + cron: true, + key: `test-initial-setup-${Date.now()}`, + disableOnInit: true, + }) + + // Step 2: Create initial migration (this will include the embedding column via Drizzle) + console.log('[TEST] Step 2: Creating initial migration...') + await cliPayload.db.createMigration({ + migrationName: 'initial', + payload: cliPayload, + }) + console.log('[TEST] Step 2.5: Initial migration created') + + // Step 3: Run vectorize:migrate to add IVFFLAT index to the migration + console.log('[TEST] Step 3: Running vectorize:migrate...') + await vectorizeMigrateScript(cliConfig) + + // Debug: Print all files in migrations directory + console.log('[TEST] Step 3.5: Listing all files in migrations directory:') + const allFiles = readdirSync(migrationsDir) + for (const file of allFiles) { + const filePath = join(migrationsDir, file) + const stats = statSync(filePath) + console.log( + `[TEST] - ${file} (${stats.size} bytes, modified: ${stats.mtime.toISOString()})`, + ) + if (file.endsWith('.ts') && file !== 'index.ts') { + const content = readFileSync(filePath, 'utf-8') + console.log(`[TEST] Content preview (first 500 chars): ${content.substring(0, 500)}`) + console.log( + `[TEST] Contains 'up' function: ${content.includes('export async function up')}`, + ) + console.log(`[TEST] Contains 'CREATE INDEX': ${content.includes('CREATE INDEX')}`) + console.log(`[TEST] Contains 'ivfflat': ${content.includes('ivfflat')}`) + console.log(`[TEST] Contains 'lists =': ${content.includes('lists =')}`) + console.log( + `[TEST] Contains 'default_embedding_ivfflat': ${content.includes('default_embedding_ivfflat')}`, + ) + // Show the last 1000 chars where our code should be + console.log( + `[TEST] Content preview (last 1000 chars): ${content.substring(Math.max(0, content.length - 1000))}`, + ) + } + } + + // Step 4: Apply the migration + console.log('[TEST] Step 4: Applying migration...') + try { + // Try using db.migrate() if it exists (internal API) + if (typeof (cliPayload.db as any).migrate === 'function') { + console.log('[TEST] Step 4.1: Using db.migrate() method') + await (cliPayload.db as any).migrate() + } else { + // Fallback: manually load and execute migration files + console.log( + '[TEST] Step 4.1: db.migrate() not available, using manual migration execution', + ) + const migrationFiles = readdirSync(migrationsDir) + .filter((f) => f.endsWith('.ts') && f !== 'index.ts') + .sort() + + for (const file of migrationFiles) { + const migrationPath = join(migrationsDir, file) + console.log(`[TEST] Step 4.2: Loading migration: ${file}`) + const migration = await import(migrationPath) + if (migration.up) { + console.log(`[TEST] Step 4.3: Executing up() for ${file}`) + await migration.up({ db: cliPayload.db.drizzle, payload: cliPayload, req: {} as any }) + } + } + } + console.log('[TEST] Step 4.5: Migration applied') + } catch (error) { + console.error('[TEST] Step 4.5: Migration failed with error:', error) + throw error + } + + // Step 4.55: Check database directly to see if index exists + const postgresPayloadCheck = cliPayload as PostgresPayload + const schemaNameCheck = postgresPayloadCheck.db.schemaName || 'public' + const indexNameCheck = 'default_embedding_ivfflat' + try { + const directIndexCheck = await postgresPayloadCheck.db.pool?.query( + `SELECT indexname FROM pg_indexes WHERE schemaname = $1 AND indexname = $2`, + [schemaNameCheck, indexNameCheck], + ) + console.log( + `[TEST] Step 4.55: Direct database check - index exists: ${(directIndexCheck?.rows.length || 0) > 0}`, + ) + if (directIndexCheck?.rows.length === 0) { + console.log(`[TEST] Step 4.55: WARNING - Index not found in database after migration!`) + // List all indexes on the default table + const allIndexes = await postgresPayloadCheck.db.pool?.query( + `SELECT indexname FROM pg_indexes WHERE schemaname = $1 AND tablename = 'default'`, + [schemaNameCheck], + ) + console.log( + `[TEST] Step 4.55: All indexes on 'default' table: ${allIndexes?.rows.map((r: any) => r.indexname).join(', ') || 'none'}`, + ) + } + } catch (error) { + console.error('[TEST] Step 4.55: Error checking database:', error) + } + + // Step 4.6: Verify the migration file actually contains the IVFFLAT code + const allMigrationsAfter = readdirSync(migrationsDir) + .filter((f) => f.endsWith('.ts') && f !== 'index.ts') + .map((f) => ({ + name: f, + path: join(migrationsDir, f), + mtime: statSync(join(migrationsDir, f)).mtime, + })) + .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) + const latestMigrationFile = allMigrationsAfter[0]?.path + if (latestMigrationFile) { + const migrationFileAfterApply = readFileSync(latestMigrationFile, 'utf-8') + console.log(`[TEST] Step 4.6: Checking migration file after apply: ${latestMigrationFile}`) + console.log( + `[TEST] File contains 'ivfflat': ${migrationFileAfterApply.includes('ivfflat')}`, + ) + console.log( + `[TEST] File contains 'lists = 10': ${migrationFileAfterApply.includes('lists = 10')}`, + ) + console.log( + `[TEST] File contains 'drizzle.execute': ${migrationFileAfterApply.includes('drizzle.execute')}`, + ) + // Find the IVFFLAT code section + const ivfflatMatch = migrationFileAfterApply.match(/ivfflat[\s\S]{0,500}/i) + if (ivfflatMatch) { + console.log(`[TEST] IVFFLAT code section: ${ivfflatMatch[0]}`) + } + // Show the end of the up function where our code should be + const upFunctionEnd = migrationFileAfterApply.lastIndexOf('export async function up') + if (upFunctionEnd !== -1) { + const upFunctionContent = migrationFileAfterApply.substring(upFunctionEnd) + const last500OfUp = upFunctionContent.substring( + Math.max(0, upFunctionContent.length - 500), + ) + console.log(`[TEST] Last 500 chars of up function: ${last500OfUp}`) + } + } + + // Restore onInit and run it now that migrations are applied + cliConfig.onInit = savedOnInit + if (cliConfig.onInit) { + await cliConfig.onInit(cliPayload) + } + + // Step 5: Verify index exists with correct lists parameter + const postgresPayload = cliPayload as PostgresPayload + const schemaName = postgresPayload.db.schemaName || 'public' + const tableName = 'default' + const indexName = `${tableName}_embedding_ivfflat` + + const indexCheck = await postgresPayload.db.pool?.query( + `SELECT pg_get_indexdef(c.oid) as def + FROM pg_indexes i + JOIN pg_class c ON c.relname = i.indexname + JOIN pg_namespace n ON n.oid = c.relnamespace AND n.nspname = i.schemaname + WHERE i.schemaname = $1 AND i.tablename = $2 AND i.indexname = $3`, + [schemaName, tableName, indexName], + ) + const indexDef = indexCheck?.rows[0]?.def || '' + console.log(`[TEST] Step 5.5: Index definition: ${indexDef}`) + expect(indexDef).toBeTruthy() + // PostgreSQL returns lists='10' (with quotes), so match either format + expect(indexDef).toMatch(/lists\s*=\s*['"]?10['"]?/i) + console.log('[TEST] Test 1 completed successfully') + }) + + test('2. Change ivfflatLists: CLI creates migration, apply and verify', async () => { + // Step 1: Recreate integration with changed ivfflatLists + const integration = createVectorizeIntegration({ + default: { + dims: DIMS, + ivfflatLists: 20, // Changed from 10 to 20 + }, + }) + + // Update config with new integration (this simulates changing static config in payload.config.ts) + cliConfig = await buildConfig({ + secret: 'test-secret', + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${cliDbName}`, + }, + }), + plugins: [ + integration.payloadcmsVectorize({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }), + ], + jobs: { + tasks: [], + autoRun: [ + { + cron: '*\/5 * * * * *', + limit: 10, + }, + ], + }, + }) + + // Temporarily disable onInit to avoid ensurePgvectorArtifacts check before migrations are applied + const savedOnInit = cliConfig.onInit + cliConfig.onInit = async () => { + // No-op: migrations haven't been applied yet + } + + cliPayload = await getPayload({ + config: cliConfig, + cron: true, + key: `test-ivfflat-change-${Date.now()}`, + disableOnInit: true, + }) + + // Step 2: Run vectorize:migrate (should detect change and create migration) + console.log('[TEST] Step 2: Running vectorize:migrate...') + const migrateScriptStart = Date.now() + try { + await Promise.race([ + vectorizeMigrateScript(cliConfig), + new Promise((_, reject) => + setTimeout(() => reject(new Error('vectorize:migrate timed out after 30s')), 30000), + ), + ]) + const migrateScriptEnd = Date.now() + console.log( + `[TEST] Step 2.5: vectorize:migrate completed in ${migrateScriptEnd - migrateScriptStart}ms`, + ) + } catch (error) { + console.error('[TEST] Step 2.5: vectorize:migrate failed:', error) + throw error + } + + // Step 3: Verify migration file was created and contains correct SQL + console.log('[TEST] Step 3: Listing all files in migrations directory:') + const allFiles = readdirSync(migrationsDir) + for (const file of allFiles) { + const filePath = join(migrationsDir, file) + const stats = statSync(filePath) + console.log( + `[TEST] - ${file} (${stats.size} bytes, modified: ${stats.mtime.toISOString()})`, + ) + } + + const migrations = readdirSync(migrationsDir) + .filter( + (f) => (f.endsWith('.ts') || f.endsWith('.js')) && f !== 'index.ts' && f !== 'index.js', + ) + .map((f) => ({ + name: f, + path: join(migrationsDir, f), + mtime: statSync(join(migrationsDir, f)).mtime, + })) + .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) + + console.log(`[TEST] Found ${migrations.length} migration files (excluding index.ts/js)`) + migrations.forEach((m, i) => { + console.log(`[TEST] ${i + 1}. ${m.name} (${m.mtime.toISOString()})`) + }) + + const newestMigration = migrations[0] + expect(newestMigration).toBeTruthy() + console.log(`[TEST] Reading migration file: ${newestMigration.path}`) + + // Verify migration file contains IVFFLAT rebuild SQL + const migrationContent = readFileSync(newestMigration.path, 'utf-8') + console.log(`[TEST] Migration file content length: ${migrationContent.length} characters`) + console.log( + `[TEST] Migration file preview (first 1000 chars):\n${migrationContent.substring(0, 1000)}`, + ) + // PostgreSQL returns lists='20' (with quotes), so match either format + expect(migrationContent).toMatch(/lists\s*=\s*['"]?20['"]?/i) + expect(migrationContent).toContain('DROP INDEX') + expect(migrationContent).toContain('CREATE INDEX') + + // Step 4: Apply the migration + if (typeof (cliPayload.db as any).migrate === 'function') { + await (cliPayload.db as any).migrate() + } else { + // Fallback: manually load and execute migration files + const migrationFiles = readdirSync(migrationsDir) + .filter((f) => f.endsWith('.ts') && f !== 'index.ts') + .sort() + + for (const file of migrationFiles) { + const migrationPath = join(migrationsDir, file) + const migration = await import(migrationPath) + if (migration.up) { + await migration.up({ db: cliPayload.db.drizzle, payload: cliPayload, req: {} as any }) + } + } + } + + // Restore onInit and run it now that migrations are applied + if (savedOnInit) { + cliConfig.onInit = savedOnInit + await savedOnInit(cliPayload) + } + + // Step 5: Verify index was rebuilt with new lists parameter + const postgresPayload = cliPayload as PostgresPayload + const schemaName = postgresPayload.db.schemaName || 'public' + const tableName = 'default' + const indexName = `${tableName}_embedding_ivfflat` + + const indexCheck = await postgresPayload.db.pool?.query( + `SELECT pg_get_indexdef(c.oid) as def + FROM pg_indexes i + JOIN pg_class c ON c.relname = i.indexname + JOIN pg_namespace n ON n.oid = c.relnamespace AND n.nspname = i.schemaname + WHERE i.schemaname = $1 AND i.tablename = $2 AND i.indexname = $3`, + [schemaName, tableName, indexName], + ) + const indexDef = indexCheck?.rows[0]?.def || '' + expect(indexDef).toBeTruthy() + // PostgreSQL returns lists='20' (with quotes), so match either format + expect(indexDef).toMatch(/lists\s*=\s*['"]?20['"]?/i) + }) + + test('3. Idempotency: CLI does not create duplicate migration when config unchanged', async () => { + // Get migration count before + const migrationsBefore = readdirSync(migrationsDir).filter( + (f) => f.endsWith('.ts') || f.endsWith('.js'), + ).length + + // Run vectorize:migrate again (config hasn't changed) + console.log('[TEST] Running vectorize:migrate for idempotency check...') + const startTime = Date.now() + try { + await Promise.race([ + vectorizeMigrateScript(cliConfig), + new Promise((_, reject) => + setTimeout(() => reject(new Error('vectorize:migrate timed out after 30s')), 30000), + ), + ]) + const endTime = Date.now() + console.log(`[TEST] vectorize:migrate completed in ${endTime - startTime}ms`) + } catch (error) { + console.error('[TEST] vectorize:migrate failed:', error) + throw error + } + + // Verify no new migration was created + const migrationsAfter = readdirSync(migrationsDir).filter( + (f) => f.endsWith('.ts') || f.endsWith('.js'), + ).length + + expect(migrationsAfter).toBe(migrationsBefore) + }) + + test('4. Change dims: CLI creates destructive migration', async () => { + console.log('[TEST] Starting test 4: Change dims') + const NEW_DIMS = DIMS + 2 // Change dimensions (destructive) + console.log(`[TEST] NEW_DIMS: ${NEW_DIMS}`) + + // Step 1: Recreate integration with changed dims + console.log('[TEST] Step 1: Creating integration with changed dims...') + const integration = createVectorizeIntegration({ + default: { + dims: NEW_DIMS, // Changed dimensions + ivfflatLists: 20, // Keep same lists + }, + }) + + // Update config with new integration + cliConfig = await buildConfig({ + secret: 'test-secret', + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${cliDbName}`, + }, + }), + plugins: [ + integration.payloadcmsVectorize({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(NEW_DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(NEW_DIMS), + }, + }, + }, + }), + ], + jobs: { + tasks: [], + autoRun: [ + { + cron: '*\/5 * * * * *', + limit: 10, + }, + ], + }, + }) + + // Temporarily disable onInit to avoid ensurePgvectorArtifacts check before migrations are applied + const savedOnInitDims = cliConfig.onInit + cliConfig.onInit = async () => { + // No-op: migrations haven't been applied yet + } + + cliPayload = await getPayload({ + config: cliConfig, + cron: true, + key: `test-dims-change-${Date.now()}`, + disableOnInit: true, + }) + + // Step 2: Run vectorize:migrate (should detect dims change) + console.log('[TEST] Step 2: Running vectorize:migrate...') + await vectorizeMigrateScript(cliConfig) + console.log('[TEST] Step 2.5: vectorize:migrate completed') + + // Step 3: Verify migration file contains destructive SQL (truncate + column type change) + console.log('[TEST] Step 3: Listing all files in migrations directory:') + const allFiles = readdirSync(migrationsDir) + for (const file of allFiles) { + const filePath = join(migrationsDir, file) + const stats = statSync(filePath) + console.log( + `[TEST] - ${file} (${stats.size} bytes, modified: ${stats.mtime.toISOString()})`, + ) + } + + const migrations = readdirSync(migrationsDir) + .filter( + (f) => (f.endsWith('.ts') || f.endsWith('.js')) && f !== 'index.ts' && f !== 'index.js', + ) + .map((f) => ({ + name: f, + path: join(migrationsDir, f), + mtime: statSync(join(migrationsDir, f)).mtime, + })) + .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) + + console.log(`[TEST] Found ${migrations.length} migration files (excluding index.ts/js)`) + const newestMigration = migrations[0] + console.log(`[TEST] Reading newest migration: ${newestMigration.path}`) + const migrationContent = readFileSync(newestMigration.path, 'utf-8') + console.log(`[TEST] Migration content length: ${migrationContent.length} characters`) + console.log( + `[TEST] Migration content preview (first 1000 chars):\n${migrationContent.substring(0, 1000)}`, + ) + + // Verify it contains dims change SQL + expect(migrationContent).toContain('Changing dims') + expect(migrationContent).toContain('TRUNCATE TABLE') + expect(migrationContent).toContain(`vector(${NEW_DIMS})`) + expect(migrationContent).toContain('ALTER COLUMN embedding TYPE') + console.log('[TEST] Step 3.5: Migration file verification passed') + + // Step 4: Apply the migration + console.log('[TEST] Step 4: Applying migration...') + console.log('[TEST] Step 4.1: About to call cliPayload.db.migrate()...') + console.log('[TEST] Step 4.1.1: Migration directory:', migrationsDir) + console.log( + '[TEST] Step 4.1.2: Payload instance migrationDir:', + (cliPayload.db as any).migrationDir, + ) + try { + const migrateStart = Date.now() + console.log('[TEST] Step 4.1.3: Calling migrate() at', new Date().toISOString()) + if (typeof (cliPayload.db as any).migrate === 'function') { + await (cliPayload.db as any).migrate() + } else { + // Fallback: manually load and execute migration files + const migrationFiles = readdirSync(migrationsDir) + .filter((f) => f.endsWith('.ts') && f !== 'index.ts') + .sort() + + for (const file of migrationFiles) { + const migrationPath = join(migrationsDir, file) + const migration = await import(migrationPath) + if (migration.up) { + await migration.up({ db: cliPayload.db.drizzle, payload: cliPayload, req: {} as any }) + } + } + } + const migrateEnd = Date.now() + console.log( + `[TEST] Step 4.2: cliPayload.db.migrate() completed in ${migrateEnd - migrateStart}ms`, + ) + } catch (error) { + console.error('[TEST] Step 4.2: Error during migration:', error) + throw error + } + console.log('[TEST] Step 4.5: Migration applied successfully') + + // Restore onInit and run it now that migrations are applied + console.log('[TEST] Step 4.6: Restoring onInit...') + if (savedOnInitDims) { + cliConfig.onInit = savedOnInitDims + await savedOnInitDims(cliPayload) + } + console.log('[TEST] Step 4.7: onInit restored and executed') + + // Step 5: Verify column type changed and table was truncated + console.log('[TEST] Step 5: Verifying column type and table state...') + const postgresPayload = cliPayload as PostgresPayload + const schemaName = postgresPayload.db.schemaName || 'public' + const tableName = 'default' + + // Check column type + const columnCheck = await postgresPayload.db.pool?.query( + `SELECT format_type(atttypid, atttypmod) as column_type + FROM pg_attribute + JOIN pg_class ON pg_attribute.attrelid = pg_class.oid + JOIN pg_namespace ON pg_class.relnamespace = pg_namespace.oid + WHERE pg_namespace.nspname = $1 + AND pg_class.relname = $2 + AND pg_attribute.attname = 'embedding' + AND pg_attribute.attnum > 0 + AND NOT pg_attribute.attisdropped`, + [schemaName, tableName], + ) + const columnType = columnCheck?.rows[0]?.column_type || '' + expect(columnType).toContain(`vector(${NEW_DIMS})`) + + // Verify table was truncated (should be empty or have no embeddings) + console.log('[TEST] Step 5.5: Checking table row count...') + const countCheck = await postgresPayload.db.pool?.query( + `SELECT COUNT(*) as count FROM "${schemaName}"."${tableName}"`, + ) + const rowCount = parseInt(countCheck?.rows[0]?.count || '0', 10) + console.log(`[TEST] Table row count: ${rowCount}`) + // Table should be empty after truncate (unless new embeddings were created during test) + expect(rowCount).toBe(0) + console.log('[TEST] Test 4 completed successfully') + }) + }) +}) diff --git a/package.json b/package.json index da2e38e..317c47c 100644 --- a/package.json +++ b/package.json @@ -40,7 +40,7 @@ "test:teardown": "docker-compose -f dev/docker-compose.test.yml down", "test": "pnpm test:int && pnpm test:e2e", "test:e2e": "playwright test", - "test:int": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS=--require=dotenv/config vitest" + "test:int": "cross-env DOTENV_CONFIG_PATH=dev/.env.test NODE_OPTIONS='--require=dotenv/config --import=tsx' vitest" }, "devDependencies": { "@eslint/eslintrc": "^3.2.0", @@ -82,6 +82,7 @@ "sharp": "0.34.2", "sort-package-json": "^2.10.0", "tailwindcss": "^4.1.14", + "tsx": "^4.21.0", "typescript": "5.7.3", "vite-tsconfig-paths": "^5.1.4", "vitest": "^3.1.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index cf56fb2..9c865c8 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -129,15 +129,18 @@ importers: tailwindcss: specifier: ^4.1.14 version: 4.1.18 + tsx: + specifier: ^4.21.0 + version: 4.21.0 typescript: specifier: 5.7.3 version: 5.7.3 vite-tsconfig-paths: specifier: ^5.1.4 - version: 5.1.4(typescript@5.7.3)(vite@7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2)) + version: 5.1.4(typescript@5.7.3)(vite@7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2)) vitest: specifier: ^3.1.2 - version: 3.2.4(@types/debug@4.1.12)(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2) + version: 3.2.4(@types/debug@4.1.12)(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2) voyage-ai-provider: specifier: ^2.0.0 version: 2.0.0(zod@4.3.4) @@ -6098,6 +6101,11 @@ packages: engines: {node: '>=18.0.0'} hasBin: true + tsx@4.21.0: + resolution: {integrity: sha512-5C1sg4USs1lfG0GFb2RLXsdpXqBSEhAaA/0kPL01wxzpMqLILNxIxIOKiILz+cdg/pLnOUxFYOR5yhHU666wbw==} + engines: {node: '>=18.0.0'} + hasBin: true + tweetnacl@0.14.5: resolution: {integrity: sha512-KXXFFdAbFXY4geFIwoyNK+f5Z1b7swfXABfL7HXCmoIWMKU3dmS26672A4EeQtDzLKy7SXmfBu51JolvEKwtGA==} @@ -9197,13 +9205,13 @@ snapshots: chai: 5.3.3 tinyrainbow: 2.0.0 - '@vitest/mocker@3.2.4(vite@7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2))': + '@vitest/mocker@3.2.4(vite@7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2))': dependencies: '@vitest/spy': 3.2.4 estree-walker: 3.0.3 magic-string: 0.30.21 optionalDependencies: - vite: 7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2) + vite: 7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2) '@vitest/pretty-format@3.2.4': dependencies: @@ -13518,7 +13526,7 @@ snapshots: tsx@4.20.3: dependencies: esbuild: 0.25.12 - get-tsconfig: 4.8.1 + get-tsconfig: 4.13.0 optionalDependencies: fsevents: 2.3.3 @@ -13529,6 +13537,13 @@ snapshots: optionalDependencies: fsevents: 2.3.3 + tsx@4.21.0: + dependencies: + esbuild: 0.27.2 + get-tsconfig: 4.13.0 + optionalDependencies: + fsevents: 2.3.3 + tweetnacl@0.14.5: {} type-check@0.4.0: @@ -13700,13 +13715,13 @@ snapshots: '@types/unist': 3.0.3 unist-util-stringify-position: 4.0.0 - vite-node@3.2.4(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2): + vite-node@3.2.4(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2): dependencies: cac: 6.7.14 debug: 4.4.3 es-module-lexer: 1.7.0 pathe: 2.0.3 - vite: 7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2) + vite: 7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2) transitivePeerDependencies: - '@types/node' - jiti @@ -13721,18 +13736,18 @@ snapshots: - tsx - yaml - vite-tsconfig-paths@5.1.4(typescript@5.7.3)(vite@7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2)): + vite-tsconfig-paths@5.1.4(typescript@5.7.3)(vite@7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2)): dependencies: debug: 4.4.3 globrex: 0.1.2 tsconfck: 3.1.6(typescript@5.7.3) optionalDependencies: - vite: 7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2) + vite: 7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2) transitivePeerDependencies: - supports-color - typescript - vite@7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2): + vite@7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2): dependencies: esbuild: 0.27.2 fdir: 6.5.0(picomatch@4.0.3) @@ -13746,14 +13761,14 @@ snapshots: jiti: 2.6.1 lightningcss: 1.30.2 sass: 1.77.4 - tsx: 4.20.6 + tsx: 4.21.0 yaml: 2.8.2 - vitest@3.2.4(@types/debug@4.1.12)(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2): + vitest@3.2.4(@types/debug@4.1.12)(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2): dependencies: '@types/chai': 5.2.3 '@vitest/expect': 3.2.4 - '@vitest/mocker': 3.2.4(vite@7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2)) + '@vitest/mocker': 3.2.4(vite@7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2)) '@vitest/pretty-format': 3.2.4 '@vitest/runner': 3.2.4 '@vitest/snapshot': 3.2.4 @@ -13771,8 +13786,8 @@ snapshots: tinyglobby: 0.2.15 tinypool: 1.1.1 tinyrainbow: 2.0.0 - vite: 7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2) - vite-node: 3.2.4(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.20.6)(yaml@2.8.2) + vite: 7.3.0(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2) + vite-node: 3.2.4(@types/node@22.19.3)(jiti@2.6.1)(lightningcss@1.30.2)(sass@1.77.4)(tsx@4.21.0)(yaml@2.8.2) why-is-node-running: 2.3.0 optionalDependencies: '@types/debug': 4.1.12 diff --git a/src/bin/vectorize-migrate.ts b/src/bin/vectorize-migrate.ts new file mode 100644 index 0000000..6f17e59 --- /dev/null +++ b/src/bin/vectorize-migrate.ts @@ -0,0 +1,588 @@ +import type { SanitizedConfig } from 'payload' +import { getPayload } from 'payload' +import { readFileSync, writeFileSync, readdirSync, statSync, existsSync, rmSync } from 'fs' +import { join, resolve } from 'path' +import toSnakeCase from 'to-snake-case' + +import { getVectorizedPayload } from '../types.js' +import type { KnowledgePoolStaticConfig } from '../types.js' + +/** + * Get prior state from existing migrations + */ +function getPriorStateFromMigrations( + migrationsDir: string, + poolNames: string[], +): Map { + const state = new Map() + + // Initialize with null (unknown state) + for (const poolName of poolNames) { + state.set(poolName, { dims: null, ivfflatLists: null }) + } + + if (!existsSync(migrationsDir)) { + return state + } + + // Find all migration files and read them in reverse order (newest first) + // Exclude index.ts/index.js as those are not migration files + const migrationFiles = readdirSync(migrationsDir) + .filter((f) => (f.endsWith('.ts') || f.endsWith('.js')) && f !== 'index.ts' && f !== 'index.js') + .map((f) => ({ + name: f, + path: join(migrationsDir, f), + mtime: statSync(join(migrationsDir, f)).mtime, + })) + .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) + + console.log(`[payloadcms-vectorize] Found ${migrationFiles.length} migration file(s) to scan for prior state`) + + // Read migration files to find vector config + for (const file of migrationFiles) { + try { + const content = readFileSync(file.path, 'utf-8') + + // Extract only the UP function content to avoid matching values in DOWN function + // The DOWN function contains previous/rollback values which we don't want + const upFunctionMatch = content.match( + /export\s+async\s+function\s+up\s*\([^)]*\)[^{]*\{([\s\S]*?)(?=\}\s*(?:export\s+async\s+function\s+down|$))/i, + ) + const upContent = upFunctionMatch ? upFunctionMatch[1] : content + + // Look for IVFFLAT index creation with lists parameter + for (const poolName of poolNames) { + const tableName = toSnakeCase(poolName) + const indexName = `${tableName}_embedding_ivfflat` + + // Check if this migration creates the index (only in UP function) + // The code format is: await db.execute(sql.raw(`CREATE INDEX "indexName" ... WITH (lists = 10)`)) + // We need to match the lists parameter in the template literal + // Use non-greedy .*? to match the FIRST occurrence + const indexMatch = + // Match: db.execute(sql.raw(`...CREATE INDEX..."indexName"...WITH (lists = 10)...`)) + upContent.match( + new RegExp( + `db\\.execute\\(sql\\.raw.*?CREATE INDEX.*?"${indexName}".*?WITH\\s*\\(lists\\s*=\\s*(\\d+)\\)`, + 'is', + ), + ) || + // Match: CREATE INDEX "indexName" ... WITH (lists = 10) (in any context) + upContent.match( + new RegExp(`CREATE INDEX.*?"${indexName}".*?WITH\\s*\\(lists\\s*=\\s*(\\d+)\\)`, 'is'), + ) || + // Match: lists = near ivfflat (non-greedy) + upContent.match(new RegExp(`ivfflat.*?lists\\s*=\\s*(\\d+)`, 'is')) + + if (indexMatch && !state.get(poolName)?.ivfflatLists) { + const lists = parseInt(indexMatch[1], 10) + const current = state.get(poolName) || { dims: null, ivfflatLists: null } + state.set(poolName, { ...current, ivfflatLists: lists }) + console.log( + `[payloadcms-vectorize] Found prior ivfflatLists=${lists} for pool "${poolName}" in ${file.name}`, + ) + } else if (!state.get(poolName)?.ivfflatLists) { + // Debug: log if we didn't find it + console.log( + `[payloadcms-vectorize] No ivfflatLists found for pool "${poolName}" in ${file.name}`, + ) + } + + // Check for dims in vector column definition (search full content as dims should be consistent) + const dimsMatch = content.match(new RegExp(`vector\\((\\d+)\\)`, 'i')) + if (dimsMatch && !state.get(poolName)?.dims) { + const dims = parseInt(dimsMatch[1], 10) + const current = state.get(poolName) || { dims: null, ivfflatLists: null } + state.set(poolName, { ...current, dims }) + console.log( + `[payloadcms-vectorize] Found prior dims=${dims} for pool "${poolName}" in ${file.name}`, + ) + } + } + } catch (err) { + // Skip files that can't be read + continue + } + } + + return state +} + +/** + * Generate SQL code for IVFFLAT index rebuild + */ +function generateIvfflatRebuildCode( + tableName: string, + schemaName: string, + ivfflatLists: number, +): string { + const indexName = `${tableName}_embedding_ivfflat` + return ` await db.execute(sql.raw(\`DROP INDEX IF EXISTS "${schemaName}"."${indexName}"\`)); + await db.execute(sql.raw(\`CREATE INDEX "${indexName}" ON "${schemaName}"."${tableName}" USING ivfflat (embedding vector_cosine_ops) WITH (lists = ${ivfflatLists})\`));` +} + +/** + * Generate SQL code for column type change + */ +function generateColumnTypeChangeCode( + tableName: string, + schemaName: string, + newDims: number, +): string { + return ` // Change column type to new dimensions + await db.execute(sql.raw(\`ALTER TABLE "${schemaName}"."${tableName}" ALTER COLUMN embedding TYPE vector(${newDims})\`));` +} + +/** + * Generate SQL code for destructive dims change + */ +function generateDimsChangeCode( + tableName: string, + schemaName: string, + newDims: number, + newIvfflatLists: number, +): string { + const indexName = `${tableName}_embedding_ivfflat` + return ` // WARNING: Changing vector dimensions is destructive and requires re-embedding + // Step 1: Drop existing index + await db.execute(sql.raw(\`DROP INDEX IF EXISTS "${schemaName}"."${indexName}"\`)); + // Step 2: Change column type (Payload migration may also generate this, but explicit is safer) + await db.execute(sql.raw(\`ALTER TABLE "${schemaName}"."${tableName}" ALTER COLUMN embedding TYPE vector(${newDims})\`)); + // Step 3: Truncate table (destructive - all embeddings are lost) + // Use CASCADE to handle foreign key constraints + await db.execute(sql.raw(\`TRUNCATE TABLE "${schemaName}"."${tableName}" CASCADE\`)); + // Step 4: Recreate index with new parameters + await db.execute(sql.raw(\`CREATE INDEX "${indexName}" ON "${schemaName}"."${tableName}" USING ivfflat (embedding vector_cosine_ops) WITH (lists = ${newIvfflatLists})\`));` +} + +/** + * Patch a migration file with vector-specific SQL + */ +function patchMigrationFile( + migrationPath: string, + staticConfigs: Record, + schemaName: string, + priorState: Map, +): void { + console.log(`[vectorize-migrate] Reading migration file: ${migrationPath}`) + const content = readFileSync(migrationPath, 'utf-8') + console.log(`[vectorize-migrate] File read successfully, length: ${content.length} characters`) + + // Generate SQL code for each pool + const vectorUpCode: string[] = [] + const vectorDownCode: string[] = [] + + for (const [poolName, config] of Object.entries(staticConfigs)) { + const tableName = toSnakeCase(poolName) + const priorConfig = priorState.get(poolName) || { dims: null, ivfflatLists: null } + const dimsChanged = priorConfig.dims !== null && priorConfig.dims !== config.dims + const ivfflatListsChanged = + priorConfig.ivfflatLists !== null && priorConfig.ivfflatLists !== config.ivfflatLists + + // Check if dims changed (destructive) - handle this first as it includes index operations + if (dimsChanged) { + vectorUpCode.push( + ` // payloadcms-vectorize: WARNING - Changing dims from ${priorConfig.dims} to ${config.dims} is destructive`, + ) + // When dims changes, we need to: + // 1. Drop existing index first + // 2. Change column type (Payload migration may also generate this) + // 3. Truncate table (destructive) + // 4. Recreate index with new ivfflatLists + vectorUpCode.push( + generateDimsChangeCode(tableName, schemaName, config.dims, config.ivfflatLists), + ) + // Down migration: restore to previous state (but can't restore data) + vectorDownCode.push( + ` // payloadcms-vectorize: Revert dims change (WARNING: data was truncated and cannot be restored)`, + ) + // Restore previous column type and index + vectorDownCode.push( + generateColumnTypeChangeCode(tableName, schemaName, priorConfig.dims || config.dims), + ) + vectorDownCode.push( + generateIvfflatRebuildCode( + tableName, + schemaName, + priorConfig.ivfflatLists || config.ivfflatLists, + ), + ) + vectorDownCode.push(` // WARNING: Original data cannot be restored`) + } else if (ivfflatListsChanged) { + // Check if ivfflatLists changed (only if dims didn't change, since dims change handles index) + vectorUpCode.push( + ` // payloadcms-vectorize: Rebuild IVFFLAT index for ${poolName} with lists=${config.ivfflatLists}`, + ) + vectorUpCode.push(generateIvfflatRebuildCode(tableName, schemaName, config.ivfflatLists)) + // Down migration: rebuild with old lists + vectorDownCode.push( + ` // payloadcms-vectorize: Revert IVFFLAT index for ${poolName} to lists=${priorConfig.ivfflatLists}`, + ) + vectorDownCode.push( + generateIvfflatRebuildCode( + tableName, + schemaName, + priorConfig.ivfflatLists || config.ivfflatLists, + ), + ) + } + + // If this is the first migration, ensure index exists + // Note: Column is handled by Drizzle schema via afterSchemaInit + // We only check ivfflatLists because dims will always be found from Drizzle schema + if (priorConfig.ivfflatLists === null) { + vectorUpCode.push(` // payloadcms-vectorize: Initial IVFFLAT index setup for ${poolName}`) + vectorUpCode.push( + ` // Note: Embedding column is created via Drizzle schema (afterSchemaInit hook)`, + ) + vectorUpCode.push(generateIvfflatRebuildCode(tableName, schemaName, config.ivfflatLists)) + vectorDownCode.push(` // payloadcms-vectorize: Drop index on rollback`) + const indexName = `${tableName}_embedding_ivfflat` + vectorDownCode.push( + ` await db.execute(sql.raw(\`DROP INDEX IF EXISTS "${schemaName}"."${indexName}"\`));`, + ) + } + } + + if (vectorUpCode.length === 0) { + // No changes needed + return + } + + // Find the up function and insert code before the closing brace + const upFunctionMatch = content.match( + /export\s+async\s+function\s+up\s*\([^)]*\)\s*:\s*Promise\s*\{/i, + ) + if (!upFunctionMatch) { + console.error( + `[vectorize-migrate] Could not find 'up' function in migration file: ${migrationPath}`, + ) + console.error(`[vectorize-migrate] File content length: ${content.length} characters`) + console.error(`[vectorize-migrate] File content (first 1000 chars):`) + console.error(content.substring(0, 1000)) + console.error(`[vectorize-migrate] File content (last 1000 chars):`) + console.error(content.substring(Math.max(0, content.length - 1000))) + console.error( + `[vectorize-migrate] Searching for pattern: /export\\s+async\\s+function\\s+up\\s*\\([^)]*\\)\\s*:\\s*Promise\\s*\\{/i`, + ) + throw new Error(`Could not find 'up' function in migration file: ${migrationPath}`) + } + + const upFunctionStart = upFunctionMatch.index! + upFunctionMatch[0].length + const downFunctionMatch = content.match(/export\s+async\s+function\s+down\s*\([^)]*\)/i) + const searchEnd = downFunctionMatch ? downFunctionMatch.index! : content.length + + // Find the last closing brace before down function or end + const upFunctionBody = content.substring(upFunctionStart, searchEnd) + const lastBraceIndex = upFunctionBody.lastIndexOf('}') + console.log(`[vectorize-migrate] up function body length: ${upFunctionBody.length}`) + console.log(`[vectorize-migrate] lastBraceIndex in body: ${lastBraceIndex}`) + console.log(`[vectorize-migrate] up function body ends with: ${upFunctionBody.substring(Math.max(0, upFunctionBody.length - 200))}`) + if (lastBraceIndex === -1) { + throw new Error( + `Could not find closing brace for 'up' function in migration file: ${migrationPath}`, + ) + } + + // Insert our code before the closing brace + const beforeBrace = content.substring(0, upFunctionStart + lastBraceIndex) + const afterBrace = content.substring(upFunctionStart + lastBraceIndex) + console.log(`[vectorize-migrate] Insertion point: beforeBrace ends with: ${beforeBrace.substring(Math.max(0, beforeBrace.length - 100))}`) + console.log(`[vectorize-migrate] Insertion point: afterBrace starts with: ${afterBrace.substring(0, 100)}`) + + const codeToInsert = '\n' + vectorUpCode.join('\n') + '\n' + console.log(`[vectorize-migrate] Inserting ${vectorUpCode.length} line(s) of code into migration`) + console.log(`[vectorize-migrate] Code to insert:\n${codeToInsert}`) + let newContent = beforeBrace + codeToInsert + afterBrace + console.log(`[vectorize-migrate] Migration file will be ${newContent.length} characters after patching (was ${content.length})`) + + // Verify insertion point looks correct + const insertionPointPreview = newContent.substring( + Math.max(0, beforeBrace.length - 50), + Math.min(newContent.length, beforeBrace.length + codeToInsert.length + 50), + ) + console.log(`[vectorize-migrate] Insertion point preview:\n${insertionPointPreview}`) + + // Handle down function + if (downFunctionMatch) { + const downFunctionStart = downFunctionMatch.index! + downFunctionMatch[0].length + const downBraceMatch = newContent.substring(downFunctionStart).match(/\{/) + if (downBraceMatch) { + const downBodyStart = downFunctionStart + downBraceMatch.index! + 1 + const downBody = newContent.substring(downBodyStart) + const downLastBraceIndex = downBody.lastIndexOf('}') + if (downLastBraceIndex !== -1) { + const beforeDownBrace = newContent.substring(0, downBodyStart + downLastBraceIndex) + const afterDownBrace = newContent.substring(downBodyStart + downLastBraceIndex) + const downCodeToInsert = '\n' + vectorDownCode.join('\n') + '\n' + newContent = beforeDownBrace + downCodeToInsert + afterDownBrace + } + } + } else if (vectorDownCode.length > 0) { + // Add down function if it doesn't exist + const lastFileBrace = newContent.lastIndexOf('}') + if (lastFileBrace !== -1) { + const beforeLastBrace = newContent.substring(0, lastFileBrace) + const afterLastBrace = newContent.substring(lastFileBrace) + const downFunctionCode = `\n\nexport async function down({ payload, req }: { payload: any; req: any }): Promise {\n${vectorDownCode.join('\n')}\n}` + newContent = beforeLastBrace + downFunctionCode + afterLastBrace + } + } + + writeFileSync(migrationPath, newContent, 'utf-8') + console.log(`[vectorize-migrate] Migration file written successfully`) + // Verify the code was inserted + const verifyContent = readFileSync(migrationPath, 'utf-8') + const hasIvfflatCode = verifyContent.includes('ivfflat') && verifyContent.includes('lists =') + console.log(`[vectorize-migrate] Verification: migration contains IVFFLAT code: ${hasIvfflatCode}`) + if (!hasIvfflatCode && vectorUpCode.length > 0) { + console.error(`[vectorize-migrate] WARNING: IVFFLAT code was supposed to be inserted but not found in file!`) + console.error(`[vectorize-migrate] Expected to find: ${vectorUpCode.join(' | ')}`) + } +} + +/** + * Bin script entry point for creating vector migrations + */ +export const script = async (config: SanitizedConfig): Promise => { + // Disable onInit to avoid ensurePgvectorArtifacts check - migrations may not be applied yet + const payload = await getPayload({ + config, + disableOnInit: true, + key: `vectorize-migrate-payload-instance-${Date.now()}`, + }) + const vectorizedPayload = getVectorizedPayload(payload) + + if (!vectorizedPayload) { + throw new Error( + '[payloadcms-vectorize] Vectorize plugin not found. Ensure payloadcmsVectorize is configured in your Payload config.', + ) + } + + const staticConfigs = vectorizedPayload._staticConfigs + if (!staticConfigs || Object.keys(staticConfigs).length === 0) { + throw new Error('[payloadcms-vectorize] No static configs found') + } + + const poolNames = Object.keys(staticConfigs) + const schemaName = (payload.db as any).schemaName || 'public' + const migrationsDir = (payload.db as any).migrationDir || resolve(process.cwd(), 'src/migrations') + + console.log('[payloadcms-vectorize] Checking for configuration changes...') + + // Get prior state from migrations + const priorState = getPriorStateFromMigrations(migrationsDir, poolNames) + + // Debug: log prior state + console.log('[payloadcms-vectorize] Prior state from migrations:') + for (const [poolName, state] of priorState.entries()) { + console.log(`[payloadcms-vectorize] ${poolName}: dims=${state.dims}, ivfflatLists=${state.ivfflatLists}`) + } + console.log('[payloadcms-vectorize] Current static configs:') + for (const [poolName, config] of Object.entries(staticConfigs)) { + console.log(`[payloadcms-vectorize] ${poolName}: dims=${config.dims}, ivfflatLists=${config.ivfflatLists}`) + } + + // Check if any changes are needed + let hasChanges = false + let isFirstMigration = false + for (const [poolName, currentConfig] of Object.entries(staticConfigs)) { + const prior = priorState.get(poolName) || { dims: null, ivfflatLists: null } + + // Check if this is the first migration (no IVFFLAT index exists yet) + // Note: dims might be found from Drizzle schema, but ivfflatLists won't be found until we create the index + if (prior.ivfflatLists === null) { + isFirstMigration = true + hasChanges = true + console.log( + `[payloadcms-vectorize] First migration detected for pool "${poolName}" (ivfflatLists not found in prior migrations)`, + ) + break + } + + // Check for actual changes + if ( + prior.dims !== null && prior.dims !== currentConfig.dims || + (prior.ivfflatLists !== null && prior.ivfflatLists !== currentConfig.ivfflatLists) + ) { + hasChanges = true + console.log( + `[payloadcms-vectorize] Change detected for pool "${poolName}": dims ${prior.dims}→${currentConfig.dims}, ivfflatLists ${prior.ivfflatLists}→${currentConfig.ivfflatLists}`, + ) + break + } + } + + // If no changes detected, check if artifacts exist (idempotency) + if (!hasChanges) { + console.log('[payloadcms-vectorize] No configuration changes detected.') + console.log( + '[payloadcms-vectorize] If this is the first migration, ensure your initial migration creates the embedding columns via Drizzle schema.', + ) + return + } + + console.log('[payloadcms-vectorize] Changes detected.') + + // Determine if there are actual schema changes (dims change) or just index parameter changes (ivfflatLists) + // payload.db.createMigration only works when there are schema changes + // For index-only changes, we need to create the migration file manually + let hasSchemaChanges = false + for (const [poolName, currentConfig] of Object.entries(staticConfigs)) { + const prior = priorState.get(poolName) || { dims: null, ivfflatLists: null } + if (prior.dims !== null && prior.dims !== currentConfig.dims) { + hasSchemaChanges = true + console.log(`[payloadcms-vectorize] Schema change detected for pool "${poolName}": dims ${prior.dims}→${currentConfig.dims}`) + break + } + } + + if (isFirstMigration) { + console.log('[payloadcms-vectorize] This is the first migration - checking if we should patch existing migration or create new one') + + // Check if there's a very recent migration file (created in last 10 seconds) that we should patch + const recentMigrations = existsSync(migrationsDir) + ? readdirSync(migrationsDir) + .filter( + (f) => (f.endsWith('.ts') || f.endsWith('.js')) && f !== 'index.ts' && f !== 'index.js', + ) + .map((f) => ({ + name: f, + path: join(migrationsDir, f), + mtime: statSync(join(migrationsDir, f)).mtime, + })) + .filter((m) => Date.now() - m.mtime.getTime() < 10000) // Created in last 10 seconds + .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) + : [] + + if (recentMigrations.length > 0) { + const recentMigration = recentMigrations[0] + console.log(`[payloadcms-vectorize] Found recent migration to patch: ${recentMigration.name}`) + // Check if it already has IVFFLAT index code + const recentContent = readFileSync(recentMigration.path, 'utf-8') + const hasIvfflatCode = recentContent.includes('ivfflat') && (recentContent.includes('drizzle.execute') || recentContent.includes('CREATE INDEX')) + + if (!hasIvfflatCode) { + console.log(`[payloadcms-vectorize] Patching existing migration: ${recentMigration.path}`) + patchMigrationFile(recentMigration.path, staticConfigs, schemaName, priorState) + console.log('[payloadcms-vectorize] Migration patched successfully!') + return + } else { + console.log(`[payloadcms-vectorize] Recent migration already has IVFFLAT code, creating new migration instead`) + } + } + + console.log('[payloadcms-vectorize] Creating new migration with IVFFLAT index setup') + } else { + console.log('[payloadcms-vectorize] Creating new migration for configuration change') + } + + // Create migration using Payload's API OR create manually for index-only changes + // Note: createMigration may not return the path, so we'll find the newest migration file after creation + const migrationsBefore = existsSync(migrationsDir) + ? readdirSync(migrationsDir) + .filter( + (f) => (f.endsWith('.ts') || f.endsWith('.js')) && f !== 'index.ts' && f !== 'index.js', + ) + .map((f) => ({ + name: f, + path: join(migrationsDir, f), + mtime: statSync(join(migrationsDir, f)).mtime, + })) + .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) + : [] + + let migrationPath: string + + // If there are schema changes (dims changed), use Payload's createMigration + // Otherwise (only ivfflatLists changed), create the migration file manually + // because Payload's createMigration hangs when there are no schema changes to detect + if (hasSchemaChanges) { + console.log('[payloadcms-vectorize] Schema changes detected - using payload.db.createMigration...') + try { + await payload.db.createMigration({ + migrationName: 'vectorize-config', + payload, + }) + console.log('[payloadcms-vectorize] Migration created successfully') + } catch (error) { + console.error('[payloadcms-vectorize] Error creating migration:', error) + throw error + } + + // Find the newest migration file (should be the one just created) + const migrationsAfter = existsSync(migrationsDir) + ? readdirSync(migrationsDir) + .filter( + (f) => (f.endsWith('.ts') || f.endsWith('.js')) && f !== 'index.ts' && f !== 'index.js', + ) + .map((f) => ({ + name: f, + path: join(migrationsDir, f), + mtime: statSync(join(migrationsDir, f)).mtime, + })) + .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) + : [] + + // Find the migration that was just created (newest that wasn't there before) + const beforePaths = new Set(migrationsBefore.map((m) => m.path)) + const newMigrations = migrationsAfter.filter((m) => !beforePaths.has(m.path)) + const foundPath = newMigrations.length > 0 ? newMigrations[0].path : migrationsAfter[0]?.path + + if (!foundPath) { + throw new Error( + '[payloadcms-vectorize] Failed to create migration file - no new migration found.', + ) + } + migrationPath = foundPath + } else { + // No schema changes (only ivfflatLists changed) - create migration file manually + // Payload's createMigration API doesn't support this case (it hangs when no schema changes detected) + console.log('[payloadcms-vectorize] No schema changes (only index parameter changes) - creating migration file manually...') + + // Generate timestamp for migration filename (format: YYYYMMDD_HHMMSS) + const now = new Date() + const timestamp = [ + now.getFullYear(), + String(now.getMonth() + 1).padStart(2, '0'), + String(now.getDate()).padStart(2, '0'), + '_', + String(now.getHours()).padStart(2, '0'), + String(now.getMinutes()).padStart(2, '0'), + String(now.getSeconds()).padStart(2, '0'), + ].join('') + + const migrationFileName = `${timestamp}_vectorize_ivfflat_rebuild.ts` + migrationPath = join(migrationsDir, migrationFileName) + + // Create a minimal migration file that we'll patch with our IVFFLAT code + const migrationTemplate = `import { MigrateUpArgs, MigrateDownArgs, sql } from '@payloadcms/db-postgres' + +export async function up({ db, payload, req }: MigrateUpArgs): Promise { + // Index parameter changes only - no schema changes +} + +export async function down({ db, payload, req }: MigrateDownArgs): Promise { + // Revert index parameter changes +} +` + + writeFileSync(migrationPath, migrationTemplate, 'utf-8') + console.log(`[payloadcms-vectorize] Created migration file: ${migrationPath}`) + } + + console.log(`[payloadcms-vectorize] Patching migration: ${migrationPath}`) + + // Patch the migration file + patchMigrationFile(migrationPath, staticConfigs, schemaName, priorState) + + console.log('[payloadcms-vectorize] Migration created and patched successfully!') + console.log( + '[payloadcms-vectorize] Review the migration file and apply it with: pnpm payload migrate', + ) + + // Only exit if not in test environment (when called from tests, just return) + if (process.env.NODE_ENV !== 'test' && !process.env.VITEST) { + process.exit(0) + } +} diff --git a/src/endpoints/vectorSearch.ts b/src/endpoints/vectorSearch.ts index 274c618..8634eeb 100644 --- a/src/endpoints/vectorSearch.ts +++ b/src/endpoints/vectorSearch.ts @@ -100,6 +100,7 @@ async function performCosineSearch( throw new Error('Only works with Postgres') } + payload.db.createMigration // In PayloadCMS, payload.db IS the adapter, and drizzle is at payload.db.drizzle const adapter = payload.db if (!adapter) { diff --git a/src/index.ts b/src/index.ts index 60d7497..834e393 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,6 +1,8 @@ import type { Config, Payload, PayloadRequest } from 'payload' import { customType } from '@payloadcms/db-postgres/drizzle/pg-core' import toSnakeCase from 'to-snake-case' +import { fileURLToPath } from 'url' +import { dirname, resolve } from 'path' import { createEmbeddingsCollection } from './collections/embeddings.js' import type { @@ -76,13 +78,21 @@ export type { export { getVectorizedPayload } from './types.js' +/** + * Presence-only safety net: checks that pgvector artifacts exist. + * Does NOT create or modify them - migrations should handle that. + * This is a runtime check to fail fast if migrations haven't been applied. + */ async function ensurePgvectorArtifacts(args: { payload: Payload tableName: string - dims: number ivfflatLists: number }): Promise { - const { payload, tableName, dims, ivfflatLists } = args + const { payload, tableName } = args + + payload.logger.info( + `[payloadcms-vectorize] ensurePgvectorArtifacts: Starting verification for table "${tableName}"`, + ) if (!isPostgresPayload(payload)) { throw new Error( @@ -94,29 +104,102 @@ async function ensurePgvectorArtifacts(args: { const postgresPayload = payload as PostgresPayload const schemaName = postgresPayload.db.schemaName || 'public' - const sqls: string[] = [ - `CREATE EXTENSION IF NOT EXISTS vector;`, - `ALTER TABLE "${schemaName}"."${tableName}" ADD COLUMN IF NOT EXISTS embedding vector(${dims});`, - `CREATE INDEX IF NOT EXISTS ${tableName}_embedding_ivfflat ON "${schemaName}"."${tableName}" USING ivfflat (embedding vector_cosine_ops) WITH (lists = ${ivfflatLists});`, - ] + payload.logger.info( + `[payloadcms-vectorize] ensurePgvectorArtifacts: Using schema "${schemaName}" for table "${tableName}"`, + ) - try { + const runQuery = async (sql: string, params?: any[]): Promise => { + payload.logger.debug(`[payloadcms-vectorize] ensurePgvectorArtifacts: Executing query: ${sql}`) if (postgresPayload.db.pool?.query) { - for (const sql of sqls) { - await postgresPayload.db.pool.query(sql) - } - } else if (postgresPayload.db.drizzle?.execute) { - for (const sql of sqls) { - await postgresPayload.db.drizzle.execute(sql) - } + return postgresPayload.db.pool.query(sql, params) } - postgresPayload.logger.info('[payloadcms-vectorize] pgvector extension/columns/index ensured') + if (postgresPayload.db.drizzle?.execute) { + return postgresPayload.db.drizzle.execute(sql) + } + throw new Error('[payloadcms-vectorize] No database query function available') + } + + try { + // Check extension exists + payload.logger.info( + '[payloadcms-vectorize] ensurePgvectorArtifacts: Checking pgvector extension...', + ) + const extensionCheck = await runQuery(`SELECT 1 FROM pg_extension WHERE extname = 'vector'`) + const extensionRows = Array.isArray(extensionCheck) + ? extensionCheck + : extensionCheck?.rows || [] + if (extensionRows.length === 0) { + payload.logger.error( + '[payloadcms-vectorize] ensurePgvectorArtifacts: pgvector extension not found', + ) + throw new Error( + `[payloadcms-vectorize] pgvector extension not found. Please ensure migrations have been applied or manually create the extension: CREATE EXTENSION IF NOT EXISTS vector;`, + ) + } + payload.logger.info('[payloadcms-vectorize] ensurePgvectorArtifacts: pgvector extension found') + + // Check column exists with correct dims + payload.logger.info( + `[payloadcms-vectorize] ensurePgvectorArtifacts: Checking embedding column in "${schemaName}"."${tableName}"...`, + ) + const columnCheck = await runQuery( + `SELECT column_name, udt_name + FROM information_schema.columns + WHERE table_schema = $1 AND table_name = $2 AND column_name = 'embedding'`, + [schemaName, tableName], + ) + const columnRows = Array.isArray(columnCheck) ? columnCheck : columnCheck?.rows || [] + if (columnRows.length === 0) { + payload.logger.error( + `[payloadcms-vectorize] ensurePgvectorArtifacts: Embedding column not found in "${schemaName}"."${tableName}"`, + ) + throw new Error( + `[payloadcms-vectorize] Embedding column not found in table "${schemaName}"."${tableName}". Please ensure migrations have been applied.`, + ) + } + payload.logger.info( + `[payloadcms-vectorize] ensurePgvectorArtifacts: Embedding column found (type: ${columnRows[0]?.udt_name || 'unknown'})`, + ) + + // Check index exists (don't verify lists parameter - migrations handle that) + const indexName = `${tableName}_embedding_ivfflat` + payload.logger.info( + `[payloadcms-vectorize] ensurePgvectorArtifacts: Checking IVFFLAT index "${indexName}"...`, + ) + const indexCheck = await runQuery( + `SELECT 1 + FROM pg_indexes + WHERE schemaname = $1 AND tablename = $2 AND indexname = $3`, + [schemaName, tableName, indexName], + ) + const indexRows = Array.isArray(indexCheck) ? indexCheck : indexCheck?.rows || [] + if (indexRows.length === 0) { + payload.logger.error( + `[payloadcms-vectorize] ensurePgvectorArtifacts: IVFFLAT index "${indexName}" not found on "${schemaName}"."${tableName}"`, + ) + throw new Error( + `[payloadcms-vectorize] IVFFLAT index not found on table "${schemaName}"."${tableName}". Please ensure migrations have been applied.`, + ) + } + payload.logger.info( + `[payloadcms-vectorize] ensurePgvectorArtifacts: IVFFLAT index "${indexName}" found`, + ) + + postgresPayload.logger.info( + `[payloadcms-vectorize] pgvector artifacts verified for table "${schemaName}"."${tableName}"`, + ) } catch (err) { + payload.logger.error( + `[payloadcms-vectorize] ensurePgvectorArtifacts: Error occurred: ${err instanceof Error ? err.message : String(err)}`, + ) + if (err instanceof Error && err.message.includes('[payloadcms-vectorize]')) { + throw err + } postgresPayload.logger.error( - '[payloadcms-vectorize] Failed ensuring pgvector artifacts', + '[payloadcms-vectorize] Failed checking pgvector artifacts', err as Error, ) - throw new Error(`[payloadcms-vectorize] Failed ensuring pgvector artifacts: ${err}`) + throw new Error(`[payloadcms-vectorize] Failed checking pgvector artifacts: ${err}`) } } @@ -177,28 +260,64 @@ export const createVectorizeIntegration = const payloadcmsVectorize = (pluginOptions: PayloadcmsVectorizeConfig) => (config: Config): Config => { + console.log('[payloadcms-vectorize] payloadcmsVectorize: Plugin initialization started') + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Processing ${Object.keys(pluginOptions.knowledgePools).length} knowledge pool(s)`, + ) + // Ensure collections array exists config.collections = [...(config.collections || [])] + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Initial collections count: ${config.collections.length}`, + ) // Ensure bulk runs collection exists once + console.log('[payloadcms-vectorize] payloadcmsVectorize: Adding bulk runs collection...') const bulkRunsCollection = createBulkEmbeddingsRunsCollection() if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_RUNS_SLUG)) { config.collections.push(bulkRunsCollection) + console.log('[payloadcms-vectorize] payloadcmsVectorize: Bulk runs collection added') + } else { + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Bulk runs collection already exists', + ) } // Ensure bulk input metadata collection exists once + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Adding bulk input metadata collection...', + ) const bulkInputMetadataCollection = createBulkEmbeddingInputMetadataCollection() if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_INPUT_METADATA_SLUG)) { config.collections.push(bulkInputMetadataCollection) + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Bulk input metadata collection added', + ) + } else { + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Bulk input metadata collection already exists', + ) } // Ensure bulk batches collection exists once + console.log('[payloadcms-vectorize] payloadcmsVectorize: Adding bulk batches collection...') const bulkBatchesCollection = createBulkEmbeddingsBatchesCollection() if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_BATCHES_SLUG)) { config.collections.push(bulkBatchesCollection) + console.log('[payloadcms-vectorize] payloadcmsVectorize: Bulk batches collection added') + } else { + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Bulk batches collection already exists', + ) } // Validate static/dynamic configs share the same pool names + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Validating static/dynamic config alignment...', + ) for (const poolName in pluginOptions.knowledgePools) { if (!staticConfigs[poolName]) { + console.error( + `[payloadcms-vectorize] payloadcmsVectorize: Knowledge pool "${poolName}" not found in static configs`, + ) throw new Error( `[payloadcms-vectorize] Knowledge pool "${poolName}" not found in static configs`, ) @@ -212,10 +331,16 @@ export const createVectorizeIntegration = } } if (unusedStaticPools.length > 0) { + console.error( + `[payloadcms-vectorize] payloadcmsVectorize: Static pools without dynamic config: ${unusedStaticPools.join(', ')}`, + ) throw new Error( `[payloadcms-vectorize] Static knowledge pool(s) ${unusedStaticPools.join(', ')} lack dynamic configuration`, ) } + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Static/dynamic config validation passed', + ) // Build reverse mapping: collectionSlug -> KnowledgePoolName[] const collectionToPools = new Map< @@ -227,68 +352,124 @@ export const createVectorizeIntegration = >() // Process each knowledge pool + console.log('[payloadcms-vectorize] payloadcmsVectorize: Processing knowledge pools...') for (const poolName in pluginOptions.knowledgePools) { + console.log(`[payloadcms-vectorize] payloadcmsVectorize: Processing pool "${poolName}"...`) const dynamicConfig = pluginOptions.knowledgePools[poolName] // Add the embeddings collection for this knowledge pool with extensionFields + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Creating embeddings collection for pool "${poolName}"...`, + ) const embeddingsCollection = createEmbeddingsCollection( poolName, dynamicConfig.extensionFields, ) if (!config.collections.find((c) => c.slug === poolName)) { config.collections.push(embeddingsCollection) + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Embeddings collection "${poolName}" added`, + ) + } else { + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Embeddings collection "${poolName}" already exists`, + ) } // Build reverse mapping for hooks - for (const collectionSlug of Object.keys(dynamicConfig.collections)) { + const collectionSlugs = Object.keys(dynamicConfig.collections) + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Pool "${poolName}" maps to ${collectionSlugs.length} collection(s): ${collectionSlugs.join(', ')}`, + ) + for (const collectionSlug of collectionSlugs) { if (!collectionToPools.has(collectionSlug)) { collectionToPools.set(collectionSlug, []) } collectionToPools.get(collectionSlug)!.push({ pool: poolName, dynamic: dynamicConfig }) } + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Pool "${poolName}" processing complete`, + ) } + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Knowledge pools processed. Total collections: ${config.collections.length}`, + ) // Validate bulk queue requirements + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Validating bulk queue requirements...', + ) let bulkIngestEnabled = false for (const poolName in pluginOptions.knowledgePools) { const dynamicConfig = pluginOptions.knowledgePools[poolName] if (dynamicConfig.embeddingConfig.bulkEmbeddingsFns) { bulkIngestEnabled = true + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Pool "${poolName}" has bulk embedding enabled`, + ) break } } if (bulkIngestEnabled && !pluginOptions.bulkQueueNames) { + console.error( + '[payloadcms-vectorize] payloadcmsVectorize: bulkQueueNames required but not provided', + ) throw new Error( '[payloadcms-vectorize] bulkQueueNames is required when any knowledge pool has bulk embedding configured (embeddingConfig.bulkEmbeddingsFns).', ) } + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Bulk queue validation passed (enabled: ${bulkIngestEnabled})`, + ) // Exit early if disabled, but keep embeddings collections present for migrations - if (pluginOptions.disabled) return config + if (pluginOptions.disabled) { + console.log('[payloadcms-vectorize] payloadcmsVectorize: Plugin disabled, exiting early') + return config + } // Register a single task using Payload Jobs that can handle any knowledge pool + console.log('[payloadcms-vectorize] payloadcmsVectorize: Registering Payload Jobs tasks...') const incomingJobs = config.jobs || { tasks: [] } const tasks = [...(config.jobs?.tasks || [])] + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Existing tasks count: ${tasks.length}`, + ) + console.log('[payloadcms-vectorize] payloadcmsVectorize: Creating vectorize task...') const vectorizeTask = createVectorizeTask({ knowledgePools: pluginOptions.knowledgePools, }) tasks.push(vectorizeTask) + console.log('[payloadcms-vectorize] payloadcmsVectorize: Vectorize task added') + + console.log('[payloadcms-vectorize] payloadcmsVectorize: Creating prepare bulk embed task...') const prepareBulkEmbedTask = createPrepareBulkEmbeddingTask({ knowledgePools: pluginOptions.knowledgePools, pollOrCompleteQueueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, }) tasks.push(prepareBulkEmbedTask) + console.log('[payloadcms-vectorize] payloadcmsVectorize: Prepare bulk embed task added') + + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Creating poll or complete bulk embed task...', + ) const pollOrCompleteBulkEmbedTask = createPollOrCompleteBulkEmbeddingTask({ knowledgePools: pluginOptions.knowledgePools, pollOrCompleteQueueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, }) tasks.push(pollOrCompleteBulkEmbedTask) + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Poll or complete bulk embed task added', + ) config.jobs = { ...incomingJobs, tasks, } + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Jobs configured. Total tasks: ${tasks.length}`, + ) const collectionToEmbedQueue = new Map< string, @@ -296,11 +477,23 @@ export const createVectorizeIntegration = >() // Extend configured collections with hooks + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Setting up hooks for ${collectionToPools.size} collection(s)...`, + ) for (const [collectionSlug, pools] of collectionToPools.entries()) { + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Setting up hooks for collection "${collectionSlug}" (${pools.length} pool(s))...`, + ) const collection = config.collections.find((c) => c.slug === collectionSlug) if (!collection) { + console.error( + `[payloadcms-vectorize] payloadcmsVectorize: Collection "${collectionSlug}" not found`, + ) throw new Error(`[payloadcms-vectorize] Collection ${collectionSlug} not found`) } + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Collection "${collectionSlug}" found, adding hooks...`, + ) const embedQueue = async (doc: any, payload: Payload, req?: PayloadRequest) => { // Queue vectorization jobs for ALL knowledge pools containing this collection @@ -329,6 +522,9 @@ export const createVectorizeIntegration = } collectionToEmbedQueue.set(collectionSlug, embedQueue) + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Embed queue function registered for "${collectionSlug}"`, + ) collection.hooks = { ...(collection.hooks || {}), @@ -386,17 +582,27 @@ export const createVectorizeIntegration = }, ], } + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Hooks configured for collection "${collectionSlug}"`, + ) } + console.log('[payloadcms-vectorize] payloadcmsVectorize: All collection hooks configured') + console.log('[payloadcms-vectorize] payloadcmsVectorize: Creating vector search handlers...') const vectorSearchHandlers = createVectorSearchHandlers(pluginOptions.knowledgePools) + console.log('[payloadcms-vectorize] payloadcmsVectorize: Vector search handlers created') // Create vectorized payload object factory that creates methods bound to a payload instance + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Creating vectorized payload object factory...', + ) const createVectorizedPayloadObject = (payload: Payload): VectorizedPayload => { return { _isBulkEmbedEnabled: (knowledgePool: TPoolNames): boolean => { const poolConfig = pluginOptions.knowledgePools[knowledgePool] return !!poolConfig?.embeddingConfig?.bulkEmbeddingsFns }, + _staticConfigs: staticConfigs, search: (params: VectorSearchQuery) => vectorSearchHandlers.vectorSearch( payload, @@ -456,29 +662,80 @@ export const createVectorizeIntegration = } // Store factory in config.custom + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Storing vectorized payload factory in config.custom...', + ) config.custom = { ...(config.custom || {}), createVectorizedPayloadObject, } + console.log('[payloadcms-vectorize] payloadcmsVectorize: Factory stored in config.custom') + + // Register bin script for migration helper + console.log('[payloadcms-vectorize] payloadcmsVectorize: Registering bin script...') + const __filename = fileURLToPath(import.meta.url) + const __dirname = dirname(__filename) + const binScriptPath = resolve(__dirname, 'bin/vectorize-migrate.ts') + console.log(`[payloadcms-vectorize] payloadcmsVectorize: Bin script path: ${binScriptPath}`) + config.bin = [ + ...(config.bin || []), + { + key: 'vectorize:migrate', + scriptPath: binScriptPath, + }, + ] + console.log('[payloadcms-vectorize] payloadcmsVectorize: Bin script registered') + console.log('[payloadcms-vectorize] payloadcmsVectorize: Setting up onInit hook...') const incomingOnInit = config.onInit config.onInit = async (payload) => { - if (incomingOnInit) await incomingOnInit(payload) - // Ensure pgvector artifacts for each knowledge pool - for (const poolName in staticConfigs) { - const staticConfig = staticConfigs[poolName] - // Drizzle converts camelCase collection slugs to snake_case table names - await ensurePgvectorArtifacts({ - payload, + payload.logger.info( + '[payloadcms-vectorize] onInit: Starting pgvector artifacts verification', + ) + try { + if (incomingOnInit) { + payload.logger.info('[payloadcms-vectorize] onInit: Calling incoming onInit hook') + await incomingOnInit(payload) + payload.logger.info('[payloadcms-vectorize] onInit: Incoming onInit hook completed') + } + // Ensure pgvector artifacts for each knowledge pool + const poolNames = Object.keys(staticConfigs) + payload.logger.info( + `[payloadcms-vectorize] onInit: Verifying artifacts for ${poolNames.length} knowledge pool(s): ${poolNames.join(', ')}`, + ) + for (const poolName in staticConfigs) { + const staticConfig = staticConfigs[poolName] + const tableName = toSnakeCase(poolName) + payload.logger.info( + `[payloadcms-vectorize] onInit: Verifying artifacts for pool "${poolName}" (table: "${tableName}")`, + ) // Drizzle converts camelCase collection slugs to snake_case table names - tableName: toSnakeCase(poolName), - dims: staticConfig.dims, - ivfflatLists: staticConfig.ivfflatLists, - }) + await ensurePgvectorArtifacts({ + payload, + // Drizzle converts camelCase collection slugs to snake_case table names + tableName, + ivfflatLists: staticConfig.ivfflatLists, + }) + payload.logger.info( + `[payloadcms-vectorize] onInit: Artifacts verified for pool "${poolName}"`, + ) + } + payload.logger.info( + '[payloadcms-vectorize] onInit: All pgvector artifacts verified successfully', + ) + } catch (error) { + payload.logger.error( + `[payloadcms-vectorize] onInit: Error verifying pgvector artifacts: ${error instanceof Error ? error.message : String(error)}`, + ) + throw error } } + console.log('[payloadcms-vectorize] payloadcmsVectorize: onInit hook configured') if (pluginOptions.endpointOverrides?.enabled !== false) { + console.log( + '[payloadcms-vectorize] payloadcmsVectorize: Setting up vector search endpoint...', + ) const path = pluginOptions.endpointOverrides?.path || '/vector-search' const inputEndpoints = config.endpoints || [] const endpoints = [ @@ -506,8 +763,17 @@ export const createVectorizeIntegration = }, ] config.endpoints = endpoints + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Vector search endpoint registered at "${path}"`, + ) + } else { + console.log('[payloadcms-vectorize] payloadcmsVectorize: Vector search endpoint disabled') } + console.log('[payloadcms-vectorize] payloadcmsVectorize: Plugin initialization complete') + console.log( + `[payloadcms-vectorize] payloadcmsVectorize: Final collections count: ${config.collections.length}`, + ) return config } return { diff --git a/src/types.ts b/src/types.ts index f211516..9c48a29 100644 --- a/src/types.ts +++ b/src/types.ts @@ -46,6 +46,8 @@ export type RetryFailedBatchResult = export type VectorizedPayload = { /** Check if bulk embedding is enabled for a knowledge pool */ _isBulkEmbedEnabled: (knowledgePool: TPoolNames) => boolean + /** Static configs for migration helper access */ + _staticConfigs: Record search: (params: VectorSearchQuery) => Promise> queueEmbed: ( params: From 2c8238aae69ea8191bc79a5c135d464cc245dd98 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sat, 17 Jan 2026 10:46:53 +0700 Subject: [PATCH 45/49] WIP --- dev/specs/chunkers.spec.ts | 22 ++- dev/specs/extensionFields.spec.ts | 19 ++- dev/specs/extensionFieldsVectorSearch.spec.ts | 23 ++- dev/specs/failedValidation.spec.ts | 27 +++- dev/specs/int.spec.ts | 21 ++- dev/specs/multipools.spec.ts | 12 +- dev/specs/queueName.spec.ts | 15 +- dev/specs/schemaName.spec.ts | 17 +- dev/specs/utils.ts | 152 +++++++++++++++++- dev/specs/vectorSearch.spec.ts | 20 ++- dev/specs/vectorizedPayload.spec.ts | 24 ++- src/endpoints/vectorSearch.ts | 1 - src/index.ts | 2 +- 13 files changed, 307 insertions(+), 48 deletions(-) diff --git a/dev/specs/chunkers.spec.ts b/dev/specs/chunkers.spec.ts index 454d1ea..aef7387 100644 --- a/dev/specs/chunkers.spec.ts +++ b/dev/specs/chunkers.spec.ts @@ -1,9 +1,8 @@ -import { getPayload } from 'payload' -import { beforeAll, describe, expect, test } from 'vitest' +import { describe, expect, test } from 'vitest' import { chunkText, chunkRichText } from 'helpers/chunkers.js' import { postgresAdapter } from '@payloadcms/db-postgres' import { buildDummyConfig, getInitialMarkdownContent, integration } from './constants.js' -import { createTestDb } from './utils.js' +import { createTestDb, initializePayloadWithMigrations, createTestMigrationsDir } from './utils.js' describe('Chunkers', () => { test('textChunker', () => { @@ -17,20 +16,27 @@ describe('Chunkers', () => { }) test('richTextChunker splits by H2', async () => { - beforeAll(async () => { - createTestDb({ dbName: 'chunkers_test' }) - }) + const dbName = 'chunkers_test' + await createTestDb({ dbName }) + const { migrationsDir } = createTestMigrationsDir(dbName) + const cfg = await buildDummyConfig({ db: postgresAdapter({ extensions: ['vector'], afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, pool: { - connectionString: 'postgresql://postgres:password@localhost:5433/chunkers_test', + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), }) const markdownContent = await getInitialMarkdownContent(cfg) - const thisPayload = await getPayload({ config: cfg }) + + const thisPayload = await initializePayloadWithMigrations({ + config: cfg, + key: `chunkers-test-${Date.now()}`, + }) const chunks = await chunkRichText(markdownContent, thisPayload) expect(chunks.length).toBe(3) diff --git a/dev/specs/extensionFields.spec.ts b/dev/specs/extensionFields.spec.ts index 56ee27a..6fe4f7f 100644 --- a/dev/specs/extensionFields.spec.ts +++ b/dev/specs/extensionFields.spec.ts @@ -1,9 +1,13 @@ import type { Payload } from 'payload' -import { getPayload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' import { postgresAdapter } from '@payloadcms/db-postgres' import { buildDummyConfig, integration, plugin } from './constants.js' -import { createTestDb, waitForVectorizationJobs } from './utils.js' +import { + createTestDb, + waitForVectorizationJobs, + initializePayloadWithMigrations, + createTestMigrationsDir, +} from './utils.js' import { PostgresPayload } from '../../src/types.js' import { chunkText, chunkRichText } from 'helpers/chunkers.js' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' @@ -15,6 +19,8 @@ describe('Extension fields integration tests', () => { beforeAll(async () => { await createTestDb({ dbName }) + const { migrationsDir } = createTestMigrationsDir(dbName) + const config = await buildDummyConfig({ jobs: { tasks: [], @@ -39,6 +45,8 @@ describe('Extension fields integration tests', () => { db: postgresAdapter({ extensions: ['vector'], afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, @@ -104,7 +112,12 @@ describe('Extension fields integration tests', () => { }), ], }) - payload = await getPayload({ config, cron: true }) + + payload = await initializePayloadWithMigrations({ + config, + key: `extension-fields-test-${Date.now()}`, + cron: true, + }) }) test('extension fields are added to the embeddings table schema', async () => { diff --git a/dev/specs/extensionFieldsVectorSearch.spec.ts b/dev/specs/extensionFieldsVectorSearch.spec.ts index 1f81419..94b136f 100644 --- a/dev/specs/extensionFieldsVectorSearch.spec.ts +++ b/dev/specs/extensionFieldsVectorSearch.spec.ts @@ -1,8 +1,12 @@ -import { getPayload } from 'payload' import { describe, expect, test } from 'vitest' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { buildDummyConfig, DIMS, integration, plugin } from './constants.js' -import { createTestDb, waitForVectorizationJobs } from './utils.js' +import { + createTestDb, + waitForVectorizationJobs, + initializePayloadWithMigrations, + createTestMigrationsDir, +} from './utils.js' import { postgresAdapter } from '@payloadcms/db-postgres' import { chunkRichText, chunkText } from 'helpers/chunkers.js' import { createVectorSearchHandlers } from '../../src/endpoints/vectorSearch.js' @@ -11,7 +15,9 @@ import type { KnowledgePoolDynamicConfig } from 'payloadcms-vectorize' describe('extensionFields', () => { test('returns extensionFields in search results with correct types', async () => { // Create a new payload instance with extensionFields - await createTestDb({ dbName: 'endpoint_test_extension' }) + const dbName = 'endpoint_test_extension' + await createTestDb({ dbName }) + const { migrationsDir } = createTestMigrationsDir(dbName) const defaultKnowledgePool: KnowledgePoolDynamicConfig = { collections: { posts: { @@ -89,8 +95,10 @@ describe('extensionFields', () => { db: postgresAdapter({ extensions: ['vector'], afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, pool: { - connectionString: 'postgresql://postgres:password@localhost:5433/endpoint_test_extension', + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ @@ -101,7 +109,12 @@ describe('extensionFields', () => { }), ], }) - const payloadWithExtensions = await getPayload({ config: configWithExtensions, cron: true }) + + const payloadWithExtensions = await initializePayloadWithMigrations({ + config: configWithExtensions, + key: `extension-fields-vector-search-test-${Date.now()}`, + cron: true, + }) // Create a post with extension field values const testQuery = 'Extension fields test content' diff --git a/dev/specs/failedValidation.spec.ts b/dev/specs/failedValidation.spec.ts index 79ef30e..8520a84 100644 --- a/dev/specs/failedValidation.spec.ts +++ b/dev/specs/failedValidation.spec.ts @@ -1,12 +1,17 @@ import { postgresAdapter } from '@payloadcms/db-postgres' import { buildConfig } from 'payload' -import { getPayload } from 'payload' import { describe, expect, test } from 'vitest' import { createVectorizeIntegration } from '../../src/index.js' -import { createTestDb, waitForVectorizationJobs } from './utils.js' +import { + createTestDb, + waitForVectorizationJobs, + initializePayloadWithMigrations, + createTestMigrationsDir, +} from './utils.js' const DIMS = 8 +const dbName = 'failed_validation_test' const embedDocs = async (texts: string[]) => texts.map(() => Array(DIMS).fill(0)) const embedQuery = async (_text: string) => Array(DIMS).fill(0) @@ -18,8 +23,7 @@ const { afterSchemaInitHook, payloadcmsVectorize } = createVectorizeIntegration( }, }) -const buildMalformedConfig = async () => { - await createTestDb({ dbName: 'failed_validation_test' }) +const buildMalformedConfig = async (migrationsDir: string) => { return buildConfig({ jobs: { tasks: [], @@ -39,10 +43,12 @@ const buildMalformedConfig = async () => { db: postgresAdapter({ extensions: ['vector'], afterSchemaInit: [afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, pool: { connectionString: process.env.DATABASE_URI || - 'postgresql://postgres:password@localhost:5433/failed_validation_test', + `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ @@ -70,8 +76,15 @@ const buildMalformedConfig = async () => { describe('Validation failures mark jobs as errored', () => { test('malformed chunk entry fails the vectorize job', async () => { - const config = await buildMalformedConfig() - const payload = await getPayload({ config, cron: true }) + await createTestDb({ dbName }) + const { migrationsDir } = createTestMigrationsDir(dbName) + + const config = await buildMalformedConfig(migrationsDir) + const payload = await initializePayloadWithMigrations({ + config, + key: `failed-validation-test-${Date.now()}`, + cron: true, + }) await payload.create({ collection: 'posts', diff --git a/dev/specs/int.spec.ts b/dev/specs/int.spec.ts index bea7dab..cf4b657 100644 --- a/dev/specs/int.spec.ts +++ b/dev/specs/int.spec.ts @@ -14,9 +14,14 @@ import { $createHeadingNode } from '@payloadcms/richtext-lexical/lexical/rich-te import { PostgresPayload } from '../../src/types.js' import { editorConfigFactory, getEnabledNodes, lexicalEditor } from '@payloadcms/richtext-lexical' import { DIMS, getInitialMarkdownContent } from './constants.js' -import { createTestDb, waitForVectorizationJobs } from './utils.js' +import { + createTestDb, + waitForVectorizationJobs, + initializePayloadWithMigrations, + createTestMigrationsDir, +} from './utils.js' import { postgresAdapter } from '@payloadcms/db-postgres' -import { buildConfig, getPayload } from 'payload' +import { buildConfig } from 'payload' import { createVectorizeIntegration } from 'payloadcms-vectorize' const embedFn = makeDummyEmbedDocs(DIMS) @@ -32,6 +37,8 @@ describe('Plugin integration tests', () => { beforeAll(async () => { await createTestDb({ dbName }) + const { migrationsDir } = createTestMigrationsDir(dbName) + // Create isolated integration for this test suite const integration = createVectorizeIntegration({ default: { @@ -55,6 +62,8 @@ describe('Plugin integration tests', () => { db: postgresAdapter({ extensions: ['vector'], afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, // Prevent dev mode schema push - use migrations only pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, @@ -99,7 +108,13 @@ describe('Plugin integration tests', () => { }, }) - payload = await getPayload({ config, key: `int-test-${Date.now()}`, cron: true }) + // Initialize Payload with migrations + payload = await initializePayloadWithMigrations({ + config, + key: `int-test-${Date.now()}`, + cron: true, + }) + markdownContent = await getInitialMarkdownContent(config) }) diff --git a/dev/specs/multipools.spec.ts b/dev/specs/multipools.spec.ts index 8b9c30d..58a9ef6 100644 --- a/dev/specs/multipools.spec.ts +++ b/dev/specs/multipools.spec.ts @@ -1,11 +1,11 @@ import type { Payload, SanitizedConfig } from 'payload' -import { buildConfig, getPayload } from 'payload' +import { buildConfig } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' import { createVectorizeIntegration } from 'payloadcms-vectorize' import { lexicalEditor } from '@payloadcms/richtext-lexical' import { postgresAdapter } from '@payloadcms/db-postgres' -import { createTestDb } from './utils.js' +import { createTestDb, initializePayloadWithMigrations, createTestMigrationsDir } from './utils.js' import type { PostgresPayload } from '../../src/types.js' const DIMS_POOL1 = 8 @@ -18,6 +18,7 @@ describe('Multiple knowledge pools', () => { beforeAll(async () => { await createTestDb({ dbName }) + const { migrationsDir } = createTestMigrationsDir(dbName) const multiPoolIntegration = createVectorizeIntegration({ pool1: { @@ -60,6 +61,8 @@ describe('Multiple knowledge pools', () => { db: postgresAdapter({ extensions: ['vector'], afterSchemaInit: [multiPoolIntegration.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, @@ -67,7 +70,10 @@ describe('Multiple knowledge pools', () => { plugins: [multiPoolIntegration.payloadcmsVectorize(multiPoolPluginOptions)], }) - payload = await getPayload({ config }) + payload = await initializePayloadWithMigrations({ + config, + key: `multipools-test-${Date.now()}`, + }) }) test('creates two embeddings collections with vector columns', async () => { diff --git a/dev/specs/queueName.spec.ts b/dev/specs/queueName.spec.ts index 887a1c0..7b6e7f0 100644 --- a/dev/specs/queueName.spec.ts +++ b/dev/specs/queueName.spec.ts @@ -1,11 +1,10 @@ import type { Payload, SanitizedConfig } from 'payload' -import { getPayload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' import { chunkText, chunkRichText } from 'helpers/chunkers.js' import type { SerializedEditorState } from '@payloadcms/richtext-lexical/lexical' import { postgresAdapter } from '@payloadcms/db-postgres' import { buildDummyConfig, getInitialMarkdownContent, integration, plugin } from './constants.js' -import { createTestDb } from './utils.js' +import { createTestDb, initializePayloadWithMigrations, createTestMigrationsDir } from './utils.js' describe('Queue tests', () => { let config: SanitizedConfig @@ -15,6 +14,8 @@ describe('Queue tests', () => { const dbName = 'queue_test' beforeAll(async () => { await createTestDb({ dbName }) + const { migrationsDir } = createTestMigrationsDir(dbName) + config = await buildDummyConfig({ collections: [ { @@ -28,8 +29,10 @@ describe('Queue tests', () => { db: postgresAdapter({ extensions: ['vector'], afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, pool: { - connectionString: 'postgresql://postgres:password@localhost:5433/queue_test', + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ @@ -65,7 +68,11 @@ describe('Queue tests', () => { }), ], }) - payload = await getPayload({ config }) + + payload = await initializePayloadWithMigrations({ + config, + key: `queue-test-${Date.now()}`, + }) markdownContent = await getInitialMarkdownContent(config) }) test('vectorization jobs are queued using the queueName', async () => { diff --git a/dev/specs/schemaName.spec.ts b/dev/specs/schemaName.spec.ts index 8ec7613..1af1725 100644 --- a/dev/specs/schemaName.spec.ts +++ b/dev/specs/schemaName.spec.ts @@ -3,13 +3,17 @@ import type { Payload } from 'payload' import { postgresAdapter } from '@payloadcms/db-postgres' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { Client } from 'pg' -import { getPayload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' import type { PostgresPayload } from '../../src/types.js' import { buildDummyConfig, DIMS, integration, plugin } from './constants.js' -import { createTestDb, waitForVectorizationJobs } from './utils.js' +import { + createTestDb, + waitForVectorizationJobs, + initializePayloadWithMigrations, + createTestMigrationsDir, +} from './utils.js' import { createVectorSearchHandlers } from '../../src/endpoints/vectorSearch.js' import type { KnowledgePoolDynamicConfig } from 'payloadcms-vectorize' const CUSTOM_SCHEMA = 'custom' @@ -20,6 +24,7 @@ describe('Custom schemaName support', () => { beforeAll(async () => { await createTestDb({ dbName }) + const { migrationsDir } = createTestMigrationsDir(dbName) // Create the custom schema before Payload initializes const client = new Client({ @@ -42,6 +47,8 @@ describe('Custom schemaName support', () => { db: postgresAdapter({ afterSchemaInit: [integration.afterSchemaInitHook], extensions: ['vector'], + migrationDir: migrationsDir, + push: false, pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, @@ -85,7 +92,11 @@ describe('Custom schemaName support', () => { ], }) - payload = await getPayload({ config, cron: true }) + payload = await initializePayloadWithMigrations({ + config, + key: `schema-name-test-${Date.now()}`, + cron: true, + }) }) test('embeddings table is created in custom schema', async () => { diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index 214891d..bc0b433 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -2,6 +2,8 @@ import type { Payload, SanitizedConfig } from 'payload' import { buildConfig, getPayload } from 'payload' import { Client } from 'pg' +import { mkdirSync, rmSync } from 'fs' +import { join } from 'path' import { postgresAdapter } from '@payloadcms/db-postgres' import { lexicalEditor } from '@payloadcms/richtext-lexical' import { createVectorizeIntegration } from 'payloadcms-vectorize' @@ -9,6 +11,7 @@ import { BULK_EMBEDDINGS_RUNS_SLUG } from '../../src/collections/bulkEmbeddingsR import { BULK_EMBEDDINGS_INPUT_METADATA_SLUG } from '../../src/collections/bulkEmbeddingInputMetadata.js' import { BULK_EMBEDDINGS_BATCHES_SLUG } from '../../src/collections/bulkEmbeddingsBatches.js' import { makeDummyEmbedDocs } from '../helpers/embed.js' +import { script as vectorizeMigrateScript } from '../../src/bin/vectorize-migrate.js' import type { BulkEmbeddingsFns, BulkEmbeddingInput, @@ -27,6 +30,128 @@ export const createTestDb = async ({ dbName }: { dbName: string }) => { await client.end() } +/** + * Initialize Payload with migrations applied. + * This handles the full migration setup: + * 1. Get payload with disableOnInit to avoid ensurePgvectorArtifacts check + * 2. Create initial migration + * 3. Run vectorize:migrate to patch with IVFFLAT index + * 4. Apply migrations + * 5. Run onInit + * + * @param config - A pre-built SanitizedConfig (must have migrationDir and push: false in db config) + * @param key - Unique key for getPayload caching + * @param cron - Whether to enable cron jobs (default: true) + */ +export async function initializePayloadWithMigrations({ + config, + key, + cron = true, +}: { + config: SanitizedConfig + key: string + cron?: boolean +}): Promise { + // Get payload with disableOnInit to avoid ensurePgvectorArtifacts check before migrations + const payload = await getPayload({ config, key, cron, disableOnInit: true }) + + // Create initial migration (Payload's schema) + await payload.db.createMigration({ migrationName: 'initial', payload }) + + // Run vectorize:migrate to patch with IVFFLAT index + await vectorizeMigrateScript(config) + + // Apply migrations (forceAcceptWarning bypasses the dev mode prompt) + await (payload.db as any).migrate({ forceAcceptWarning: true }) + + // Now run onInit (it's still available on config, not destroyed by disableOnInit) + if (payload.config.onInit) { + await payload.config.onInit(payload) + } + + return payload +} + +/** + * Create a unique migration directory for a test. + * Returns the path and a cleanup function. + */ +export function createTestMigrationsDir(dbName: string): { + migrationsDir: string + cleanup: () => void +} { + const migrationsDir = join(process.cwd(), 'dev', `test-migrations-${dbName}`) + // Clean up any existing migration directory + rmSync(migrationsDir, { recursive: true, force: true }) + mkdirSync(migrationsDir, { recursive: true }) + + return { + migrationsDir, + cleanup: () => rmSync(migrationsDir, { recursive: true, force: true }), + } +} + +/** + * Create pgvector artifacts (extension + IVFFLAT index) for testing. + * This should be called after migrations are applied but before onInit runs, + * or used with disableOnInit to manually set up the test environment. + */ +export const ensureTestPgvectorArtifacts = async ({ + dbName, + tableName = 'default', + dims = DEFAULT_DIMS, + ivfflatLists = 1, +}: { + dbName: string + tableName?: string + dims?: number + ivfflatLists?: number +}) => { + const client = new Client({ + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }) + await client.connect() + try { + // Ensure pgvector extension exists + await client.query('CREATE EXTENSION IF NOT EXISTS vector') + + // Check if table exists (it should be created by Payload's schema init) + const tableCheck = await client.query( + `SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = $1`, + [tableName], + ) + if (tableCheck.rowCount === 0) { + // Table doesn't exist yet - this is expected before migrations + // We'll skip index creation; it will be handled by migrations + return + } + + // Check if embedding column exists + const columnCheck = await client.query( + `SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = $1 AND column_name = 'embedding'`, + [tableName], + ) + if (columnCheck.rowCount === 0) { + // Column doesn't exist yet - skip index creation + return + } + + // Create IVFFLAT index if it doesn't exist + const indexName = `${tableName}_embedding_ivfflat` + const indexCheck = await client.query( + `SELECT 1 FROM pg_indexes WHERE schemaname = 'public' AND tablename = $1 AND indexname = $2`, + [tableName, indexName], + ) + if (indexCheck.rowCount === 0) { + await client.query( + `CREATE INDEX "${indexName}" ON "public"."${tableName}" USING ivfflat (embedding vector_cosine_ops) WITH (lists = ${ivfflatLists})`, + ) + } + } finally { + await client.end() + } +} + async function waitForTasks( payload: Payload, taskSlugs: string[], @@ -190,6 +315,13 @@ export async function buildPayloadWithIntegration({ pluginOpts, key, }: BuildPayloadArgs): Promise<{ payload: Payload; config: SanitizedConfig }> { + // Create a unique migration directory for this test + const migrationsDir = join(process.cwd(), 'dev', `test-migrations-${dbName}`) + + // Clean up any existing migration directory + rmSync(migrationsDir, { recursive: true, force: true }) + mkdirSync(migrationsDir, { recursive: true }) + const integration = createVectorizeIntegration({ default: { dims: DEFAULT_DIMS, @@ -209,6 +341,8 @@ export async function buildPayloadWithIntegration({ db: postgresAdapter({ extensions: ['vector'], afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, // Prevent dev mode schema push - use migrations only pool: { connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, @@ -237,7 +371,23 @@ export async function buildPayloadWithIntegration({ }) const payloadKey = key ?? `payload-${dbName}-${Date.now()}` - const payload = await getPayload({ config, key: payloadKey, cron: true }) + // Disable onInit to avoid ensurePgvectorArtifacts check before index exists + const payload = await getPayload({ config, key: payloadKey, cron: true, disableOnInit: true }) + + // Create initial migration (Payload's schema) + await payload.db.createMigration({ migrationName: 'initial', payload }) + + // Run vectorize:migrate to patch with IVFFLAT index + await vectorizeMigrateScript(config) + + // Apply migrations (forceAcceptWarning bypasses the dev mode prompt) + await (payload.db as any).migrate({ forceAcceptWarning: true }) + + // Now run onInit (it's still available on config, not destroyed by disableOnInit) + if (payload.config.onInit) { + await payload.config.onInit(payload) + } + return { payload, config } } diff --git a/dev/specs/vectorSearch.spec.ts b/dev/specs/vectorSearch.spec.ts index 2ac894a..2c79747 100644 --- a/dev/specs/vectorSearch.spec.ts +++ b/dev/specs/vectorSearch.spec.ts @@ -1,6 +1,5 @@ import type { Payload } from 'payload' -import { getPayload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { type SerializedEditorState } from '@payloadcms/richtext-lexical/lexical' @@ -10,6 +9,8 @@ import { createMockBulkEmbeddings, createTestDb, waitForVectorizationJobs, + initializePayloadWithMigrations, + createTestMigrationsDir, } from './utils.js' import { postgresAdapter } from '@payloadcms/db-postgres' import { chunkRichText, chunkText } from 'helpers/chunkers.js' @@ -77,9 +78,12 @@ describe('Search endpoint integration tests', () => { let payload: Payload let markdownContent: SerializedEditorState const titleAndQuery = 'My query is a title' + const dbName = 'endpoint_test' beforeAll(async () => { - await createTestDb({ dbName: 'endpoint_test' }) + await createTestDb({ dbName }) + const { migrationsDir } = createTestMigrationsDir(dbName) + const config = await buildDummyConfig({ jobs: { tasks: [], @@ -102,8 +106,10 @@ describe('Search endpoint integration tests', () => { db: postgresAdapter({ extensions: ['vector'], afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, // Prevent dev mode schema push - use migrations only pool: { - connectionString: 'postgresql://postgres:password@localhost:5433/endpoint_test', + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ @@ -189,7 +195,13 @@ describe('Search endpoint integration tests', () => { }), ], }) - payload = await getPayload({ config, cron: true }) + + // Initialize Payload with migrations + payload = await initializePayloadWithMigrations({ + config, + key: `vector-search-test-${Date.now()}`, + cron: true, + }) markdownContent = await getInitialMarkdownContent(config) }) diff --git a/dev/specs/vectorizedPayload.spec.ts b/dev/specs/vectorizedPayload.spec.ts index 6ea9539..ffe182d 100644 --- a/dev/specs/vectorizedPayload.spec.ts +++ b/dev/specs/vectorizedPayload.spec.ts @@ -1,10 +1,14 @@ import type { Payload } from 'payload' -import { getPayload } from 'payload' import { beforeAll, describe, expect, test } from 'vitest' import { getVectorizedPayload, VectorizedPayload } from '../../src/types.js' import { buildDummyConfig, DIMS, getInitialMarkdownContent } from './constants.js' -import { createTestDb, waitForVectorizationJobs } from './utils.js' +import { + createTestDb, + waitForVectorizationJobs, + initializePayloadWithMigrations, + createTestMigrationsDir, +} from './utils.js' import { postgresAdapter } from '@payloadcms/db-postgres' import { makeDummyEmbedDocs, makeDummyEmbedQuery, testEmbeddingVersion } from 'helpers/embed.js' import { chunkRichText, chunkText } from 'helpers/chunkers.js' @@ -30,9 +34,12 @@ describe('VectorizedPayload', () => { let payload: Payload let markdownContent: SerializedEditorState const titleAndQuery = 'VectorizedPayload Test Title' + const dbName = 'vectorized_payload_test' beforeAll(async () => { - await createTestDb({ dbName: 'vectorized_payload_test' }) + await createTestDb({ dbName }) + const { migrationsDir } = createTestMigrationsDir(dbName) + const config = await buildDummyConfig({ jobs: { tasks: [], @@ -55,8 +62,10 @@ describe('VectorizedPayload', () => { db: postgresAdapter({ extensions: ['vector'], afterSchemaInit: [integration.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, pool: { - connectionString: 'postgresql://postgres:password@localhost:5433/vectorized_payload_test', + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, }), plugins: [ @@ -89,7 +98,12 @@ describe('VectorizedPayload', () => { }), ], }) - payload = await getPayload({ config, cron: true }) + + payload = await initializePayloadWithMigrations({ + config, + key: `vectorized-payload-test-${Date.now()}`, + cron: true, + }) markdownContent = await getInitialMarkdownContent(config) }) diff --git a/src/endpoints/vectorSearch.ts b/src/endpoints/vectorSearch.ts index 8634eeb..274c618 100644 --- a/src/endpoints/vectorSearch.ts +++ b/src/endpoints/vectorSearch.ts @@ -100,7 +100,6 @@ async function performCosineSearch( throw new Error('Only works with Postgres') } - payload.db.createMigration // In PayloadCMS, payload.db IS the adapter, and drizzle is at payload.db.drizzle const adapter = payload.db if (!adapter) { diff --git a/src/index.ts b/src/index.ts index f5372e8..461ff2b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -675,7 +675,7 @@ export const createVectorizeIntegration = console.log('[payloadcms-vectorize] payloadcmsVectorize: Registering bin script...') const __filename = fileURLToPath(import.meta.url) const __dirname = dirname(__filename) - const binScriptPath = resolve(__dirname, 'bin/vectorize-migrate.ts') + const binScriptPath = resolve(__dirname, 'bin/vectorize-migrate.js') console.log(`[payloadcms-vectorize] payloadcmsVectorize: Bin script path: ${binScriptPath}`) config.bin = [ ...(config.bin || []), From a5abfdd53c56fb135ac8c2970379779c3d77f15f Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sat, 17 Jan 2026 14:38:58 +0700 Subject: [PATCH 46/49] WIP --- dev/specs/migrationCli.spec.ts | 273 +++++++++++++-------------------- dev/specs/utils.ts | 97 +++--------- src/bin/vectorize-migrate.ts | 18 ++- src/index.ts | 170 -------------------- 4 files changed, 138 insertions(+), 420 deletions(-) diff --git a/dev/specs/migrationCli.spec.ts b/dev/specs/migrationCli.spec.ts index edb4473..a77c2b6 100644 --- a/dev/specs/migrationCli.spec.ts +++ b/dev/specs/migrationCli.spec.ts @@ -11,116 +11,66 @@ import { script as vectorizeMigrateScript } from '../../src/bin/vectorize-migrat import { readdirSync, statSync, existsSync, readFileSync, rmSync } from 'fs' import { join, resolve } from 'path' -describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { - const dbName = `migration_cli_test_${Date.now()}` - let payload: Payload - - beforeAll(async () => { - await createTestDb({ dbName }) - - const integration = createVectorizeIntegration({ - default: { - dims: DIMS, - ivfflatLists: 10, - }, - }) +describe('Migration CLI integration tests', () => { + describe('VectorizedPayload access', () => { + let payload: Payload + const dbName = `migration_cli_test_${Date.now()}` - const config = await buildConfig({ - secret: 'test-secret', - collections: [ - { - slug: 'posts', - fields: [{ name: 'title', type: 'text' }], - }, - ], - db: postgresAdapter({ - extensions: ['vector'], - afterSchemaInit: [integration.afterSchemaInitHook], - pool: { - connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + beforeAll(async () => { + await createTestDb({ dbName }) + + const integration = createVectorizeIntegration({ + default: { + dims: DIMS, + ivfflatLists: 10, }, - }), - plugins: [ - integration.payloadcmsVectorize({ - knowledgePools: { - default: { - collections: { - posts: { - toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }) + + const config = await buildConfig({ + secret: 'test-secret', + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integration.afterSchemaInitHook], + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, + }, + }), + plugins: [ + integration.payloadcmsVectorize({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), }, - }, - embeddingConfig: { - version: testEmbeddingVersion, - queryFn: makeDummyEmbedQuery(DIMS), - realTimeIngestionFn: makeDummyEmbedDocs(DIMS), }, }, - }, - }), - ], - jobs: { - tasks: [], - autoRun: [ - { - cron: '*/5 * * * * *', - limit: 10, - }, + }), ], - }, - }) - - // Temporarily disable onInit for runtime behavior tests - // This prevents ensurePgvectorArtifacts from running before tests can set up their state - - payload = await getPayload({ - config, - cron: true, - disableOnInit: true, - key: `test-runtime-behavior-${Date.now()}`, - }) - }) - - describe('Runtime behavior', () => { - test('ensurePgvectorArtifacts is presence-only and does not rebuild index', async () => { - const postgresPayload = payload as PostgresPayload - const schemaName = postgresPayload.db.schemaName || 'public' - const tableName = 'default' - - // Manually create the index first (simulating a migration) - await postgresPayload.db.pool?.query( - `CREATE INDEX IF NOT EXISTS ${tableName}_embedding_ivfflat ON "${schemaName}"."${tableName}" USING ivfflat (embedding vector_cosine_ops) WITH (lists = 10)`, - ) - - // Get initial index definition - const initialIndex = await postgresPayload.db.pool?.query( - `SELECT pg_get_indexdef(c.oid) as def - FROM pg_indexes i - JOIN pg_class c ON c.relname = i.indexname - JOIN pg_namespace n ON n.oid = c.relnamespace AND n.nspname = i.schemaname - WHERE i.schemaname = $1 AND i.tablename = $2 AND i.indexname = $3`, - [schemaName, tableName, `${tableName}_embedding_ivfflat`], - ) - const initialDef = initialIndex?.rows[0]?.def || '' - - // Call ensurePgvectorArtifacts (via onInit which should check presence) - // Since we already have the artifacts, it should pass without modifying - // Note: onInit calls ensurePgvectorArtifacts, but since artifacts exist, it should just verify - await payload.config.onInit?.(payload) - - // Verify index definition hasn't changed - const afterIndex = await postgresPayload.db.pool?.query( - `SELECT pg_get_indexdef(c.oid) as def - FROM pg_indexes i - JOIN pg_class c ON c.relname = i.indexname - JOIN pg_namespace n ON n.oid = c.relnamespace AND n.nspname = i.schemaname - WHERE i.schemaname = $1 AND i.tablename = $2 AND i.indexname = $3`, - [schemaName, tableName, `${tableName}_embedding_ivfflat`], - ) - const afterDef = afterIndex?.rows[0]?.def || '' + jobs: { + tasks: [], + autoRun: [ + { + cron: '*/5 * * * * *', + limit: 10, + }, + ], + }, + }) - // Index should still exist and be the same - expect(afterDef).toBeTruthy() - expect(afterDef).toBe(initialDef) + payload = await getPayload({ config, cron: true }) }) test('VectorizedPayload has _staticConfigs', async () => { @@ -133,25 +83,22 @@ describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { expect(vectorizedPayload?._staticConfigs.default.dims).toBe(DIMS) expect(vectorizedPayload?._staticConfigs.default.ivfflatLists).toBe(10) }) + }) + + describe('Error handling when migrations not run', () => { + let payload: Payload + const dbName = `migration_error_test_${Date.now()}` - test('ensurePgvectorArtifacts throws error when artifacts are missing (user has not run migrations)', async () => { - // Create a new database without any migrations applied - // This simulates the state when a user hasn't run migrations yet - const testDbName = `migration_cli_test_missing_${Date.now()}` - console.log('[TEST] Step 1: Creating test database:', testDbName) - await createTestDb({ dbName: testDbName }) - console.log('[TEST] Step 2: Database created') + beforeAll(async () => { + await createTestDb({ dbName }) - console.log('[TEST] Step 3: Creating integration') const integration = createVectorizeIntegration({ default: { dims: DIMS, ivfflatLists: 10, }, }) - console.log('[TEST] Step 4: Integration created') - console.log('[TEST] Step 5: Starting buildConfig...') const config = await buildConfig({ secret: 'test-secret', collections: [ @@ -164,8 +111,10 @@ describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { extensions: ['vector'], afterSchemaInit: [integration.afterSchemaInitHook], pool: { - connectionString: `postgresql://postgres:password@localhost:5433/${testDbName}`, + connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, }, + // Don't push schema changes - we want to test without migrations + push: false, }), plugins: [ integration.payloadcmsVectorize({ @@ -187,19 +136,47 @@ describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { ], jobs: { tasks: [], - autoRun: [], + autoRun: [ + { + cron: '*/5 * * * * *', + limit: 10, + }, + ], }, }) - console.log('[TEST] Step 6: buildConfig completed') - // Note: onInit will be called during getPayload and will throw because artifacts don't exist - // This simulates the real-world scenario where a user hasn't run migrations yet - // The error will be "Embedding column not found" (first check that fails) - console.log('[TEST] Step 7: Calling getPayload (should throw)...') + payload = await getPayload({ + config, + cron: false, // Disable cron to avoid background jobs + key: `migration-error-test-${Date.now()}`, + }) + }) + + test('vector search fails with descriptive error when embedding column missing', async () => { + const { getVectorizedPayload } = await import('payloadcms-vectorize') + const vectorizedPayload = getVectorizedPayload(payload) + + // Vector search should fail with a descriptive error await expect( - getPayload({ config, cron: true, key: `test-missing-artifacts-${Date.now()}` }), - ).rejects.toThrow('Embedding column not found') - console.log('[TEST] Step 8: getPayload threw as expected') + vectorizedPayload?.search({ + knowledgePool: 'default', + query: 'test query', + limit: 10, + }), + ).rejects.toThrow() + }) + + test('creating document fails when embedding table does not exist', async () => { + // Try to create a document that would trigger vectorization + // This should fail because the embedding table doesn't exist + await expect( + payload.create({ + collection: 'posts', + data: { + title: 'Test Post', + }, + }), + ).rejects.toThrow() }) }) @@ -283,17 +260,11 @@ describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { }, }) - // Temporarily disable onInit to avoid ensurePgvectorArtifacts check before migrations are applied - const savedOnInit = cliConfig.onInit - cliConfig.onInit = async () => { - // No-op: migrations haven't been applied yet - } - + // Get payload instance cliPayload = await getPayload({ config: cliConfig, cron: true, - key: `test-initial-setup-${Date.now()}`, - disableOnInit: true, + key: `migration-cli-test-${Date.now()}`, }) // Step 2: Create initial migration (this will include the embedding column via Drizzle) @@ -433,12 +404,6 @@ describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { } } - // Restore onInit and run it now that migrations are applied - cliConfig.onInit = savedOnInit - if (cliConfig.onInit) { - await cliConfig.onInit(cliPayload) - } - // Step 5: Verify index exists with correct lists parameter const postgresPayload = cliPayload as PostgresPayload const schemaName = postgresPayload.db.schemaName || 'public' @@ -516,17 +481,11 @@ describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { }, }) - // Temporarily disable onInit to avoid ensurePgvectorArtifacts check before migrations are applied - const savedOnInit = cliConfig.onInit - cliConfig.onInit = async () => { - // No-op: migrations haven't been applied yet - } - + // Get payload instance cliPayload = await getPayload({ config: cliConfig, cron: true, - key: `test-ivfflat-change-${Date.now()}`, - disableOnInit: true, + key: `migration-cli-test-${Date.now()}`, }) // Step 2: Run vectorize:migrate (should detect change and create migration) @@ -608,12 +567,6 @@ describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { } } - // Restore onInit and run it now that migrations are applied - if (savedOnInit) { - cliConfig.onInit = savedOnInit - await savedOnInit(cliPayload) - } - // Step 5: Verify index was rebuilt with new lists parameter const postgresPayload = cliPayload as PostgresPayload const schemaName = postgresPayload.db.schemaName || 'public' @@ -725,17 +678,11 @@ describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { }, }) - // Temporarily disable onInit to avoid ensurePgvectorArtifacts check before migrations are applied - const savedOnInitDims = cliConfig.onInit - cliConfig.onInit = async () => { - // No-op: migrations haven't been applied yet - } - + // Get payload instance cliPayload = await getPayload({ config: cliConfig, cron: true, - key: `test-dims-change-${Date.now()}`, - disableOnInit: true, + key: `migration-cli-test-${Date.now()}`, }) // Step 2: Run vectorize:migrate (should detect dims change) @@ -818,14 +765,6 @@ describe('Migration CLI and ensurePgvectorArtifacts integration tests', () => { } console.log('[TEST] Step 4.5: Migration applied successfully') - // Restore onInit and run it now that migrations are applied - console.log('[TEST] Step 4.6: Restoring onInit...') - if (savedOnInitDims) { - cliConfig.onInit = savedOnInitDims - await savedOnInitDims(cliPayload) - } - console.log('[TEST] Step 4.7: onInit restored and executed') - // Step 5: Verify column type changed and table was truncated console.log('[TEST] Step 5: Verifying column type and table state...') const postgresPayload = cliPayload as PostgresPayload diff --git a/dev/specs/utils.ts b/dev/specs/utils.ts index bc0b433..99c858b 100644 --- a/dev/specs/utils.ts +++ b/dev/specs/utils.ts @@ -23,24 +23,36 @@ export const createTestDb = async ({ dbName }: { dbName: string }) => { process.env.DATABASE_ADMIN_URI || 'postgresql://postgres:password@localhost:5433/postgres' // connect to 'postgres' const client = new Client({ connectionString: adminUri }) await client.connect() + + /* + // Drop and recreate the database to ensure a clean state + // First, terminate any existing connections to the database + await client.query(` + SELECT pg_terminate_backend(pg_stat_activity.pid) + FROM pg_stat_activity + WHERE pg_stat_activity.datname = $1 + AND pid <> pg_backend_pid() + `, [dbName])*/ + const exists = await client.query('SELECT 1 FROM pg_database WHERE datname = $1', [dbName]) if (exists.rowCount === 0) { await client.query(`CREATE DATABASE ${dbName}`) + //await client.query(`DROP DATABASE "${dbName}"`) } + //await client.query(`DROP DATABASE "${dbName}"`) await client.end() } /** * Initialize Payload with migrations applied. * This handles the full migration setup: - * 1. Get payload with disableOnInit to avoid ensurePgvectorArtifacts check + * 1. Get payload instance * 2. Create initial migration * 3. Run vectorize:migrate to patch with IVFFLAT index * 4. Apply migrations - * 5. Run onInit * * @param config - A pre-built SanitizedConfig (must have migrationDir and push: false in db config) - * @param key - Unique key for getPayload caching + * @param key - Unique key for getPayload caching (prevents instance collisions in tests) * @param cron - Whether to enable cron jobs (default: true) */ export async function initializePayloadWithMigrations({ @@ -49,11 +61,10 @@ export async function initializePayloadWithMigrations({ cron = true, }: { config: SanitizedConfig - key: string + key?: string cron?: boolean }): Promise { - // Get payload with disableOnInit to avoid ensurePgvectorArtifacts check before migrations - const payload = await getPayload({ config, key, cron, disableOnInit: true }) + const payload = await getPayload({ config, key, cron }) // Create initial migration (Payload's schema) await payload.db.createMigration({ migrationName: 'initial', payload }) @@ -64,11 +75,6 @@ export async function initializePayloadWithMigrations({ // Apply migrations (forceAcceptWarning bypasses the dev mode prompt) await (payload.db as any).migrate({ forceAcceptWarning: true }) - // Now run onInit (it's still available on config, not destroyed by disableOnInit) - if (payload.config.onInit) { - await payload.config.onInit(payload) - } - return payload } @@ -91,67 +97,6 @@ export function createTestMigrationsDir(dbName: string): { } } -/** - * Create pgvector artifacts (extension + IVFFLAT index) for testing. - * This should be called after migrations are applied but before onInit runs, - * or used with disableOnInit to manually set up the test environment. - */ -export const ensureTestPgvectorArtifacts = async ({ - dbName, - tableName = 'default', - dims = DEFAULT_DIMS, - ivfflatLists = 1, -}: { - dbName: string - tableName?: string - dims?: number - ivfflatLists?: number -}) => { - const client = new Client({ - connectionString: `postgresql://postgres:password@localhost:5433/${dbName}`, - }) - await client.connect() - try { - // Ensure pgvector extension exists - await client.query('CREATE EXTENSION IF NOT EXISTS vector') - - // Check if table exists (it should be created by Payload's schema init) - const tableCheck = await client.query( - `SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = $1`, - [tableName], - ) - if (tableCheck.rowCount === 0) { - // Table doesn't exist yet - this is expected before migrations - // We'll skip index creation; it will be handled by migrations - return - } - - // Check if embedding column exists - const columnCheck = await client.query( - `SELECT 1 FROM information_schema.columns WHERE table_schema = 'public' AND table_name = $1 AND column_name = 'embedding'`, - [tableName], - ) - if (columnCheck.rowCount === 0) { - // Column doesn't exist yet - skip index creation - return - } - - // Create IVFFLAT index if it doesn't exist - const indexName = `${tableName}_embedding_ivfflat` - const indexCheck = await client.query( - `SELECT 1 FROM pg_indexes WHERE schemaname = 'public' AND tablename = $1 AND indexname = $2`, - [tableName, indexName], - ) - if (indexCheck.rowCount === 0) { - await client.query( - `CREATE INDEX "${indexName}" ON "public"."${tableName}" USING ivfflat (embedding vector_cosine_ops) WITH (lists = ${ivfflatLists})`, - ) - } - } finally { - await client.end() - } -} - async function waitForTasks( payload: Payload, taskSlugs: string[], @@ -371,8 +316,7 @@ export async function buildPayloadWithIntegration({ }) const payloadKey = key ?? `payload-${dbName}-${Date.now()}` - // Disable onInit to avoid ensurePgvectorArtifacts check before index exists - const payload = await getPayload({ config, key: payloadKey, cron: true, disableOnInit: true }) + const payload = await getPayload({ config, key: payloadKey, cron: true }) // Create initial migration (Payload's schema) await payload.db.createMigration({ migrationName: 'initial', payload }) @@ -383,11 +327,6 @@ export async function buildPayloadWithIntegration({ // Apply migrations (forceAcceptWarning bypasses the dev mode prompt) await (payload.db as any).migrate({ forceAcceptWarning: true }) - // Now run onInit (it's still available on config, not destroyed by disableOnInit) - if (payload.config.onInit) { - await payload.config.onInit(payload) - } - return { payload, config } } diff --git a/src/bin/vectorize-migrate.ts b/src/bin/vectorize-migrate.ts index 6f17e59..d698a3d 100644 --- a/src/bin/vectorize-migrate.ts +++ b/src/bin/vectorize-migrate.ts @@ -345,11 +345,11 @@ function patchMigrationFile( * Bin script entry point for creating vector migrations */ export const script = async (config: SanitizedConfig): Promise => { - // Disable onInit to avoid ensurePgvectorArtifacts check - migrations may not be applied yet + // Use a unique key to ensure we get a fresh Payload instance with the correct config + // This is important when running in tests or when the config has been modified const payload = await getPayload({ config, - disableOnInit: true, - key: `vectorize-migrate-payload-instance-${Date.now()}`, + key: `vectorize-migrate-${Date.now()}`, }) const vectorizedPayload = getVectorizedPayload(payload) @@ -366,7 +366,17 @@ export const script = async (config: SanitizedConfig): Promise => { const poolNames = Object.keys(staticConfigs) const schemaName = (payload.db as any).schemaName || 'public' - const migrationsDir = (payload.db as any).migrationDir || resolve(process.cwd(), 'src/migrations') + + // Get migrations directory - the postgres adapter stores it on payload.db.migrationDir + // but this may be set to default before config is applied. Try multiple sources. + const dbMigrationDir = (payload.db as any).migrationDir + + // Debug: log migration directory detection + console.log('[payloadcms-vectorize] Debug: payload.db.migrationDir =', dbMigrationDir) + + // Use the payload.db.migrationDir - this is where Payload stores the resolved path + const migrationsDir = dbMigrationDir || resolve(process.cwd(), 'src/migrations') + console.log('[payloadcms-vectorize] Using migrations directory:', migrationsDir) console.log('[payloadcms-vectorize] Checking for configuration changes...') diff --git a/src/index.ts b/src/index.ts index 461ff2b..2083da9 100644 --- a/src/index.ts +++ b/src/index.ts @@ -78,131 +78,6 @@ export type { export { getVectorizedPayload } from './types.js' -/** - * Presence-only safety net: checks that pgvector artifacts exist. - * Does NOT create or modify them - migrations should handle that. - * This is a runtime check to fail fast if migrations haven't been applied. - */ -async function ensurePgvectorArtifacts(args: { - payload: Payload - tableName: string - ivfflatLists: number -}): Promise { - const { payload, tableName } = args - - payload.logger.info( - `[payloadcms-vectorize] ensurePgvectorArtifacts: Starting verification for table "${tableName}"`, - ) - - if (!isPostgresPayload(payload)) { - throw new Error( - '[payloadcms-vectorize] This plugin requires the Postgres adapter. Please configure @payloadcms/db-postgres.', - ) - } - - // Now payload is typed as PostgresPayload - const postgresPayload = payload as PostgresPayload - const schemaName = postgresPayload.db.schemaName || 'public' - - payload.logger.info( - `[payloadcms-vectorize] ensurePgvectorArtifacts: Using schema "${schemaName}" for table "${tableName}"`, - ) - - const runQuery = async (sql: string, params?: any[]): Promise => { - payload.logger.debug(`[payloadcms-vectorize] ensurePgvectorArtifacts: Executing query: ${sql}`) - if (postgresPayload.db.pool?.query) { - return postgresPayload.db.pool.query(sql, params) - } - if (postgresPayload.db.drizzle?.execute) { - return postgresPayload.db.drizzle.execute(sql) - } - throw new Error('[payloadcms-vectorize] No database query function available') - } - - try { - // Check extension exists - payload.logger.info( - '[payloadcms-vectorize] ensurePgvectorArtifacts: Checking pgvector extension...', - ) - const extensionCheck = await runQuery(`SELECT 1 FROM pg_extension WHERE extname = 'vector'`) - const extensionRows = Array.isArray(extensionCheck) - ? extensionCheck - : extensionCheck?.rows || [] - if (extensionRows.length === 0) { - payload.logger.error( - '[payloadcms-vectorize] ensurePgvectorArtifacts: pgvector extension not found', - ) - throw new Error( - `[payloadcms-vectorize] pgvector extension not found. Please ensure migrations have been applied or manually create the extension: CREATE EXTENSION IF NOT EXISTS vector;`, - ) - } - payload.logger.info('[payloadcms-vectorize] ensurePgvectorArtifacts: pgvector extension found') - - // Check column exists with correct dims - payload.logger.info( - `[payloadcms-vectorize] ensurePgvectorArtifacts: Checking embedding column in "${schemaName}"."${tableName}"...`, - ) - const columnCheck = await runQuery( - `SELECT column_name, udt_name - FROM information_schema.columns - WHERE table_schema = $1 AND table_name = $2 AND column_name = 'embedding'`, - [schemaName, tableName], - ) - const columnRows = Array.isArray(columnCheck) ? columnCheck : columnCheck?.rows || [] - if (columnRows.length === 0) { - payload.logger.error( - `[payloadcms-vectorize] ensurePgvectorArtifacts: Embedding column not found in "${schemaName}"."${tableName}"`, - ) - throw new Error( - `[payloadcms-vectorize] Embedding column not found in table "${schemaName}"."${tableName}". Please ensure migrations have been applied.`, - ) - } - payload.logger.info( - `[payloadcms-vectorize] ensurePgvectorArtifacts: Embedding column found (type: ${columnRows[0]?.udt_name || 'unknown'})`, - ) - - // Check index exists (don't verify lists parameter - migrations handle that) - const indexName = `${tableName}_embedding_ivfflat` - payload.logger.info( - `[payloadcms-vectorize] ensurePgvectorArtifacts: Checking IVFFLAT index "${indexName}"...`, - ) - const indexCheck = await runQuery( - `SELECT 1 - FROM pg_indexes - WHERE schemaname = $1 AND tablename = $2 AND indexname = $3`, - [schemaName, tableName, indexName], - ) - const indexRows = Array.isArray(indexCheck) ? indexCheck : indexCheck?.rows || [] - if (indexRows.length === 0) { - payload.logger.error( - `[payloadcms-vectorize] ensurePgvectorArtifacts: IVFFLAT index "${indexName}" not found on "${schemaName}"."${tableName}"`, - ) - throw new Error( - `[payloadcms-vectorize] IVFFLAT index not found on table "${schemaName}"."${tableName}". Please ensure migrations have been applied.`, - ) - } - payload.logger.info( - `[payloadcms-vectorize] ensurePgvectorArtifacts: IVFFLAT index "${indexName}" found`, - ) - - postgresPayload.logger.info( - `[payloadcms-vectorize] pgvector artifacts verified for table "${schemaName}"."${tableName}"`, - ) - } catch (err) { - payload.logger.error( - `[payloadcms-vectorize] ensurePgvectorArtifacts: Error occurred: ${err instanceof Error ? err.message : String(err)}`, - ) - if (err instanceof Error && err.message.includes('[payloadcms-vectorize]')) { - throw err - } - postgresPayload.logger.error( - '[payloadcms-vectorize] Failed checking pgvector artifacts', - err as Error, - ) - throw new Error(`[payloadcms-vectorize] Failed checking pgvector artifacts: ${err}`) - } -} - // ================== // Plugin entry point // ================== @@ -686,51 +561,6 @@ export const createVectorizeIntegration = ] console.log('[payloadcms-vectorize] payloadcmsVectorize: Bin script registered') - console.log('[payloadcms-vectorize] payloadcmsVectorize: Setting up onInit hook...') - const incomingOnInit = config.onInit - config.onInit = async (payload) => { - payload.logger.info( - '[payloadcms-vectorize] onInit: Starting pgvector artifacts verification', - ) - try { - if (incomingOnInit) { - payload.logger.info('[payloadcms-vectorize] onInit: Calling incoming onInit hook') - await incomingOnInit(payload) - payload.logger.info('[payloadcms-vectorize] onInit: Incoming onInit hook completed') - } - // Ensure pgvector artifacts for each knowledge pool - const poolNames = Object.keys(staticConfigs) - payload.logger.info( - `[payloadcms-vectorize] onInit: Verifying artifacts for ${poolNames.length} knowledge pool(s): ${poolNames.join(', ')}`, - ) - for (const poolName in staticConfigs) { - const staticConfig = staticConfigs[poolName] - const tableName = toSnakeCase(poolName) - payload.logger.info( - `[payloadcms-vectorize] onInit: Verifying artifacts for pool "${poolName}" (table: "${tableName}")`, - ) - // Drizzle converts camelCase collection slugs to snake_case table names - await ensurePgvectorArtifacts({ - payload, - tableName, - ivfflatLists: staticConfig.ivfflatLists, - }) - payload.logger.info( - `[payloadcms-vectorize] onInit: Artifacts verified for pool "${poolName}"`, - ) - } - payload.logger.info( - '[payloadcms-vectorize] onInit: All pgvector artifacts verified successfully', - ) - } catch (error) { - payload.logger.error( - `[payloadcms-vectorize] onInit: Error verifying pgvector artifacts: ${error instanceof Error ? error.message : String(error)}`, - ) - throw error - } - } - console.log('[payloadcms-vectorize] payloadcmsVectorize: onInit hook configured') - if (pluginOptions.endpointOverrides?.enabled !== false) { console.log( '[payloadcms-vectorize] payloadcmsVectorize: Setting up vector search endpoint...', From cc6d39dbac01dfa3132f44c10b1f5d5bbfba5db3 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sat, 17 Jan 2026 17:55:00 +0700 Subject: [PATCH 47/49] WIP --- dev/specs/migrationCli.spec.ts | 276 +++++++++++++++++++++++++++++++++ src/bin/vectorize-migrate.ts | 32 +++- 2 files changed, 302 insertions(+), 6 deletions(-) diff --git a/dev/specs/migrationCli.spec.ts b/dev/specs/migrationCli.spec.ts index a77c2b6..97c6ec4 100644 --- a/dev/specs/migrationCli.spec.ts +++ b/dev/specs/migrationCli.spec.ts @@ -798,5 +798,281 @@ describe('Migration CLI integration tests', () => { expect(rowCount).toBe(0) console.log('[TEST] Test 4 completed successfully') }) + + test('5. Add new knowledgePool: CLI creates migration for new table', async () => { + console.log('[TEST] Starting test 5: Add new knowledgePool') + + // Step 1: Create integration with an additional knowledgePool "secondary" + const integrationWithSecondary = createVectorizeIntegration({ + default: { + dims: 10, // Keep same dims as test 4 + ivfflatLists: 20, // Keep same lists as test 4 + }, + secondary: { + dims: DIMS, + ivfflatLists: 5, + }, + }) + + cliConfig = await buildConfig({ + secret: 'test-secret', + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + { + slug: 'articles', + fields: [{ name: 'content', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integrationWithSecondary.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${cliDbName}`, + }, + }), + plugins: [ + integrationWithSecondary.payloadcmsVectorize({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(10), + realTimeIngestionFn: makeDummyEmbedDocs(10), + }, + }, + secondary: { + collections: { + articles: { + toKnowledgePool: async (doc: any) => [{ chunk: doc.content || '' }], + }, + } as any, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(DIMS), + realTimeIngestionFn: makeDummyEmbedDocs(DIMS), + }, + }, + }, + }), + ], + jobs: { + tasks: [], + autoRun: [ + { + cron: '*/5 * * * * *', + limit: 10, + }, + ], + }, + }) + + // Get new payload instance + cliPayload = await getPayload({ + config: cliConfig, + cron: true, + key: `migration-cli-test-5-${Date.now()}`, + }) + + // Step 2: Create migration for new table + console.log('[TEST] Step 2: Creating migration for new knowledgePool...') + try { + await cliPayload.db.createMigration({ + migrationName: 'add_secondary_pool', + payload: cliPayload, + forceAcceptWarning: true, // Skip prompts in tests + }) + console.log('[TEST] Step 2.5: Migration created') + } catch (e) { + console.error('[TEST] Step 2 ERROR - createMigration failed:', e) + throw e + } + + // Step 3: Run vectorize:migrate to add IVFFLAT index for new pool + console.log('[TEST] Step 3: Running vectorize:migrate...') + try { + await vectorizeMigrateScript(cliConfig) + console.log('[TEST] Step 3.5: vectorize:migrate completed') + } catch (e) { + console.error('[TEST] Step 3 ERROR - vectorize:migrate failed:', e) + throw e + } + + // Step 4: Verify migration file contains secondary table creation and IVFFLAT index + const migrations = readdirSync(migrationsDir) + .filter( + (f) => (f.endsWith('.ts') || f.endsWith('.js')) && f !== 'index.ts' && f !== 'index.js', + ) + .map((f) => ({ + name: f, + path: join(migrationsDir, f), + mtime: statSync(join(migrationsDir, f)).mtime, + })) + .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) + + const newestMigration = migrations[0] + console.log(`[TEST] Step 4: Checking newest migration: ${newestMigration.name}`) + const migrationContent = readFileSync(newestMigration.path, 'utf-8') + + // Should contain secondary table creation + expect(migrationContent).toContain('secondary') + // Should contain IVFFLAT index for secondary pool + expect(migrationContent).toContain('secondary_embedding_ivfflat') + console.log('[TEST] Step 4.5: Migration file verification passed') + + // Step 5: Apply the migration + console.log('[TEST] Step 5: Applying migration...') + try { + await (cliPayload.db as any).migrate({ forceAcceptWarning: true }) + console.log('[TEST] Step 5.5: Migration applied') + } catch (e) { + console.error('[TEST] Step 5 ERROR - migrate failed:', e) + throw e + } + + // Step 6: Verify new table exists with IVFFLAT index + const postgresPayload = cliPayload as PostgresPayload + const schemaName = postgresPayload.db.schemaName || 'public' + + // Check table exists + const tableCheck = await postgresPayload.db.pool?.query( + `SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_schema = $1 AND table_name = 'secondary' + )`, + [schemaName], + ) + expect(tableCheck?.rows[0]?.exists).toBe(true) + console.log('[TEST] Step 6: Secondary table exists') + + // Check IVFFLAT index exists + const indexCheck = await postgresPayload.db.pool?.query( + `SELECT indexname FROM pg_indexes WHERE schemaname = $1 AND indexname = $2`, + [schemaName, 'secondary_embedding_ivfflat'], + ) + expect(indexCheck?.rows.length).toBeGreaterThan(0) + console.log('[TEST] Step 6.5: Secondary IVFFLAT index exists') + console.log('[TEST] Test 5 completed successfully') + }) + + test('6. Remove knowledgePool: Secondary table can be dropped manually', async () => { + console.log('[TEST] Starting test 6: Remove knowledgePool') + + // Note: Payload's migration system doesn't automatically generate DROP TABLE + // migrations when collections are removed. Users need to manually drop tables. + // This test verifies that after removing a pool, the vectorize plugin handles + // it gracefully and the table can be dropped manually. + + // Step 1: Create integration with only 'default' pool (removing 'secondary') + const integrationWithoutSecondary = createVectorizeIntegration({ + default: { + dims: 10, + ivfflatLists: 20, + }, + }) + + cliConfig = await buildConfig({ + secret: 'test-secret', + collections: [ + { + slug: 'posts', + fields: [{ name: 'title', type: 'text' }], + }, + ], + db: postgresAdapter({ + extensions: ['vector'], + afterSchemaInit: [integrationWithoutSecondary.afterSchemaInitHook], + migrationDir: migrationsDir, + push: false, + pool: { + connectionString: `postgresql://postgres:password@localhost:5433/${cliDbName}`, + }, + }), + plugins: [ + integrationWithoutSecondary.payloadcmsVectorize({ + knowledgePools: { + default: { + collections: { + posts: { + toKnowledgePool: async (doc) => [{ chunk: doc.title || '' }], + }, + }, + embeddingConfig: { + version: testEmbeddingVersion, + queryFn: makeDummyEmbedQuery(10), + realTimeIngestionFn: makeDummyEmbedDocs(10), + }, + }, + }, + }), + ], + jobs: { + tasks: [], + autoRun: [ + { + cron: '*/5 * * * * *', + limit: 10, + }, + ], + }, + }) + + // Get new payload instance + cliPayload = await getPayload({ + config: cliConfig, + cron: true, + key: `migration-cli-test-6-${Date.now()}`, + }) + + // Step 2: Run vectorize:migrate - should detect no changes for default pool + // and not error out because secondary is no longer in config + console.log('[TEST] Step 2: Running vectorize:migrate with secondary pool removed...') + await vectorizeMigrateScript(cliConfig) + console.log('[TEST] Step 2.5: vectorize:migrate completed (no changes expected)') + + // Step 3: Verify secondary table still exists (Payload doesn't auto-drop) + const postgresPayload = cliPayload as PostgresPayload + const schemaName = postgresPayload.db.schemaName || 'public' + + const tableCheck = await postgresPayload.db.pool?.query( + `SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_schema = $1 AND table_name = 'secondary' + )`, + [schemaName], + ) + // Table should still exist since Payload doesn't auto-drop tables + expect(tableCheck?.rows[0]?.exists).toBe(true) + console.log('[TEST] Step 3: Secondary table still exists (as expected - manual drop required)') + + // Step 4: Manually drop the secondary table and its index + console.log('[TEST] Step 4: Manually dropping secondary table...') + await postgresPayload.db.pool?.query( + `DROP INDEX IF EXISTS "${schemaName}"."secondary_embedding_ivfflat"`, + ) + await postgresPayload.db.pool?.query(`DROP TABLE IF EXISTS "${schemaName}"."secondary" CASCADE`) + console.log('[TEST] Step 4.5: Secondary table dropped') + + // Step 5: Verify secondary table no longer exists + const tableCheckAfter = await postgresPayload.db.pool?.query( + `SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_schema = $1 AND table_name = 'secondary' + )`, + [schemaName], + ) + expect(tableCheckAfter?.rows[0]?.exists).toBe(false) + console.log('[TEST] Step 5: Secondary table no longer exists') + console.log('[TEST] Test 6 completed successfully') + }) }) }) diff --git a/src/bin/vectorize-migrate.ts b/src/bin/vectorize-migrate.ts index d698a3d..de0c3fc 100644 --- a/src/bin/vectorize-migrate.ts +++ b/src/bin/vectorize-migrate.ts @@ -88,14 +88,33 @@ function getPriorStateFromMigrations( ) } - // Check for dims in vector column definition (search full content as dims should be consistent) - const dimsMatch = content.match(new RegExp(`vector\\((\\d+)\\)`, 'i')) + // Check for dims in vector column definition + // Look for pool-specific patterns to avoid mixing up dims from different pools + // Use non-greedy .*? to match table-specific sections + const dimsMatch = + // ALTER TABLE specific to this table + content.match( + new RegExp(`ALTER\\s+TABLE[^;]*?"${tableName}"[^;]*?vector\\((\\d+)\\)`, 'is'), + ) || + // CREATE TABLE for this table (with non-greedy match to the table content) + content.match( + new RegExp(`CREATE\\s+TABLE[^;]*?"${tableName}"[^;]*?embedding[^;]*?vector\\((\\d+)\\)`, 'is'), + ) || + // Table definition in Drizzle format: "tableName" (...embedding vector(X)...) + content.match( + new RegExp(`"${tableName}"\\s*\\([^)]*embedding[^)]*vector\\((\\d+)\\)`, 'is'), + ) + if (dimsMatch && !state.get(poolName)?.dims) { const dims = parseInt(dimsMatch[1], 10) const current = state.get(poolName) || { dims: null, ivfflatLists: null } state.set(poolName, { ...current, dims }) console.log( - `[payloadcms-vectorize] Found prior dims=${dims} for pool "${poolName}" in ${file.name}`, + `[payloadcms-vectorize] Found prior dims=${dims} for pool "${poolName}" (table="${tableName}") in ${file.name}`, + ) + } else if (!state.get(poolName)?.dims) { + console.log( + `[payloadcms-vectorize] No dims found for pool "${poolName}" (table="${tableName}") in ${file.name}`, ) } } @@ -345,14 +364,14 @@ function patchMigrationFile( * Bin script entry point for creating vector migrations */ export const script = async (config: SanitizedConfig): Promise => { - // Use a unique key to ensure we get a fresh Payload instance with the correct config - // This is important when running in tests or when the config has been modified + // Get Payload instance for db operations and to access static configs via VectorizedPayload const payload = await getPayload({ config, key: `vectorize-migrate-${Date.now()}`, }) - const vectorizedPayload = getVectorizedPayload(payload) + // Get static configs from VectorizedPayload + const vectorizedPayload = getVectorizedPayload(payload) if (!vectorizedPayload) { throw new Error( '[payloadcms-vectorize] Vectorize plugin not found. Ensure payloadcmsVectorize is configured in your Payload config.', @@ -513,6 +532,7 @@ export const script = async (config: SanitizedConfig): Promise => { await payload.db.createMigration({ migrationName: 'vectorize-config', payload, + forceAcceptWarning: true, }) console.log('[payloadcms-vectorize] Migration created successfully') } catch (error) { From c5b7164fb9a151c6af522429f9354a7f1b1c6cd6 Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sat, 17 Jan 2026 19:11:05 +0700 Subject: [PATCH 48/49] WIP --- src/bin/vectorize-migrate.ts | 146 ++--------------------------------- src/index.ts | 138 +-------------------------------- 2 files changed, 9 insertions(+), 275 deletions(-) diff --git a/src/bin/vectorize-migrate.ts b/src/bin/vectorize-migrate.ts index de0c3fc..105a711 100644 --- a/src/bin/vectorize-migrate.ts +++ b/src/bin/vectorize-migrate.ts @@ -36,15 +36,12 @@ function getPriorStateFromMigrations( })) .sort((a, b) => b.mtime.getTime() - a.mtime.getTime()) - console.log(`[payloadcms-vectorize] Found ${migrationFiles.length} migration file(s) to scan for prior state`) - // Read migration files to find vector config for (const file of migrationFiles) { try { const content = readFileSync(file.path, 'utf-8') // Extract only the UP function content to avoid matching values in DOWN function - // The DOWN function contains previous/rollback values which we don't want const upFunctionMatch = content.match( /export\s+async\s+function\s+up\s*\([^)]*\)[^{]*\{([\s\S]*?)(?=\}\s*(?:export\s+async\s+function\s+down|$))/i, ) @@ -55,52 +52,32 @@ function getPriorStateFromMigrations( const tableName = toSnakeCase(poolName) const indexName = `${tableName}_embedding_ivfflat` - // Check if this migration creates the index (only in UP function) - // The code format is: await db.execute(sql.raw(`CREATE INDEX "indexName" ... WITH (lists = 10)`)) - // We need to match the lists parameter in the template literal - // Use non-greedy .*? to match the FIRST occurrence const indexMatch = - // Match: db.execute(sql.raw(`...CREATE INDEX..."indexName"...WITH (lists = 10)...`)) upContent.match( new RegExp( `db\\.execute\\(sql\\.raw.*?CREATE INDEX.*?"${indexName}".*?WITH\\s*\\(lists\\s*=\\s*(\\d+)\\)`, 'is', ), ) || - // Match: CREATE INDEX "indexName" ... WITH (lists = 10) (in any context) upContent.match( new RegExp(`CREATE INDEX.*?"${indexName}".*?WITH\\s*\\(lists\\s*=\\s*(\\d+)\\)`, 'is'), ) || - // Match: lists = near ivfflat (non-greedy) upContent.match(new RegExp(`ivfflat.*?lists\\s*=\\s*(\\d+)`, 'is')) if (indexMatch && !state.get(poolName)?.ivfflatLists) { const lists = parseInt(indexMatch[1], 10) const current = state.get(poolName) || { dims: null, ivfflatLists: null } state.set(poolName, { ...current, ivfflatLists: lists }) - console.log( - `[payloadcms-vectorize] Found prior ivfflatLists=${lists} for pool "${poolName}" in ${file.name}`, - ) - } else if (!state.get(poolName)?.ivfflatLists) { - // Debug: log if we didn't find it - console.log( - `[payloadcms-vectorize] No ivfflatLists found for pool "${poolName}" in ${file.name}`, - ) } - // Check for dims in vector column definition - // Look for pool-specific patterns to avoid mixing up dims from different pools - // Use non-greedy .*? to match table-specific sections + // Check for dims in vector column definition (pool-specific patterns) const dimsMatch = - // ALTER TABLE specific to this table content.match( new RegExp(`ALTER\\s+TABLE[^;]*?"${tableName}"[^;]*?vector\\((\\d+)\\)`, 'is'), ) || - // CREATE TABLE for this table (with non-greedy match to the table content) content.match( new RegExp(`CREATE\\s+TABLE[^;]*?"${tableName}"[^;]*?embedding[^;]*?vector\\((\\d+)\\)`, 'is'), ) || - // Table definition in Drizzle format: "tableName" (...embedding vector(X)...) content.match( new RegExp(`"${tableName}"\\s*\\([^)]*embedding[^)]*vector\\((\\d+)\\)`, 'is'), ) @@ -109,13 +86,6 @@ function getPriorStateFromMigrations( const dims = parseInt(dimsMatch[1], 10) const current = state.get(poolName) || { dims: null, ivfflatLists: null } state.set(poolName, { ...current, dims }) - console.log( - `[payloadcms-vectorize] Found prior dims=${dims} for pool "${poolName}" (table="${tableName}") in ${file.name}`, - ) - } else if (!state.get(poolName)?.dims) { - console.log( - `[payloadcms-vectorize] No dims found for pool "${poolName}" (table="${tableName}") in ${file.name}`, - ) } } } catch (err) { @@ -183,9 +153,7 @@ function patchMigrationFile( schemaName: string, priorState: Map, ): void { - console.log(`[vectorize-migrate] Reading migration file: ${migrationPath}`) const content = readFileSync(migrationPath, 'utf-8') - console.log(`[vectorize-migrate] File read successfully, length: ${content.length} characters`) // Generate SQL code for each pool const vectorUpCode: string[] = [] @@ -273,17 +241,6 @@ function patchMigrationFile( /export\s+async\s+function\s+up\s*\([^)]*\)\s*:\s*Promise\s*\{/i, ) if (!upFunctionMatch) { - console.error( - `[vectorize-migrate] Could not find 'up' function in migration file: ${migrationPath}`, - ) - console.error(`[vectorize-migrate] File content length: ${content.length} characters`) - console.error(`[vectorize-migrate] File content (first 1000 chars):`) - console.error(content.substring(0, 1000)) - console.error(`[vectorize-migrate] File content (last 1000 chars):`) - console.error(content.substring(Math.max(0, content.length - 1000))) - console.error( - `[vectorize-migrate] Searching for pattern: /export\\s+async\\s+function\\s+up\\s*\\([^)]*\\)\\s*:\\s*Promise\\s*\\{/i`, - ) throw new Error(`Could not find 'up' function in migration file: ${migrationPath}`) } @@ -294,9 +251,6 @@ function patchMigrationFile( // Find the last closing brace before down function or end const upFunctionBody = content.substring(upFunctionStart, searchEnd) const lastBraceIndex = upFunctionBody.lastIndexOf('}') - console.log(`[vectorize-migrate] up function body length: ${upFunctionBody.length}`) - console.log(`[vectorize-migrate] lastBraceIndex in body: ${lastBraceIndex}`) - console.log(`[vectorize-migrate] up function body ends with: ${upFunctionBody.substring(Math.max(0, upFunctionBody.length - 200))}`) if (lastBraceIndex === -1) { throw new Error( `Could not find closing brace for 'up' function in migration file: ${migrationPath}`, @@ -306,21 +260,9 @@ function patchMigrationFile( // Insert our code before the closing brace const beforeBrace = content.substring(0, upFunctionStart + lastBraceIndex) const afterBrace = content.substring(upFunctionStart + lastBraceIndex) - console.log(`[vectorize-migrate] Insertion point: beforeBrace ends with: ${beforeBrace.substring(Math.max(0, beforeBrace.length - 100))}`) - console.log(`[vectorize-migrate] Insertion point: afterBrace starts with: ${afterBrace.substring(0, 100)}`) const codeToInsert = '\n' + vectorUpCode.join('\n') + '\n' - console.log(`[vectorize-migrate] Inserting ${vectorUpCode.length} line(s) of code into migration`) - console.log(`[vectorize-migrate] Code to insert:\n${codeToInsert}`) let newContent = beforeBrace + codeToInsert + afterBrace - console.log(`[vectorize-migrate] Migration file will be ${newContent.length} characters after patching (was ${content.length})`) - - // Verify insertion point looks correct - const insertionPointPreview = newContent.substring( - Math.max(0, beforeBrace.length - 50), - Math.min(newContent.length, beforeBrace.length + codeToInsert.length + 50), - ) - console.log(`[vectorize-migrate] Insertion point preview:\n${insertionPointPreview}`) // Handle down function if (downFunctionMatch) { @@ -349,15 +291,6 @@ function patchMigrationFile( } writeFileSync(migrationPath, newContent, 'utf-8') - console.log(`[vectorize-migrate] Migration file written successfully`) - // Verify the code was inserted - const verifyContent = readFileSync(migrationPath, 'utf-8') - const hasIvfflatCode = verifyContent.includes('ivfflat') && verifyContent.includes('lists =') - console.log(`[vectorize-migrate] Verification: migration contains IVFFLAT code: ${hasIvfflatCode}`) - if (!hasIvfflatCode && vectorUpCode.length > 0) { - console.error(`[vectorize-migrate] WARNING: IVFFLAT code was supposed to be inserted but not found in file!`) - console.error(`[vectorize-migrate] Expected to find: ${vectorUpCode.join(' | ')}`) - } } /** @@ -386,31 +319,12 @@ export const script = async (config: SanitizedConfig): Promise => { const poolNames = Object.keys(staticConfigs) const schemaName = (payload.db as any).schemaName || 'public' - // Get migrations directory - the postgres adapter stores it on payload.db.migrationDir - // but this may be set to default before config is applied. Try multiple sources. + // Get migrations directory const dbMigrationDir = (payload.db as any).migrationDir - - // Debug: log migration directory detection - console.log('[payloadcms-vectorize] Debug: payload.db.migrationDir =', dbMigrationDir) - - // Use the payload.db.migrationDir - this is where Payload stores the resolved path const migrationsDir = dbMigrationDir || resolve(process.cwd(), 'src/migrations') - console.log('[payloadcms-vectorize] Using migrations directory:', migrationsDir) - - console.log('[payloadcms-vectorize] Checking for configuration changes...') // Get prior state from migrations const priorState = getPriorStateFromMigrations(migrationsDir, poolNames) - - // Debug: log prior state - console.log('[payloadcms-vectorize] Prior state from migrations:') - for (const [poolName, state] of priorState.entries()) { - console.log(`[payloadcms-vectorize] ${poolName}: dims=${state.dims}, ivfflatLists=${state.ivfflatLists}`) - } - console.log('[payloadcms-vectorize] Current static configs:') - for (const [poolName, config] of Object.entries(staticConfigs)) { - console.log(`[payloadcms-vectorize] ${poolName}: dims=${config.dims}, ivfflatLists=${config.ivfflatLists}`) - } // Check if any changes are needed let hasChanges = false @@ -419,13 +333,9 @@ export const script = async (config: SanitizedConfig): Promise => { const prior = priorState.get(poolName) || { dims: null, ivfflatLists: null } // Check if this is the first migration (no IVFFLAT index exists yet) - // Note: dims might be found from Drizzle schema, but ivfflatLists won't be found until we create the index if (prior.ivfflatLists === null) { isFirstMigration = true hasChanges = true - console.log( - `[payloadcms-vectorize] First migration detected for pool "${poolName}" (ivfflatLists not found in prior migrations)`, - ) break } @@ -435,40 +345,27 @@ export const script = async (config: SanitizedConfig): Promise => { (prior.ivfflatLists !== null && prior.ivfflatLists !== currentConfig.ivfflatLists) ) { hasChanges = true - console.log( - `[payloadcms-vectorize] Change detected for pool "${poolName}": dims ${prior.dims}→${currentConfig.dims}, ivfflatLists ${prior.ivfflatLists}→${currentConfig.ivfflatLists}`, - ) break } } - // If no changes detected, check if artifacts exist (idempotency) + // If no changes detected if (!hasChanges) { console.log('[payloadcms-vectorize] No configuration changes detected.') - console.log( - '[payloadcms-vectorize] If this is the first migration, ensure your initial migration creates the embedding columns via Drizzle schema.', - ) return } - - console.log('[payloadcms-vectorize] Changes detected.') // Determine if there are actual schema changes (dims change) or just index parameter changes (ivfflatLists) - // payload.db.createMigration only works when there are schema changes - // For index-only changes, we need to create the migration file manually let hasSchemaChanges = false for (const [poolName, currentConfig] of Object.entries(staticConfigs)) { const prior = priorState.get(poolName) || { dims: null, ivfflatLists: null } if (prior.dims !== null && prior.dims !== currentConfig.dims) { hasSchemaChanges = true - console.log(`[payloadcms-vectorize] Schema change detected for pool "${poolName}": dims ${prior.dims}→${currentConfig.dims}`) break } } if (isFirstMigration) { - console.log('[payloadcms-vectorize] This is the first migration - checking if we should patch existing migration or create new one') - // Check if there's a very recent migration file (created in last 10 seconds) that we should patch const recentMigrations = existsSync(migrationsDir) ? readdirSync(migrationsDir) @@ -486,24 +383,16 @@ export const script = async (config: SanitizedConfig): Promise => { if (recentMigrations.length > 0) { const recentMigration = recentMigrations[0] - console.log(`[payloadcms-vectorize] Found recent migration to patch: ${recentMigration.name}`) // Check if it already has IVFFLAT index code const recentContent = readFileSync(recentMigration.path, 'utf-8') const hasIvfflatCode = recentContent.includes('ivfflat') && (recentContent.includes('drizzle.execute') || recentContent.includes('CREATE INDEX')) if (!hasIvfflatCode) { - console.log(`[payloadcms-vectorize] Patching existing migration: ${recentMigration.path}`) patchMigrationFile(recentMigration.path, staticConfigs, schemaName, priorState) console.log('[payloadcms-vectorize] Migration patched successfully!') return - } else { - console.log(`[payloadcms-vectorize] Recent migration already has IVFFLAT code, creating new migration instead`) } } - - console.log('[payloadcms-vectorize] Creating new migration with IVFFLAT index setup') - } else { - console.log('[payloadcms-vectorize] Creating new migration for configuration change') } // Create migration using Payload's API OR create manually for index-only changes @@ -525,20 +414,12 @@ export const script = async (config: SanitizedConfig): Promise => { // If there are schema changes (dims changed), use Payload's createMigration // Otherwise (only ivfflatLists changed), create the migration file manually - // because Payload's createMigration hangs when there are no schema changes to detect if (hasSchemaChanges) { - console.log('[payloadcms-vectorize] Schema changes detected - using payload.db.createMigration...') - try { - await payload.db.createMigration({ - migrationName: 'vectorize-config', - payload, - forceAcceptWarning: true, - }) - console.log('[payloadcms-vectorize] Migration created successfully') - } catch (error) { - console.error('[payloadcms-vectorize] Error creating migration:', error) - throw error - } + await payload.db.createMigration({ + migrationName: 'vectorize-config', + payload, + forceAcceptWarning: true, + }) // Find the newest migration file (should be the one just created) const migrationsAfter = existsSync(migrationsDir) @@ -567,10 +448,6 @@ export const script = async (config: SanitizedConfig): Promise => { migrationPath = foundPath } else { // No schema changes (only ivfflatLists changed) - create migration file manually - // Payload's createMigration API doesn't support this case (it hangs when no schema changes detected) - console.log('[payloadcms-vectorize] No schema changes (only index parameter changes) - creating migration file manually...') - - // Generate timestamp for migration filename (format: YYYYMMDD_HHMMSS) const now = new Date() const timestamp = [ now.getFullYear(), @@ -585,7 +462,6 @@ export const script = async (config: SanitizedConfig): Promise => { const migrationFileName = `${timestamp}_vectorize_ivfflat_rebuild.ts` migrationPath = join(migrationsDir, migrationFileName) - // Create a minimal migration file that we'll patch with our IVFFLAT code const migrationTemplate = `import { MigrateUpArgs, MigrateDownArgs, sql } from '@payloadcms/db-postgres' export async function up({ db, payload, req }: MigrateUpArgs): Promise { @@ -598,18 +474,12 @@ export async function down({ db, payload, req }: MigrateDownArgs): Promise ` writeFileSync(migrationPath, migrationTemplate, 'utf-8') - console.log(`[payloadcms-vectorize] Created migration file: ${migrationPath}`) } - console.log(`[payloadcms-vectorize] Patching migration: ${migrationPath}`) - // Patch the migration file patchMigrationFile(migrationPath, staticConfigs, schemaName, priorState) console.log('[payloadcms-vectorize] Migration created and patched successfully!') - console.log( - '[payloadcms-vectorize] Review the migration file and apply it with: pnpm payload migrate', - ) // Only exit if not in test environment (when called from tests, just return) if (process.env.NODE_ENV !== 'test' && !process.env.VITEST) { diff --git a/src/index.ts b/src/index.ts index 2083da9..bf80c36 100644 --- a/src/index.ts +++ b/src/index.ts @@ -135,64 +135,28 @@ export const createVectorizeIntegration = const payloadcmsVectorize = (pluginOptions: PayloadcmsVectorizeConfig) => (config: Config): Config => { - console.log('[payloadcms-vectorize] payloadcmsVectorize: Plugin initialization started') - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Processing ${Object.keys(pluginOptions.knowledgePools).length} knowledge pool(s)`, - ) - // Ensure collections array exists config.collections = [...(config.collections || [])] - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Initial collections count: ${config.collections.length}`, - ) // Ensure bulk runs collection exists once - console.log('[payloadcms-vectorize] payloadcmsVectorize: Adding bulk runs collection...') const bulkRunsCollection = createBulkEmbeddingsRunsCollection() if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_RUNS_SLUG)) { config.collections.push(bulkRunsCollection) - console.log('[payloadcms-vectorize] payloadcmsVectorize: Bulk runs collection added') - } else { - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Bulk runs collection already exists', - ) } // Ensure bulk input metadata collection exists once - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Adding bulk input metadata collection...', - ) const bulkInputMetadataCollection = createBulkEmbeddingInputMetadataCollection() if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_INPUT_METADATA_SLUG)) { config.collections.push(bulkInputMetadataCollection) - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Bulk input metadata collection added', - ) - } else { - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Bulk input metadata collection already exists', - ) } // Ensure bulk batches collection exists once - console.log('[payloadcms-vectorize] payloadcmsVectorize: Adding bulk batches collection...') const bulkBatchesCollection = createBulkEmbeddingsBatchesCollection() if (!config.collections.find((c) => c.slug === BULK_EMBEDDINGS_BATCHES_SLUG)) { config.collections.push(bulkBatchesCollection) - console.log('[payloadcms-vectorize] payloadcmsVectorize: Bulk batches collection added') - } else { - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Bulk batches collection already exists', - ) } // Validate static/dynamic configs share the same pool names - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Validating static/dynamic config alignment...', - ) for (const poolName in pluginOptions.knowledgePools) { if (!staticConfigs[poolName]) { - console.error( - `[payloadcms-vectorize] payloadcmsVectorize: Knowledge pool "${poolName}" not found in static configs`, - ) throw new Error( `[payloadcms-vectorize] Knowledge pool "${poolName}" not found in static configs`, ) @@ -206,16 +170,10 @@ export const createVectorizeIntegration = } } if (unusedStaticPools.length > 0) { - console.error( - `[payloadcms-vectorize] payloadcmsVectorize: Static pools without dynamic config: ${unusedStaticPools.join(', ')}`, - ) throw new Error( `[payloadcms-vectorize] Static knowledge pool(s) ${unusedStaticPools.join(', ')} lack dynamic configuration`, ) } - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Static/dynamic config validation passed', - ) // Build reverse mapping: collectionSlug -> KnowledgePoolName[] const collectionToPools = new Map< @@ -227,124 +185,73 @@ export const createVectorizeIntegration = >() // Process each knowledge pool - console.log('[payloadcms-vectorize] payloadcmsVectorize: Processing knowledge pools...') for (const poolName in pluginOptions.knowledgePools) { - console.log(`[payloadcms-vectorize] payloadcmsVectorize: Processing pool "${poolName}"...`) const dynamicConfig = pluginOptions.knowledgePools[poolName] // Add the embeddings collection for this knowledge pool with extensionFields - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Creating embeddings collection for pool "${poolName}"...`, - ) const embeddingsCollection = createEmbeddingsCollection( poolName, dynamicConfig.extensionFields, ) if (!config.collections.find((c) => c.slug === poolName)) { config.collections.push(embeddingsCollection) - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Embeddings collection "${poolName}" added`, - ) - } else { - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Embeddings collection "${poolName}" already exists`, - ) } // Build reverse mapping for hooks const collectionSlugs = Object.keys(dynamicConfig.collections) - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Pool "${poolName}" maps to ${collectionSlugs.length} collection(s): ${collectionSlugs.join(', ')}`, - ) for (const collectionSlug of collectionSlugs) { if (!collectionToPools.has(collectionSlug)) { collectionToPools.set(collectionSlug, []) } collectionToPools.get(collectionSlug)!.push({ pool: poolName, dynamic: dynamicConfig }) } - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Pool "${poolName}" processing complete`, - ) } - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Knowledge pools processed. Total collections: ${config.collections.length}`, - ) // Validate bulk queue requirements - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Validating bulk queue requirements...', - ) let bulkIngestEnabled = false for (const poolName in pluginOptions.knowledgePools) { const dynamicConfig = pluginOptions.knowledgePools[poolName] if (dynamicConfig.embeddingConfig.bulkEmbeddingsFns) { bulkIngestEnabled = true - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Pool "${poolName}" has bulk embedding enabled`, - ) break } } if (bulkIngestEnabled && !pluginOptions.bulkQueueNames) { - console.error( - '[payloadcms-vectorize] payloadcmsVectorize: bulkQueueNames required but not provided', - ) throw new Error( '[payloadcms-vectorize] bulkQueueNames is required when any knowledge pool has bulk embedding configured (embeddingConfig.bulkEmbeddingsFns).', ) } - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Bulk queue validation passed (enabled: ${bulkIngestEnabled})`, - ) // Exit early if disabled, but keep embeddings collections present for migrations if (pluginOptions.disabled) { - console.log('[payloadcms-vectorize] payloadcmsVectorize: Plugin disabled, exiting early') return config } - // Register a single task using Payload Jobs that can handle any knowledge pool - console.log('[payloadcms-vectorize] payloadcmsVectorize: Registering Payload Jobs tasks...') + // Register tasks using Payload Jobs const incomingJobs = config.jobs || { tasks: [] } const tasks = [...(config.jobs?.tasks || [])] - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Existing tasks count: ${tasks.length}`, - ) - console.log('[payloadcms-vectorize] payloadcmsVectorize: Creating vectorize task...') const vectorizeTask = createVectorizeTask({ knowledgePools: pluginOptions.knowledgePools, }) tasks.push(vectorizeTask) - console.log('[payloadcms-vectorize] payloadcmsVectorize: Vectorize task added') - console.log('[payloadcms-vectorize] payloadcmsVectorize: Creating prepare bulk embed task...') const prepareBulkEmbedTask = createPrepareBulkEmbeddingTask({ knowledgePools: pluginOptions.knowledgePools, pollOrCompleteQueueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, }) tasks.push(prepareBulkEmbedTask) - console.log('[payloadcms-vectorize] payloadcmsVectorize: Prepare bulk embed task added') - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Creating poll or complete bulk embed task...', - ) const pollOrCompleteBulkEmbedTask = createPollOrCompleteBulkEmbeddingTask({ knowledgePools: pluginOptions.knowledgePools, pollOrCompleteQueueName: pluginOptions.bulkQueueNames?.pollOrCompleteQueueName, }) tasks.push(pollOrCompleteBulkEmbedTask) - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Poll or complete bulk embed task added', - ) config.jobs = { ...incomingJobs, tasks, } - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Jobs configured. Total tasks: ${tasks.length}`, - ) const collectionToEmbedQueue = new Map< string, @@ -352,23 +259,11 @@ export const createVectorizeIntegration = >() // Extend configured collections with hooks - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Setting up hooks for ${collectionToPools.size} collection(s)...`, - ) for (const [collectionSlug, pools] of collectionToPools.entries()) { - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Setting up hooks for collection "${collectionSlug}" (${pools.length} pool(s))...`, - ) const collection = config.collections.find((c) => c.slug === collectionSlug) if (!collection) { - console.error( - `[payloadcms-vectorize] payloadcmsVectorize: Collection "${collectionSlug}" not found`, - ) throw new Error(`[payloadcms-vectorize] Collection ${collectionSlug} not found`) } - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Collection "${collectionSlug}" found, adding hooks...`, - ) const embedQueue = async (doc: any, payload: Payload, req?: PayloadRequest) => { // Queue vectorization jobs for ALL knowledge pools containing this collection @@ -397,9 +292,6 @@ export const createVectorizeIntegration = } collectionToEmbedQueue.set(collectionSlug, embedQueue) - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Embed queue function registered for "${collectionSlug}"`, - ) collection.hooks = { ...(collection.hooks || {}), @@ -457,20 +349,11 @@ export const createVectorizeIntegration = }, ], } - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Hooks configured for collection "${collectionSlug}"`, - ) } - console.log('[payloadcms-vectorize] payloadcmsVectorize: All collection hooks configured') - console.log('[payloadcms-vectorize] payloadcmsVectorize: Creating vector search handlers...') const vectorSearchHandlers = createVectorSearchHandlers(pluginOptions.knowledgePools) - console.log('[payloadcms-vectorize] payloadcmsVectorize: Vector search handlers created') // Create vectorized payload object factory that creates methods bound to a payload instance - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Creating vectorized payload object factory...', - ) const createVectorizedPayloadObject = (payload: Payload): VectorizedPayload => { return { _isBulkEmbedEnabled: (knowledgePool: TPoolNames): boolean => { @@ -537,21 +420,15 @@ export const createVectorizeIntegration = } // Store factory in config.custom - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Storing vectorized payload factory in config.custom...', - ) config.custom = { ...(config.custom || {}), createVectorizedPayloadObject, } - console.log('[payloadcms-vectorize] payloadcmsVectorize: Factory stored in config.custom') // Register bin script for migration helper - console.log('[payloadcms-vectorize] payloadcmsVectorize: Registering bin script...') const __filename = fileURLToPath(import.meta.url) const __dirname = dirname(__filename) const binScriptPath = resolve(__dirname, 'bin/vectorize-migrate.js') - console.log(`[payloadcms-vectorize] payloadcmsVectorize: Bin script path: ${binScriptPath}`) config.bin = [ ...(config.bin || []), { @@ -559,12 +436,8 @@ export const createVectorizeIntegration = scriptPath: binScriptPath, }, ] - console.log('[payloadcms-vectorize] payloadcmsVectorize: Bin script registered') if (pluginOptions.endpointOverrides?.enabled !== false) { - console.log( - '[payloadcms-vectorize] payloadcmsVectorize: Setting up vector search endpoint...', - ) const path = pluginOptions.endpointOverrides?.path || '/vector-search' const inputEndpoints = config.endpoints || [] const endpoints = [ @@ -592,17 +465,8 @@ export const createVectorizeIntegration = }, ] config.endpoints = endpoints - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Vector search endpoint registered at "${path}"`, - ) - } else { - console.log('[payloadcms-vectorize] payloadcmsVectorize: Vector search endpoint disabled') } - console.log('[payloadcms-vectorize] payloadcmsVectorize: Plugin initialization complete') - console.log( - `[payloadcms-vectorize] payloadcmsVectorize: Final collections count: ${config.collections.length}`, - ) return config } return { From 35db4c0f74fba533fb06538feaf2e49d196adc7b Mon Sep 17 00:00:00 2001 From: techiejd <62455039+techiejd@users.noreply.github.com> Date: Sat, 17 Jan 2026 21:19:05 +0700 Subject: [PATCH 49/49] WIP --- .gitignore | 1 + dev/.env.test | 3 ++- src/bin/vectorize-migrate.ts | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 3d5eadc..d2757c1 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ # testing /coverage +/dev/test-migrations-* # next.js .next/ diff --git a/dev/.env.test b/dev/.env.test index 7337099..8e37b53 100644 --- a/dev/.env.test +++ b/dev/.env.test @@ -1,2 +1,3 @@ DIMS=8 -IVFFLATLISTS=1 \ No newline at end of file +IVFFLATLISTS=1 +TEST_ENV=1 \ No newline at end of file diff --git a/src/bin/vectorize-migrate.ts b/src/bin/vectorize-migrate.ts index 105a711..1c3631e 100644 --- a/src/bin/vectorize-migrate.ts +++ b/src/bin/vectorize-migrate.ts @@ -300,7 +300,8 @@ export const script = async (config: SanitizedConfig): Promise => { // Get Payload instance for db operations and to access static configs via VectorizedPayload const payload = await getPayload({ config, - key: `vectorize-migrate-${Date.now()}`, + // In test environment, use unique key and enable cron for job processing + ...(process.env.TEST_ENV ? { key: `vectorize-migrate-${Date.now()}`, cron: true } : {}), }) // Get static configs from VectorizedPayload