Tweak vector search
Browse files- generate_all_embeddings.js +68 -0
- generate_embeddings_simple.js +55 -0
generate_all_embeddings.js
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Using built-in fetch (Node.js 18+)
|
2 |
+
|
3 |
+
async function generateAllEmbeddings() {
|
4 |
+
console.log('π Starting to generate embeddings for all documents...');
|
5 |
+
|
6 |
+
try {
|
7 |
+
// Get all documents
|
8 |
+
const response = await fetch('http://localhost:5000/api/documents');
|
9 |
+
const documents = await response.json();
|
10 |
+
|
11 |
+
console.log(`π Found ${documents.length} documents`);
|
12 |
+
|
13 |
+
// Generate embeddings for each document
|
14 |
+
for (const doc of documents) {
|
15 |
+
console.log(`\nπ Processing: ${doc.title} (ID: ${doc.id})`);
|
16 |
+
|
17 |
+
// Generate embedding using the content
|
18 |
+
const embeddingResponse = await fetch('http://localhost:5000/api/embeddings', {
|
19 |
+
method: 'POST',
|
20 |
+
headers: { 'Content-Type': 'application/json' },
|
21 |
+
body: JSON.stringify({ input: doc.content.substring(0, 8000) }) // Limit content length
|
22 |
+
});
|
23 |
+
|
24 |
+
if (embeddingResponse.ok) {
|
25 |
+
const embeddingResult = await embeddingResponse.json();
|
26 |
+
const embedding = embeddingResult.data[0].embedding;
|
27 |
+
|
28 |
+
console.log(`β
Generated embedding with ${embedding.length} dimensions`);
|
29 |
+
|
30 |
+
// Update document with embedding
|
31 |
+
const updateResponse = await fetch(`http://localhost:5000/api/documents/process/${doc.id}`, {
|
32 |
+
method: 'POST',
|
33 |
+
headers: { 'Content-Type': 'application/json' },
|
34 |
+
body: JSON.stringify({
|
35 |
+
operations: ['generate_embedding'],
|
36 |
+
embedding: embedding
|
37 |
+
})
|
38 |
+
});
|
39 |
+
|
40 |
+
if (updateResponse.ok) {
|
41 |
+
console.log(`β
Updated document ${doc.id} with embedding`);
|
42 |
+
} else {
|
43 |
+
console.log(`β Failed to update document ${doc.id}`);
|
44 |
+
}
|
45 |
+
|
46 |
+
} else {
|
47 |
+
console.log(`β Failed to generate embedding for ${doc.title}`);
|
48 |
+
}
|
49 |
+
|
50 |
+
// Small delay to avoid overwhelming the API
|
51 |
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
52 |
+
}
|
53 |
+
|
54 |
+
console.log('\nπ Embedding generation completed!');
|
55 |
+
console.log('\nπ Now you can test vector search with these queries:');
|
56 |
+
console.log('- "attention mechanism transformer architecture"');
|
57 |
+
console.log('- "multimodal language model GPT"');
|
58 |
+
console.log('- "constitutional AI safety alignment"');
|
59 |
+
console.log('- "mixtral mixture of experts"');
|
60 |
+
console.log('- "retrieval augmented generation knowledge"');
|
61 |
+
|
62 |
+
} catch (error) {
|
63 |
+
console.error('β Error:', error.message);
|
64 |
+
}
|
65 |
+
}
|
66 |
+
|
67 |
+
// Run the function
|
68 |
+
generateAllEmbeddings().catch(console.error);
|
generate_embeddings_simple.js
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Simple script to generate embeddings for key documents
|
2 |
+
async function generateEmbeddings() {
|
3 |
+
const keyDocuments = [
|
4 |
+
{
|
5 |
+
id: 1,
|
6 |
+
title: "Attention Is All You Need",
|
7 |
+
content: "We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train."
|
8 |
+
},
|
9 |
+
{
|
10 |
+
id: 11,
|
11 |
+
title: "Mixtral of Experts",
|
12 |
+
content: "We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, but each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs."
|
13 |
+
},
|
14 |
+
{
|
15 |
+
id: 2,
|
16 |
+
title: "GPT-4 Technical Report",
|
17 |
+
content: "We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks."
|
18 |
+
}
|
19 |
+
];
|
20 |
+
|
21 |
+
console.log('π Generating embeddings for key documents...');
|
22 |
+
|
23 |
+
for (const doc of keyDocuments) {
|
24 |
+
try {
|
25 |
+
console.log(`\nπ Processing: ${doc.title}`);
|
26 |
+
|
27 |
+
// Generate embedding
|
28 |
+
const response = await fetch('http://localhost:5000/api/embeddings', {
|
29 |
+
method: 'POST',
|
30 |
+
headers: { 'Content-Type': 'application/json' },
|
31 |
+
body: JSON.stringify({ input: doc.content })
|
32 |
+
});
|
33 |
+
|
34 |
+
if (response.ok) {
|
35 |
+
const result = await response.json();
|
36 |
+
console.log(`β
Generated embedding for ${doc.title} (${result.data[0].embedding.length} dimensions)`);
|
37 |
+
} else {
|
38 |
+
console.log(`β Failed to generate embedding for ${doc.title}`);
|
39 |
+
}
|
40 |
+
|
41 |
+
// Small delay
|
42 |
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
43 |
+
|
44 |
+
} catch (error) {
|
45 |
+
console.log(`β Error processing ${doc.title}: ${error.message}`);
|
46 |
+
}
|
47 |
+
}
|
48 |
+
|
49 |
+
console.log('\nβ
Embedding generation test completed!');
|
50 |
+
console.log('\nπ Note: These embeddings were generated but not stored in the database.');
|
51 |
+
console.log('To actually store embeddings, we need to use the proper document update endpoint.');
|
52 |
+
}
|
53 |
+
|
54 |
+
// Run the function
|
55 |
+
generateEmbeddings().catch(console.error);
|