generate embeddings
Browse files- generate_embeddings.js +69 -0
generate_embeddings.js
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// Quick script to generate embeddings for existing documents
|
2 |
+
import fs from 'fs';
|
3 |
+
|
4 |
+
async function generateEmbeddings() {
|
5 |
+
// Document contents to generate embeddings for
|
6 |
+
const documents = [
|
7 |
+
{
|
8 |
+
id: 1,
|
9 |
+
title: "Attention Is All You Need",
|
10 |
+
content: "The Transformer, a model architecture eschewing recurrence and instead relying entirely on an attention mechanism to draw global dependencies between input and output. The Transformer allows for significantly more parallelization and can reach a new state of the art in translation quality."
|
11 |
+
},
|
12 |
+
{
|
13 |
+
id: 2,
|
14 |
+
title: "GPT-4 Technical Report",
|
15 |
+
content: "We report the development of GPT-4, a large-scale, multimodal model which can accept image and text inputs and produce text outputs. While less capable than humans in many real-world scenarios, GPT-4 exhibits human-level performance on various professional and academic benchmarks."
|
16 |
+
},
|
17 |
+
{
|
18 |
+
id: 3,
|
19 |
+
title: "Constitutional AI",
|
20 |
+
content: "As AI systems become more capable, we would like to enlist their help to supervise other AI systems. We experiment with methods for training a harmless AI assistant through self-improvement, without any human labels identifying harmful outputs."
|
21 |
+
},
|
22 |
+
{
|
23 |
+
id: 4,
|
24 |
+
title: "Retrieval-Augmented Generation",
|
25 |
+
content: "Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely manipulate knowledge is still limited."
|
26 |
+
}
|
27 |
+
];
|
28 |
+
|
29 |
+
console.log('Generating embeddings for documents...');
|
30 |
+
|
31 |
+
for (const doc of documents) {
|
32 |
+
try {
|
33 |
+
console.log(`Processing document ${doc.id}: ${doc.title}`);
|
34 |
+
|
35 |
+
// Generate embedding
|
36 |
+
const response = await fetch('http://localhost:5000/api/embeddings', {
|
37 |
+
method: 'POST',
|
38 |
+
headers: { 'Content-Type': 'application/json' },
|
39 |
+
body: JSON.stringify({ input: doc.content })
|
40 |
+
});
|
41 |
+
|
42 |
+
if (response.ok) {
|
43 |
+
const result = await response.json();
|
44 |
+
console.log(`β
Generated embedding for ${doc.title} (${result.data[0].embedding.length} dimensions)`);
|
45 |
+
|
46 |
+
// Note: In a real implementation, you would update the database here
|
47 |
+
// For now, just log success
|
48 |
+
} else {
|
49 |
+
console.log(`β Failed to generate embedding for ${doc.title}`);
|
50 |
+
}
|
51 |
+
|
52 |
+
// Small delay to avoid overwhelming the API
|
53 |
+
await new Promise(resolve => setTimeout(resolve, 1000));
|
54 |
+
|
55 |
+
} catch (error) {
|
56 |
+
console.log(`β Error processing ${doc.title}: ${error.message}`);
|
57 |
+
}
|
58 |
+
}
|
59 |
+
|
60 |
+
console.log('β
Embedding generation completed!');
|
61 |
+
console.log('\nπ Now you can test vector search with these queries:');
|
62 |
+
console.log('- "attention mechanism transformer architecture"');
|
63 |
+
console.log('- "multimodal language model GPT"');
|
64 |
+
console.log('- "constitutional AI safety alignment"');
|
65 |
+
console.log('- "retrieval augmented generation knowledge"');
|
66 |
+
}
|
67 |
+
|
68 |
+
// Run the function
|
69 |
+
generateEmbeddings().catch(console.error);
|