GabrielJuan349 commited on
Commit
659d842
·
1 Parent(s): 8ddaa0b

Uploading info to qdrant

Browse files
Files changed (2) hide show
  1. metadata.jsonl +0 -0
  2. upload_data.py +100 -0
metadata.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
upload_data.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from qdrant_client import QdrantClient
5
+ from qdrant_client.models import Distance, VectorParams, PointStruct
6
+ from langchain_huggingface import HuggingFaceEmbeddings
7
+ from langchain.schema import Document
8
+
9
+ load_dotenv()
10
+
11
+ # Configurar cliente Qdrant
12
+ qdrant = QdrantClient(
13
+ url=os.environ.get("QDRANT_URL"),
14
+ api_key=os.environ.get("QDRANT_SERVICE_KEY"),
15
+ timeout=60
16
+ )
17
+
18
+ # Configurar embeddings
19
+ embeddings = HuggingFaceEmbeddings(
20
+ model_name="sentence-transformers/static-similarity-mrl-multilingual-v1",
21
+ model_kwargs={'device': 'cpu'}
22
+ )
23
+
24
+ collection_name = "documents"
25
+
26
+ def create_collection():
27
+ """Crear colección si no existe"""
28
+ try:
29
+ qdrant.get_collection(collection_name)
30
+ print(f"Colección '{collection_name}' ya existe")
31
+ except Exception:
32
+ print(f"Creando colección '{collection_name}'...")
33
+ qdrant.create_collection(
34
+ collection_name=collection_name,
35
+ vectors_config=VectorParams(
36
+ size=1024, # Dimensión correcta
37
+ distance=Distance.COSINE
38
+ )
39
+ )
40
+ print("Colección creada exitosamente")
41
+
42
+ def upload_embeddings_from_jsonl(file_path: str):
43
+ with open(file_path, 'r') as jsonl_file:
44
+ json_list = list(jsonl_file)
45
+
46
+ json_QA = []
47
+ for json_str in json_list:
48
+ json_data = json.loads(json_str)
49
+ json_QA.append(json_data)
50
+ docs = []
51
+ for sample in json_QA:
52
+ content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
53
+ doc = {
54
+ "content" : content,
55
+ "metadata" : {
56
+ "source" : sample['task_id']
57
+ },
58
+ "embedding" : embeddings.embed_query(content),
59
+ }
60
+ docs.append(doc)
61
+ print(f"Subiendo {len(docs)} documentos a Qdrant...")
62
+ try:
63
+ points = []
64
+ for idx, doc in enumerate(docs):
65
+ point = PointStruct(
66
+ id=idx,
67
+ vector=doc["embedding"],
68
+ payload={
69
+ "content": doc["content"],
70
+ "metadata": doc["metadata"]
71
+ }
72
+ )
73
+ points.append(point)
74
+
75
+ response = qdrant.upsert(
76
+ collection_name=collection_name,
77
+ points=points,
78
+ wait=True
79
+ )
80
+ print(response)
81
+ except Exception as exception:
82
+ print("Error inserting data into Qdrant:", exception)
83
+
84
+
85
+ def main():
86
+ # Crear colección
87
+ create_collection()
88
+
89
+ # Subir embeddings
90
+ jsonl_file = "./metadata.jsonl" # Ajusta la ruta si es necesario
91
+ if os.path.exists(jsonl_file):
92
+ print(f"Subiendo embeddings desde {jsonl_file}...")
93
+ # random_data()
94
+ upload_embeddings_from_jsonl(jsonl_file)
95
+ print("¡Embeddings subidos exitosamente!")
96
+ else:
97
+ print(f"Archivo {jsonl_file} no encontrado")
98
+
99
+ if __name__ == "__main__":
100
+ main()