Final_Assignment_Template / upload_data.py
GabrielJuan349
Update upload_data.py
bcbeae5
import json
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
load_dotenv()
# Configurar cliente Qdrant
qdrant = QdrantClient(
url=os.environ.get("QDRANT_URL"),
api_key=os.environ.get("QDRANT_SERVICE_KEY"),
timeout=60
)
# Configurar embeddings
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/static-similarity-mrl-multilingual-v1",
model_kwargs={'device': 'cpu'}
)
collection_name = "documents"
def create_collection():
"""Crear colección si no existe"""
try:
qdrant.get_collection(collection_name)
print(f"Colección '{collection_name}' ya existe")
except Exception:
print(f"Creando colección '{collection_name}'...")
qdrant.create_collection(
collection_name=collection_name,
vectors_config=VectorParams(
size=1024, # Dimensión correcta
distance=Distance.COSINE
)
)
print("Colección creada exitosamente")
def upload_embeddings_from_jsonl(file_path: str):
with open(file_path, 'r') as jsonl_file:
json_list = list(jsonl_file)
json_QA = []
for json_str in json_list:
json_data = json.loads(json_str)
json_QA.append(json_data)
docs = []
for sample in json_QA:
content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
doc = {
"page_content" : content,
"metadata" : {
"source" : sample['task_id']
},
"embedding" : embeddings.embed_query(content),
}
docs.append(doc)
print(f"Subiendo {len(docs)} documentos a Qdrant...")
try:
points = []
for idx, doc in enumerate(docs):
point = PointStruct(
id=idx,
vector=doc["embedding"],
payload={
"page_content": doc["page_content"],
"metadata": doc["metadata"]
}
)
points.append(point)
response = qdrant.upsert(
collection_name=collection_name,
points=points,
wait=True
)
print(response)
except Exception as exception:
print("Error inserting data into Qdrant:", exception)
def main():
# Crear colección
create_collection()
# Subir embeddings
jsonl_file = "./metadata.jsonl" # Ajusta la ruta si es necesario
if os.path.exists(jsonl_file):
print(f"Subiendo embeddings desde {jsonl_file}...")
# random_data()
upload_embeddings_from_jsonl(jsonl_file)
print("¡Embeddings subidos exitosamente!")
else:
print(f"Archivo {jsonl_file} no encontrado")
if __name__ == "__main__":
main()