import json import os from dotenv import load_dotenv from qdrant_client import QdrantClient from qdrant_client.models import Distance, VectorParams, PointStruct from langchain_huggingface import HuggingFaceEmbeddings from langchain.schema import Document load_dotenv() # Configurar cliente Qdrant qdrant = QdrantClient( url=os.environ.get("QDRANT_URL"), api_key=os.environ.get("QDRANT_SERVICE_KEY"), timeout=60 ) # Configurar embeddings embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/static-similarity-mrl-multilingual-v1", model_kwargs={'device': 'cpu'} ) collection_name = "documents" def create_collection(): """Crear colección si no existe""" try: qdrant.get_collection(collection_name) print(f"Colección '{collection_name}' ya existe") except Exception: print(f"Creando colección '{collection_name}'...") qdrant.create_collection( collection_name=collection_name, vectors_config=VectorParams( size=1024, # Dimensión correcta distance=Distance.COSINE ) ) print("Colección creada exitosamente") def upload_embeddings_from_jsonl(file_path: str): with open(file_path, 'r') as jsonl_file: json_list = list(jsonl_file) json_QA = [] for json_str in json_list: json_data = json.loads(json_str) json_QA.append(json_data) docs = [] for sample in json_QA: content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}" doc = { "page_content" : content, "metadata" : { "source" : sample['task_id'] }, "embedding" : embeddings.embed_query(content), } docs.append(doc) print(f"Subiendo {len(docs)} documentos a Qdrant...") try: points = [] for idx, doc in enumerate(docs): point = PointStruct( id=idx, vector=doc["embedding"], payload={ "page_content": doc["page_content"], "metadata": doc["metadata"] } ) points.append(point) response = qdrant.upsert( collection_name=collection_name, points=points, wait=True ) print(response) except Exception as exception: print("Error inserting data into Qdrant:", exception) def main(): # Crear colección create_collection() # Subir embeddings jsonl_file = "./metadata.jsonl" # Ajusta la ruta si es necesario if os.path.exists(jsonl_file): print(f"Subiendo embeddings desde {jsonl_file}...") # random_data() upload_embeddings_from_jsonl(jsonl_file) print("¡Embeddings subidos exitosamente!") else: print(f"Archivo {jsonl_file} no encontrado") if __name__ == "__main__": main()