File size: 2,956 Bytes
659d842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bcbeae5
659d842
 
 
 
 
 
 
 
 
 
 
 
 
 
bcbeae5
659d842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import json
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.schema import Document

load_dotenv()

# Configurar cliente Qdrant
qdrant = QdrantClient(
    url=os.environ.get("QDRANT_URL"),
    api_key=os.environ.get("QDRANT_SERVICE_KEY"), 
    timeout=60
)

# Configurar embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/static-similarity-mrl-multilingual-v1",
    model_kwargs={'device': 'cpu'}
)

collection_name = "documents"

def create_collection():
    """Crear colección si no existe"""
    try:
        qdrant.get_collection(collection_name)
        print(f"Colección '{collection_name}' ya existe")
    except Exception:
        print(f"Creando colección '{collection_name}'...")
        qdrant.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(
                size=1024,  # Dimensión correcta
                distance=Distance.COSINE
            )
        )
        print("Colección creada exitosamente")

def upload_embeddings_from_jsonl(file_path: str):
    with open(file_path, 'r') as jsonl_file:
        json_list = list(jsonl_file)

    json_QA = []
    for json_str in json_list:
        json_data = json.loads(json_str)
        json_QA.append(json_data)
    docs = []
    for sample in json_QA:
        content = f"Question : {sample['Question']}\n\nFinal answer : {sample['Final answer']}"
        doc = {
            "page_content" : content,
            "metadata" : {
                "source" : sample['task_id']
            },
            "embedding" : embeddings.embed_query(content),
        }
        docs.append(doc)
    print(f"Subiendo {len(docs)} documentos a Qdrant...")
    try:
        points = []
        for idx, doc in enumerate(docs):
            point = PointStruct(
            id=idx,
            vector=doc["embedding"],
            payload={
                "page_content": doc["page_content"],
                "metadata": doc["metadata"]
            }
            )
            points.append(point)
        
        response = qdrant.upsert(
            collection_name=collection_name,
            points=points,
            wait=True
        )
        print(response)
    except Exception as exception:
        print("Error inserting data into Qdrant:", exception)


def main():
    # Crear colección
    create_collection()
    
    # Subir embeddings
    jsonl_file = "./metadata.jsonl"  # Ajusta la ruta si es necesario
    if os.path.exists(jsonl_file):
        print(f"Subiendo embeddings desde {jsonl_file}...")
        # random_data()
        upload_embeddings_from_jsonl(jsonl_file)
        print("¡Embeddings subidos exitosamente!")
    else:
        print(f"Archivo {jsonl_file} no encontrado")

if __name__ == "__main__":
    main()