Spaces:

selcukwashere
/

NoterLLM-Demo

Sleeping

File size: 7,623 Bytes

c3ce070

from langchain.schema import Document
import json
import os
import pickle
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_huggingface import HuggingFaceEndpoint
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from typing import Optional

_qa_chain: Optional[RetrievalQA] = None
_initialized = False


def init_rag():
    global _qa_chain, _initialized

    if _initialized:
        return

    HF_TOKEN = os.getenv("HF_TOKEN")
    if not HF_TOKEN:
        print(
            "⚠️  HF_TOKEN not found in environment variables. Set it in Spaces secrets or .env file"
        )

    documents = []

    try:
        with open("tnb_genelgeler_rag.json", "r", encoding="utf-8") as f:
            genelge_data = json.load(f)
        print(f"✅ Loaded {len(genelge_data)} chunks from tnb_genelgeler_rag.json")

        for item in genelge_data:
            if "source_type" not in item.get("metadata", {}):
                item.setdefault("metadata", {})["source_type"] = "genelge"
            documents.append(
                Document(
                    page_content=item.get("content", ""),
                    metadata=item.get("metadata", {}),
                )
            )

    except FileNotFoundError:
        print("⚠️  tnb_genelgeler_rag.json not found. Please upload data files.")

    try:
        with open("noterlik_kanunu_rag.json", "r", encoding="utf-8") as f:
            kanun_data = json.load(f)
        print(f"✅ Loaded {len(kanun_data)} chunks from noterlik_kanunu_rag.json")

        for item in kanun_data:
            documents.append(
                Document(
                    page_content=item.get("content", ""),
                    metadata=item.get("metadata", {}),
                )
            )

    except FileNotFoundError:
        print("⚠️  noterlik_kanunu_rag.json not found. Please upload data files.")

    if not documents:
        print("❌ No documents loaded. Please prepare data files first.")
        _initialized = False
        return

    print(f"📚 Total documents loaded: {len(documents)}")

    faiss_index_path = "faiss_index"

    print("🔄 Initializing embedding model (multilingual-e5-base)...")
    embedding_model = HuggingFaceEmbeddings(
        model_name="intfloat/multilingual-e5-base", encode_kwargs={"batch_size": 32}
    )
    print("✅ Embedding model initialized")

    # Load or create FAISS index
    if os.path.exists(faiss_index_path):
        print(f"✅ Found existing FAISS index at {faiss_index_path} — loading...")
        try:
            vector_db = FAISS.load_local(
                faiss_index_path, embedding_model, allow_dangerous_deserialization=True
            )
            print("✅ FAISS index loaded successfully!")
        except Exception as e:
            print(f"❌ Failed to load FAISS index: {e}")
            print(f"🔄 Creating new FAISS index...")
            vector_db = FAISS.from_documents(documents, embedding_model)
            vector_db.save_local(faiss_index_path)
            print(f"✅ FAISS index created and saved to {faiss_index_path}")
    else:
        print(f"🔄 Creating new FAISS index (this may take a few minutes)...")
        vector_db = FAISS.from_documents(documents, embedding_model)
        vector_db.save_local(faiss_index_path)
        print(f"✅ FAISS index created and saved to {faiss_index_path}")

    bm25_path = "bm25_retriever.pkl"
    if os.path.exists(bm25_path):
        print(f"✅ Loading existing BM25 index from {bm25_path}...")
        with open(bm25_path, "rb") as f:
            bm25_retriever = pickle.load(f)
        print(f"✅ BM25 index loaded successfully!")
    else:
        print(f"🔄 Creating new BM25 index...")
        bm25_retriever = BM25Retriever.from_documents(documents)
        bm25_retriever.k = 5
        with open(bm25_path, "wb") as f:
            pickle.dump(bm25_retriever, f)
        print(f"✅ BM25 index created and saved to {bm25_path}")

    vector_retriever = vector_db.as_retriever(search_kwargs={"k": 5})

    ensemble_retriever = EnsembleRetriever(
        retrievers=[bm25_retriever, vector_retriever], weights=[0.5, 0.5]
    )

    print("🔄 Initializing HuggingFace LLM (Mistral-7B-Instruct)...")
    try:
        llm = HuggingFaceEndpoint(
            repo_id="mistralai/Mistral-7B-Instruct-v0.2",
            huggingfacehub_api_token=HF_TOKEN,
            temperature=0.3,
            max_new_tokens=1024,
            top_p=0.95,
            repetition_penalty=1.1,
        )
        print("✅ HuggingFace LLM initialized (Mistral-7B-Instruct-v0.2)")
    except Exception as e:
        print(f"❌ Failed to initialize LLM: {e}")
        print(f"   HF_TOKEN is {'set' if HF_TOKEN else 'NOT set'}")
        _initialized = False
        return

    turkish_legal_prompt = """Sen Türk Noter Hukuku konusunda uzman bir yapay zeka asistanısın. Görevin, Noterlik Kanunu ve Türkiye Noterler Birliği genelgelerinden yararlanarak kullanıcının sorusunu doğru ve eksiksiz yanıtlamaktır.

    BAĞLAM BİLGİLERİ (Kanun ve Genelgelerden):
    {context}

    KULLANICI SORUSU: {question}

    YANITLAMA STRATEJİSİ:
    1. **KAYNAK ÖNCELİĞİ**: 
       - Noterlik Kanunu → Temel yasal çerçeve ve genel kurallar
       - TNB Genelgeleri → Kanunun uygulanmasına ilişkin özel düzenlemeler ve açıklamalar
       - Her iki kaynağı da kontrol et ve ilgili olanları kullan

    2. **HİBRİT YANITLAMA**: 
       - Kanun maddeleri varsa bunları temel al
       - Genelgelerdeki uygulama detayları varsa ekle
       - Kaynak belirtmeyi unutma!

    3. **KAYNAK BELİRTME**:
       - Kanundan alınan bilgi → "Noterlik Kanunu Madde X'e göre..."
       - Genelgelerden alınan bilgi → "Genelge X, Madde Y'ye göre..."
       - Genel bilgi → "Genel olarak..." veya "Türk Hukuku'nda..."

    4. **KALİTE KURALLARI**:
       - Yanıtını net, anlaşılır ve yapılandırılmış şekilde sun
       - Hukuki terminolojiyi doğru kullan
       - Kesin olmadığın konularda varsayımda bulunma
       - Hem kanunu hem genelgeleri kaynak olarak kullanabilirsin

    YANITINIZ:"""

    prompt_template = PromptTemplate(
        template=turkish_legal_prompt, input_variables=["context", "question"]
    )

    _qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=ensemble_retriever,
        chain_type="stuff",
        chain_type_kwargs={"prompt": prompt_template, "document_separator": "\n---\n"},
        return_source_documents=True,
        verbose=False,
    )

    if _qa_chain is None:
        print("❌ QA Chain creation failed silently")
        return

    print("✅ RAG system initialized successfully!\n")
    _initialized = True


def query_rag(question: str):
    global _qa_chain, _initialized

    if not _initialized:
        init_rag()

    if not _initialized or _qa_chain is None:
        print("❌ RAG system is not properly initialized. Chain or data missing.")
        return None

    try:
        print(f"DEBUG: _qa_chain type: {type(_qa_chain)}")
        print(f"DEBUG: _qa_chain.invoke type: {type(_qa_chain.invoke)}")
        print(f"DEBUG: Calling invoke with question: {question[:50]}...")
        result = _qa_chain.invoke({"query": question})
        return result
    except Exception as e:
        print(f"❌ Error querying RAG: {e}")
        import traceback
        traceback.print_exc()
        return None