Spaces:

arsiba
/

GDPR-EDPB-AI

Runtime error

App Files Files Community

arsiba commited on Apr 26

Commit

f6cf353

1 Parent(s): e4a8e02

feat: use nvidia llama with optional reasoning

Browse files

Files changed (1) hide show

app.py +25 -27

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ with open("vector_db/metadata.pkl", "rb") as f:
 ST = SentenceTransformer("BAAI/bge-large-en-v1.5")
-model_id = "microsoft/phi-2"
 bnb = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
@@ -24,6 +24,7 @@ bnb = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     quantization_config=bnb,
@@ -33,36 +34,35 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 SYS = (
-"You are a legal AI assistant specialized in GDPR/EDPB. "
-    "If you cannot find an answer in the context, reply 'I do not know.' Answer this Question:"
 )
 def retrieve(q, k=3):
     emb = ST.encode(q)
     D, I = index.search(np.array([emb], dtype="float32"), k)
-    docs = []
-    file_sources = []
     for i in I[0]:
         chunk = chunks[i]
-        metadata = metadata_dict[i]
-        docs.append({
-            "title": chunk,
-            "pages": chunk
-        })
-        file_sources.append(metadata["source"])
     return docs, file_sources
-def make_prompt(q, docs):
-    context = "\n\n".join(f"Title: {doc['title']}\nPages: {doc['pages']}" for doc in docs)
-    return f"Instruct:{SYS} {q} based on the following documents:{context}\nOutput:"
 @spaces.GPU()
-def qa_fn(question, top_k, temperature, max_tokens):
     docs, file_sources = retrieve(question, top_k)
-    prompt = make_prompt(question, docs)[:8000]
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
@@ -78,11 +78,8 @@ def qa_fn(question, top_k, temperature, max_tokens):
     output = ""
     for tok in streamer:
         output += tok
-    think_tag_index = output.find("</think>") #change to "Answer:" after testing
-    if think_tag_index != -1:
-        output = output[think_tag_index + len("</think>"):].strip()
     return output, file_sources
 outputs_answer = gr.Textbox(label="Answer")
@@ -92,9 +89,10 @@ demo = gr.Interface(
     fn=qa_fn,
     inputs=[
         gr.Textbox(lines=2, label="Your Question"),
-        gr.Slider(1, 7, value=4, step=1, label="Top-K Documents (How many chunks to include for context)"),
-        gr.Slider(0.1, 1.0, value=0.7, step=0.05, label="Temperature (Higher = more creative, lower = more focused)"),
-        gr.Slider(64, 1024, value=512, step=64, label="Max Answer Length (Maximum tokens to generate)")
     ],
     outputs=[outputs_answer, outputs_sources],
     title="GDPR Legal Assistant",

 ST = SentenceTransformer("BAAI/bge-large-en-v1.5")
+model_id = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"
 bnb = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_use_double_quant=True,
     bnb_4bit_compute_dtype=torch.bfloat16
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+tokenizer.pad_token_id = tokenizer.eos_token_id
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     quantization_config=bnb,
 )
 SYS = (
+    "You are a legal AI assistant specialized in GDPR/EDPB. "
+    "If you cannot find an answer in the context, reply 'I do not know.' "
+    "Answer this Question:"
 )
 def retrieve(q, k=3):
     emb = ST.encode(q)
     D, I = index.search(np.array([emb], dtype="float32"), k)
+    docs, file_sources = [], []
     for i in I[0]:
         chunk = chunks[i]
+        meta = metadata_dict[i]
+        docs.append({"title": chunk, "pages": chunk})
+        file_sources.append(meta["source"])
     return docs, file_sources
+def make_prompt(q, docs, reasoning_mode):
+    context = "\n\n".join(f"Title: {d['title']}\nPages: {d['pages']}" for d in docs)
+    prompt = f"detailed thinking {reasoning_mode}\n"
+    if reasoning_mode == "off":
+        prompt += "eager_mode on\n"
+    prompt += f"Instruct: {SYS} {q} based on the following documents:\n{context}\nOutput:"
+    return prompt
 @spaces.GPU()
+def qa_fn(question, reasoning_mode, top_k, temperature, max_tokens):
     docs, file_sources = retrieve(question, top_k)
+    prompt = make_prompt(question, docs, reasoning_mode)[:8000]
     inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
     inputs = {k: v.to(model.device) for k, v in inputs.items()}
     streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
     output = ""
     for tok in streamer:
         output += tok
+    if "</think>" in output:
+        output = output.split("</think>", 1)[1].strip()
     return output, file_sources
 outputs_answer = gr.Textbox(label="Answer")
     fn=qa_fn,
     inputs=[
         gr.Textbox(lines=2, label="Your Question"),
+        gr.Radio(["on", "off"], value="off", label="Reasoning Mode"),
+        gr.Slider(1, 7, value=4, step=1, label="Top-K Documents"),
+        gr.Slider(0.1, 1.0, value=0.6, step=0.05, label="Temperature"),
+        gr.Slider(64, 1024, value=512, step=64, label="Max Answer Length")
     ],
     outputs=[outputs_answer, outputs_sources],
     title="GDPR Legal Assistant",