arsiba commited on
Commit
2bfc379
Β·
1 Parent(s): f0f3243

feat: add pdf links

Browse files
Files changed (1) hide show
  1. app.py +14 -2
app.py CHANGED
@@ -15,6 +15,7 @@ with open("vector_db/metadata.pkl", "rb") as f:
15
  metadata_dict = pickle.load(f)
16
 
17
  ST = SentenceTransformer("BAAI/bge-large-en-v1.5")
 
18
 
19
  model_id = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"
20
  bnb = BitsAndBytesConfig(
@@ -59,9 +60,18 @@ def make_prompt(q, docs, reasoning_mode):
59
  prompt += f"Instruct: {SYS} {q} based on the following documents:\n{context}\nOutput:"
60
  return prompt
61
 
 
 
 
 
 
 
 
 
62
  @spaces.GPU()
63
  def qa_fn(question, reasoning_mode, top_k, temperature, max_tokens):
64
  docs, file_sources = retrieve(question, top_k)
 
65
  prompt = make_prompt(question, docs, reasoning_mode)[:8000]
66
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
67
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
@@ -80,10 +90,12 @@ def qa_fn(question, reasoning_mode, top_k, temperature, max_tokens):
80
  output += tok
81
  if "</think>" in output:
82
  output = output.split("</think>", 1)[1].strip()
83
- return output, file_sources
84
 
85
  outputs_answer = gr.Textbox(label="Answer")
86
  outputs_sources = gr.JSON(label="Sources (Used Files)")
 
 
87
 
88
  demo = gr.Interface(
89
  fn=qa_fn,
@@ -94,7 +106,7 @@ demo = gr.Interface(
94
  gr.Slider(0.1, 1.0, value=0.6, step=0.05, label="Temperature"),
95
  gr.Slider(64, 1024, value=512, step=64, label="Max Answer Length")
96
  ],
97
- outputs=[outputs_answer, outputs_sources],
98
  title="GDPR Legal Assistant",
99
  description="Ask any question about GDPR or EDPB documents. The response includes used files and chunks.",
100
  allow_flagging="never"
 
15
  metadata_dict = pickle.load(f)
16
 
17
  ST = SentenceTransformer("BAAI/bge-large-en-v1.5")
18
+ github_base_url = "https://github.com/arsiba/EDPB-AI/blob/main/"
19
 
20
  model_id = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"
21
  bnb = BitsAndBytesConfig(
 
60
  prompt += f"Instruct: {SYS} {q} based on the following documents:\n{context}\nOutput:"
61
  return prompt
62
 
63
+ def build_markdown_links(file_input):
64
+ lines = []
65
+ for idx, item in enumerate(file_input, start=1):
66
+ url = f"{github_base_url}/{item['directory']}/{item['source']}"
67
+ line = f"**Source {idx}:** [{item['source']}]({url}) on page {item['page']}"
68
+ lines.append(line)
69
+ return "\n\n".join(lines)
70
+
71
  @spaces.GPU()
72
  def qa_fn(question, reasoning_mode, top_k, temperature, max_tokens):
73
  docs, file_sources = retrieve(question, top_k)
74
+ file_links = build_markdown_links(file_sources)
75
  prompt = make_prompt(question, docs, reasoning_mode)[:8000]
76
  inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
77
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
 
90
  output += tok
91
  if "</think>" in output:
92
  output = output.split("</think>", 1)[1].strip()
93
+ return output, file_sources, file_links
94
 
95
  outputs_answer = gr.Textbox(label="Answer")
96
  outputs_sources = gr.JSON(label="Sources (Used Files)")
97
+ outputs_link = gr.Markdown(label="Source Link")
98
+
99
 
100
  demo = gr.Interface(
101
  fn=qa_fn,
 
106
  gr.Slider(0.1, 1.0, value=0.6, step=0.05, label="Temperature"),
107
  gr.Slider(64, 1024, value=512, step=64, label="Max Answer Length")
108
  ],
109
+ outputs=[outputs_answer, outputs_sources, outputs_link],
110
  title="GDPR Legal Assistant",
111
  description="Ask any question about GDPR or EDPB documents. The response includes used files and chunks.",
112
  allow_flagging="never"