File size: 5,250 Bytes
4b5a1de
baadb7f
a550123
baadb7f
 
45cea24
4789e1c
baadb7f
7568c7e
fe562e2
2c2e382
ff30c53
 
 
 
ae88fb1
ff30c53
ae88fb1
a550123
ff30c53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c2e382
ff30c53
7d022f0
e94e758
 
5608476
7d022f0
b8a20df
8d1e3a7
7d7514a
f0f3243
2c2e382
f6cf353
f0f3243
7d7514a
f6cf353
c65b0b0
 
7d7514a
4b5a1de
2c2e382
21c3afc
f6cf353
21c3afc
5608476
f6cf353
0d615a1
2bfc379
 
 
 
 
 
fc337a7
b2e555c
2c2e382
b1ac4ea
 
 
 
 
 
 
fc337a7
b1ac4ea
f0f3243
596e351
 
2bfc379
b1ac4ea
21c3afc
5608476
7d7514a
7568c7e
5608476
7d7514a
 
 
 
 
 
 
5608476
7568c7e
 
 
21c3afc
 
5608476
45cea24
5608476
b2e555c
5608476
d9ee889
b2e555c
 
2e62836
b2e555c
 
 
2c2e382
4eb4fdb
b2e555c
4d9e44c
3f8e148
b2e555c
 
 
4d9e44c
b314e98
d9ee889
b314e98
d9ee889
b314e98
d9ee889
b314e98
 
b2e555c
6c7b7ca
b2e555c
b314e98
b2e555c
2e62836
b2e555c
ae88fb1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import spaces
import pickle
import numpy as np
import faiss
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
from sentence_transformers import SentenceTransformer
import gradio as gr
from threading import Thread

index = faiss.read_index("vector_db/index.faiss")
with open("vector_db/chunks.pkl", "rb") as f:
    chunks = pickle.load(f)
with open("vector_db/metadata.pkl", "rb") as f:
    metadata_dict = pickle.load(f)

ST = SentenceTransformer("BAAI/bge-large-en-v1.5")
github_base_url = "https://github.com/arsiba/EDPB-AI/blob/main/" 

model_id = "HuggingFaceH4/zephyr-7b-beta"
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb,
    device_map={"": 0},
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

SYS = (
    "You are a legal AI assistant specialized in GDPR/EDPB."
    "If you cannot find an answer in the context, it's okay to speculate. But if so, make it clear."
    "Answer this Question:"
)

@spaces.GPU()
def retrieve(q, k=3):
    emb = ST.encode(q)
    D, I = index.search(np.array([emb], dtype="float32"), k)
    docs, file_sources = [], []
    for i in I[0]:
        chunk = chunks[i]
        meta = metadata_dict[i]
        docs.append({"title": meta, "pages": chunk})
        file_sources.append(meta)
    return docs, file_sources


def make_prompt(q, docs):
    context = "\n\n".join(f"Title: {d['title']}\nPages: {d['pages']}" for d in docs)
    prompt = f"detailed thinking off\n"
    prompt += f"Instruct: {SYS} {q} based on the following documents:\n{context}\nOutput:"
    return prompt

def build_markdown_links(file_input):
    lines = []
    for idx, item in enumerate(file_input, start=1):
        url = f"{github_base_url}/{item['directory']}/{item['source']}"
        line = f"**Source {idx}:** [{item['source']}]({url}) on page {item['page']}"
        lines.append(line)
    return "\n\n".join(lines)


def build_markdown_chunks(docs):
    lines = []
    for idx, d in enumerate(docs, start=1):
        title = d['title']['source']
        page = d['title']['page']
        text = d['pages']
        lines.append(f"**Chunk {idx}:** {title} on page {page}\n\n{text}")
    return "\n\n".join(lines)

@spaces.GPU()
def qa_fn(faiss_search, question, top_k, temperature, max_tokens):
    docs, file_sources = retrieve(faiss_search, top_k)
    file_links = build_markdown_links(file_sources)
    markdown_chunks = build_markdown_chunks(docs)
    prompt = make_prompt(question, docs)[:8000]
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
    Thread(target=model.generate, kwargs={
        **inputs,
        "streamer": streamer,
        "max_new_tokens": max_tokens,
        "do_sample": False,
        "temperature": temperature,
        "top_p": 0.9,
        "eos_token_id": tokenizer.eos_token_id
    }).start()
    output = ""
    for tok in streamer:
        output += tok
    if "Output:" in output:
        output = output.split("Output:", 1)[1].strip()
    return "\n# Generated Answer\n", output,"\n# Used Documents\n",  file_links, "\n# Used Context\n", markdown_chunks

heading_answer  = gr.Markdown(label="Answer Heading")
outputs_answer = gr.Textbox(label="Answer")
heading_links  = gr.Markdown(label="Links Heading")
heading_chunks = gr.Markdown(label="Chunks Heading")
outputs_link = gr.Markdown(label="Source Link")
outputs_chunks = gr.Markdown(label="Used Chunks")

demo = gr.Interface(
    fn=qa_fn,
    inputs=[
        gr.Textbox(lines=4, label="What Documents are you looking for?", placeholder="Please change to get propper results:\nDocuments covering the EDPB’s stance on automated decision-making, particularly profiling, under the GDPR. Guidelines on how organizations should inform data subjects about automated decisions and the rights of individuals to object to such decisions."),
        gr.Textbox(lines=1, label="What is your question?", placeholder="Please change to get propper results:\nWhat does the EDPB recommend regarding automated decision-making and profiling under the GDPR, and what rights do individuals have in relation to such decisions?"),
    ],
    additional_inputs=[
        gr.Slider(1, 10, value=7, step=1, label="Top-K Documents"),
        gr.Slider(0.1, 1.0, value=0.6, step=0.05, label="Temperature"),
        gr.Slider(64, 1024, value=512, step=64, label="Max Answer Length")
    ],
    additional_inputs_accordion="Advanced Options",
    outputs=[
        heading_answer,
        outputs_answer,
        heading_links,
        outputs_link,
        heading_chunks,
        outputs_chunks
    ],
    title="GDPR Legal Assistant",
    description="Ask any question about GDPR or EDPB documents.",
    allow_flagging="never",
    fill_width=True,
)

if __name__ == "__main__":
    demo.launch(share=True)