# import os # # Set CUDA device dynamically # os.environ["CUDA_VISIBLE_DEVICES"] = "5" import spaces import torch import transformers import gradio as gr from ragatouille import RAGPretrainedModel from huggingface_hub import InferenceClient import re from datetime import datetime import json import os import arxiv from utils import get_md_text_abstract, search_cleaner, get_arxiv_live_search, make_demo, make_doc_prompt, load_llama_guard, moderate, LLM global MODEL, CURRENT_MODEL MODEL, CURRENT_MODEL = None, None retrieve_results = 10 show_examples = True llm_models_to_choose = ['Trust-Align-Qwen2.5', "meta-llama/Meta-Llama-3-8B-Instruct",'None'] llm_location_map={ "Trust-Align-Qwen2.5": os.getenv("MODEL_NAME"), "meta-llama/Meta-Llama-3-8B-Instruct": "meta-llama/Meta-Llama-3-8B-Instruct", # "Qwen/Qwen2.5-7B-Instruct" "None": None } generate_kwargs = dict( temperature = 0.1, max_new_tokens = 512, top_p = 1.0, do_sample = True, ) # Load llama Guard llama_guard, llama_guard_tokenizer, UNSAFE_TOKEN_ID = load_llama_guard("meta-llama/Llama-Guard-3-1B") ## RAG MODEL RAG = RAGPretrainedModel.from_index("colbert/indexes/arxiv_colbert", n_gpu=0) try: gr.Info("Setting up retriever, please wait...") rag_initial_output = RAG.search("what is Mistral?", k = 1) gr.Info("Retriever working successfully!") except: gr.Warning("Retriever not working!") def choose_llm(choosed_llm): global MODEL, CURRENT_MODEL try: gr.Info("Setting up LLM, please wait...") MODEL = LLM(llm_location_map[choosed_llm], use_vllm=False) CURRENT_MODEL = choosed_llm gr.Info("LLM working successfully!") except Exception as e: raise RuntimeError("Failed to load the LLM MODEL.") from e choose_llm(llm_models_to_choose[0]) # prompt used for generation try: with open("rejection_full.json") as f: prompt_data = json.load(f) except FileNotFoundError: raise RuntimeError("Prompt data file 'rejection_full.json' not found.") except json.JSONDecodeError: raise RuntimeError("Failed to decode 'rejection_full.json'.") ## Header mark_text = '# 🔍 Search Results\n' header_text = "# 🤖 Trust-Align: Measuring and Enhancing Trustworthiness of LLMs in RAG through Grounded Attributions and Learning to Refuse\n \n" try: with open("README.md", "r") as f: mdfile = f.read() date_pattern = r'Index Last Updated : \d{4}-\d{2}-\d{2}' match = re.search(date_pattern, mdfile) date = match.group().split(': ')[1] formatted_date = datetime.strptime(date, '%Y-%m-%d').strftime('%d %b %Y') header_text += f'Index Last Updated: {formatted_date}\n' index_info = f"Semantic Search - up to {formatted_date}" except: index_info = "Semantic Search" database_choices = [index_info,'Arxiv Search - Latest - (EXPERIMENTAL)'] ## Arxiv API arx_client = arxiv.Client() is_arxiv_available = True check_arxiv_result = get_arxiv_live_search("What is Mistral?", arx_client, retrieve_results) if len(check_arxiv_result) == 0: is_arxiv_available = False print("Arxiv search not working, switching to default search ...") database_choices = [index_info] ## Show examples (disabled) if show_examples: with open("sample_outputs.json", "r") as f: sample_outputs = json.load(f) output_placeholder = sample_outputs['output_placeholder'] md_text_initial = sample_outputs['search_placeholder'] else: output_placeholder = None md_text_initial = '' def rag_cleaner(inp): rank = inp['rank'] title = inp['document_metadata']['title'] content = inp['content'] date = inp['document_metadata']['_time'] return f"{rank}. {title} \n Date : {date} \n Abstract: {content}" def get_references(question, retriever, k = retrieve_results): rag_out = retriever.search(query=question, k=k) return rag_out def get_rag(message): return get_references(message, RAG) with gr.Blocks(theme = gr.themes.Soft()) as demo: header = gr.Markdown(header_text) with gr.Group(): msg = gr.Textbox(label = 'Search', placeholder = 'What is Mistral?') with gr.Accordion("Advanced Settings", open=False): with gr.Row(equal_height = True): llm_model = gr.Dropdown(choices = llm_models_to_choose, value = 'Trust-Align-Qwen2.5', label = 'LLM MODEL') llm_results = gr.Slider(minimum=1, maximum=retrieve_results, value=3, step=1, interactive=True, label="Top n results as context") database_src = gr.Dropdown(choices = database_choices, value = index_info, label = 'Search Source') stream_results = gr.Checkbox(value = True, label = "Stream output", visible = False) output_text = gr.Textbox(show_label = True, container = True, label = 'LLM Answer', visible = True, placeholder = output_placeholder) input = gr.Textbox(visible=False) # placeholder gr_md = gr.Markdown(mark_text + md_text_initial) # @spaces.GPU(duration=60) def update_with_rag_md(message, llm_results_use = 5, database_choice = index_info, llm_model_picked = 'Trust-Align-Qwen2.5'): chat_round = [ {"role": "user", "content": [ {"type": "text", "text": message } ] } ] # llama guard check for it prompt_safety = moderate(chat_round, llama_guard, llama_guard_tokenizer, UNSAFE_TOKEN_ID)['generated_text'] prompt_safety = "safe" if prompt_safety == "safe": docs = [] database_to_use = database_choice if database_choice == index_info: rag_out = get_rag(message) else: arxiv_search_success = True try: rag_out = get_arxiv_live_search(message, arx_client, retrieve_results) if len(rag_out) == 0: arxiv_search_success = False except: arxiv_search_success = False if not arxiv_search_success: gr.Warning("Arxiv Search not working, switching to semantic search ...") rag_out = get_rag(message) database_to_use = index_info md_text_updated = mark_text for i in range(retrieve_results): rag_answer = rag_out[i] if i < llm_results_use: md_text_paper, doc = get_md_text_abstract(rag_answer, source = database_to_use, return_prompt_formatting = True) docs.append(doc) md_text_paper = md_text_paper.strip("###") md_text_updated += f"### [{i+1}] {md_text_paper}" # else: # md_text_paper = get_md_text_abstract(rag_answer, source = database_to_use) # md_text_updated += md_text_paper infer_item = { "question": message, "docs": docs, } prompt = make_demo( infer_item, prompt=prompt_data["demo_prompt"], ndoc=llm_results_use, doc_prompt=prompt_data["doc_prompt"], instruction=prompt_data["instruction"], test=True ) else: md_text_updated = mark_text + "### Invalid search query!" prompt = "" return md_text_updated, prompt @spaces.GPU(duration=60) def ask_llm(prompt, llm_model_picked = 'Trust-Align-Qwen2.5', stream_outputs = False): model_disabled_text = "LLM MODEL is disabled" output = "" if llm_model_picked == 'None': if stream_outputs: for out in model_disabled_text: output += out yield output return output else: return model_disabled_text global MODEL if llm_model_picked != CURRENT_MODEL: del MODEL import gc gc.collect torch.cuda.empty_cache() choose_llm(llm_model_picked) try: stream = MODEL.generate(prompt, generate_kwargs["max_new_tokens"]) except: gr.Warning("LLM Inference rate limit reached, try again later!") return "" if stream_outputs: for response in stream: output += response yield output return output else: return output msg.submit(update_with_rag_md, [msg, llm_results, database_src, llm_model], [gr_md, input]).success(ask_llm, [input, llm_model, stream_results], output_text) demo.queue().launch()