|
import pandas as pd |
|
import requests |
|
from pydantic import Field, BaseModel |
|
|
|
from omegaconf import OmegaConf |
|
|
|
from vectara_agentic.agent import Agent |
|
from vectara_agentic.tools import ToolsFactory, VectaraToolFactory |
|
|
|
initial_prompt = "How can I help you today?" |
|
|
|
prompt = """ |
|
[ |
|
{"role": "system", "content": "You are an AI assistant that forms a coherent answer to a user query based on search results that are provided to you." }, |
|
{"role": "user", "content": " |
|
[INSTRUCTIONS] |
|
If the search results are irrelevant to the question respond with *** I do not have enough information to answer this question.*** |
|
Search results may include tables in a markdown format. When answering a question using a table be careful about which rows and columns contain the answer and include all relevant information from the relevant rows and columns that the query is asking about. |
|
Do not base your response on information or knowledge that is not in the search results. |
|
Make sure your response is answering the query asked. If the query is related to an entity (such as a person or place), make sure you use search results related to that entity. |
|
Consider that each search result is a partial segment from a bigger text, and may be incomplete. |
|
Your output should always be in a single language - the $vectaraLangName language. Check spelling and grammar for the $vectaraLangName language. |
|
Search results for the query *** $vectaraQuery***, are listed below, some are text, some MAY be tables in markdown format. |
|
#foreach ($qResult in $vectaraQueryResultsDeduped) |
|
[$esc.java($foreach.index + 1)] |
|
#if($qResult.hasTable()) |
|
Table Title: $qResult.getTable().title() || Table Description: $qResult.getTable().description() || Table Data: |
|
$qResult.getTable().markdown() |
|
#else |
|
$qResult.getText() |
|
#end |
|
#end |
|
Generate a coherent response (but no more than $vectaraOutChars characters) to the query *** $vectaraQuery *** using information and facts in the search results provided. |
|
Give a slight preference to search results that appear earlier in the list. |
|
Include statistical and numerical evidence to support and contextualize your response. |
|
Only cite relevant search results in your answer following these specific instructions: $vectaraCitationInstructions |
|
If the search results are irrelevant to the query, respond with ***I do not have enough information to answer this question.***. Respond always in the $vectaraLangName language, and only in that language."} |
|
] |
|
""" |
|
|
|
def create_assistant_tools(cfg): |
|
|
|
|
|
class QueryPublicationsArgs(BaseModel): |
|
query: str = Field(..., description="The user query, always in the form of a question", |
|
examples=["what are the risks reported?", "which drug was use on the and how big was the population?"]) |
|
|
|
vec_factory = VectaraToolFactory(vectara_api_key=cfg.api_key, |
|
vectara_corpus_key=cfg.corpus_key) |
|
summarizer = 'vectara-summary-table-md-query-ext-jan-2025-gpt-4o' |
|
ask_publications = vec_factory.create_rag_tool( |
|
tool_name = "ask_publications", |
|
tool_description = """ |
|
Responds to an user question about a particular result, based on the publications. |
|
""", |
|
tool_args_schema = QueryPublicationsArgs, |
|
|
|
reranker = "chain", rerank_k = 100, |
|
rerank_chain = [ |
|
{ |
|
"type": "multilingual_reranker_v1", |
|
|
|
}, |
|
{ |
|
"type": "mmr", |
|
"diversity_bias": 0.2, |
|
"limit": 50 |
|
} |
|
], |
|
n_sentences_before = 2, n_sentences_after = 2, lambda_val = 0.005, |
|
summary_num_results = 15, |
|
vectara_summarizer = summarizer, |
|
include_citations = True, |
|
vectara_prompt_text=prompt, |
|
save_history = True, |
|
verbose=False |
|
) |
|
|
|
search_publications = vec_factory.create_search_tool( |
|
tool_name = "search_publications", |
|
tool_description = """ |
|
Returns matching publications to a user query. |
|
""", |
|
tool_args_schema = QueryPublicationsArgs, |
|
reranker = "chain", rerank_k = 100, |
|
rerank_chain = [ |
|
{ |
|
"type": "multilingual_reranker_v1", |
|
|
|
}, |
|
{ |
|
"type": "mmr", |
|
"diversity_bias": 0.2, |
|
"limit": 50 |
|
} |
|
], |
|
|
|
n_sentences_before = 2, n_sentences_after = 2, lambda_val = 0.005, |
|
save_history = True, |
|
verbose=True |
|
) |
|
|
|
|
|
tools_factory = ToolsFactory() |
|
return ( |
|
tools_factory.standard_tools() + |
|
[ask_publications, search_publications] |
|
) |
|
|
|
def initialize_agent(_cfg, agent_progress_callback=None): |
|
menarini_bot_instructions = """ |
|
- You are an expert statistician and clinical trial data analyst with extensive experience in designing, analyzing, and interpreting clinical research data. |
|
- Your responses should be technically rigorous, data-driven, and written for an audience familiar with advanced statistical methodologies, regulatory standards, and the nuances of clinical trial design. |
|
- Call the ask_publications tool to retreive information to answer the user query. |
|
If the initial query lacks comprehensive data, continue to query ask_publications with refined search parameters until you retrieve all necessary numerical details |
|
- Call the search_publications tool to retreive a list of publications that may contain the information needed to answer the user query. |
|
The results include the document_id of each publication, and metadata. |
|
- When responding to queries: |
|
1) Use precise statistical terminology (e.g., randomization, blinding, intention-to-treat, type I/II error, p-values, confidence intervals, Bayesian methods, etc.) |
|
and reference common methodologies or guidelines where applicable (e.g., CONSORT, FDA, EMA). |
|
2) Your responses must include contextual information such as sample size and population characteristics. This nuance is crucial in clinical trial analysis. |
|
When considering or reporting sample sizes, consider participants who were eligible for the study, those who were randomized, and those who completed the study. |
|
If it's unclear which one is being referred to, clarify this in your response or ask the user for clarification. |
|
3) Provide clear explanations of statistical concepts, including assumptions, potential biases, and limitations in the context of clinical trial data. |
|
4) Ensure that your analysis is evidence-based and reflects current best practices in the field of clinical research and data analysis. |
|
5) Before finalizing your answer, review the analysis to ensure that all relevant data has been incorporated and that your conclusions are well-supported by the evidence. |
|
6) Provide sources and citations for all data and statistical information included in your responses, as provided in the response from the tools. |
|
""" |
|
|
|
agent = Agent( |
|
tools=create_assistant_tools(_cfg), |
|
topic="Drug trials publications", |
|
custom_instructions=menarini_bot_instructions, |
|
agent_progress_callback=agent_progress_callback, |
|
) |
|
agent.report() |
|
return agent |