Spaces:

OrganizedProgrammers
/

kig_test

Sleeping

App Files Files Community

adrienbrdne commited on Apr 15

Commit

dbe6919

verified ·

1 Parent(s): 79c56bc

Update ki_gen/data_retriever.py

Browse files

Files changed (1) hide show

ki_gen/data_retriever.py +44 -36

ki_gen/data_retriever.py CHANGED Viewed

@@ -7,9 +7,9 @@ from random import shuffle, sample
 from langgraph.checkpoint.sqlite import SqliteSaver
 # Remove ChatGroq import
-# from langchain_groq import ChatGroq
 # Add ChatGoogleGenerativeAI import
-from langchain_google_genai import ChatGoogleGenerativeAI
 import os # Add os import
 from langchain_openai import ChatOpenAI
@@ -21,21 +21,20 @@ from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.pydantic_v1 import Field
 from pydantic import BaseModel
-from neo4j import GraphDatabase
 from langgraph.graph import StateGraph
 from llmlingua import PromptCompressor
 from ki_gen.prompts import (
-    CYPHER_GENERATION_PROMPT,
     CONCEPT_SELECTION_PROMPT,
     BINARY_GRADER_PROMPT,
     SCORE_GRADER_PROMPT,
     RELEVANT_CONCEPTS_PROMPT,
 )
 # Import get_model which now handles Gemini
-from ki_gen.utils import ConfigSchema, DocRetrieverState, get_model, format_doc
 # ... (extract_cypher remains the same)
@@ -99,7 +98,7 @@ def get_concepts(graph: Neo4jGraph):
 def get_related_concepts(graph: Neo4jGraph, question: str):
     concepts = get_concepts(graph)
     # Use get_model
-    llm = get_model()
     print(f"this is the llm variable : {llm}")
     def parse_answer(llm_answer : str):
         try:
@@ -113,7 +112,7 @@ def get_related_concepts(graph: Neo4jGraph, question: str):
     print(f"This is the question of the user : {question}")
     print(f"This is the concepts of the user : {concepts}")
     # Remove specific Groq error handling block
     try:
         related_concepts_raw = related_concepts_chain.invoke({"user_query" : question, "concepts" : '\n'.join(concepts)})
@@ -148,7 +147,7 @@ def build_concept_string(graph: Neo4jGraph, concept_list: list[str]):
 MATCH (c:Concept {{name: "{concept}" }}) RETURN c.description
 """
         concept_description = graph.query(concept_description_query)[0]['c.description']
-        concept_string += f"name: {concept}\ndescription: {concept_description}\n\n"
     return concept_string
 def get_global_concepts(graph: Neo4jGraph):
@@ -167,12 +166,20 @@ def generate_cypher(state: DocRetrieverState, config: ConfigSchema):
     """
     The node where the cypher is generated
     """
-    #graph = config["configurable"].get("graph")
-    NEO4J_URI = "neo4j+s://4985272f.databases.neo4j.io"
-    NEO4J_USERNAME = "neo4j"
-    NEO4J_PASSWORD = os.getenv("neo4j_password")
-    graph = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
-    question = state['query']
     related_concepts = get_related_concepts(graph, question)
     cyphers = []
@@ -183,15 +190,18 @@ def generate_cypher(state: DocRetrieverState, config: ConfigSchema):
             "question": question,
             "concepts": related_concepts
         })
     # Remove specific Groq error handling block
     try:
         if config["configurable"].get("cypher_gen_method") == 'guided':
             concept_selection_chain = get_concept_selection_chain()
             print(f"Concept selection chain is : {concept_selection_chain}")
             selected_topic = concept_selection_chain.invoke({"question" : question, "concepts": get_concepts(graph)})
             print(f"Selected topic are : {selected_topic}")
-            cyphers = [generate_cypher_from_topic(selected_topic, state['current_plan_step'])]
             print(f"Cyphers are : {cyphers}")
     except Exception as e:
@@ -205,7 +215,7 @@ def generate_cypher(state: DocRetrieverState, config: ConfigSchema):
             corrector_schema = [Schema(el["start"], el["type"], el["end"]) for el in graph.structured_schema.get("relationships", [])]
             cypher_corrector = CypherQueryCorrector(corrector_schema)
             # Apply corrector only if cyphers were generated
-            if cyphers:
                 try:
                     cyphers = [cypher_corrector(cypher) for cypher in cyphers]
                 except Exception as corr_e:
@@ -214,9 +224,10 @@ def generate_cypher(state: DocRetrieverState, config: ConfigSchema):
         else:
             print("Warning: Cypher validation skipped, graph or schema unavailable.")
-    graph.close()
     return {"cyphers" : cyphers}
 # ... (generate_cypher_from_topic, get_docs remain the same)
 def generate_cypher_from_topic(selected_concept: str, plan_step: int):
     """
@@ -232,25 +243,21 @@ def generate_cypher_from_topic(selected_concept: str, plan_step: int):
             cypher_el = "(rp:ResearchPaper) RETURN rp.title, rp.abstract"
         case 2:
             cypher_el = "(ki:KeyIssue) RETURN ki.description"
-    return f"MATCH (c:Concept {{name:'{selected_concept}'}})-[:RELATED_TO]-{cypher_el}"
 def get_docs(state:DocRetrieverState, config:ConfigSchema):
     """
     This node retrieves docs from the graph using the generated cypher
     """
-    #graph = config["configurable"].get("graph")
-    NEO4J_URI = "neo4j+s://4985272f.databases.neo4j.io"
-    NEO4J_USERNAME = "neo4j"
-    NEO4J_PASSWORD = os.getenv("neo4j_password")
-    graph = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
     output = []
     if graph is not None and state.get("cyphers"): # Check if cyphers exist
         for cypher in state["cyphers"]:
             try:
                 output = graph.query(cypher)
                 # Assuming the first successful query is sufficient
-                if output:
-                    break
             except Exception as e:
                 print(f"Failed to retrieve docs with cypher '{cypher}': {e}")
                 # Continue to try next cypher if one fails
@@ -264,13 +271,13 @@ def get_docs(state:DocRetrieverState, config:ConfigSchema):
             for key in doc:
                 if isinstance(doc[key], dict):
                     # If a value is a dict, treat it as a separate document
-                    all_docs.append(doc[key])
                 else:
                     unwinded_doc.update({key: doc[key]})
         # Add the unwinded parts if any keys were not dictionaries
-        if unwinded_doc:
             all_docs.append(unwinded_doc)
     filtered_docs = []
     seen_docs = set() # Use a set for faster duplicate checking based on a unique identifier
@@ -278,7 +285,7 @@ def get_docs(state:DocRetrieverState, config:ConfigSchema):
          # Create a tuple of items to check for duplicates, assuming dicts are hashable
          # If dicts contain unhashable types (like lists), convert them to strings or use a primary key
         try:
-            doc_tuple = tuple(sorted(doc.items()))
             if doc_tuple not in seen_docs:
                 filtered_docs.append(doc)
                 seen_docs.add(doc_tuple)
@@ -290,7 +297,7 @@ def get_docs(state:DocRetrieverState, config:ConfigSchema):
                   filtered_docs.append(doc)
                   seen_docs.add(doc_str)
-    graph.close()
     return {"docs": filtered_docs}
@@ -385,13 +392,13 @@ def eval_doc(doc, query, method="binary", threshold=0.7, eval_model="gemini-2.0-
 # Update default model
 def eval_docs(state: DocRetrieverState, config: ConfigSchema):
     """
-    This node performs evaluation of the retrieved docs and
     """
     eval_method =  config["configurable"].get("eval_method") or "binary"
     MAX_DOCS = config["configurable"].get("max_docs") or 15
     # Update default model name
-    eval_model_name = config["configurable"].get("eval_model") or "gemini-2.0-flash"
     valid_doc_scores = []
     # Ensure 'docs' exists and is a list
@@ -419,7 +426,7 @@ def eval_docs(state: DocRetrieverState, config: ConfigSchema):
         score = eval_doc(
                         doc=formatted_doc_str,
-                        query=state["query"],
                         method=eval_method,
                         threshold=config["configurable"].get("eval_threshold") or 0.7,
                         eval_model=eval_model_name # Pass the eval_model name
@@ -431,7 +438,7 @@ def eval_docs(state: DocRetrieverState, config: ConfigSchema):
         else:
              print(f"Warning: Received non-numeric score ({score}) for doc {doc}, skipping.")
     if eval_method == 'score':
         # Get at most MAX_DOCS items with the highest score if score method was used
         valid_docs_sorted = sorted(valid_doc_scores, key=lambda x: x[1], reverse=True) # Sort descending
@@ -454,7 +461,7 @@ def build_data_retriever_graph(memory):
     """
     Builds the data_retriever graph
     """
-    #with SqliteSaver.from_conn_string(":memory:") as memory :
     graph_builder_doc_retriever = StateGraph(DocRetrieverState)
@@ -469,6 +476,7 @@ def build_data_retriever_graph(memory):
     graph_builder_doc_retriever.add_edge("eval_docs", "__end__")
     graph_doc_retriever = graph_builder_doc_retriever.compile(checkpointer=memory)
     return graph_doc_retriever
 # Remove Groq specific error handling function

 from langgraph.checkpoint.sqlite import SqliteSaver
 # Remove ChatGroq import
+# from langchain_groq import ChatGroq
 # Add ChatGoogleGenerativeAI import
+from langchain_google_genai import ChatGoogleGenerativeAI
 import os # Add os import
 from langchain_openai import ChatOpenAI
 from langchain_core.pydantic_v1 import Field
 from pydantic import BaseModel
 from langgraph.graph import StateGraph
 from llmlingua import PromptCompressor
 from ki_gen.prompts import (
+    CYPHER_GENERATION_PROMPT,
     CONCEPT_SELECTION_PROMPT,
     BINARY_GRADER_PROMPT,
     SCORE_GRADER_PROMPT,
     RELEVANT_CONCEPTS_PROMPT,
 )
 # Import get_model which now handles Gemini
+from ki_gen.utils import ConfigSchema, DocRetrieverState, get_model, format_doc
 # ... (extract_cypher remains the same)
 def get_related_concepts(graph: Neo4jGraph, question: str):
     concepts = get_concepts(graph)
     # Use get_model
+    llm = get_model()
     print(f"this is the llm variable : {llm}")
     def parse_answer(llm_answer : str):
         try:
     print(f"This is the question of the user : {question}")
     print(f"This is the concepts of the user : {concepts}")
     # Remove specific Groq error handling block
     try:
         related_concepts_raw = related_concepts_chain.invoke({"user_query" : question, "concepts" : '\n'.join(concepts)})
 MATCH (c:Concept {{name: "{concept}" }}) RETURN c.description
 """
         concept_description = graph.query(concept_description_query)[0]['c.description']
+        concept_string += f"name: {concept}\ndescription: {concept_description}\n\n"
     return concept_string
 def get_global_concepts(graph: Neo4jGraph):
     """
     The node where the cypher is generated
     """
+    graph = config["configurable"].get("graph")
+    # --- Correction Applied Here ---
+    # Use .get() for safer access to 'query'
+    question = state.get('query')
+    if not question:
+        # Handle the case where query is missing
+        print("Error: 'query' key not found in state for generate_cypher node.")
+        # Return an empty list or appropriate error state
+        # This prevents the KeyError and stops processing for this branch if query is missing
+        return {"cyphers": []}
+    # --- End of Correction ---
     related_concepts = get_related_concepts(graph, question)
     cyphers = []
             "question": question,
             "concepts": related_concepts
         })
     # Remove specific Groq error handling block
     try:
         if config["configurable"].get("cypher_gen_method") == 'guided':
             concept_selection_chain = get_concept_selection_chain()
             print(f"Concept selection chain is : {concept_selection_chain}")
+            # Ensure 'current_plan_step' is also safely accessed if needed here, though it's used later
             selected_topic = concept_selection_chain.invoke({"question" : question, "concepts": get_concepts(graph)})
             print(f"Selected topic are : {selected_topic}")
+            # Safely get 'current_plan_step', defaulting to 0 if not found
+            current_plan_step = state.get('current_plan_step', 0)
+            cyphers = [generate_cypher_from_topic(selected_topic, current_plan_step)]
             print(f"Cyphers are : {cyphers}")
     except Exception as e:
             corrector_schema = [Schema(el["start"], el["type"], el["end"]) for el in graph.structured_schema.get("relationships", [])]
             cypher_corrector = CypherQueryCorrector(corrector_schema)
             # Apply corrector only if cyphers were generated
+            if cyphers:
                 try:
                     cyphers = [cypher_corrector(cypher) for cypher in cyphers]
                 except Exception as corr_e:
         else:
             print("Warning: Cypher validation skipped, graph or schema unavailable.")
     return {"cyphers" : cyphers}
 # ... (generate_cypher_from_topic, get_docs remain the same)
 def generate_cypher_from_topic(selected_concept: str, plan_step: int):
     """
             cypher_el = "(rp:ResearchPaper) RETURN rp.title, rp.abstract"
         case 2:
             cypher_el = "(ki:KeyIssue) RETURN ki.description"
+    return f"MATCH (c:Concept {{name:'{selected_concept}'}})-[:RELATED_TO]-{cypher_el}"
 def get_docs(state:DocRetrieverState, config:ConfigSchema):
     """
     This node retrieves docs from the graph using the generated cypher
     """
+    graph = config["configurable"].get("graph")
     output = []
     if graph is not None and state.get("cyphers"): # Check if cyphers exist
         for cypher in state["cyphers"]:
             try:
                 output = graph.query(cypher)
                 # Assuming the first successful query is sufficient
+                if output:
+                    break
             except Exception as e:
                 print(f"Failed to retrieve docs with cypher '{cypher}': {e}")
                 # Continue to try next cypher if one fails
             for key in doc:
                 if isinstance(doc[key], dict):
                     # If a value is a dict, treat it as a separate document
+                    all_docs.append(doc[key])
                 else:
                     unwinded_doc.update({key: doc[key]})
         # Add the unwinded parts if any keys were not dictionaries
+        if unwinded_doc:
             all_docs.append(unwinded_doc)
     filtered_docs = []
     seen_docs = set() # Use a set for faster duplicate checking based on a unique identifier
          # Create a tuple of items to check for duplicates, assuming dicts are hashable
          # If dicts contain unhashable types (like lists), convert them to strings or use a primary key
         try:
+            doc_tuple = tuple(sorted(doc.items()))
             if doc_tuple not in seen_docs:
                 filtered_docs.append(doc)
                 seen_docs.add(doc_tuple)
                   filtered_docs.append(doc)
                   seen_docs.add(doc_str)
     return {"docs": filtered_docs}
 # Update default model
 def eval_docs(state: DocRetrieverState, config: ConfigSchema):
     """
+    This node performs evaluation of the retrieved docs and
     """
     eval_method =  config["configurable"].get("eval_method") or "binary"
     MAX_DOCS = config["configurable"].get("max_docs") or 15
     # Update default model name
+    eval_model_name = config["configurable"].get("eval_model") or "gemini-2.0-flash"
     valid_doc_scores = []
     # Ensure 'docs' exists and is a list
         score = eval_doc(
                         doc=formatted_doc_str,
+                        query=state["query"], # This line assumes "query" exists in state
                         method=eval_method,
                         threshold=config["configurable"].get("eval_threshold") or 0.7,
                         eval_model=eval_model_name # Pass the eval_model name
         else:
              print(f"Warning: Received non-numeric score ({score}) for doc {doc}, skipping.")
     if eval_method == 'score':
         # Get at most MAX_DOCS items with the highest score if score method was used
         valid_docs_sorted = sorted(valid_doc_scores, key=lambda x: x[1], reverse=True) # Sort descending
     """
     Builds the data_retriever graph
     """
+    #with SqliteSaver.from_conn_string(":memory:") as memory :
     graph_builder_doc_retriever = StateGraph(DocRetrieverState)
     graph_builder_doc_retriever.add_edge("eval_docs", "__end__")
     graph_doc_retriever = graph_builder_doc_retriever.compile(checkpointer=memory)
     return graph_doc_retriever
 # Remove Groq specific error handling function