#!/usr/bin/env python
# coding: utf-8

import re
import time
from random import shuffle, sample
from langgraph.checkpoint.sqlite import SqliteSaver

# Remove ChatGroq import
# from langchain_groq import ChatGroq
# Add ChatGoogleGenerativeAI import
from langchain_google_genai import ChatGoogleGenerativeAI
import os # Add os import

from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langchain_community.graphs import Neo4jGraph
from langchain_community.chains.graph_qa.cypher_utils import CypherQueryCorrector, Schema
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import Field
from pydantic import BaseModel


from langgraph.graph import StateGraph

from llmlingua import PromptCompressor

from ki_gen.prompts import (
    CYPHER_GENERATION_PROMPT,
    CONCEPT_SELECTION_PROMPT,
    BINARY_GRADER_PROMPT,
    SCORE_GRADER_PROMPT,
    RELEVANT_CONCEPTS_PROMPT,
)
# Import get_model which now handles Gemini
from ki_gen.utils import ConfigSchema, DocRetrieverState, get_model, format_doc


# ... (extract_cypher remains the same)
def extract_cypher(text: str) -> str:
    """Extract Cypher code from a text.

    Args:
        text: Text to extract Cypher code from.

    Returns:
        Cypher code extracted from the text.
    """
    # The pattern to find Cypher code enclosed in triple backticks
    pattern_1 = r"```cypher\n(.*?)```"
    pattern_2 = r"```\n(.*?)```"

    # Find all matches in the input text
    matches_1 = re.findall(pattern_1, text, re.DOTALL)
    matches_2 = re.findall(pattern_2, text, re.DOTALL)
    return [
        matches_1[0] if matches_1 else text,
        matches_2[0] if matches_2 else text,
        text
    ]

# Update default model and use get_model
def get_cypher_gen_chain(model: str = "gemini-2.0-flash"):
    """
    Returns cypher gen chain using specified model for generation
    This is used when the 'auto' cypher generation method has been configured
    """
    llm_cypher_gen = get_model(model)
    cypher_gen_chain = CYPHER_GENERATION_PROMPT | llm_cypher_gen | StrOutputParser() | extract_cypher
    return cypher_gen_chain

# Update default model and use get_model
def get_concept_selection_chain(model: str = "gemini-2.0-flash"):
    """
    Returns a chain to select the most relevant topic using specified model for generation.
    This is used when the 'guided' cypher generation method has been configured
    """
    llm_topic_selection = get_model(model)
    print(f"FOUND LLM TOPIC SELECTION FOR THE CONCEPT SELECTION PROMPT : {llm_topic_selection}")
    topic_selection_chain = CONCEPT_SELECTION_PROMPT | llm_topic_selection | StrOutputParser()
    return topic_selection_chain

# ... (get_concepts remains the same)
def get_concepts(graph: Neo4jGraph):
    concept_cypher = "MATCH (c:Concept) return c"
    if isinstance(graph, Neo4jGraph):
        concepts = graph.query(concept_cypher)
    else:
        user_input = input("Topics : ")
        concepts = eval(user_input)

    concepts_name = [concept['c']['name'] for concept in concepts]
    return concepts_name


# Update to use get_model, remove Groq error handling
def get_related_concepts(graph: Neo4jGraph, question: str):
    concepts = get_concepts(graph)
    # Use get_model
    llm = get_model()
    print(f"this is the llm variable : {llm}")
    def parse_answer(llm_answer : str):
        try:
            print(f"This the llm_answer : {llm_answer}")
            # Adjust parsing if Gemini output format differs
            return re.split("\n(?:\d)+\.\s", llm_answer.split("Concepts:")[1])[1:]
        except Exception as e:
             print(f"Error parsing LLM concept answer: {e}")
             return [] # Return empty list on parsing error
    related_concepts_chain = RELEVANT_CONCEPTS_PROMPT | llm | StrOutputParser() | parse_answer

    print(f"This is the question of the user : {question}")
    print(f"This is the concepts of the user : {concepts}")

    # Remove specific Groq error handling block
    try:
        related_concepts_raw = related_concepts_chain.invoke({"user_query" : question, "concepts" : '\n'.join(concepts)})
        print(f"related_concepts_raw : {related_concepts_raw}")
    except Exception as e:
        # Add generic error handling/logging for Gemini if needed
        print(f"Error invoking related concepts chain: {e}")
        related_concepts_raw = [] # Assign empty list on error

    # We clean up the list we received from the LLM in case there were some hallucinations
    related_concepts_cleaned = []
    for related_concept in related_concepts_raw:
        # If the concept returned from the LLM is in the list we keep it
        if related_concept in concepts:
            related_concepts_cleaned.append(related_concept)
        else:
            # The LLM sometimes only forgets a few words from the concept name
            # We check if the generated concept is a substring of an existing one and if it is the case add it to the list
            for concept in concepts:
                if related_concept in concept:
                    related_concepts_cleaned.append(concept)
                    break

    # TODO : Add concepts found via similarity search
    return related_concepts_cleaned

# ... (build_concept_string, get_global_concepts remain the same)
def build_concept_string(graph: Neo4jGraph, concept_list: list[str]):
    concept_string = ""
    for concept in concept_list:
        concept_description_query = f"""
MATCH (c:Concept {{name: "{concept}" }}) RETURN c.description
"""
        concept_description = graph.query(concept_description_query)[0]['c.description']
        concept_string += f"name: {concept}\ndescription: {concept_description}\n\n"
    return concept_string

def get_global_concepts(graph: Neo4jGraph):
    concept_cypher = "MATCH (gc:GlobalConcept) return gc"
    if isinstance(graph, Neo4jGraph):
        concepts = graph.query(concept_cypher)
    else:
        user_input = input("Topics : ")
        concepts = eval(user_input)

    concepts_name = [concept['gc']['name'] for concept in concepts]
    return concepts_name

# Update concept selection error handling
def generate_cypher(state: DocRetrieverState, config: ConfigSchema):
    """
    The node where the cypher is generated
    """
    graph = config["configurable"].get("graph")

    # --- Correction Applied Here ---
    # Use .get() for safer access to 'query'
    question = state.get('query')
    if not question:
        # Handle the case where query is missing
        print("Error: 'query' key not found in state for generate_cypher node.")
        # Return an empty list or appropriate error state
        # This prevents the KeyError and stops processing for this branch if query is missing
        return {"cyphers": []}
    # --- End of Correction ---


    related_concepts = get_related_concepts(graph, question)
    cyphers = []

    if config["configurable"].get("cypher_gen_method") == 'auto':
        cypher_gen_chain = get_cypher_gen_chain()
        cyphers = cypher_gen_chain.invoke({
            "schema": graph.schema,
            "question": question,
            "concepts": related_concepts
        })

    # Remove specific Groq error handling block
    try:
        if config["configurable"].get("cypher_gen_method") == 'guided':
            concept_selection_chain = get_concept_selection_chain()
            print(f"Concept selection chain is : {concept_selection_chain}")
            # Ensure 'current_plan_step' is also safely accessed if needed here, though it's used later
            selected_topic = concept_selection_chain.invoke({"question" : question, "concepts": get_concepts(graph)})
            print(f"Selected topic are : {selected_topic}")
            # Safely get 'current_plan_step', defaulting to 0 if not found
            current_plan_step = state.get('current_plan_step', 0)
            cyphers = [generate_cypher_from_topic(selected_topic, current_plan_step)]
            print(f"Cyphers are : {cyphers}")

    except Exception as e:
         # Add generic error handling/logging for Gemini if needed
         print(f"Error during guided cypher generation: {e}")
         cyphers = [] # Assign empty list on error

    if config["configurable"].get("validate_cypher"):
        # Ensure graph schema is correctly fetched if needed
        if graph and hasattr(graph, 'structured_schema'):
            corrector_schema = [Schema(el["start"], el["type"], el["end"]) for el in graph.structured_schema.get("relationships", [])]
            cypher_corrector = CypherQueryCorrector(corrector_schema)
            # Apply corrector only if cyphers were generated
            if cyphers:
                try:
                    cyphers = [cypher_corrector(cypher) for cypher in cyphers]
                except Exception as corr_e:
                    print(f"Error during cypher correction: {corr_e}")
                    # Decide how to handle correction errors, maybe keep original cyphers
        else:
            print("Warning: Cypher validation skipped, graph or schema unavailable.")


    return {"cyphers" : cyphers}


# ... (generate_cypher_from_topic, get_docs remain the same)
def generate_cypher_from_topic(selected_concept: str, plan_step: int):
    """
    Helper function used when the 'guided' cypher generation method has been configured
    """

    print(f"L.176 PLAN STEP : {plan_step}")
    cypher_el = "(n) return n.title, n.description"
    match plan_step:
        case 0:
            cypher_el = "(ts:TechnicalSpecification) RETURN ts.title, ts.scope, ts.description"
        case 1:
            cypher_el = "(rp:ResearchPaper) RETURN rp.title, rp.abstract"
        case 2:
            cypher_el = "(ki:KeyIssue) RETURN ki.description"
    return f"MATCH (c:Concept {{name:'{selected_concept}'}})-[:RELATED_TO]-{cypher_el}"

def get_docs(state:DocRetrieverState, config:ConfigSchema):
    """
    This node retrieves docs from the graph using the generated cypher
    """
    graph = config["configurable"].get("graph")
    output = []
    if graph is not None and state.get("cyphers"): # Check if cyphers exist
        for cypher in state["cyphers"]:
            try:
                output = graph.query(cypher)
                # Assuming the first successful query is sufficient
                if output:
                    break
            except Exception as e:
                print(f"Failed to retrieve docs with cypher '{cypher}': {e}")
                # Continue to try next cypher if one fails

    # Clean up the docs we received as there may be duplicates depending on the cypher query
    all_docs = []
    for doc in output:
        unwinded_doc = {}
        # Ensure doc is a dictionary before iterating
        if isinstance(doc, dict):
            for key in doc:
                if isinstance(doc[key], dict):
                    # If a value is a dict, treat it as a separate document
                    all_docs.append(doc[key])
                else:
                    unwinded_doc.update({key: doc[key]})
        # Add the unwinded parts if any keys were not dictionaries
        if unwinded_doc:
            all_docs.append(unwinded_doc)

    filtered_docs = []
    seen_docs = set() # Use a set for faster duplicate checking based on a unique identifier

    for doc in all_docs:
         # Create a tuple of items to check for duplicates, assuming dicts are hashable
         # If dicts contain unhashable types (like lists), convert them to strings or use a primary key
        try:
            doc_tuple = tuple(sorted(doc.items()))
            if doc_tuple not in seen_docs:
                filtered_docs.append(doc)
                seen_docs.add(doc_tuple)
        except TypeError:
             # Handle cases where doc items are not hashable (e.g., contain lists/dicts)
             # Fallback: convert doc to string for uniqueness check (less reliable)
             doc_str = str(sorted(doc.items()))
             if doc_str not in seen_docs:
                  filtered_docs.append(doc)
                  seen_docs.add(doc_str)


    return {"docs": filtered_docs}


# Data model
class GradeDocumentsBinary(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )

# Update default model and use get_model
def get_binary_grader(model="gemini-2.0-flash"):
    """
    Returns a binary grader to evaluate relevance of documents using specified model for generation
    This is used when the 'binary' evaluation method has been configured
    """
    llm_grader_binary = get_model(model)
    # Check if the model supports structured output, otherwise use standard invocation
    try:
        # Attempt to get structured output
        structured_llm_grader_binary = llm_grader_binary.with_structured_output(GradeDocumentsBinary)
        retrieval_grader_binary = BINARY_GRADER_PROMPT | structured_llm_grader_binary
    except NotImplementedError:
         print(f"Warning: Model {model} may not support structured output directly for binary grading. Falling back.")
         # Fallback: parse the string output if structured output fails
         from langchain_core.output_parsers import SimpleJsonOutputParser
         # You might need to adjust the prompt to explicitly ask for JSON
         retrieval_grader_binary = BINARY_GRADER_PROMPT | llm_grader_binary | SimpleJsonOutputParser() # Or StrOutputParser and manual parsing

    return retrieval_grader_binary


class GradeDocumentsScore(BaseModel):
    """Score for relevance check on retrieved documents."""

    score: float = Field(
        description="Documents are relevant to the question, score between 0 (completely irrelevant) and 1 (perfectly relevant)"
    )

# Update default model and use get_model
def get_score_grader(model="gemini-2.0-flash"):
    """
    Returns a score grader to evaluate relevance of documents using specified model for generation
    This is used when the 'score' evaluation method has been configured
    """
    llm_grader_score = get_model(model)
    # Check if the model supports structured output
    try:
        structured_llm_grader_score = llm_grader_score.with_structured_output(GradeDocumentsScore)
        retrieval_grader_score = SCORE_GRADER_PROMPT | structured_llm_grader_score
    except NotImplementedError:
        print(f"Warning: Model {model} may not support structured output directly for score grading. Falling back.")
        # Fallback: parse the string output if structured output fails
        from langchain_core.output_parsers import SimpleJsonOutputParser
        # Adjust prompt if needed
        retrieval_grader_score = SCORE_GRADER_PROMPT | llm_grader_score | SimpleJsonOutputParser() # Or StrOutputParser and manual parsing

    return retrieval_grader_score

# Update default model
def eval_doc(doc, query, method="binary", threshold=0.7, eval_model="gemini-2.0-flash"):
    '''
    doc : the document to evaluate
    query : the query to which to doc shoud be relevant
    method : "binary" or "score"
    threshold : for "score" method, score above which a doc is considered relevant
    '''
    try:
        if method == "binary":
            retrieval_grader_binary = get_binary_grader(model=eval_model)
            result = retrieval_grader_binary.invoke({"question": query, "document":doc})
            # Handle both structured and parsed output
            binary_score = result.binary_score if isinstance(result, GradeDocumentsBinary) else result.get("binary_score", "no")
            return 1 if (binary_score.lower() == 'yes') else 0
        elif method == "score":
            retrieval_grader_score = get_score_grader(model=eval_model)
            result = retrieval_grader_score.invoke({"query": query, "document":doc})
            # Handle both structured and parsed output
            score = result.score if isinstance(result, GradeDocumentsScore) else result.get("score")
            if score is not None:
                return score if float(score) >= threshold else 0
            else:
                print("Warning: Couldn't parse score, marking document as relevant by default.")
                return 1 # Default to relevant if score parsing fails
        else:
            raise ValueError("Invalid method")
    except Exception as e:
        print(f"Error evaluating document: {e}")
        return 0 # Default to irrelevant on error

# Update default model
def eval_docs(state: DocRetrieverState, config: ConfigSchema):
    """
    This node performs evaluation of the retrieved docs and
    """

    eval_method =  config["configurable"].get("eval_method") or "binary"
    MAX_DOCS = config["configurable"].get("max_docs") or 15
    # Update default model name
    eval_model_name = config["configurable"].get("eval_model") or "gemini-2.0-flash"
    valid_doc_scores = []

    # Ensure 'docs' exists and is a list
    docs_to_evaluate = state.get("docs", [])
    if not isinstance(docs_to_evaluate, list):
        print("Warning: 'docs' is not a list, skipping evaluation.")
        docs_to_evaluate = []

    # Sample safely
    sample_size = min(25, len(docs_to_evaluate))
    sampled_docs = sample(docs_to_evaluate, sample_size) if sample_size > 0 else []


    for doc in sampled_docs:
         # Ensure doc is not None before formatting
        if doc is None:
            print("Warning: Encountered None document during evaluation, skipping.")
            continue

        formatted_doc_str = format_doc(doc)
        # Add basic check for empty formatted doc
        if not formatted_doc_str.strip():
             print(f"Warning: Skipping empty formatted document: {doc}")
             continue

        score = eval_doc(
                        doc=formatted_doc_str,
                        query=state["query"], # This line assumes "query" exists in state
                        method=eval_method,
                        threshold=config["configurable"].get("eval_threshold") or 0.7,
                        eval_model=eval_model_name # Pass the eval_model name
                        )
        # Ensure score is numeric before appending
        if isinstance(score, (int, float)):
            if score > 0: # Only add if relevant (score > 0 or binary score == 1)
                 valid_doc_scores.append((doc, score))
        else:
             print(f"Warning: Received non-numeric score ({score}) for doc {doc}, skipping.")


    if eval_method == 'score':
        # Get at most MAX_DOCS items with the highest score if score method was used
        valid_docs_sorted = sorted(valid_doc_scores, key=lambda x: x[1], reverse=True) # Sort descending
        valid_docs = [valid_doc[0] for valid_doc in valid_docs_sorted[:MAX_DOCS]]
    else:
        # Get at mots MAX_DOCS items at random if binary method was used
        shuffle(valid_doc_scores)
        valid_docs = [valid_doc[0] for valid_doc in valid_doc_scores[:MAX_DOCS]]

    # Ensure existing valid_docs is a list before concatenating
    existing_valid_docs = state.get("valid_docs", [])
    if not isinstance(existing_valid_docs, list):
        existing_valid_docs = []

    return {"valid_docs": valid_docs + existing_valid_docs}


def build_data_retriever_graph(memory):
    """
    Builds the data_retriever graph
    """
    #with SqliteSaver.from_conn_string(":memory:") as memory :

    graph_builder_doc_retriever = StateGraph(DocRetrieverState)

    graph_builder_doc_retriever.add_node("generate_cypher", generate_cypher)
    graph_builder_doc_retriever.add_node("get_docs", get_docs)
    graph_builder_doc_retriever.add_node("eval_docs", eval_docs)


    graph_builder_doc_retriever.add_edge("__start__", "generate_cypher")
    graph_builder_doc_retriever.add_edge("generate_cypher", "get_docs")
    graph_builder_doc_retriever.add_edge("get_docs", "eval_docs")
    graph_builder_doc_retriever.add_edge("eval_docs", "__end__")

    graph_doc_retriever = graph_builder_doc_retriever.compile(checkpointer=memory)

    return graph_doc_retriever

# Remove Groq specific error handling function
# def error_concept_groq(msg,concepts,groq,question): ...