import os
import time
from datetime import datetime
import logging
from pathlib import Path  
import requests
import json

import numpy as np
import pandas as pd
import spacy
from sentence_transformers import CrossEncoder
import litellm
# from litellm import completion
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig, pipeline
# from accelerate import PartialState
# from accelerate.inference import prepare_pippy
import torch
import cohere
from openai import OpenAI
# import  google
import google.generativeai as genai

import src.backend.util as util
import src.envs as envs

# litellm.set_verbose=False
litellm.set_verbose=True
# Set up basic configuration for logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

# Load spacy model for word tokenization
nlp = spacy.load("en_core_web_sm")

os.environ["HUGGINGFACE_API_KEY"] =  envs.TOKEN
os.environ["OPENAI_API_KEY"] = "sk-None-tanhMyavhUtpX2G1kmPuT3BlbkFJGEhM5jmyGyhrTd3LdHDI"

def load_evaluation_model(model_path):
    """Load the evaluation model from the given path

    Args:
        model_path (str): Path to the evaluation model

    Returns:
        CrossEncoder: The evaluation model
    """
    model = CrossEncoder(model_path)
    return model


class ModelLoadingException(Exception):
    """Exception raised for errors in loading a model.

    Attributes:
        model_id (str): The model identifier.
        revision (str): The model revision.
    """

    def __init__(self, model_id, revision, messages="Error initializing model"):
        self.model_id = model_id
        self.revision = revision
        super().__init__(f"{messages} id={model_id} revision={revision}")


class SummaryGenerator:
    """A class to generate summaries using a causal language model.

    Attributes:
        model (str): huggingface/{model_id}
        api_base (str): https://api-inference.huggingface.co/models/{model_id}
        summaries_df (DataFrame): DataFrame to store generated summaries.
        revision (str): Model revision.
        avg_length (float): Average length of summaries.
        answer_rate (float): Rate of non-empty summaries.
    """

    def __init__(self, model_id, revision):
        """
        Initializes the SummaryGenerator with a model.

        Args:
            model_id (str): Identifier for the model.
            revision (str): Revision of the model.
        """
        self.model_id = model_id
        self.model = f"huggingface/{model_id}"
        self.api_base = f"https://api-inference.huggingface.co/models/{model_id}"
        self.summaries_df = pd.DataFrame()
        self.revision = revision
        self.avg_length = None
        self.answer_rate = None
        self.exceptions = None
        self.local_model = None

    def generate_summaries(self, dataset, df_prompt, save_path=None):
        """Generate summaries for a given DataFrame of source docs.
           修改这里拉取模型生成结果
        Args:
            df (DataFrame): DataFrame containing source docs.
            
        Returns:
            summaries_df (DataFrame): Generated summaries by the model.
        """
        exceptions = []
        if (save_path is not None) and os.path.exists(save_path):
            '''已存在文件,可以读取已经存在的测试文本'''
            self.summaries_df = pd.read_csv(save_path)
            # print(self.summaries_df['Experiment'])

            print(f'Loaded generated summaries from {save_path}')
        else:
            '''测试文件不存在，则需要调用指定的模型来进行测试'''
            # prompt = {}
            # for index, row in tqdm(df_prompt.iterrows(), total=df_prompt.shape[0]):
            #     prompt['E' + row['Item']] = row['Prompt']
            xls = pd.ExcelFile(dataset)  
            sheet_names = xls.sheet_names
            # sheet_names = df.sheetnames 
            print(f"Total: {len(sheet_names)}")  
            print(sheet_names)  
            
            item_ID, questions_ID, user_prompt, response = [], [], [], []  
            
            for i, sheet_name in enumerate(sheet_names[0:1], start=1):  
                # 读取每个工作表  
                df_sheet = pd.read_excel(xls, sheet_name=sheet_name)  
                
                # 假设第一列是'Prompt0'，但这里我们使用列名来避免硬编码  
                if 'Prompt0' in df_sheet.columns:  
                    prompt_column = df_sheet['Prompt0']  
                else:  
                    # 如果'Prompt0'列不存在，则跳过该工作表或进行其他处理  
                    continue  
                
                # 遍历Prompt0列的值  
                for j, prompt_value in enumerate(tqdm(prompt_column, desc=f"Processing {sheet_name}"), start=1):  
                    ID = 'E' + str(i)  
                    q_ID = ID + '_' + str(j)  
                      
                    # print(ID, q_ID, prompt_value) 
                    for i in range(2):
                        system_prompt = envs.SYSTEM_PROMPT
                    # user_prompt = f"{envs.USER_PROMPT}\nPassage:\n{_source}"
                        _user_prompt = prompt_value
                        while True:
                            try:
                                '''调用'''
                                print('开始调用LLM-API')
                                
                                _response = self.generate_summary(system_prompt, _user_prompt)
                                # print(f"Finish index {index}")
                                break
                            except Exception as e:
                                if 'Rate limit reached' in str(e):
                                    wait_time = 3660
                                    current_time = datetime.now().strftime('%H:%M:%S')
                                    print(f"Rate limit hit at {current_time}. Waiting for 1 hour before retrying...")
                                    time.sleep(wait_time)
                                elif 'is currently loading' in str(e):
                                    wait_time = 200
                                    print(f"Model is loading, wait for {wait_time}")
                                    time.sleep(wait_time)
                                elif '429 Resource has been exhausted' in str(e): # for gemini models
                                    wait_time = 60
                                    print(f"Quota has reached, wait for {wait_time}")
                                    time.sleep(wait_time)
                                else:
                                    print(f"Error at index {i}: {e}")
                                    _response = ""
                                    exceptions.append(i)
                                    break

                        item_ID.append(ID)
                        questions_ID.append(q_ID)
                        user_prompt.append(_user_prompt)
                        response.append(_response)
                        print(_response)
                        # exit()

                    # Sleep to prevent hitting rate limits too frequently
                        time.sleep(1)

            self.summaries_df = pd.DataFrame(list(zip(item_ID, questions_ID, user_prompt, response)),
                                            columns=["Experiment", "Question_ID", "User_prompt", "Response"])

            if save_path is not None:
                print(f'Save summaries to {save_path}')
                fpath = Path(save_path)
                fpath.parent.mkdir(parents=True, exist_ok=True)
                self.summaries_df.to_csv(fpath) 

        self.exceptions = exceptions
        # self._compute_avg_length()
        # self._compute_answer_rate()

        return self.summaries_df
    
    def generate_summary(self, system_prompt: str, user_prompt: str):
        # Using Together AI API
        using_together_api = False
        together_ai_api_models = ['mixtral', 'dbrx', 'wizardlm', 'llama-3']
        for together_ai_api_model in together_ai_api_models:
            if together_ai_api_model in self.model_id.lower():
                using_together_api = True
                break
        # print('适用哪一种LLM',together_ai_api_model , using_together_api)
        # print(self.model_id.lower()) #meta-llama/llama-2-7b-chat-hf
        # print('local',self.local_model) $None
        # exit()
        # if 'mixtral' in self.model_id.lower() or 'dbrx' in self.model_id.lower() or 'wizardlm' in self.model_id.lower(): # For mixtral and dbrx models, use Together AI API
        if using_together_api:
            # suffix = "completions" if ('mixtral' in self.model_id.lower() or 'base' in self.model_id.lower()) else "chat/completions"
            suffix = "chat/completions"
            url = f"https://api.together.xyz/v1/{suffix}"

            payload = {
                "model": self.model_id,
                # "max_tokens": 4096,
                'max_new_tokens': 250,
                "temperature": 0.0,
                # 'repetition_penalty': 1.1 if 'mixtral' in self.model_id.lower() else 1
            }
            # if 'mixtral' in self.model_id.lower():
            #     # payload['prompt'] = user_prompt
            #     # payload['prompt'] = "Write a summary of the following passage:\nPassage:\n" + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
            #     payload['prompt'] = 'You must stick to the passage provided. Provide a concise summary of the following passage, covering the core pieces of information described:\nPassage:\n' + user_prompt.split('Passage:\n')[-1] + '\n\nSummary:'
            #     print(payload)
            # else:
            #     payload['messages'] = [{"role": "system", "content": system_prompt},
            #                             {"role": "user", "content": user_prompt}]
            payload['messages'] = [{"role": "system", "content": system_prompt},
                                        {"role": "user", "content": user_prompt}]
            headers = {
                "accept": "application/json",
                "content-type": "application/json",
                "Authorization": f"Bearer {os.environ['TOGETHER_API_KEY']}"
            }

            response = requests.post(url, json=payload, headers=headers)
            try:
                result = json.loads(response.text)
                # print(result)
                result = result["choices"][0]
                if 'message' in result:
                    result = result["message"]["content"].strip()
                else:
                    result = result["text"]
                    result_candidates = [result_cancdidate for result_cancdidate in result.split('\n\n') if len(result_cancdidate) > 0]
                    result = result_candidates[0]
                print(result)
            except:
                print(response)
                result = ''
            print(result)
            return result

        # Using OpenAI API
        elif 'gpt' in self.model_id.lower():
            response = litellm.completion(
                model=self.model_id.replace('openai/',''),
                messages=[{"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt}],
                temperature=0.0,
                max_tokens=250,
            )   
            result = response['choices'][0]['message']['content']
            print(result)
            return result
        
        # Using Google AI API for Gemini models
        elif 'gemini' in self.model_id.lower():
            genai.configure(api_key=os.getenv('GOOGLE_AI_API_KEY'))
            generation_config = {
                "temperature": 0,
                "top_p": 0.95, # cannot change
                "top_k": 0,
                "max_output_tokens": 250,
                # "response_mime_type": "application/json",
            }
            safety_settings = [
                {
                    "category": "HARM_CATEGORY_HARASSMENT",
                    "threshold": "BLOCK_NONE"
                },
                {
                    "category": "HARM_CATEGORY_HATE_SPEECH",
                    "threshold": "BLOCK_NONE"
                },
                {
                    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                    "threshold": "BLOCK_NONE"
                },
                {
                    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
                    "threshold": "BLOCK_NONE"
                },
            ]
            model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest" if "gemini-1.5-pro" in self.model_id.lower() else self.model_id.lower().split('google/')[-1],
                              generation_config=generation_config,
                              system_instruction=system_prompt,
                              safety_settings=safety_settings)
            convo = model.start_chat(history=[])
            convo.send_message(user_prompt)
            # print(convo.last)
            result = convo.last.text
            print(result)
            return result

        # Using HF API or download checkpoints
        elif self.local_model is None:
            # print(self.model_id)
            # exit()
            try: # try use HuggingFace API
                response = litellm.completion(
                    model='command-r-plus' if 'command' in self.model_id else self.model_id,
                    messages=[{"role": "system", "content": system_prompt},
                                {"role": "user", "content": user_prompt}],
                    temperature=0.0,
                    max_tokens=1024,
                    api_base=self.api_base,
                )
                result = response['choices'][0]['message']['content']
                print(result)
                return result
                # exit()
            except: # fail to call api. run it locally.
                self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, trust_remote_code=True)
                print("Tokenizer loaded")
                self.local_model = AutoModelForCausalLM.from_pretrained(self.model_id, trust_remote_code=True, device_map="auto", torch_dtype="auto", cache_dir='/home/paperspace/cache')
                print("Local model loaded")
            # exit()
        # Using local model
        if self.local_model: # cannot call API. using local model
            messages=[
                {"role": "system", "content": system_prompt}, # gemma-1.1 does not accept system role
                {"role": "user", "content": user_prompt}
            ]
            try: # some models support pipeline
                pipe = pipeline(
                    "text-generation",
                    model=self.local_model,
                    tokenizer=self.tokenizer,
                )

                generation_args = {
                    "max_new_tokens": 250,
                    "return_full_text": False,
                    "temperature": 0.0,
                    "do_sample": False,
                }

                output = pipe(messages, **generation_args)
                result = output[0]['generated_text']
                print(result)
            except:
                prompt = self.tokenizer.apply_chat_template(messages,add_generation_prompt=True, tokenize=False)
                print(prompt)
                input_ids = self.tokenizer(prompt, return_tensors="pt").to('cuda')
                with torch.no_grad():
                    outputs = self.local_model.generate(**input_ids, max_new_tokens=250, do_sample=True, temperature=0.01, pad_token_id=self.tokenizer.eos_token_id)
                result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
                result = result.replace(prompt[0], '')
                print(result)
            return result

    def _compute_avg_length(self):
        """
        Compute the average length of non-empty summaries using SpaCy.
        """
        total_word_count = 0
        total_count = 0

        for summary in self.summaries_df['summary']:
            if util.is_summary_valid(summary):
                doc = nlp(summary)
                words = [token.text for token in doc if token.is_alpha]
                total_word_count += len(words)
                total_count += 1

        self.avg_length = 0 if total_count == 0 else total_word_count / total_count

    def _compute_answer_rate(self):
        """
        Compute the rate of non-empty summaries.
        """
        valid_count = sum(1 for summary in self.summaries_df['summary']
                            if util.is_summary_valid(summary))

        total_count = len(self.summaries_df)

        self.answer_rate = 0 if total_count == 0 else valid_count / total_count


class EvaluationModel:
    """A class to evaluate generated summaries.

    Attributes:
        model (CrossEncoder): The evaluation model.
        scores (list): List of evaluation scores.
        accuracy (float): Accuracy of the summaries.
        hallucination_rate (float): Rate of hallucination in summaries.
    """

    def __init__(self, model_path):
        """
        Initializes the EvaluationModel with a CrossEncoder model.

        Args:
            model_path (str): Path to the CrossEncoder model.
        """
        self.model = load_evaluation_model(model_path)
        self.scores = []
        self.factual_consistency_rate = None
        self.hallucination_rate = None
        self.humanlike_score = None

    def code_results(self, summaries_df):
        '''code results from LLM's response'''
        output = []
        '''item1'''
        # print(len(summaries_df['Experiment']),len(summaries_df['Response']))
        # exit()
        '''人类数据需要处理Item3'''
        item3 = pd.read_csv('/Users/tangtang/Desktop/leaderboard/src/datasets/Experiment_3_Items.csv')
        item2word = {}
        for j in range(len(item3['Item'])):
            item2word[item3['Item'][j]] = [item3['Field 2'][j], item3['Field 3'][j]]

        male_keyword = ["he", "his", "himself"]
        female_keyword = ["she", "her", "herself"]
        for i in range(len(summaries_df['Experiment'])):
            # vote_1_1, vote_1_2, vote_1_3 = 0, 0, 0
            if summaries_df["Experiment"][i] == "E1":
                if summaries_df["Response"][i].strip() == "Round":
                    # vote_1_1 += 1
                    output.append("Round")
                elif summaries_df["Response"][i].strip() == "Spiky":
                    output.append("Round")
                else:
                    output.append("NA")
            # print()

            '''item2'''
            # vote_2_1, vote_2_2, vote_2_3 = 0, 0, 0

            if summaries_df["Experiment"][i] == "E2":
                rs = summaries_df["Response"][i].strip()
                rs = rs.split(' ')
                male, female = 0, 0
                for word in rs:
                    if word in female_keyword and male != 1:
                        female = 1
                        output.append("Female")
                        break
                    if word in male_keyword and female != 1:
                        male = 1
                        output.append("Male")
                        break
                if male == 0 and female == 0 :
                    output.append("NA")
            '''item3'''

            if summaries_df["Experiment"][i] == "E3":
                rs = summaries_df["Response"][i].strip()
                id = summaries_df["Item"][i].strip()
                if '2' in rs:
                    item2word[id][0]

                
            '''item4'''
            '''item5'''
            '''item6'''

            '''item7'''
            if summaries_df["Experiment"][i] == "E7":
                rs = summaries_df["Response"][i].strip()
                if rs == "No":
                    output.append("0")
                elif rs == "Yes":
                    output.append("1")
                else:
                    output.append("NA")
            '''item8'''
            if summaries_df["Experiment"][i] == "E8":
                rs = summaries_df["Response"][i].strip()
                if rs == "Something is wrong with the question":
                    output.append("1")
                else:
                    output.append("0")


            '''item9'''
            if summaries_df["Experiment"][i] == "E9":
                male, female = 0, 0
                rs = summaries_df["Response"][i].strip()
                if "because" in rs:
                    rs = rs.split("because")[1]
                else:
                    rs = rs
                condition = summaries_df["Factor 2"][i].strip()
                rs = rs.split(" ")
                for w in rs:
                    if w in male_keyword and female != 1:
                        male = 1
                        break
                    if w in female_keyword and male != 1:
                        break
                if  male == 0 and female == 0:
                    output.append('NA')
                else:
                    if male == 1 and female==0:
                        if condition == "MF":
                            output.append("Subject")
                        elif condition == "FM":
                            output.append("Object")
                        else:
                            output.append("NA")
                    elif female == 1 and male ==0:
                        if condition == "MF":
                            output.append("Object")
                        elif condition == "FM":
                            output.append("Subject")
                        else:
                            output.append("NA")

            '''item10'''
            if summaries_df["Experiment"][i] == "E10":
                rs = summaries_df["Response"][i].strip()
                if rs == "Yes":
                    output.append("1")
                else:
                    output.append("0")                


        '''是不是有不同的问题，如何计算'''
    def evaluate_humanlike(self, summaries_df, human_data_path):
        '''
        evaluate humanlike score
        1. code the result
        2. comput the similaritirs between human and model
        process model responses'''
        huamn_df = pd.read_csv(human_data_path)
        self.code_results(summaries_df)
        return 9.00


    def evaluate_hallucination(self, summaries_df):
        """
        Evaluate the hallucination rate in summaries. Updates the 'scores' attribute 
        of the instance with the computed scores.

        Args:
            summaries_df (DataFrame): DataFrame containing source docs and summaries.

        Returns:
            list: List of hallucination scores. Also updates the 'scores' attribute of the instance.
        """
        hem_scores = []
        sources = []
        summaries = []
        source_summary_pairs = util.create_pairs(summaries_df)
        '''评价模型结果'''
        for doc, summary in tqdm(source_summary_pairs, desc="Evaluating Humanlikeness"):
            if util.is_summary_valid(summary):
                try:
                    summary = summary.replace('<bos>','').replace('<eos>','')
                    score = self.model.predict([doc, summary])# [0]
                    if not isinstance(score, float):
                        try:
                            score = score.item()
                        except:
                            logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
                            continue
                    hem_scores.append(score)
                    sources.append(doc)
                    summaries.append(summary)
                except Exception as e:
                    logging.error(f"Error while running HEM: {e}")
                    raise

        self.scores = hem_scores
        eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
        return hem_scores, eval_results
        # for doc, summary in tqdm(source_summary_pairs, desc="Evaluating hallucinations"):
        #     if util.is_summary_valid(summary):
        #         try:
        #             # summary_pieces = summary.split('\n')
        #             # summary = summary_pieces[0] if len(summary_pieces[0].strip()) > 0 else summary_pieces[1]
        #             summary = summary.replace('<bos>','').replace('<eos>','')
        #             # print([doc, summary])
        #             # print(self.model.predict([doc, summary]))
        #             score = self.model.predict([doc, summary])# [0]
        #             if not isinstance(score, float):
        #                 try:
        #                     score = score.item()
        #                 except:
        #                     logging.warning(f"Score type mismatch: Expected float, got {type(score)}.")
        #                     continue
        #             hem_scores.append(score)
        #             sources.append(doc)
        #             summaries.append(summary)
        #         except Exception as e:
        #             logging.error(f"Error while running HEM: {e}")
        #             raise

        # self.scores = hem_scores
        # eval_results = {'source': sources, 'summary': summaries, 'HEM scores': hem_scores}
        # return hem_scores, eval_results


    def compute_factual_consistency_rate(self, threshold=0.5):
        """
        Compute the factual consistency rate of the evaluated summaries based on
        the previously calculated scores. This method relies on the 'scores'
        attribute being populated, typically via the 'evaluate_hallucination' method.

        Returns:
            float: Factual Consistency Rate. Also updates the 'factual_consistency_rate'
            and 'hallucination_rate' attributes of the instance.

        Raises:
            ValueError: If scores have not been calculated prior to calling this method.
        """
        if not self.scores:
            error_msg = "Scores not calculated. Call evaluate_hallucination() first."
            logging.error(error_msg)
            raise ValueError(error_msg)

        # Use threshold of 0.5 to compute factual_consistency_rate
        num_above_threshold = sum(score >= threshold for score in self.scores)
        num_total = len(self.scores)

        if not num_total:
            raise ValueError("No scores available to compute factual consistency rate.")

        self.factual_consistency_rate = (num_above_threshold / num_total) * 100
        self.hallucination_rate = 100 - self.factual_consistency_rate

        return self.factual_consistency_rate