import json import os import argparse from tqdm import tqdm import tiktoken from openai import OpenAI from huggingface_hub import hf_hub_download def gpt_4o(input_text): client=OpenAI(api_key=os.environ.get("OAI")) response = client.chat.completions.create( model="gpt-4o", messages=[ {"role": "user", "content": [{"type": "text", "text": input_text}]} ], response_format={"type": "json_object"}, temperature=0, max_tokens=4096, top_p=0, frequency_penalty=0, presence_penalty=0 ) return response.choices[0].message.content def run_gpt4_event_extraction(data_dir, max_tokens=100000): all_info_path = os.path.join(data_dir, "all_info_with_txt.json") output_dir = os.path.join(data_dir, "gpt4_event_extraction") os.makedirs(output_dir, exist_ok=True) icl_path = hf_hub_download( repo_id="PledgeTracker/demo_feedback", filename="icl.txt", repo_type="dataset", token=os.environ["HF_TOKEN"] ) ICL = open(icl_path, "r").read() all_info = open(all_info_path, "r").readlines() enc = tiktoken.encoding_for_model("gpt-4o") for i, line in enumerate(all_info): ID = i urls = [] results = [] data = json.loads(line) docs = data["evidence"] claim = data["claim"] output_path = os.path.join(output_dir, f"gpt4o_results_{ID}_claim.json") if os.path.exists(output_path): print(f"Already exist: {output_path}") else: for doc in tqdm(docs): if doc["url"] in urls: continue text = " ".join(doc["text"]) input_text = ( f"{ICL}\nNow please only summarize events that are useful for verifying the pledge '{claim}', and their dates in the JSON format.\n\nInput:\n\nTitle: {doc['metadata']['title']}\n" f"Date: {doc['metadata']['date']}\nArticle: {text}\nPledge: {claim}\n\n" f"Output:\n" ) urls.append(doc["url"]) text_tokens = enc.encode(input_text) if len(text_tokens) > max_tokens: input_text = enc.decode(text_tokens[:max_tokens]) try: output = gpt_4o(input_text) # print(f"GPT-4o Response: {output}") results.append({ "url": doc["url"], "title": doc["metadata"]["title"], "date": doc["metadata"]["date"], "article": text, "output": json.loads(output) }) except Exception as e: print(f"Error processing doc: {e}") continue with open(output_path, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=4) return output_path if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run GPT-4o event extraction") parser.add_argument("--data_dir", type=str, required=True, help="Root data directory") parser.add_argument("--icl_path", type=str, required=True, help="Path to ICL prompt file") parser.add_argument("--max_tokens", type=int, default=100000, help="Maximum token limit for input") args = parser.parse_args() run_gpt4_event_extraction( base_dir=args.base_dir, icl_path=args.icl_path, max_tokens=args.max_tokens )