Spaces:
Sleeping
Sleeping
import json | |
import os | |
import argparse | |
from tqdm import tqdm | |
import tiktoken | |
from openai import OpenAI | |
from huggingface_hub import hf_hub_download | |
def gpt_4o(input_text): | |
client=OpenAI(api_key=os.environ.get("OAI")) | |
response = client.chat.completions.create( | |
model="gpt-4o", | |
messages=[ | |
{"role": "user", "content": [{"type": "text", "text": input_text}]} | |
], | |
response_format={"type": "json_object"}, | |
temperature=0, | |
max_tokens=4096, | |
top_p=0, | |
frequency_penalty=0, | |
presence_penalty=0 | |
) | |
return response.choices[0].message.content | |
def run_gpt4_event_extraction(data_dir, max_tokens=100000): | |
all_info_path = os.path.join(data_dir, "all_info_with_txt.json") | |
output_dir = os.path.join(data_dir, "gpt4_event_extraction") | |
os.makedirs(output_dir, exist_ok=True) | |
icl_path = hf_hub_download( | |
repo_id="PledgeTracker/demo_feedback", | |
filename="icl.txt", | |
repo_type="dataset", | |
token=os.environ["HF_TOKEN"] | |
) | |
ICL = open(icl_path, "r").read() | |
all_info = open(all_info_path, "r").readlines() | |
enc = tiktoken.encoding_for_model("gpt-4o") | |
for i, line in enumerate(all_info): | |
ID = i | |
urls = [] | |
results = [] | |
data = json.loads(line) | |
docs = data["evidence"] | |
claim = data["claim"] | |
output_path = os.path.join(output_dir, f"gpt4o_results_{ID}_claim.json") | |
if os.path.exists(output_path): | |
print(f"Already exist: {output_path}") | |
else: | |
for doc in tqdm(docs): | |
if doc["url"] in urls: | |
continue | |
text = " ".join(doc["text"]) | |
input_text = ( | |
f"{ICL}\nNow please only summarize events that are useful for verifying the pledge '{claim}', and their dates in the JSON format.\n\nInput:\n\nTitle: {doc['metadata']['title']}\n" | |
f"Date: {doc['metadata']['date']}\nArticle: {text}\nPledge: {claim}\n\n" | |
f"Output:\n" | |
) | |
urls.append(doc["url"]) | |
text_tokens = enc.encode(input_text) | |
if len(text_tokens) > max_tokens: | |
input_text = enc.decode(text_tokens[:max_tokens]) | |
try: | |
output = gpt_4o(input_text) | |
# print(f"GPT-4o Response: {output}") | |
results.append({ | |
"url": doc["url"], | |
"title": doc["metadata"]["title"], | |
"date": doc["metadata"]["date"], | |
"article": text, | |
"output": json.loads(output) | |
}) | |
except Exception as e: | |
print(f"Error processing doc: {e}") | |
continue | |
with open(output_path, "w", encoding="utf-8") as f: | |
json.dump(results, f, ensure_ascii=False, indent=4) | |
return output_path | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Run GPT-4o event extraction") | |
parser.add_argument("--data_dir", type=str, required=True, help="Root data directory") | |
parser.add_argument("--icl_path", type=str, required=True, help="Path to ICL prompt file") | |
parser.add_argument("--max_tokens", type=int, default=100000, help="Maximum token limit for input") | |
args = parser.parse_args() | |
run_gpt4_event_extraction( | |
base_dir=args.base_dir, | |
icl_path=args.icl_path, | |
max_tokens=args.max_tokens | |
) | |