Spaces:

PledgeTracker
/

Pledge_Tracker

Sleeping

File size: 5,429 Bytes

35b3f62

import os
from datetime import datetime
import subprocess
from huggingface_hub import hf_hub_download
import json

def run_hero_reranking(pipeline_base_dir, suggestion_meta):
    base_dir = f"{pipeline_base_dir}"
    hero_dir = os.path.join(base_dir, "hero")
    os.makedirs(hero_dir, exist_ok=True)

    if suggestion_meta:
        hyde_path = hf_hub_download(
                repo_id="PledgeTracker/demo_feedback",   
                filename="manifesto_icl_hyde_fc.json",            
                repo_type="dataset",                     
                token=os.environ["HF_TOKEN"]            
            )
        with open(hyde_path, "r", encoding="utf-8") as f:
            all_hyde_data = json.load(f)

        idx = suggestion_meta["index"]
        single_hyde = [all_hyde_data[idx]]
        save_path = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")
        with open(save_path, "w", encoding="utf-8") as f:
            json.dump(single_hyde, f, indent=2)
    
    hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")

    def safe_run(cmd, timeout=600):
        try:
            print(f"👉 Running: {' '.join(str(x) for x in cmd)}")
            subprocess.run(cmd, check=True, timeout=timeout)
        except subprocess.CalledProcessError as e:
            print(f"[❌ ERROR] Subprocess failed: {e}")
            if e.stderr:
                print("[stderr]:", e.stderr.decode())
            raise
        except subprocess.TimeoutExpired:
            print(f"[❌ TIMEOUT] Command timed out: {' '.join(cmd)}")
            raise

    # Step 3.2: retrieval
    print("🔍 Step 3.2: Retrieval from knowledge store ...")
    knowledge_store_dir = os.path.join(base_dir, "augmented_data_store")
    retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k_QA.json")

    if not os.path.exists(retrieval_output):
        safe_run([
            "python", "system/baseline/retrieval_optimized.py",
            "--knowledge_store_dir", knowledge_store_dir,
            "--target_data", hyde_output,
            "--json_output", retrieval_output,
        ])

    # Step 3.3: reranking
    print("🏷️ Step 3.3: Reranking retrieved evidence ...")
    rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k_QA.json")

    if not os.path.exists(rerank_output):
        safe_run([
            "python", "system/baseline/reranking_optimized.py",
            "--target_data", retrieval_output,
            "--json_output", rerank_output,
            "--top_k", str(50),
        ])

    return {
        "hyde": hyde_output,
        "retrieved": retrieval_output,
        "reranked": rerank_output,
    }


def run_hero_pipeline(pipeline_base_dir):
    base_dir = f"{pipeline_base_dir}"
    hero_dir = os.path.join(base_dir, "hero")
    os.makedirs(hero_dir, exist_ok=True)

    target_data = os.path.join(base_dir, "claim.json")
    hyde_output = os.path.join(hero_dir, "manifesto_icl_hyde_fc.json")

    def safe_run(cmd, timeout=600):
        try:
            print(f"👉 Running: {' '.join(cmd)}")
            subprocess.run(cmd, check=True, timeout=timeout)
        except subprocess.CalledProcessError as e:
            print(f"[❌ ERROR] Subprocess failed: {e}")
            if e.stderr:
                print("[stderr]:", e.stderr.decode())
            raise
        except subprocess.TimeoutExpired:
            print(f"[❌ TIMEOUT] Command timed out: {' '.join(cmd)}")
            raise

    # Step 3.1: hyde_fc_generation
    if not os.path.exists(hyde_output):
        print("🧠 Step 3.1: HyDE ICL generation ...")
        safe_run([
            "python", "system/baseline/hyde_fc_generation_optimized.py",
            "--target_data", target_data,
            "--json_output", hyde_output
        ])

    # Step 3.2: retrieval
    print("🔍 Step 3.2: Retrieval from knowledge store ...")
    knowledge_store_dir = os.path.join(base_dir, "initial_data_store")
    retrieval_output = os.path.join(hero_dir, "manifesto_icl_retrieval_top_k.json")

    if not os.path.exists(retrieval_output):
        safe_run([
            "python", "system/baseline/retrieval_optimized.py",
            "--knowledge_store_dir", knowledge_store_dir,
            "--target_data", hyde_output,
            "--json_output", retrieval_output
        ])

    # Step 3.3: reranking
    print("🏷️ Step 3.3: Reranking retrieved evidence ...")
    rerank_output = os.path.join(hero_dir, "manifesto_icl_reranking_top_k.json")

    if not os.path.exists(rerank_output):
        safe_run([
            "python", "system/baseline/reranking_optimized.py",
            "--target_data", retrieval_output,
            "--json_output", rerank_output
        ])

    # Step 3.4: question generation
    print("❓ Step 3.4: Generating QA pairs ...")
    reference_corpus = "system/baseline/train.json"
    qa_output = os.path.join(hero_dir, "manifesto_icl_top_k_qa.json")

    if not os.path.exists(qa_output):
        safe_run([
            "python", "system/baseline/question_generation_optimized.py",
            "--reference_corpus", reference_corpus,
            "--top_k_target_knowledge", rerank_output,
            "--output_questions", qa_output,
            "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct"
        ])

    return {
        "hyde": hyde_output,
        "retrieved": retrieval_output,
        "reranked": rerank_output,
        "qa_pairs": qa_output
    }