import pandas as pd
from pathlib import Path
import webdataset as wds


def convert_path_to_key(img_path: Path) -> str:
    # 1. Get relative path from root


    relative = img_path.relative_to("data/database")
    
    # 2. Remove suffix (.jpg)
    no_suffix = relative.with_suffix('')
    
    # 3. Convert to POSIX-style string and flatten it
    flat = no_suffix.as_posix().replace('/', '_')
    
    # 4. Replace . with , to match your target format
    key = flat.replace('.', ',')
    
    return key

    
def update_mapping_csv(original_csv, webdataset_dir, new_csv_path):
    df = pd.read_csv(original_csv)


    webdataset_dir = Path(webdataset_dir)
    shards = list(webdataset_dir.glob("*.tar"))
    
    # Create mapping: key -> shard_path
    key_to_shard = {}
    for shard in shards:
        dataset = wds.WebDataset(str(shard),  empty_check=False)

        for sample in dataset:
            key = sample["__key__"]
            key_to_shard[key] = str(shard)
    
  
    df["key"] = df["local_path"].apply(lambda p: convert_path_to_key(Path(p)))
    df["shard_path"] = df["key"].map(key_to_shard)
    # ❗ Raise an error if any shard_path is NaN
    if df["shard_path"].isna().any():
        missing_keys = df[df["shard_path"].isna()]["key"].tolist()
        raise ValueError(f"Missing shard paths for the following keys: {missing_keys[:10]}... (and possibly more)")
    df.to_csv(new_csv_path, index=False)


if __name__ == "__main__":
    update_mapping_csv(
        original_csv="faiss_index/faiss_index_to_local_path.csv",
        webdataset_dir="data/webdataset_shards",
        new_csv_path="faiss_index/faiss_index_webdataset.csv"
    )