import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
from torchvision.models import resnet50, ResNet50_Weights
from PIL import Image
from torchvision.transforms import v2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("\n🚀 Using device:", device)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def get_bert_embedding(text):
    inputs = tokenizer.encode_plus(
        text, add_special_tokens=True,
        return_tensors='pt', max_length=80,
        truncation=True, padding='max_length'
    )
    return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0)

class SelfAttentionFusion(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.attn = nn.Linear(embed_dim * 2, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x_text, x_img):
        stacked = torch.stack([x_text, x_img], dim=1)
        attn_weights = self.softmax(self.attn(torch.cat([x_text, x_img], dim=1))).unsqueeze(2)
        fused = (attn_weights * stacked).sum(dim=1)
        return fused

class BERTResNetClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.image_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1)
        self.fc_image = nn.Linear(1000, 512)
        self.drop_img = nn.Dropout(0.3)

        self.text_model = BertModel.from_pretrained("bert-base-uncased")
        self.fc_text = nn.Linear(self.text_model.config.hidden_size, 512)
        self.drop_text = nn.Dropout(0.3)

        self.fusion = SelfAttentionFusion(512)
        self.fc_final = nn.Linear(512, num_classes)

    def forward(self, image, input_ids, attention_mask):
        x_img = self.image_model(image)
        x_img = self.drop_img(x_img)
        x_img = self.fc_image(x_img)

        x_text = self.text_model(input_ids=input_ids, attention_mask=attention_mask)[0][:, 0, :]
        x_text = self.drop_text(x_text)
        x_text = self.fc_text(x_text)

        x_fused = self.fusion(x_text, x_img)
        return self.fc_final(x_fused)

def remove_module_prefix(state_dict):
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k.replace('module.', '')
        new_state_dict[name] = v
    return new_state_dict

print("📦 Loading model weights...")
state_dict = torch.load("state_dict.pth", map_location=device)
clean_state_dict = remove_module_prefix(state_dict)

model = BERTResNetClassifier(num_classes=2)
model.load_state_dict(clean_state_dict)
model.to(device)
model.eval()
print("✅ Model loaded successfully.")

text = "The Traditionalists - Whole Roasted Kitten"
image_address = "./image.png"

image = Image.open(image_address).convert("RGB")
transform = v2.Compose([
            v2.Resize((256, 256)),
            v2.ToImage(),
            v2.ToDtype(torch.float32, scale=True),
            v2.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
image = transform(image).unsqueeze(0)  
input_ids, attention_mask = get_bert_embedding(text)
input_ids = input_ids.unsqueeze(0)
attention_mask = attention_mask.unsqueeze(0)

image.to(device)
attention_mask.to(device)
input_ids.to(device)

output = model(image, input_ids, attention_mask)

# PRINT OUTPUT
classes = ["Fake", "Real"]

probabilities = torch.nn.functional.softmax(output, dim=1)
prob_values = [f"{prob:.2%}" for prob in probabilities[0].tolist()]
print("Probabilities:", prob_values)

prediction_id = torch.argmax(output, dim=1).item()
print("Prediction:", classes[prediction_id])