import torch from transformers import BertTokenizer, BertModel import torch.nn as nn from torchvision.models import resnet50, ResNet50_Weights from PIL import Image from torchvision.transforms import v2 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("\nšŸš€ Using device:", device) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") def get_bert_embedding(text): inputs = tokenizer.encode_plus( text, add_special_tokens=True, return_tensors='pt', max_length=80, truncation=True, padding='max_length' ) return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0) class SelfAttentionFusion(nn.Module): def __init__(self, embed_dim): super().__init__() self.attn = nn.Linear(embed_dim * 2, 2) self.softmax = nn.Softmax(dim=1) def forward(self, x_text, x_img): stacked = torch.stack([x_text, x_img], dim=1) attn_weights = self.softmax(self.attn(torch.cat([x_text, x_img], dim=1))).unsqueeze(2) fused = (attn_weights * stacked).sum(dim=1) return fused class BERTResNetClassifier(nn.Module): def __init__(self, num_classes=2): super().__init__() self.image_model = resnet50(weights=ResNet50_Weights.IMAGENET1K_V1) self.fc_image = nn.Linear(1000, 512) self.drop_img = nn.Dropout(0.3) self.text_model = BertModel.from_pretrained("bert-base-uncased") self.fc_text = nn.Linear(self.text_model.config.hidden_size, 512) self.drop_text = nn.Dropout(0.3) self.fusion = SelfAttentionFusion(512) self.fc_final = nn.Linear(512, num_classes) def forward(self, image, input_ids, attention_mask): x_img = self.image_model(image) x_img = self.drop_img(x_img) x_img = self.fc_image(x_img) x_text = self.text_model(input_ids=input_ids, attention_mask=attention_mask)[0][:, 0, :] x_text = self.drop_text(x_text) x_text = self.fc_text(x_text) x_fused = self.fusion(x_text, x_img) return self.fc_final(x_fused) def remove_module_prefix(state_dict): from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k.replace('module.', '') new_state_dict[name] = v return new_state_dict print("šŸ“¦ Loading model weights...") state_dict = torch.load("state_dict.pth", map_location=device) clean_state_dict = remove_module_prefix(state_dict) model = BERTResNetClassifier(num_classes=2) model.load_state_dict(clean_state_dict) model.to(device) model.eval() print("āœ… Model loaded successfully.") text = "The Traditionalists - Whole Roasted Kitten" image_address = "./image.png" image = Image.open(image_address).convert("RGB") transform = v2.Compose([ v2.Resize((256, 256)), v2.ToImage(), v2.ToDtype(torch.float32, scale=True), v2.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) image = transform(image).unsqueeze(0) input_ids, attention_mask = get_bert_embedding(text) input_ids = input_ids.unsqueeze(0) attention_mask = attention_mask.unsqueeze(0) image.to(device) attention_mask.to(device) input_ids.to(device) output = model(image, input_ids, attention_mask) # PRINT OUTPUT classes = ["Fake", "Real"] probabilities = torch.nn.functional.softmax(output, dim=1) prob_values = [f"{prob:.2%}" for prob in probabilities[0].tolist()] print("Probabilities:", prob_values) prediction_id = torch.argmax(output, dim=1).item() print("Prediction:", classes[prediction_id])