Spaces:
Sleeping
Sleeping
| # import torch | |
| # import torch.nn as nn | |
| # from PIL import Image | |
| # import numpy as np | |
| # import matplotlib.pyplot as plt | |
| # import torchvision.transforms as transforms | |
| # from doctr.io import DocumentFile | |
| # from doctr.models import recognition_predictor | |
| # character_num = "0123456789-" | |
| # character_letter = ''' "()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^''' #"()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^" | |
| # model_dev_digits_path = "models/devnagri_digits_20k_v2.pth" | |
| # model_roman_digits_path = "models/roman_digits_20k_v5.pth" | |
| # dev_letter_path = "models/small_devnagari_letter.pth" | |
| # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| # # Define the CRNN model | |
| # class CRNN(nn.Module): | |
| # def __init__(self, num_classes, input_size=(1, 64, 256)): | |
| # super(CRNN, self).__init__() | |
| # self.conv_block = nn.Sequential( | |
| # nn.Conv2d(input_size[0], 64, kernel_size=3, stride=1, padding=1), | |
| # nn.BatchNorm2d(64), | |
| # nn.ReLU(), | |
| # nn.MaxPool2d(kernel_size=2, stride=2), # 64x128 | |
| # nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), | |
| # nn.BatchNorm2d(128), | |
| # nn.ReLU(), | |
| # nn.MaxPool2d(kernel_size=2, stride=2), # 32x64 | |
| # nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), | |
| # nn.BatchNorm2d(256), | |
| # nn.ReLU(), | |
| # nn.MaxPool2d(kernel_size=2, stride=2), # 16x32 | |
| # nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1), | |
| # nn.BatchNorm2d(512), | |
| # nn.ReLU(), | |
| # nn.MaxPool2d(kernel_size=2, stride=2) # 8x16 | |
| # ) | |
| # # Dimensions after conv: batch x 512 x 8 x 16 | |
| # feature_height = input_size[1] // 16 # 64 -> 4 pools → 64/2^4 = 4 | |
| # self.rnn = nn.LSTM( | |
| # input_size=512 * feature_height, # 512 * 4 = 2048 | |
| # hidden_size=128, | |
| # num_layers=1, | |
| # bidirectional=True, | |
| # dropout=0.3, | |
| # batch_first=True | |
| # ) | |
| # self.fc = nn.Linear(256, num_classes) # 256*2 = 512 | |
| # def forward(self, x): | |
| # x = self.conv_block(x) # (B, 512, H=4, W=16) | |
| # b, c, h, w = x.size() | |
| # x = x.permute(0, 3, 1, 2) # (B, W, C, H) | |
| # x = x.contiguous().view(b, w, c * h) # (B, seq_len, input_size) | |
| # x, _ = self.rnn(x) # (B, seq_len, 512) | |
| # x = self.fc(x) # (B, seq_len, num_classes) | |
| # return x | |
| # # Initialize the model | |
| # def model_init(character, model_path): | |
| # # Initialize the model with the number of classes | |
| # model = CRNN(num_classes=len(character)) | |
| # model.load_state_dict(torch.load(model_path, map_location=device)) | |
| # model = model.to(device) | |
| # return model | |
| # def predict_image(image_path,character, model_path): | |
| # image = Image.open(image_path).convert('L') | |
| # # if value < 128, set to 0, else set to 255 | |
| # if model_path != dev_letter_path: | |
| # image = image.point(lambda x: 0 if x < 128 else 255, 'L') | |
| # image = image.resize((256, 64)) # Resize to match the input size of the model | |
| # image = np.array(image) | |
| # image = np.expand_dims(image, axis=0)[0] # Add channel dimension | |
| # # to pil image | |
| # # print(image) | |
| # image = Image.fromarray(image).convert('L') | |
| # if model_path == dev_letter_path: | |
| # image = Image.eval(image, lambda x: 255 - x) | |
| # # plt.imshow(image, cmap='gray') | |
| # # plt.axis('off') | |
| # # plt.show() | |
| # transform = transforms.Compose([ | |
| # transforms.Resize((64, 256)), | |
| # transforms.ToTensor(), | |
| # transforms.Normalize((0.5,), (0.5,)) | |
| # ]) | |
| # image = transform(image).unsqueeze(0).to(device) # Add batch dimension and move to GPU | |
| # # Load the model weights | |
| # model = model_init(character, model_path) | |
| # # token to string | |
| # # tokens to ids | |
| # id_to_char = {i: c for i, c in enumerate(character)} | |
| # def get_string_from_token(token): | |
| # """ | |
| # Convert a list of character IDs back to the corresponding string. | |
| # """ | |
| # return ''.join([id_to_char[i] for i in token]) | |
| # with torch.no_grad(): | |
| # output = model(image) | |
| # output = output.permute(1, 0, 2) # (seq_len, batch_size, num_classes) | |
| # _, predicted = output.max(2) | |
| # predicted = predicted.permute(1, 0) # (batch_size, seq_len) | |
| # predicted_str = get_string_from_token(predicted[0].cpu().numpy()) | |
| # return predicted_str | |
| # def dev_number(image): | |
| # # Load the model | |
| # model_path = model_dev_digits_path | |
| # character = character_num | |
| # # Predict the image | |
| # predicted_str = predict_image(image, character, model_path) | |
| # return predicted_str | |
| # def roman_number(image): | |
| # # Load the model | |
| # model_path = model_roman_digits_path | |
| # character = character_num | |
| # # Predict the image | |
| # predicted_str = predict_image(image, character, model_path) | |
| # return predicted_str | |
| # def dev_letter(image): | |
| # # Load the model | |
| # model_path = dev_letter_path | |
| # character = character_letter | |
| # # Predict the image | |
| # predicted_str = predict_image(image, character, model_path) | |
| # return predicted_str | |
| # # roman_letter | |
| # # Load OCR model once at startup | |
| # model = recognition_predictor(pretrained=True) | |
| # def roman_letter(image): | |
| # # Load image using doctr | |
| # img = DocumentFile.from_images(image) | |
| # # Perform OCR | |
| # result = model(img) | |
| # # Return result as JSON | |
| # return result | |
| import torch | |
| import torch.nn as nn | |
| from PIL import Image | |
| import numpy as np | |
| import torchvision.transforms as transforms | |
| from doctr.io import DocumentFile | |
| from torchvision import models | |
| from doctr.models import recognition_predictor | |
| import os | |
| from functools import lru_cache | |
| import pickle | |
| # Character sets | |
| CHARACTER_NUM = "0123456789-" | |
| CHARACTER_LETTER = ''' "()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^''' #"()-./0123456789:?ABCDEFGHIKLMNOPQRSTUWYabcdefghijklmnoprstuvwyँंःअआइईउऊऋऌऍऎएऐऑऒओऔकखगघङचछजझञटठडढणतथदधनऩपफबभमयरऱलळऴवशषसह़ऽािीुूृॄॅॆेैॉॊोौ्ॐ॒॑॓॔क़ख़ग़ज़ड़ढ़फ़य़ॠॢ।॥०१२३४५६७८९॰ॱॲॻॼॽॾ^" | |
| # Model paths - these should be configurable | |
| MODEL_PATHS = { | |
| 'dev_digits': "models/devnagri_digits_20k_v2.pth", | |
| 'roman_digits': "models/roman_digits_20k_v5.pth", | |
| 'dev_letter': "models/small_devnagari_letter.pth", | |
| 'classify_ne': "models/nepali_english_classifier.pth" | |
| } | |
| # Use GPU if available | |
| DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| class ResNetClassifier(nn.Module): | |
| def __init__(self, num_classes=2): | |
| super(ResNetClassifier, self).__init__() | |
| self.base_model = models.resnet50(weights='IMAGENET1K_V2') # Pre-trained ResNet-50 | |
| for param in self.base_model.parameters(): | |
| param.requires_grad = False # Freeze base model | |
| num_ftrs = self.base_model.fc.in_features | |
| self.base_model.fc = nn.Sequential( | |
| nn.Linear(num_ftrs, 128), | |
| nn.ReLU(), | |
| nn.Linear(128, num_classes) | |
| ) | |
| def forward(self, x): | |
| return self.base_model(x) | |
| # Define the CRNN model | |
| class CRNN(nn.Module): | |
| def __init__(self, num_classes, input_size=(1, 64, 256)): | |
| super(CRNN, self).__init__() | |
| self.conv_block = nn.Sequential( | |
| nn.Conv2d(input_size[0], 64, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(64), | |
| nn.ReLU(), | |
| nn.MaxPool2d(kernel_size=2, stride=2), # 64x128 | |
| nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(128), | |
| nn.ReLU(), | |
| nn.MaxPool2d(kernel_size=2, stride=2), # 32x64 | |
| nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(256), | |
| nn.ReLU(), | |
| nn.MaxPool2d(kernel_size=2, stride=2), # 16x32 | |
| nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1), | |
| nn.BatchNorm2d(512), | |
| nn.ReLU(), | |
| nn.MaxPool2d(kernel_size=2, stride=2) # 8x16 | |
| ) | |
| # Dimensions after conv: batch x 512 x 8 x 16 | |
| feature_height = input_size[1] // 16 # 64 -> 4 pools → 64/2^4 = 4 | |
| self.rnn = nn.LSTM( | |
| input_size=512 * feature_height, # 512 * 4 = 2048 | |
| hidden_size=128, | |
| num_layers=1, | |
| bidirectional=True, | |
| dropout=0.3, | |
| batch_first=True | |
| ) | |
| self.fc = nn.Linear(256, num_classes) # 256 for bidirectional | |
| def forward(self, x): | |
| x = self.conv_block(x) # (B, 512, H=4, W=16) | |
| b, c, h, w = x.size() | |
| x = x.permute(0, 3, 1, 2) # (B, W, C, H) | |
| x = x.contiguous().view(b, w, c * h) # (B, seq_len, input_size) | |
| x, _ = self.rnn(x) # (B, seq_len, 512) | |
| x = self.fc(x) # (B, seq_len, num_classes) | |
| return x | |
| class OCRModelManager: | |
| """ | |
| Singleton class to manage OCR models and prevent repeated loading | |
| """ | |
| _instance = None | |
| def __new__(cls): | |
| if cls._instance is None: | |
| cls._instance = super(OCRModelManager, cls).__new__(cls) | |
| cls._instance.models = {} | |
| cls._instance.char_maps = {} | |
| cls._instance.transforms = {} | |
| cls._instance.initialize_transforms() | |
| # Initialize doctr model once | |
| cls._instance.roman_letter_model = recognition_predictor(pretrained=True) | |
| return cls._instance | |
| def initialize_transforms(self): | |
| """Initialize standard transforms used across models""" | |
| self.transforms['standard'] = transforms.Compose([ | |
| transforms.Resize((64, 256)), | |
| transforms.ToTensor(), | |
| transforms.Normalize((0.5,), (0.5,)) | |
| ]) | |
| def get_model(self, model_type, character_set): | |
| """Get or load a model based on type""" | |
| if model_type not in self.models: | |
| if model_type not in MODEL_PATHS: | |
| raise ValueError(f"Unknown model type: {model_type}") | |
| # Create character to ID mapping | |
| self.char_maps[model_type] = { | |
| 'id_to_char': {i: c for i, c in enumerate(character_set)}, | |
| 'char_to_id': {c: i for i, c in enumerate(character_set)} | |
| } | |
| # Initialize and load model | |
| model = CRNN(num_classes=len(character_set)) | |
| model.load_state_dict(torch.load(MODEL_PATHS[model_type], map_location=DEVICE)) | |
| model.eval() # Set to evaluation mode | |
| model = model.to(DEVICE) | |
| self.models[model_type] = model | |
| return self.models[model_type], self.char_maps[model_type] | |
| def preprocess_image(self, image_path, model_type): | |
| """Preprocess image based on model type""" | |
| image = Image.open(image_path).convert('L') | |
| # Apply specific preprocessing based on model type | |
| if model_type != 'dev_letter': | |
| # Binarize the image for digit models | |
| image = image.point(lambda x: 0 if x < 128 else 255, 'L') | |
| # Resize to model input size | |
| image = image.resize((256, 64)) | |
| # Invert colors for dev_letter model | |
| if model_type == 'dev_letter': | |
| image = Image.eval(image, lambda x: 255 - x) | |
| # Apply transforms | |
| tensor_image = self.transforms['standard'](image).unsqueeze(0).to(DEVICE) | |
| return tensor_image | |
| def predict(self, image_path, model_type, character_set): | |
| """Make a prediction using the specified model""" | |
| # Get or load model | |
| model, char_map = self.get_model(model_type, character_set) | |
| # Preprocess image | |
| tensor_image = self.preprocess_image(image_path, model_type) | |
| # Run inference | |
| with torch.no_grad(): | |
| output = model(tensor_image) | |
| output = output.permute(1, 0, 2) # (seq_len, batch_size, num_classes) | |
| _, predicted = output.max(2) | |
| predicted = predicted.permute(1, 0) # (batch_size, seq_len) | |
| # Convert tokens to string | |
| predicted_str = ''.join([char_map['id_to_char'][i] for i in predicted[0].cpu().numpy()]) | |
| return predicted_str | |
| def predict_roman_letter(self, image_path): | |
| """Predict using the doctr model for Roman letters""" | |
| img = DocumentFile.from_images(image_path) | |
| result = self.roman_letter_model(img) | |
| # print(result) | |
| return result[0][0] | |
| # Initialize the model manager as a singleton | |
| ocr_manager = OCRModelManager() | |
| # Simplified API functions | |
| def dev_number(image_path): | |
| """Recognize Devanagari digits in an image""" | |
| return ocr_manager.predict(image_path, 'dev_digits', CHARACTER_NUM) | |
| def roman_number(image_path): | |
| """Recognize Roman digits in an image""" | |
| return ocr_manager.predict(image_path, 'roman_digits', CHARACTER_NUM) | |
| def dev_letter(image_path): | |
| """Recognize Devanagari letters in an image""" | |
| return ocr_manager.predict(image_path, 'dev_letter', CHARACTER_LETTER) | |
| def roman_letter(image_path): | |
| """Recognize Roman letters in an image""" | |
| return ocr_manager.predict_roman_letter(image_path) | |
| def predict_ne(image_path, device="cpu"): | |
| # load label encoder | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model = ResNetClassifier(num_classes=4).to(device) | |
| # model.eval() | |
| transform = transforms.Compose([ | |
| transforms.Resize(256), # Resize shorter side to 256 | |
| transforms.CenterCrop(224), # Crop center 224x224 patch | |
| transforms.ToTensor(), | |
| transforms.Normalize(mean=[0.485, 0.456, 0.406], | |
| std=[0.229, 0.224, 0.225]) | |
| ]) | |
| image = Image.open(image_path).convert('RGB') | |
| image_tensor = transform(image).unsqueeze(0).to(device) | |
| # loading model weights/state_dict | |
| model.load_state_dict(torch.load('models/dev_roman_classifier.pth', map_location=device)) | |
| model.eval() | |
| # loading label encoder | |
| with open('models/dev_roman_label_encoder.pkl', 'rb') as f: | |
| le = pickle.load(f) | |
| with torch.no_grad(): | |
| output = model(image_tensor) | |
| _, predicted = torch.max(output, 1) | |
| return le.inverse_transform([predicted.item()])[0] |