Spaces:

winamnd
/

ocr-llm-test

Sleeping

File size: 2,895 Bytes

9c1923d
a4bd204
 
9c1923d
 
 
 
 
 
 
 
 
 
 
 
 
a4bd204
9c1923d
a4bd204
9c1923d
 
 
 
a4bd204
9c1923d
 
 
 
 
a4bd204
 
9c1923d
a4bd204
9c1923d
 
 
 
 
 
 
a4bd204
 
9c1923d
a4bd204
9c1923d
 
 
 
 
 
 
 
 
 
 
 
 
 
a4bd204
 
9c1923d
a4bd204
9c1923d
 
 
a4bd204
9c1923d
 
 
 
 
 
a4bd204
9c1923d
 
 
a4bd204
 
9c1923d
a4bd204
 
9c1923d
 
a4bd204
 
9c1923d
 
 
 
 
 
a4bd204
9c1923d

import gradio as gr 
import cv2
import easyocr
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Download necessary NLTK data
nltk.data.path.append("/usr/local/lib/nltk_data")
nltk.download('punkt')
nltk.download('stopwords')

"""
EasyOCR for Text Extraction
"""
def ocr_with_easy(img):
    # Convert image to grayscale
    gray_scale_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    cv2.imwrite('image.png', gray_scale_image)
    
    # Use EasyOCR to read text from the image
    reader = easyocr.Reader(['en'])
    bounds = reader.readtext('image.png', paragraph="False", detail=0)
    extracted_text = ' '.join(bounds)
    return extracted_text

"""
Text Preprocessing for Spam Classification
"""
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    return ' '.join(stemmed_tokens)

"""
Load and Train Spam Classifier
"""
# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')
data['v2'] = data['v2'].apply(preprocess_text)

# Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2'])

# Label Encoding
data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(tfidf_matrix, data['v1'])

"""
OCR and Spam Classification Pipeline
"""
def ocr_and_classify_spam(img):
    # Step 1: Extract text from the image using EasyOCR
    extracted_text = ocr_with_easy(img)
    
    # Step 2: Preprocess and classify the extracted text
    if extracted_text:
        processed_text = preprocess_text(extracted_text)
        input_tfidf = tfidf_vectorizer.transform([processed_text])
        prediction = rf_classifier.predict(input_tfidf)
        spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM"
    else:
        spam_result = "No text found in the image."
        
    return extracted_text, spam_result

"""
Create User Interface with Gradio
"""
image = gr.Image()
output_text = gr.Textbox(label="Extracted Text")
output_classification = gr.Textbox(label="Spam Classification")

demo = gr.Interface(
    fn=ocr_and_classify_spam,
    inputs=image,
    outputs=[output_text, output_classification],
    title="OCR and Spam Classifier",
    description="Upload an image with text. The text will be extracted using EasyOCR and then classified as SPAM or NOT SPAM.",
    css=".gradio-container {background-color: lightgray}"
)

demo.launch()