Spaces:
Sleeping
Sleeping
File size: 2,895 Bytes
9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d a4bd204 9c1923d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import gradio as gr
import cv2
import easyocr
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
# Download necessary NLTK data
nltk.data.path.append("/usr/local/lib/nltk_data")
nltk.download('punkt')
nltk.download('stopwords')
"""
EasyOCR for Text Extraction
"""
def ocr_with_easy(img):
# Convert image to grayscale
gray_scale_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
cv2.imwrite('image.png', gray_scale_image)
# Use EasyOCR to read text from the image
reader = easyocr.Reader(['en'])
bounds = reader.readtext('image.png', paragraph="False", detail=0)
extracted_text = ' '.join(bounds)
return extracted_text
"""
Text Preprocessing for Spam Classification
"""
def preprocess_text(text):
tokens = word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
return ' '.join(stemmed_tokens)
"""
Load and Train Spam Classifier
"""
# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')
data['v2'] = data['v2'].apply(preprocess_text)
# Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2'])
# Label Encoding
data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(tfidf_matrix, data['v1'])
"""
OCR and Spam Classification Pipeline
"""
def ocr_and_classify_spam(img):
# Step 1: Extract text from the image using EasyOCR
extracted_text = ocr_with_easy(img)
# Step 2: Preprocess and classify the extracted text
if extracted_text:
processed_text = preprocess_text(extracted_text)
input_tfidf = tfidf_vectorizer.transform([processed_text])
prediction = rf_classifier.predict(input_tfidf)
spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM"
else:
spam_result = "No text found in the image."
return extracted_text, spam_result
"""
Create User Interface with Gradio
"""
image = gr.Image()
output_text = gr.Textbox(label="Extracted Text")
output_classification = gr.Textbox(label="Spam Classification")
demo = gr.Interface(
fn=ocr_and_classify_spam,
inputs=image,
outputs=[output_text, output_classification],
title="OCR and Spam Classifier",
description="Upload an image with text. The text will be extracted using EasyOCR and then classified as SPAM or NOT SPAM.",
css=".gradio-container {background-color: lightgray}"
)
demo.launch()
|