Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import cv2 | |
| import easyocr | |
| import pandas as pd | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.stem import PorterStemmer | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.ensemble import RandomForestClassifier | |
| # Download necessary NLTK data | |
| nltk.data.path.append("/usr/local/lib/nltk_data") | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| """ | |
| EasyOCR for Text Extraction | |
| """ | |
| def ocr_with_easy(img): | |
| # Convert image to grayscale | |
| gray_scale_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| cv2.imwrite('image.png', gray_scale_image) | |
| # Use EasyOCR to read text from the image | |
| reader = easyocr.Reader(['en']) | |
| bounds = reader.readtext('image.png', paragraph="False", detail=0) | |
| extracted_text = ' '.join(bounds) | |
| return extracted_text | |
| """ | |
| Text Preprocessing for Spam Classification | |
| """ | |
| def preprocess_text(text): | |
| tokens = word_tokenize(text.lower()) | |
| stop_words = set(stopwords.words('english')) | |
| filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words] | |
| stemmer = PorterStemmer() | |
| stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens] | |
| return ' '.join(stemmed_tokens) | |
| """ | |
| Load and Train Spam Classifier | |
| """ | |
| # Load the dataset | |
| data = pd.read_csv('spam.csv', encoding='latin-1') | |
| data['v2'] = data['v2'].apply(preprocess_text) | |
| # Feature Extraction (TF-IDF) | |
| tfidf_vectorizer = TfidfVectorizer() | |
| tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2']) | |
| # Label Encoding | |
| data['v1'] = data['v1'].map({'ham': 0, 'spam': 1}) | |
| # Create a Random Forest classifier | |
| rf_classifier = RandomForestClassifier(random_state=42) | |
| rf_classifier.fit(tfidf_matrix, data['v1']) | |
| """ | |
| OCR and Spam Classification Pipeline | |
| """ | |
| def ocr_and_classify_spam(img): | |
| # Step 1: Extract text from the image using EasyOCR | |
| extracted_text = ocr_with_easy(img) | |
| # Step 2: Preprocess and classify the extracted text | |
| if extracted_text: | |
| processed_text = preprocess_text(extracted_text) | |
| input_tfidf = tfidf_vectorizer.transform([processed_text]) | |
| prediction = rf_classifier.predict(input_tfidf) | |
| spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM" | |
| else: | |
| spam_result = "No text found in the image." | |
| return extracted_text, spam_result | |
| """ | |
| Create User Interface with Gradio | |
| """ | |
| image = gr.Image() | |
| output_text = gr.Textbox(label="Extracted Text") | |
| output_classification = gr.Textbox(label="Spam Classification") | |
| demo = gr.Interface( | |
| fn=ocr_and_classify_spam, | |
| inputs=image, | |
| outputs=[output_text, output_classification], | |
| title="OCR and Spam Classifier", | |
| description="Upload an image with text. The text will be extracted using EasyOCR and then classified as SPAM or NOT SPAM.", | |
| css=".gradio-container {background-color: lightgray}" | |
| ) | |
| demo.launch() | |