Spaces:

winamnd
/

ocr-llm-test

Sleeping

App Files Files Community

ocr-llm-test / app.py

winamnd

Update app.py

9c1923d verified 10 months ago

raw

history blame

2.9 kB

	import gradio as gr
	import cv2
	import easyocr
	import pandas as pd
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.ensemble import RandomForestClassifier

	# Download necessary NLTK data
	nltk.data.path.append("/usr/local/lib/nltk_data")
	nltk.download('punkt')
	nltk.download('stopwords')

	"""
	EasyOCR for Text Extraction
	"""
	def ocr_with_easy(img):
	# Convert image to grayscale
	gray_scale_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	cv2.imwrite('image.png', gray_scale_image)

	# Use EasyOCR to read text from the image
	reader = easyocr.Reader(['en'])
	bounds = reader.readtext('image.png', paragraph="False", detail=0)
	extracted_text = ' '.join(bounds)
	return extracted_text

	"""
	Text Preprocessing for Spam Classification
	"""
	def preprocess_text(text):
	tokens = word_tokenize(text.lower())
	stop_words = set(stopwords.words('english'))
	filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
	stemmer = PorterStemmer()
	stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
	return ' '.join(stemmed_tokens)

	"""
	Load and Train Spam Classifier
	"""
	# Load the dataset
	data = pd.read_csv('spam.csv', encoding='latin-1')
	data['v2'] = data['v2'].apply(preprocess_text)

	# Feature Extraction (TF-IDF)
	tfidf_vectorizer = TfidfVectorizer()
	tfidf_matrix = tfidf_vectorizer.fit_transform(data['v2'])

	# Label Encoding
	data['v1'] = data['v1'].map({'ham': 0, 'spam': 1})

	# Create a Random Forest classifier
	rf_classifier = RandomForestClassifier(random_state=42)
	rf_classifier.fit(tfidf_matrix, data['v1'])

	"""
	OCR and Spam Classification Pipeline
	"""
	def ocr_and_classify_spam(img):
	# Step 1: Extract text from the image using EasyOCR
	extracted_text = ocr_with_easy(img)

	# Step 2: Preprocess and classify the extracted text
	if extracted_text:
	processed_text = preprocess_text(extracted_text)
	input_tfidf = tfidf_vectorizer.transform([processed_text])
	prediction = rf_classifier.predict(input_tfidf)
	spam_result = "SPAM" if prediction[0] == 1 else "NOT SPAM"
	else:
	spam_result = "No text found in the image."

	return extracted_text, spam_result

	"""
	Create User Interface with Gradio
	"""
	image = gr.Image()
	output_text = gr.Textbox(label="Extracted Text")
	output_classification = gr.Textbox(label="Spam Classification")

	demo = gr.Interface(
	fn=ocr_and_classify_spam,
	inputs=image,
	outputs=[output_text, output_classification],
	title="OCR and Spam Classifier",
	description="Upload an image with text. The text will be extracted using EasyOCR and then classified as SPAM or NOT SPAM.",
	css=".gradio-container {background-color: lightgray}"
	)

	demo.launch()