Spaces:

vova631
/

emotion-matcher

Sleeping

App Files Files Community

vova631 commited on Jun 30

Commit

e5c9a77

verified ·

1 Parent(s): 55a3995

Upload app.py

Browse files

Files changed (1) hide show

app.py +412 -0

app.py ADDED Viewed

	@@ -0,0 +1,412 @@

+# -*- coding: utf-8 -*-
+"""emotion-matcher.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/1LTiGD09mHJRCtPkBO3f3XnYPACLCB_ro
+## 1. Dataset
+"""
+import pandas as pd
+# Define the file paths for each dataset split
+splits = {
+    'train': 'simplified/train-00000-of-00001.parquet',
+    'validation': 'simplified/validation-00000-of-00001.parquet',
+    'test': 'simplified/test-00000-of-00001.parquet'
+}
+# Load the training set from HuggingFace Hub using the hf:// protocol
+df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])
+# Preview the first few rows of the dataset
+df.head()
+"""This code loads the training split of the GoEmotions dataset (simplified version) directly from the HuggingFace Hub using the hf:// path.
+We use the pandas library and the read_parquet() method to read the data into a table (DataFrame).
+Then, we display the first few rows using df.head() to make sure the data was loaded correctly.
+This is a perfect starting point for the next step – Exploratory Data Analysis (EDA).
+"""
+#Import necessary libraries
+import pandas as pd
+#View dataset shape
+print("Dataset shape:", df.shape)
+#View basic column information
+print("\nColumn names:", df.columns.tolist())
+#View detailed info
+df.info()
+"""In this step, we check how many rows and columns the dataset has, and examine the names and data types of all columns.
+This gives us an overview of what kind of data we’re dealing with (text, numbers, labels, etc.).
+It helps us understand what preprocessing may be needed next.
+"""
+#Check for missing values
+print("Missing values per column:")
+print(df.isnull().sum())
+#Check for duplicated rows (convert unhashable columns to string)
+print("\nNumber of duplicated rows:")
+print(df.astype(str).duplicated().sum())
+#Check how many unique combinations of emotion labels exist
+print("\nNumber of unique label combinations:")
+print(df["labels"].apply(lambda x: tuple(x)).nunique())
+#Compute text lengths in number of words
+df["text_length"] = df["text"].apply(lambda x: len(x.split()))
+#Plot histogram of text lengths
+import matplotlib.pyplot as plt
+plt.figure(figsize=(10,6))
+plt.hist(df["text_length"], bins=50)
+plt.title("Distribution of Text Lengths (in words)")
+plt.xlabel("Number of words")
+plt.ylabel("Number of samples")
+plt.grid(True)
+plt.show()
+"""Most texts in the dataset are short—under 30 words—which helps us choose the proper maximum length for tokenization later.
+"""
+#Count how many emotion labels each text has
+df["num_labels"] = df["labels"].apply(len)
+#Plot distribution
+plt.figure(figsize=(8,5))
+df["num_labels"].value_counts().sort_index().plot(kind="bar")
+plt.xlabel("Number of emotion labels")
+plt.ylabel("Number of samples")
+plt.title("Distribution of Emotion Labels per Sample")
+plt.show()
+"""Most samples are annotated with a single emotion label, and very few have multiple labels. This indicates that the dataset is mostly suitable for single-label classification tasks, although a multi-label approach could still capture additional nuance for rare cases."""
+# Count frequency of each individual emotion label
+from collections import Counter
+# Flatten the list of labels across all samples
+all_labels = [label for labels in df["labels"] for label in labels]
+label_counts = Counter(all_labels)
+# Convert to DataFrame for plotting
+import pandas as pd
+emotion_freq = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count'])
+emotion_freq = emotion_freq.sort_values(by='count', ascending=False)
+# Plot the frequency of each emotion
+emotion_freq.plot(kind='bar', figsize=(15,5), legend=False)
+plt.title("Frequency of Each Emotion Label")
+plt.xlabel("Emotion Label ID")
+plt.ylabel("Number of Occurrences")
+plt.show()
+"""This bar chart illustrates how often each emotion label appears across the dataset.
+We observe a strong imbalance: some labels like 27 (likely “neutral”) dominate with over 14,000 occurrences,
+while others like 16, 21, or 23 are very rare.
+This highlights the need to consider class imbalance when training models.
+"""
+# Import necessary libraries
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+# Create a binary matrix for emotions
+# Get the maximum label ID from all label lists
+num_labels = max([max(l.tolist()) if len(l) > 0 else 0 for l in df["labels"]]) + 1
+emotion_matrix = np.zeros((len(df), num_labels), dtype=int)
+for i, labels in enumerate(df["labels"]):
+    for label in labels:
+        emotion_matrix[i, label] = 1
+# Compute co-occurrence matrix
+co_occurrence = np.dot(emotion_matrix.T, emotion_matrix)
+# Plot heatmap
+plt.figure(figsize=(12, 10))
+sns.heatmap(co_occurrence, cmap="Blues", linewidths=0.5)
+plt.title("Emotion Co-occurrence Heatmap")
+plt.xlabel("Emotion Label ID")
+plt.ylabel("Emotion Label ID")
+plt.show()
+"""This heatmap visualizes how frequently pairs of emotion labels co-occur within the same text. Darker shades indicate more frequent co-occurrences, helping identify emotions that often appear together."""
+# View random samples of texts and their corresponding emotion labels
+# Display 5 random rows
+print("Sample text examples with emotion labels:")
+display(df.sample(5)[["text", "labels"]])
+"""This step is meant to get a qualitative sense of the dataset by inspecting real examples. It helps verify whether:
+The texts are understandable and relevant.
+The assigned emotion labels make sense.
+There are any noisy, overly short, or unclear samples.
+"""
+# Define emotion label ID to name mapping manually (based on GoEmotions documentation)
+id2label = [
+    'admiration', 'amusement', 'anger', 'annoyance', 'approval',
+    'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
+    'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
+    'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
+    'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise',
+    'neutral'
+]
+# Define a function to convert list of label IDs into label names
+def decode_labels(label_ids):
+    return [id2label[i] for i in label_ids]
+# Display 5 random samples with readable label names
+print("Sample text examples with emotion label names:")
+sample_df = df.sample(5)
+sample_df["label_names"] = sample_df["labels"].apply(decode_labels)
+display(sample_df[["text", "label_names"]])
+"""Sample Texts with Emotion Labels
+The table displays five random text samples from the dataset along with their decoded emotion labels. Most of the examples are labeled as “neutral,” highlighting its dominance in the dataset.
+"""
+# Import library for word cloud
+from wordcloud import WordCloud
+import matplotlib.pyplot as plt
+# Combine all text data into one string
+all_text = " ".join(df["text"])
+# Generate word cloud
+wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
+# Plot word cloud
+plt.figure(figsize=(12, 6))
+plt.imshow(wordcloud, interpolation="bilinear")
+plt.axis("off")
+plt.title("Most Frequent Words in All Text Samples")
+plt.show()
+"""This word cloud displays the most commonly used words across all text samples in the dataset. Larger words appear more frequently, offering insights into prevalent vocabulary and themes used by users expressing various emotions."""
+#  Step: Text Preprocessing - clean the text data
+import re
+import string
+# Define a function to clean each text entry
+def clean_text(text):
+    # Lowercase
+    text = text.lower()
+    # Remove [NAME], [URL], and other placeholders
+    text = re.sub(r"\[.*?\]", "", text)
+    # Remove punctuation
+    text = text.translate(str.maketrans('', '', string.punctuation))
+    # Remove numbers
+    text = re.sub(r"\d+", "", text)
+    # Remove extra whitespaces
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+# Apply cleaning to the text column
+df["clean_text"] = df["text"].apply(clean_text)
+#  Preview cleaned text
+print("Sample cleaned texts:")
+display(df[["text", "clean_text"]].sample(5))
+"""
+This preprocessing step standardizes text inputs by converting to lowercase, removing brackets like [NAME], punctuation, digits, and extra spaces — which helps downstream models focus on meaningful content.
+"""
+# Plot label distribution
+# Flatten all label lists into a single list
+all_labels = [label for sublist in df["labels"] for label in sublist]
+# Count frequency of each label
+from collections import Counter
+label_counts = Counter(all_labels)
+# Convert to DataFrame for plotting
+label_df = pd.DataFrame.from_dict(label_counts, orient="index", columns=["count"])
+label_df.index.name = "label_id"
+label_df = label_df.sort_index()
+label_df["label_name"] = label_df.index.map(lambda i: id2label[i])
+# Plot bar chart
+plt.figure(figsize=(14, 6))
+sns.barplot(x="label_name", y="count", data=label_df)
+plt.xticks(rotation=45, ha="right")
+plt.title("Distribution of Emotion Labels in Training Set")
+plt.xlabel("Emotion")
+plt.ylabel("Frequency")
+plt.tight_layout()
+plt.show()
+"""This bar chart shows how often each emotion label appears across all samples. Labels with higher frequency indicate more common emotions in the dataset.
+## 2. Embeddings
+"""
+# Install the sentence-transformers library (if not already installed)
+!pip install -q sentence-transformers
+# Import required libraries
+from sentence_transformers import SentenceTransformer
+import torch
+# Choose a small and fast model for generating sentence embeddings
+model = SentenceTransformer('all-MiniLM-L6-v2')
+# Optional: move model to GPU if available
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model = model.to(device)
+# Subset the dataset to 2000 samples for efficiency
+sample_df = df.sample(n=2000, random_state=42).reset_index(drop=True)
+# Generate embeddings for the 'clean_text' column
+# This might take 1-2 minutes
+embeddings = model.encode(
+    sample_df['clean_text'].tolist(),
+    convert_to_tensor=True,
+    show_progress_bar=True,
+    device=device
+)
+# Store embeddings as a list inside the dataframe
+sample_df['embedding'] = embeddings.cpu().numpy().tolist()
+# Preview the result
+sample_df[['clean_text', 'embedding']].head()
+"""
+We use the all-MiniLM-L6-v2 model from SentenceTransformers to convert each cleaned text into a dense vector representation, capturing semantic meaning for further clustering and visualization.
+"""
+from tqdm.notebook import tqdm
+sample_df = df.sample(n=3000, random_state=42).reset_index(drop=True)
+embeddings = model.encode(sample_df["clean_text"].tolist(), show_progress_bar=True)
+sample_df["embedding"] = embeddings.tolist()
+"""This step uses the all-MiniLM-L6-v2 model from the sentence-transformers library to convert each text sample into a dense vector (embedding). To improve efficiency, a random sample of 3,000 examples is selected, encoded in batches, and saved into a new "embedding" column."""
+from sklearn.manifold import TSNE
+import matplotlib.pyplot as plt
+import numpy as np
+# Convert list of embeddings to a NumPy array
+X = np.array(sample_df["embedding"].tolist())
+# Reduce the embedding dimensions to 2D using t-SNE
+tsne = TSNE(n_components=2, random_state=42, perplexity=30)
+X_embedded = tsne.fit_transform(X)
+# Add 2D coordinates to the dataframe
+sample_df["x"] = X_embedded[:, 0]
+sample_df["y"] = X_embedded[:, 1]
+# Visualize the 2D embeddings using a scatter plot
+plt.figure(figsize=(10, 6))
+plt.scatter(sample_df["x"], sample_df["y"], alpha=0.5)
+plt.title("t-SNE Projection of Text Embeddings")
+plt.xlabel("Component 1")
+plt.ylabel("Component 2")
+plt.show()
+"""t-SNE projection of sentence embeddings onto 2D space. Each point represents a high-dimensional text embedding reduced to two components for visualization. This helps reveal potential clustering structures and the distribution of semantic similarities."""
+from sklearn.cluster import KMeans
+# Define the number of clusters (you can try different values like 5, 10, etc.)
+num_clusters = 8
+# Apply K-Means clustering to the embeddings
+kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+sample_df["cluster"] = kmeans.fit_predict(X)
+# Visualize the clusters on the t-SNE projection
+plt.figure(figsize=(10, 6))
+scatter = plt.scatter(sample_df["x"], sample_df["y"], c=sample_df["cluster"], cmap='tab10', alpha=0.6)
+plt.title(f"K-Means Clustering (k={num_clusters}) on t-SNE Projection")
+plt.xlabel("Component 1")
+plt.ylabel("Component 2")
+plt.colorbar(scatter, label="Cluster")
+plt.show()
+"""K-Means clustering (k=8) applied to sentence embeddings, visualized using t-SNE. Each color represents a distinct cluster, indicating groups of semantically similar text samples based on their embedding vectors.
+## 3. Inputs & Outputs
+"""
+from sentence_transformers import util
+import torch
+# Ensure sample_df contains the 'embedding' column
+EMBEDDINGS = torch.tensor(sample_df['embedding'].tolist(), device=device)
+# Define the recommendation function
+def recommend_similar_emotions(user_input):
+    if not user_input.strip():
+        return "Please enter some text."
+    # Encode the user input into an embedding
+    user_embedding = model.encode(user_input, convert_to_tensor=True, device=device)
+    # Compute cosine similarity between user input and all stored embeddings
+    similarities = util.cos_sim(user_embedding, EMBEDDINGS)[0]
+    top_indices = similarities.argsort(descending=True)[:5]
+    # Format the top 5 most similar results
+    results = []
+    for idx in top_indices:
+        row = sample_df.iloc[idx.item()]
+        results.append(f"{row['text']}\nEmotions: {row['labels']}")
+    return "\n\n".join(results)
+recommend_similar_emotions("I'm feeling nervous before my exam")
+"""Core recommendation logic for matching user input text to most similar texts in the dataset using sentence embeddings and cosine similarity.
+Returns top 5 results with their associated emotion labels.
+"""
+import gradio as gr
+# Create Gradio interface
+demo = gr.Interface(
+    fn=recommend_similar_emotions,
+    inputs=gr.Textbox(lines=2, placeholder="Type your situation or feeling..."),
+    outputs="text",
+    title="Emotion Matcher",
+    description="Describe how you feel, and get similar examples with emotion labels."
+)
+demo.launch()
+"""Set up the Gradio web app for entering text and viewing recommendations"""