vova631 commited on
Commit
e5c9a77
·
verified ·
1 Parent(s): 55a3995

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +412 -0
app.py ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """emotion-matcher.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1LTiGD09mHJRCtPkBO3f3XnYPACLCB_ro
8
+
9
+ ## 1. Dataset
10
+ """
11
+
12
+ import pandas as pd
13
+
14
+ # Define the file paths for each dataset split
15
+ splits = {
16
+ 'train': 'simplified/train-00000-of-00001.parquet',
17
+ 'validation': 'simplified/validation-00000-of-00001.parquet',
18
+ 'test': 'simplified/test-00000-of-00001.parquet'
19
+ }
20
+
21
+ # Load the training set from HuggingFace Hub using the hf:// protocol
22
+ df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])
23
+
24
+ # Preview the first few rows of the dataset
25
+ df.head()
26
+
27
+ """This code loads the training split of the GoEmotions dataset (simplified version) directly from the HuggingFace Hub using the hf:// path.
28
+ We use the pandas library and the read_parquet() method to read the data into a table (DataFrame).
29
+ Then, we display the first few rows using df.head() to make sure the data was loaded correctly.
30
+ This is a perfect starting point for the next step – Exploratory Data Analysis (EDA).
31
+ """
32
+
33
+ #Import necessary libraries
34
+ import pandas as pd
35
+
36
+ #View dataset shape
37
+ print("Dataset shape:", df.shape)
38
+
39
+ #View basic column information
40
+ print("\nColumn names:", df.columns.tolist())
41
+
42
+ #View detailed info
43
+ df.info()
44
+
45
+ """In this step, we check how many rows and columns the dataset has, and examine the names and data types of all columns.
46
+ This gives us an overview of what kind of data we’re dealing with (text, numbers, labels, etc.).
47
+ It helps us understand what preprocessing may be needed next.
48
+ """
49
+
50
+ #Check for missing values
51
+
52
+ print("Missing values per column:")
53
+ print(df.isnull().sum())
54
+
55
+ #Check for duplicated rows (convert unhashable columns to string)
56
+
57
+ print("\nNumber of duplicated rows:")
58
+ print(df.astype(str).duplicated().sum())
59
+
60
+ #Check how many unique combinations of emotion labels exist
61
+
62
+ print("\nNumber of unique label combinations:")
63
+ print(df["labels"].apply(lambda x: tuple(x)).nunique())
64
+
65
+ #Compute text lengths in number of words
66
+
67
+ df["text_length"] = df["text"].apply(lambda x: len(x.split()))
68
+
69
+ #Plot histogram of text lengths
70
+
71
+ import matplotlib.pyplot as plt
72
+
73
+ plt.figure(figsize=(10,6))
74
+ plt.hist(df["text_length"], bins=50)
75
+ plt.title("Distribution of Text Lengths (in words)")
76
+ plt.xlabel("Number of words")
77
+ plt.ylabel("Number of samples")
78
+ plt.grid(True)
79
+ plt.show()
80
+
81
+ """Most texts in the dataset are short—under 30 words—which helps us choose the proper maximum length for tokenization later.
82
+
83
+ """
84
+
85
+ #Count how many emotion labels each text has
86
+
87
+ df["num_labels"] = df["labels"].apply(len)
88
+
89
+ #Plot distribution
90
+
91
+ plt.figure(figsize=(8,5))
92
+ df["num_labels"].value_counts().sort_index().plot(kind="bar")
93
+ plt.xlabel("Number of emotion labels")
94
+ plt.ylabel("Number of samples")
95
+ plt.title("Distribution of Emotion Labels per Sample")
96
+ plt.show()
97
+
98
+ """Most samples are annotated with a single emotion label, and very few have multiple labels. This indicates that the dataset is mostly suitable for single-label classification tasks, although a multi-label approach could still capture additional nuance for rare cases."""
99
+
100
+ # Count frequency of each individual emotion label
101
+ from collections import Counter
102
+
103
+ # Flatten the list of labels across all samples
104
+ all_labels = [label for labels in df["labels"] for label in labels]
105
+ label_counts = Counter(all_labels)
106
+
107
+ # Convert to DataFrame for plotting
108
+ import pandas as pd
109
+ emotion_freq = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count'])
110
+ emotion_freq = emotion_freq.sort_values(by='count', ascending=False)
111
+
112
+ # Plot the frequency of each emotion
113
+ emotion_freq.plot(kind='bar', figsize=(15,5), legend=False)
114
+ plt.title("Frequency of Each Emotion Label")
115
+ plt.xlabel("Emotion Label ID")
116
+ plt.ylabel("Number of Occurrences")
117
+ plt.show()
118
+
119
+ """This bar chart illustrates how often each emotion label appears across the dataset.
120
+ We observe a strong imbalance: some labels like 27 (likely “neutral”) dominate with over 14,000 occurrences,
121
+ while others like 16, 21, or 23 are very rare.
122
+ This highlights the need to consider class imbalance when training models.
123
+ """
124
+
125
+ # Import necessary libraries
126
+ import numpy as np
127
+ import matplotlib.pyplot as plt
128
+ import seaborn as sns
129
+
130
+ # Create a binary matrix for emotions
131
+ # Get the maximum label ID from all label lists
132
+ num_labels = max([max(l.tolist()) if len(l) > 0 else 0 for l in df["labels"]]) + 1
133
+
134
+ emotion_matrix = np.zeros((len(df), num_labels), dtype=int)
135
+
136
+ for i, labels in enumerate(df["labels"]):
137
+ for label in labels:
138
+ emotion_matrix[i, label] = 1
139
+
140
+ # Compute co-occurrence matrix
141
+ co_occurrence = np.dot(emotion_matrix.T, emotion_matrix)
142
+
143
+ # Plot heatmap
144
+ plt.figure(figsize=(12, 10))
145
+ sns.heatmap(co_occurrence, cmap="Blues", linewidths=0.5)
146
+ plt.title("Emotion Co-occurrence Heatmap")
147
+ plt.xlabel("Emotion Label ID")
148
+ plt.ylabel("Emotion Label ID")
149
+ plt.show()
150
+
151
+ """This heatmap visualizes how frequently pairs of emotion labels co-occur within the same text. Darker shades indicate more frequent co-occurrences, helping identify emotions that often appear together."""
152
+
153
+ # View random samples of texts and their corresponding emotion labels
154
+
155
+ # Display 5 random rows
156
+ print("Sample text examples with emotion labels:")
157
+ display(df.sample(5)[["text", "labels"]])
158
+
159
+ """This step is meant to get a qualitative sense of the dataset by inspecting real examples. It helps verify whether:
160
+
161
+ The texts are understandable and relevant.
162
+ The assigned emotion labels make sense.
163
+ There are any noisy, overly short, or unclear samples.
164
+
165
+ """
166
+
167
+ # Define emotion label ID to name mapping manually (based on GoEmotions documentation)
168
+ id2label = [
169
+ 'admiration', 'amusement', 'anger', 'annoyance', 'approval',
170
+ 'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
171
+ 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
172
+ 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
173
+ 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise',
174
+ 'neutral'
175
+ ]
176
+
177
+ # Define a function to convert list of label IDs into label names
178
+ def decode_labels(label_ids):
179
+ return [id2label[i] for i in label_ids]
180
+
181
+ # Display 5 random samples with readable label names
182
+ print("Sample text examples with emotion label names:")
183
+ sample_df = df.sample(5)
184
+ sample_df["label_names"] = sample_df["labels"].apply(decode_labels)
185
+ display(sample_df[["text", "label_names"]])
186
+
187
+ """Sample Texts with Emotion Labels
188
+
189
+ The table displays five random text samples from the dataset along with their decoded emotion labels. Most of the examples are labeled as “neutral,” highlighting its dominance in the dataset.
190
+ """
191
+
192
+ # Import library for word cloud
193
+ from wordcloud import WordCloud
194
+ import matplotlib.pyplot as plt
195
+
196
+ # Combine all text data into one string
197
+ all_text = " ".join(df["text"])
198
+
199
+ # Generate word cloud
200
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
201
+
202
+ # Plot word cloud
203
+ plt.figure(figsize=(12, 6))
204
+ plt.imshow(wordcloud, interpolation="bilinear")
205
+ plt.axis("off")
206
+ plt.title("Most Frequent Words in All Text Samples")
207
+ plt.show()
208
+
209
+ """This word cloud displays the most commonly used words across all text samples in the dataset. Larger words appear more frequently, offering insights into prevalent vocabulary and themes used by users expressing various emotions."""
210
+
211
+ # Step: Text Preprocessing - clean the text data
212
+ import re
213
+ import string
214
+
215
+ # Define a function to clean each text entry
216
+ def clean_text(text):
217
+ # Lowercase
218
+ text = text.lower()
219
+ # Remove [NAME], [URL], and other placeholders
220
+ text = re.sub(r"\[.*?\]", "", text)
221
+ # Remove punctuation
222
+ text = text.translate(str.maketrans('', '', string.punctuation))
223
+ # Remove numbers
224
+ text = re.sub(r"\d+", "", text)
225
+ # Remove extra whitespaces
226
+ text = re.sub(r"\s+", " ", text).strip()
227
+ return text
228
+
229
+ # Apply cleaning to the text column
230
+ df["clean_text"] = df["text"].apply(clean_text)
231
+
232
+ # Preview cleaned text
233
+ print("Sample cleaned texts:")
234
+ display(df[["text", "clean_text"]].sample(5))
235
+
236
+ """
237
+ This preprocessing step standardizes text inputs by converting to lowercase, removing brackets like [NAME], punctuation, digits, and extra spaces — which helps downstream models focus on meaningful content.
238
+ """
239
+
240
+ # Plot label distribution
241
+
242
+ # Flatten all label lists into a single list
243
+ all_labels = [label for sublist in df["labels"] for label in sublist]
244
+
245
+ # Count frequency of each label
246
+ from collections import Counter
247
+ label_counts = Counter(all_labels)
248
+
249
+ # Convert to DataFrame for plotting
250
+ label_df = pd.DataFrame.from_dict(label_counts, orient="index", columns=["count"])
251
+ label_df.index.name = "label_id"
252
+ label_df = label_df.sort_index()
253
+ label_df["label_name"] = label_df.index.map(lambda i: id2label[i])
254
+
255
+ # Plot bar chart
256
+ plt.figure(figsize=(14, 6))
257
+ sns.barplot(x="label_name", y="count", data=label_df)
258
+ plt.xticks(rotation=45, ha="right")
259
+ plt.title("Distribution of Emotion Labels in Training Set")
260
+ plt.xlabel("Emotion")
261
+ plt.ylabel("Frequency")
262
+ plt.tight_layout()
263
+ plt.show()
264
+
265
+ """This bar chart shows how often each emotion label appears across all samples. Labels with higher frequency indicate more common emotions in the dataset.
266
+
267
+ ## 2. Embeddings
268
+ """
269
+
270
+ # Install the sentence-transformers library (if not already installed)
271
+ !pip install -q sentence-transformers
272
+
273
+ # Import required libraries
274
+ from sentence_transformers import SentenceTransformer
275
+ import torch
276
+
277
+ # Choose a small and fast model for generating sentence embeddings
278
+ model = SentenceTransformer('all-MiniLM-L6-v2')
279
+
280
+ # Optional: move model to GPU if available
281
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
282
+ model = model.to(device)
283
+
284
+ # Subset the dataset to 2000 samples for efficiency
285
+ sample_df = df.sample(n=2000, random_state=42).reset_index(drop=True)
286
+
287
+ # Generate embeddings for the 'clean_text' column
288
+ # This might take 1-2 minutes
289
+ embeddings = model.encode(
290
+ sample_df['clean_text'].tolist(),
291
+ convert_to_tensor=True,
292
+ show_progress_bar=True,
293
+ device=device
294
+ )
295
+
296
+ # Store embeddings as a list inside the dataframe
297
+ sample_df['embedding'] = embeddings.cpu().numpy().tolist()
298
+
299
+ # Preview the result
300
+ sample_df[['clean_text', 'embedding']].head()
301
+
302
+ """
303
+
304
+ We use the all-MiniLM-L6-v2 model from SentenceTransformers to convert each cleaned text into a dense vector representation, capturing semantic meaning for further clustering and visualization.
305
+ """
306
+
307
+ from tqdm.notebook import tqdm
308
+
309
+
310
+ sample_df = df.sample(n=3000, random_state=42).reset_index(drop=True)
311
+
312
+
313
+ embeddings = model.encode(sample_df["clean_text"].tolist(), show_progress_bar=True)
314
+
315
+ sample_df["embedding"] = embeddings.tolist()
316
+
317
+ """This step uses the all-MiniLM-L6-v2 model from the sentence-transformers library to convert each text sample into a dense vector (embedding). To improve efficiency, a random sample of 3,000 examples is selected, encoded in batches, and saved into a new "embedding" column."""
318
+
319
+ from sklearn.manifold import TSNE
320
+ import matplotlib.pyplot as plt
321
+ import numpy as np
322
+
323
+ # Convert list of embeddings to a NumPy array
324
+ X = np.array(sample_df["embedding"].tolist())
325
+
326
+ # Reduce the embedding dimensions to 2D using t-SNE
327
+ tsne = TSNE(n_components=2, random_state=42, perplexity=30)
328
+ X_embedded = tsne.fit_transform(X)
329
+
330
+ # Add 2D coordinates to the dataframe
331
+ sample_df["x"] = X_embedded[:, 0]
332
+ sample_df["y"] = X_embedded[:, 1]
333
+
334
+ # Visualize the 2D embeddings using a scatter plot
335
+ plt.figure(figsize=(10, 6))
336
+ plt.scatter(sample_df["x"], sample_df["y"], alpha=0.5)
337
+ plt.title("t-SNE Projection of Text Embeddings")
338
+ plt.xlabel("Component 1")
339
+ plt.ylabel("Component 2")
340
+ plt.show()
341
+
342
+ """t-SNE projection of sentence embeddings onto 2D space. Each point represents a high-dimensional text embedding reduced to two components for visualization. This helps reveal potential clustering structures and the distribution of semantic similarities."""
343
+
344
+ from sklearn.cluster import KMeans
345
+
346
+ # Define the number of clusters (you can try different values like 5, 10, etc.)
347
+ num_clusters = 8
348
+
349
+ # Apply K-Means clustering to the embeddings
350
+ kmeans = KMeans(n_clusters=num_clusters, random_state=42)
351
+ sample_df["cluster"] = kmeans.fit_predict(X)
352
+
353
+ # Visualize the clusters on the t-SNE projection
354
+ plt.figure(figsize=(10, 6))
355
+ scatter = plt.scatter(sample_df["x"], sample_df["y"], c=sample_df["cluster"], cmap='tab10', alpha=0.6)
356
+ plt.title(f"K-Means Clustering (k={num_clusters}) on t-SNE Projection")
357
+ plt.xlabel("Component 1")
358
+ plt.ylabel("Component 2")
359
+ plt.colorbar(scatter, label="Cluster")
360
+ plt.show()
361
+
362
+ """K-Means clustering (k=8) applied to sentence embeddings, visualized using t-SNE. Each color represents a distinct cluster, indicating groups of semantically similar text samples based on their embedding vectors.
363
+
364
+ ## 3. Inputs & Outputs
365
+ """
366
+
367
+ from sentence_transformers import util
368
+ import torch
369
+
370
+ # Ensure sample_df contains the 'embedding' column
371
+ EMBEDDINGS = torch.tensor(sample_df['embedding'].tolist(), device=device)
372
+
373
+ # Define the recommendation function
374
+ def recommend_similar_emotions(user_input):
375
+ if not user_input.strip():
376
+ return "Please enter some text."
377
+
378
+ # Encode the user input into an embedding
379
+ user_embedding = model.encode(user_input, convert_to_tensor=True, device=device)
380
+
381
+ # Compute cosine similarity between user input and all stored embeddings
382
+ similarities = util.cos_sim(user_embedding, EMBEDDINGS)[0]
383
+ top_indices = similarities.argsort(descending=True)[:5]
384
+
385
+ # Format the top 5 most similar results
386
+ results = []
387
+ for idx in top_indices:
388
+ row = sample_df.iloc[idx.item()]
389
+ results.append(f"{row['text']}\nEmotions: {row['labels']}")
390
+
391
+ return "\n\n".join(results)
392
+
393
+ recommend_similar_emotions("I'm feeling nervous before my exam")
394
+
395
+ """Core recommendation logic for matching user input text to most similar texts in the dataset using sentence embeddings and cosine similarity.
396
+ Returns top 5 results with their associated emotion labels.
397
+ """
398
+
399
+ import gradio as gr
400
+
401
+ # Create Gradio interface
402
+ demo = gr.Interface(
403
+ fn=recommend_similar_emotions,
404
+ inputs=gr.Textbox(lines=2, placeholder="Type your situation or feeling..."),
405
+ outputs="text",
406
+ title="Emotion Matcher",
407
+ description="Describe how you feel, and get similar examples with emotion labels."
408
+ )
409
+
410
+ demo.launch()
411
+
412
+ """Set up the Gradio web app for entering text and viewing recommendations"""