Spaces:
Sleeping
Sleeping
Alon Albalak
commited on
Commit
·
57be184
1
Parent(s):
7e52249
major update: all data saved on HF (prompts, results), unified utilities
Browse files- data/prompts.jsonl +0 -0
- src/config/settings.py +3 -3
- src/models/data_manager.py +13 -18
- src/models/llm_manager.py +17 -22
- src/scoring/scorer.py +8 -5
- src/ui/page_handlers.py +1 -2
- src/utils/__init__.py +0 -0
- src/utils/hf_data_manager.py +43 -0
- src/utils/model_loader.py +39 -0
- src/utils/prompt_formatter.py +28 -0
data/prompts.jsonl
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/config/settings.py
CHANGED
|
@@ -17,9 +17,9 @@ DEFAULT_SIMILARITY_MODEL = "all-MiniLM-L6-v2"
|
|
| 17 |
# Token limits
|
| 18 |
MAX_USER_TOKENS = 5
|
| 19 |
|
| 20 |
-
#
|
| 21 |
-
|
| 22 |
-
|
| 23 |
|
| 24 |
# Server configuration
|
| 25 |
DEFAULT_SERVER_NAME = "127.0.0.1"
|
|
|
|
| 17 |
# Token limits
|
| 18 |
MAX_USER_TOKENS = 5
|
| 19 |
|
| 20 |
+
# Huggingface data repositories
|
| 21 |
+
HF_PROMPTS_REPO = "alon-albalak/collaborative-decoding-prompts"
|
| 22 |
+
HF_RESULTS_REPO = "alon-albalak/collaborative-decoding-results"
|
| 23 |
|
| 24 |
# Server configuration
|
| 25 |
DEFAULT_SERVER_NAME = "127.0.0.1"
|
src/models/data_manager.py
CHANGED
|
@@ -1,22 +1,21 @@
|
|
| 1 |
"""Data loading and saving functionality"""
|
| 2 |
|
| 3 |
import json
|
| 4 |
-
import os
|
| 5 |
import random
|
| 6 |
import datetime
|
| 7 |
import uuid
|
| 8 |
from pathlib import Path
|
| 9 |
-
from datasets import load_dataset
|
| 10 |
from huggingface_hub import CommitScheduler
|
| 11 |
|
| 12 |
-
|
|
|
|
| 13 |
|
| 14 |
-
JSON_DATASET_DIR = Path("results")
|
| 15 |
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
| 16 |
JSON_DATASET_PATH = JSON_DATASET_DIR / f"results_{uuid.uuid4()}.json"
|
| 17 |
|
| 18 |
scheduler = CommitScheduler(
|
| 19 |
-
repo_id=
|
| 20 |
repo_type="dataset",
|
| 21 |
folder_path=JSON_DATASET_DIR.as_posix(),
|
| 22 |
path_in_repo="data",
|
|
@@ -30,10 +29,11 @@ class DataManager:
|
|
| 30 |
self.prompts_data = []
|
| 31 |
self.results = None
|
| 32 |
|
| 33 |
-
def load_prompts_data(self
|
| 34 |
-
"""Load prompts data
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 37 |
|
| 38 |
def get_random_prompt(self):
|
| 39 |
"""Get a random prompt from loaded data"""
|
|
@@ -44,7 +44,7 @@ class DataManager:
|
|
| 44 |
def get_results(self):
|
| 45 |
"""Get all results data, loading if not already loaded."""
|
| 46 |
if self.results is None:
|
| 47 |
-
self.results = self.
|
| 48 |
return self.results
|
| 49 |
|
| 50 |
def add_results(self, new_results):
|
|
@@ -53,14 +53,9 @@ class DataManager:
|
|
| 53 |
raise RuntimeError("Results not loaded. Call get_results() first.")
|
| 54 |
self.results.extend(new_results)
|
| 55 |
|
| 56 |
-
def
|
| 57 |
-
"""Load
|
| 58 |
-
|
| 59 |
-
dataset = load_dataset(hf_repo, split="train")
|
| 60 |
-
return dataset.to_list()
|
| 61 |
-
except Exception as e:
|
| 62 |
-
print(f"Error loading dataset from Hugging Face: {e}")
|
| 63 |
-
return []
|
| 64 |
|
| 65 |
def save_interaction_to_hf(self, prompt_data, user_continuation, generated_response,
|
| 66 |
cosine_distance, session_id, num_user_tokens):
|
|
|
|
| 1 |
"""Data loading and saving functionality"""
|
| 2 |
|
| 3 |
import json
|
|
|
|
| 4 |
import random
|
| 5 |
import datetime
|
| 6 |
import uuid
|
| 7 |
from pathlib import Path
|
|
|
|
| 8 |
from huggingface_hub import CommitScheduler
|
| 9 |
|
| 10 |
+
from src.config.settings import HF_RESULTS_REPO, HF_PROMPTS_REPO
|
| 11 |
+
from src.utils.hf_data_manager import HFDataManager
|
| 12 |
|
| 13 |
+
JSON_DATASET_DIR = Path("testing/data/results")
|
| 14 |
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
| 15 |
JSON_DATASET_PATH = JSON_DATASET_DIR / f"results_{uuid.uuid4()}.json"
|
| 16 |
|
| 17 |
scheduler = CommitScheduler(
|
| 18 |
+
repo_id=HF_RESULTS_REPO,
|
| 19 |
repo_type="dataset",
|
| 20 |
folder_path=JSON_DATASET_DIR.as_posix(),
|
| 21 |
path_in_repo="data",
|
|
|
|
| 29 |
self.prompts_data = []
|
| 30 |
self.results = None
|
| 31 |
|
| 32 |
+
def load_prompts_data(self):
|
| 33 |
+
"""Load prompts data"""
|
| 34 |
+
self.prompts_data = self.load_from_hf(HF_PROMPTS_REPO)
|
| 35 |
+
if not self.prompts_data:
|
| 36 |
+
raise RuntimeError("No prompts data loaded from Hugging Face.")
|
| 37 |
|
| 38 |
def get_random_prompt(self):
|
| 39 |
"""Get a random prompt from loaded data"""
|
|
|
|
| 44 |
def get_results(self):
|
| 45 |
"""Get all results data, loading if not already loaded."""
|
| 46 |
if self.results is None:
|
| 47 |
+
self.results = self.load_from_hf(HF_RESULTS_REPO)
|
| 48 |
return self.results
|
| 49 |
|
| 50 |
def add_results(self, new_results):
|
|
|
|
| 53 |
raise RuntimeError("Results not loaded. Call get_results() first.")
|
| 54 |
self.results.extend(new_results)
|
| 55 |
|
| 56 |
+
def load_from_hf(self, hf_repo):
|
| 57 |
+
"""Load data from Hugging Face dataset repository."""
|
| 58 |
+
return HFDataManager.load_from_hf(hf_repo)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
def save_interaction_to_hf(self, prompt_data, user_continuation, generated_response,
|
| 61 |
cosine_distance, session_id, num_user_tokens):
|
src/models/llm_manager.py
CHANGED
|
@@ -2,7 +2,8 @@
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import torch
|
| 5 |
-
from
|
|
|
|
| 6 |
|
| 7 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 8 |
|
|
@@ -12,28 +13,12 @@ class LLMManager:
|
|
| 12 |
def __init__(self):
|
| 13 |
self.model = None
|
| 14 |
self.tokenizer = None
|
| 15 |
-
|
| 16 |
-
if torch.cuda.is_available():
|
| 17 |
-
device = "cuda"
|
| 18 |
-
dtype = torch.float16
|
| 19 |
-
elif torch.backends.mps.is_available():
|
| 20 |
-
device = "mps"
|
| 21 |
-
dtype = torch.float16
|
| 22 |
-
else:
|
| 23 |
-
device = "cpu"
|
| 24 |
-
dtype = torch.float32
|
| 25 |
-
|
| 26 |
-
self.device = device
|
| 27 |
-
self.dtype = dtype
|
| 28 |
|
| 29 |
def load_models(self, model_name="meta-llama/Llama-3.2-1B-Instruct"):
|
| 30 |
"""Load the LLM model and tokenizer"""
|
| 31 |
-
self.
|
| 32 |
-
self.model
|
| 33 |
-
self.model = self.model.to(self.device)
|
| 34 |
-
|
| 35 |
-
if self.tokenizer.pad_token is None:
|
| 36 |
-
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 37 |
|
| 38 |
def validate_user_input(self, user_input, max_tokens=5):
|
| 39 |
"""Validate that user input is within token limits"""
|
|
@@ -63,13 +48,22 @@ class LLMManager:
|
|
| 63 |
token_texts.append(token_text)
|
| 64 |
return tokens, token_texts
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
def generate_response_from_user_input(self, prompt, partial_response, user_continuation):
|
| 67 |
"""Generate a full response from user's continuation"""
|
| 68 |
if not self.model or not self.tokenizer:
|
| 69 |
raise RuntimeError("Models not loaded. Call load_models() first.")
|
| 70 |
|
| 71 |
# TODO: make this more robust for multiple models, needs to be formatted correctly
|
| 72 |
-
full_prompt =
|
| 73 |
|
| 74 |
inputs = self.tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
|
| 75 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
@@ -86,5 +80,6 @@ class LLMManager:
|
|
| 86 |
)
|
| 87 |
|
| 88 |
full_response = self.tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
|
| 89 |
-
|
|
|
|
| 90 |
return assistant_part
|
|
|
|
| 2 |
|
| 3 |
import os
|
| 4 |
import torch
|
| 5 |
+
from src.utils.model_loader import ModelLoader
|
| 6 |
+
from src.utils.prompt_formatter import PromptFormatter
|
| 7 |
|
| 8 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 9 |
|
|
|
|
| 13 |
def __init__(self):
|
| 14 |
self.model = None
|
| 15 |
self.tokenizer = None
|
| 16 |
+
self.device, self.dtype = ModelLoader.get_device_and_dtype()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def load_models(self, model_name="meta-llama/Llama-3.2-1B-Instruct"):
|
| 19 |
"""Load the LLM model and tokenizer"""
|
| 20 |
+
self.model_name = model_name
|
| 21 |
+
self.model, self.tokenizer, self.device, self.dtype = ModelLoader.load_model_and_tokenizer(model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def validate_user_input(self, user_input, max_tokens=5):
|
| 24 |
"""Validate that user input is within token limits"""
|
|
|
|
| 48 |
token_texts.append(token_text)
|
| 49 |
return tokens, token_texts
|
| 50 |
|
| 51 |
+
def extract_assistant_response(self, full_response: str) -> str:
|
| 52 |
+
"""Extract the assistant's response from the full generated text"""
|
| 53 |
+
return PromptFormatter.extract_assistant_response(self.model_name, full_response)
|
| 54 |
+
|
| 55 |
+
def format_prompt(self, prompt: str, partial_response: str, continuation: str) -> str:
|
| 56 |
+
"""Format the full prompt for generation"""
|
| 57 |
+
return PromptFormatter.format_prompt(self.model_name, prompt, partial_response, continuation)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
def generate_response_from_user_input(self, prompt, partial_response, user_continuation):
|
| 61 |
"""Generate a full response from user's continuation"""
|
| 62 |
if not self.model or not self.tokenizer:
|
| 63 |
raise RuntimeError("Models not loaded. Call load_models() first.")
|
| 64 |
|
| 65 |
# TODO: make this more robust for multiple models, needs to be formatted correctly
|
| 66 |
+
full_prompt = self.format_prompt(prompt, partial_response, user_continuation)
|
| 67 |
|
| 68 |
inputs = self.tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
|
| 69 |
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
|
|
| 80 |
)
|
| 81 |
|
| 82 |
full_response = self.tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
|
| 83 |
+
|
| 84 |
+
assistant_part = self.extract_assistant_response(full_response)
|
| 85 |
return assistant_part
|
src/scoring/scorer.py
CHANGED
|
@@ -33,12 +33,15 @@ class Scorer:
|
|
| 33 |
else:
|
| 34 |
return "Hard", user_token_count
|
| 35 |
|
| 36 |
-
def calculate_rank_and_percentile(self, user_score, prompt_results, user_tokens):
|
| 37 |
"""Calculate user's rank and percentile among users with same prompt and token count."""
|
| 38 |
# Filter to only same prompt and same token count
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
| 42 |
if not comparable_scores:
|
| 43 |
return None, None
|
| 44 |
|
|
@@ -117,7 +120,7 @@ class Scorer:
|
|
| 117 |
html_content += f"""
|
| 118 |
<div class="score-metric">
|
| 119 |
<div class="metric-value">#{rank}</div>
|
| 120 |
-
<div class="metric-label">Rank out of {same_category_attempts
|
| 121 |
</div>
|
| 122 |
<div class="score-metric">
|
| 123 |
<div class="metric-value">{percentile:.1f}%</div>
|
|
|
|
| 33 |
else:
|
| 34 |
return "Hard", user_token_count
|
| 35 |
|
| 36 |
+
def calculate_rank_and_percentile(self, user_score, prompt_results, user_tokens, separate_by_token_count=False):
|
| 37 |
"""Calculate user's rank and percentile among users with same prompt and token count."""
|
| 38 |
# Filter to only same prompt and same token count
|
| 39 |
+
if separate_by_token_count:
|
| 40 |
+
comparable_scores = [r["cosine_distance"] for r in prompt_results
|
| 41 |
+
if r["num_user_tokens"] == user_tokens]
|
| 42 |
+
else:
|
| 43 |
+
comparable_scores = [r["cosine_distance"] for r in prompt_results]
|
| 44 |
+
|
| 45 |
if not comparable_scores:
|
| 46 |
return None, None
|
| 47 |
|
|
|
|
| 120 |
html_content += f"""
|
| 121 |
<div class="score-metric">
|
| 122 |
<div class="metric-value">#{rank}</div>
|
| 123 |
+
<div class="metric-label">Rank out of {same_category_attempts}</div>
|
| 124 |
</div>
|
| 125 |
<div class="score-metric">
|
| 126 |
<div class="metric-value">{percentile:.1f}%</div>
|
src/ui/page_handlers.py
CHANGED
|
@@ -404,8 +404,7 @@ class PageHandlers:
|
|
| 404 |
|
| 405 |
# Create enhanced score display with progress bars and metrics
|
| 406 |
user_tokens = self.app.llm_manager.count_tokens(user_text)
|
| 407 |
-
same_category_attempts = len(
|
| 408 |
-
if r["num_user_tokens"] == user_tokens]) if prompt_results else 1
|
| 409 |
|
| 410 |
score_text = self.app.scorer.create_enhanced_score_display(
|
| 411 |
cosine_distance, rank, percentile, user_tokens, same_category_attempts
|
|
|
|
| 404 |
|
| 405 |
# Create enhanced score display with progress bars and metrics
|
| 406 |
user_tokens = self.app.llm_manager.count_tokens(user_text)
|
| 407 |
+
same_category_attempts = len(prompt_results)
|
|
|
|
| 408 |
|
| 409 |
score_text = self.app.scorer.create_enhanced_score_display(
|
| 410 |
cosine_distance, rank, percentile, user_tokens, same_category_attempts
|
src/utils/__init__.py
ADDED
|
File without changes
|
src/utils/hf_data_manager.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unified HuggingFace dataset operations"""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from datasets import Dataset, load_dataset
|
| 6 |
+
|
| 7 |
+
class HFDataManager:
|
| 8 |
+
"""Handles all HuggingFace dataset loading and saving operations"""
|
| 9 |
+
|
| 10 |
+
@staticmethod
|
| 11 |
+
def load_from_hf(hf_repo):
|
| 12 |
+
"""Load data from HuggingFace dataset repository"""
|
| 13 |
+
try:
|
| 14 |
+
dataset = load_dataset(hf_repo, split="train")
|
| 15 |
+
return dataset.to_list()
|
| 16 |
+
except Exception:
|
| 17 |
+
# Return empty list if dataset doesn't exist or can't be loaded
|
| 18 |
+
return []
|
| 19 |
+
|
| 20 |
+
@staticmethod
|
| 21 |
+
def push_to_hf(data, repo_id, private=True):
|
| 22 |
+
"""Push data to HuggingFace dataset repository"""
|
| 23 |
+
dataset = Dataset.from_list(data)
|
| 24 |
+
dataset.push_to_hub(repo_id, private=private)
|
| 25 |
+
|
| 26 |
+
@staticmethod
|
| 27 |
+
def save_to_jsonl(data, file_path):
|
| 28 |
+
"""Save data to local JSONL file"""
|
| 29 |
+
file_path = Path(file_path)
|
| 30 |
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
| 31 |
+
|
| 32 |
+
with open(file_path, "w") as f:
|
| 33 |
+
for item in data:
|
| 34 |
+
f.write(json.dumps(item) + "\n")
|
| 35 |
+
|
| 36 |
+
@staticmethod
|
| 37 |
+
def load_from_jsonl(file_path):
|
| 38 |
+
"""Load data from local JSONL file"""
|
| 39 |
+
data = []
|
| 40 |
+
with open(file_path, "r") as f:
|
| 41 |
+
for line in f:
|
| 42 |
+
data.append(json.loads(line.strip()))
|
| 43 |
+
return data
|
src/utils/model_loader.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unified model loading and device management"""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import torch
|
| 5 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 6 |
+
|
| 7 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 8 |
+
|
| 9 |
+
class ModelLoader:
|
| 10 |
+
"""Handles device detection and model/tokenizer loading"""
|
| 11 |
+
|
| 12 |
+
@staticmethod
|
| 13 |
+
def get_device_and_dtype():
|
| 14 |
+
"""Determine the best available device and dtype"""
|
| 15 |
+
if torch.cuda.is_available():
|
| 16 |
+
return "cuda", torch.float16
|
| 17 |
+
elif torch.backends.mps.is_available():
|
| 18 |
+
return "mps", torch.float16
|
| 19 |
+
else:
|
| 20 |
+
return "cpu", torch.float32
|
| 21 |
+
|
| 22 |
+
@staticmethod
|
| 23 |
+
def load_model_and_tokenizer(model_name="meta-llama/Llama-3.2-1B-Instruct"):
|
| 24 |
+
"""Load model and tokenizer with optimal device/dtype settings"""
|
| 25 |
+
device, dtype = ModelLoader.get_device_and_dtype()
|
| 26 |
+
|
| 27 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 28 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 29 |
+
model_name,
|
| 30 |
+
dtype=dtype,
|
| 31 |
+
low_cpu_mem_usage=True
|
| 32 |
+
)
|
| 33 |
+
model = model.to(device)
|
| 34 |
+
|
| 35 |
+
# Set pad token if needed
|
| 36 |
+
if tokenizer.pad_token is None:
|
| 37 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 38 |
+
|
| 39 |
+
return model, tokenizer, device, dtype
|
src/utils/prompt_formatter.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unified prompt formatting and response extraction"""
|
| 2 |
+
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
class PromptFormatter:
|
| 6 |
+
"""Handles prompt formatting and assistant response extraction"""
|
| 7 |
+
|
| 8 |
+
@staticmethod
|
| 9 |
+
def format_prompt(model_name, prompt, partial_response, continuation):
|
| 10 |
+
"""Format the full prompt for generation"""
|
| 11 |
+
if "meta-llama" in model_name:
|
| 12 |
+
return f"{prompt}\n\nAssistant: {partial_response}{continuation}"
|
| 13 |
+
else:
|
| 14 |
+
raise NotImplementedError(f"Prompt formatting not implemented for model: {model_name}")
|
| 15 |
+
|
| 16 |
+
@staticmethod
|
| 17 |
+
def extract_assistant_response(model_name, full_response):
|
| 18 |
+
"""Extract the assistant's response from the full generated text"""
|
| 19 |
+
if "meta-llama" in model_name:
|
| 20 |
+
# Check if we have multiple assistant tags and get the last one
|
| 21 |
+
assistant_tags = re.findall(r"Assistant:\s*", full_response)
|
| 22 |
+
if len(assistant_tags) > 1:
|
| 23 |
+
print(f"Found multiple assistant tags ({len(assistant_tags)})\nFull response:\n{full_response}\n**")
|
| 24 |
+
|
| 25 |
+
# Only split on the first assistant tag
|
| 26 |
+
return full_response.split("Assistant:", maxsplit=1)[-1].strip()
|
| 27 |
+
else:
|
| 28 |
+
raise NotImplementedError(f"Response extraction not implemented for model: {model_name}")
|