Alon Albalak commited on
Commit
57be184
·
1 Parent(s): 7e52249

major update: all data saved on HF (prompts, results), unified utilities

Browse files
data/prompts.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
src/config/settings.py CHANGED
@@ -17,9 +17,9 @@ DEFAULT_SIMILARITY_MODEL = "all-MiniLM-L6-v2"
17
  # Token limits
18
  MAX_USER_TOKENS = 5
19
 
20
- # Data file paths
21
- PROMPTS_DATA_PATH = "data/prompts.jsonl"
22
- RESULTS_DATA_PATH = "data/results.jsonl"
23
 
24
  # Server configuration
25
  DEFAULT_SERVER_NAME = "127.0.0.1"
 
17
  # Token limits
18
  MAX_USER_TOKENS = 5
19
 
20
+ # Huggingface data repositories
21
+ HF_PROMPTS_REPO = "alon-albalak/collaborative-decoding-prompts"
22
+ HF_RESULTS_REPO = "alon-albalak/collaborative-decoding-results"
23
 
24
  # Server configuration
25
  DEFAULT_SERVER_NAME = "127.0.0.1"
src/models/data_manager.py CHANGED
@@ -1,22 +1,21 @@
1
  """Data loading and saving functionality"""
2
 
3
  import json
4
- import os
5
  import random
6
  import datetime
7
  import uuid
8
  from pathlib import Path
9
- from datasets import load_dataset
10
  from huggingface_hub import CommitScheduler
11
 
12
- HF_REPO_ID = "alon-albalak/collaborative-decoding-results"
 
13
 
14
- JSON_DATASET_DIR = Path("results")
15
  JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
16
  JSON_DATASET_PATH = JSON_DATASET_DIR / f"results_{uuid.uuid4()}.json"
17
 
18
  scheduler = CommitScheduler(
19
- repo_id=HF_REPO_ID,
20
  repo_type="dataset",
21
  folder_path=JSON_DATASET_DIR.as_posix(),
22
  path_in_repo="data",
@@ -30,10 +29,11 @@ class DataManager:
30
  self.prompts_data = []
31
  self.results = None
32
 
33
- def load_prompts_data(self, filepath="data/prompts.jsonl"):
34
- """Load prompts data from JSONL file"""
35
- with open(filepath, "r") as f:
36
- self.prompts_data = [json.loads(line) for line in f]
 
37
 
38
  def get_random_prompt(self):
39
  """Get a random prompt from loaded data"""
@@ -44,7 +44,7 @@ class DataManager:
44
  def get_results(self):
45
  """Get all results data, loading if not already loaded."""
46
  if self.results is None:
47
- self.results = self.load_results_from_hf()
48
  return self.results
49
 
50
  def add_results(self, new_results):
@@ -53,14 +53,9 @@ class DataManager:
53
  raise RuntimeError("Results not loaded. Call get_results() first.")
54
  self.results.extend(new_results)
55
 
56
- def load_results_from_hf(self, hf_repo=HF_REPO_ID):
57
- """Load results data from Hugging Face dataset repository."""
58
- try:
59
- dataset = load_dataset(hf_repo, split="train")
60
- return dataset.to_list()
61
- except Exception as e:
62
- print(f"Error loading dataset from Hugging Face: {e}")
63
- return []
64
 
65
  def save_interaction_to_hf(self, prompt_data, user_continuation, generated_response,
66
  cosine_distance, session_id, num_user_tokens):
 
1
  """Data loading and saving functionality"""
2
 
3
  import json
 
4
  import random
5
  import datetime
6
  import uuid
7
  from pathlib import Path
 
8
  from huggingface_hub import CommitScheduler
9
 
10
+ from src.config.settings import HF_RESULTS_REPO, HF_PROMPTS_REPO
11
+ from src.utils.hf_data_manager import HFDataManager
12
 
13
+ JSON_DATASET_DIR = Path("testing/data/results")
14
  JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
15
  JSON_DATASET_PATH = JSON_DATASET_DIR / f"results_{uuid.uuid4()}.json"
16
 
17
  scheduler = CommitScheduler(
18
+ repo_id=HF_RESULTS_REPO,
19
  repo_type="dataset",
20
  folder_path=JSON_DATASET_DIR.as_posix(),
21
  path_in_repo="data",
 
29
  self.prompts_data = []
30
  self.results = None
31
 
32
+ def load_prompts_data(self):
33
+ """Load prompts data"""
34
+ self.prompts_data = self.load_from_hf(HF_PROMPTS_REPO)
35
+ if not self.prompts_data:
36
+ raise RuntimeError("No prompts data loaded from Hugging Face.")
37
 
38
  def get_random_prompt(self):
39
  """Get a random prompt from loaded data"""
 
44
  def get_results(self):
45
  """Get all results data, loading if not already loaded."""
46
  if self.results is None:
47
+ self.results = self.load_from_hf(HF_RESULTS_REPO)
48
  return self.results
49
 
50
  def add_results(self, new_results):
 
53
  raise RuntimeError("Results not loaded. Call get_results() first.")
54
  self.results.extend(new_results)
55
 
56
+ def load_from_hf(self, hf_repo):
57
+ """Load data from Hugging Face dataset repository."""
58
+ return HFDataManager.load_from_hf(hf_repo)
 
 
 
 
 
59
 
60
  def save_interaction_to_hf(self, prompt_data, user_continuation, generated_response,
61
  cosine_distance, session_id, num_user_tokens):
src/models/llm_manager.py CHANGED
@@ -2,7 +2,8 @@
2
 
3
  import os
4
  import torch
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
6
 
7
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
8
 
@@ -12,28 +13,12 @@ class LLMManager:
12
  def __init__(self):
13
  self.model = None
14
  self.tokenizer = None
15
-
16
- if torch.cuda.is_available():
17
- device = "cuda"
18
- dtype = torch.float16
19
- elif torch.backends.mps.is_available():
20
- device = "mps"
21
- dtype = torch.float16
22
- else:
23
- device = "cpu"
24
- dtype = torch.float32
25
-
26
- self.device = device
27
- self.dtype = dtype
28
 
29
  def load_models(self, model_name="meta-llama/Llama-3.2-1B-Instruct"):
30
  """Load the LLM model and tokenizer"""
31
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
32
- self.model = AutoModelForCausalLM.from_pretrained(model_name, dtype=self.dtype, low_cpu_mem_usage=True)
33
- self.model = self.model.to(self.device)
34
-
35
- if self.tokenizer.pad_token is None:
36
- self.tokenizer.pad_token = self.tokenizer.eos_token
37
 
38
  def validate_user_input(self, user_input, max_tokens=5):
39
  """Validate that user input is within token limits"""
@@ -63,13 +48,22 @@ class LLMManager:
63
  token_texts.append(token_text)
64
  return tokens, token_texts
65
 
 
 
 
 
 
 
 
 
 
66
  def generate_response_from_user_input(self, prompt, partial_response, user_continuation):
67
  """Generate a full response from user's continuation"""
68
  if not self.model or not self.tokenizer:
69
  raise RuntimeError("Models not loaded. Call load_models() first.")
70
 
71
  # TODO: make this more robust for multiple models, needs to be formatted correctly
72
- full_prompt = f"{prompt}\n\nAssistant: {partial_response}{user_continuation}"
73
 
74
  inputs = self.tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
75
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
@@ -86,5 +80,6 @@ class LLMManager:
86
  )
87
 
88
  full_response = self.tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
89
- assistant_part = full_response.split("Assistant: ")[-1]
 
90
  return assistant_part
 
2
 
3
  import os
4
  import torch
5
+ from src.utils.model_loader import ModelLoader
6
+ from src.utils.prompt_formatter import PromptFormatter
7
 
8
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
9
 
 
13
  def __init__(self):
14
  self.model = None
15
  self.tokenizer = None
16
+ self.device, self.dtype = ModelLoader.get_device_and_dtype()
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def load_models(self, model_name="meta-llama/Llama-3.2-1B-Instruct"):
19
  """Load the LLM model and tokenizer"""
20
+ self.model_name = model_name
21
+ self.model, self.tokenizer, self.device, self.dtype = ModelLoader.load_model_and_tokenizer(model_name)
 
 
 
 
22
 
23
  def validate_user_input(self, user_input, max_tokens=5):
24
  """Validate that user input is within token limits"""
 
48
  token_texts.append(token_text)
49
  return tokens, token_texts
50
 
51
+ def extract_assistant_response(self, full_response: str) -> str:
52
+ """Extract the assistant's response from the full generated text"""
53
+ return PromptFormatter.extract_assistant_response(self.model_name, full_response)
54
+
55
+ def format_prompt(self, prompt: str, partial_response: str, continuation: str) -> str:
56
+ """Format the full prompt for generation"""
57
+ return PromptFormatter.format_prompt(self.model_name, prompt, partial_response, continuation)
58
+
59
+
60
  def generate_response_from_user_input(self, prompt, partial_response, user_continuation):
61
  """Generate a full response from user's continuation"""
62
  if not self.model or not self.tokenizer:
63
  raise RuntimeError("Models not loaded. Call load_models() first.")
64
 
65
  # TODO: make this more robust for multiple models, needs to be formatted correctly
66
+ full_prompt = self.format_prompt(prompt, partial_response, user_continuation)
67
 
68
  inputs = self.tokenizer(full_prompt, return_tensors="pt", padding=True, truncation=True)
69
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
 
80
  )
81
 
82
  full_response = self.tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
83
+
84
+ assistant_part = self.extract_assistant_response(full_response)
85
  return assistant_part
src/scoring/scorer.py CHANGED
@@ -33,12 +33,15 @@ class Scorer:
33
  else:
34
  return "Hard", user_token_count
35
 
36
- def calculate_rank_and_percentile(self, user_score, prompt_results, user_tokens):
37
  """Calculate user's rank and percentile among users with same prompt and token count."""
38
  # Filter to only same prompt and same token count
39
- comparable_scores = [r["cosine_distance"] for r in prompt_results
40
- if r["num_user_tokens"] == user_tokens]
41
-
 
 
 
42
  if not comparable_scores:
43
  return None, None
44
 
@@ -117,7 +120,7 @@ class Scorer:
117
  html_content += f"""
118
  <div class="score-metric">
119
  <div class="metric-value">#{rank}</div>
120
- <div class="metric-label">Rank out of {same_category_attempts+1}</div>
121
  </div>
122
  <div class="score-metric">
123
  <div class="metric-value">{percentile:.1f}%</div>
 
33
  else:
34
  return "Hard", user_token_count
35
 
36
+ def calculate_rank_and_percentile(self, user_score, prompt_results, user_tokens, separate_by_token_count=False):
37
  """Calculate user's rank and percentile among users with same prompt and token count."""
38
  # Filter to only same prompt and same token count
39
+ if separate_by_token_count:
40
+ comparable_scores = [r["cosine_distance"] for r in prompt_results
41
+ if r["num_user_tokens"] == user_tokens]
42
+ else:
43
+ comparable_scores = [r["cosine_distance"] for r in prompt_results]
44
+
45
  if not comparable_scores:
46
  return None, None
47
 
 
120
  html_content += f"""
121
  <div class="score-metric">
122
  <div class="metric-value">#{rank}</div>
123
+ <div class="metric-label">Rank out of {same_category_attempts}</div>
124
  </div>
125
  <div class="score-metric">
126
  <div class="metric-value">{percentile:.1f}%</div>
src/ui/page_handlers.py CHANGED
@@ -404,8 +404,7 @@ class PageHandlers:
404
 
405
  # Create enhanced score display with progress bars and metrics
406
  user_tokens = self.app.llm_manager.count_tokens(user_text)
407
- same_category_attempts = len([r for r in prompt_results
408
- if r["num_user_tokens"] == user_tokens]) if prompt_results else 1
409
 
410
  score_text = self.app.scorer.create_enhanced_score_display(
411
  cosine_distance, rank, percentile, user_tokens, same_category_attempts
 
404
 
405
  # Create enhanced score display with progress bars and metrics
406
  user_tokens = self.app.llm_manager.count_tokens(user_text)
407
+ same_category_attempts = len(prompt_results)
 
408
 
409
  score_text = self.app.scorer.create_enhanced_score_display(
410
  cosine_distance, rank, percentile, user_tokens, same_category_attempts
src/utils/__init__.py ADDED
File without changes
src/utils/hf_data_manager.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unified HuggingFace dataset operations"""
2
+
3
+ import json
4
+ from pathlib import Path
5
+ from datasets import Dataset, load_dataset
6
+
7
+ class HFDataManager:
8
+ """Handles all HuggingFace dataset loading and saving operations"""
9
+
10
+ @staticmethod
11
+ def load_from_hf(hf_repo):
12
+ """Load data from HuggingFace dataset repository"""
13
+ try:
14
+ dataset = load_dataset(hf_repo, split="train")
15
+ return dataset.to_list()
16
+ except Exception:
17
+ # Return empty list if dataset doesn't exist or can't be loaded
18
+ return []
19
+
20
+ @staticmethod
21
+ def push_to_hf(data, repo_id, private=True):
22
+ """Push data to HuggingFace dataset repository"""
23
+ dataset = Dataset.from_list(data)
24
+ dataset.push_to_hub(repo_id, private=private)
25
+
26
+ @staticmethod
27
+ def save_to_jsonl(data, file_path):
28
+ """Save data to local JSONL file"""
29
+ file_path = Path(file_path)
30
+ file_path.parent.mkdir(parents=True, exist_ok=True)
31
+
32
+ with open(file_path, "w") as f:
33
+ for item in data:
34
+ f.write(json.dumps(item) + "\n")
35
+
36
+ @staticmethod
37
+ def load_from_jsonl(file_path):
38
+ """Load data from local JSONL file"""
39
+ data = []
40
+ with open(file_path, "r") as f:
41
+ for line in f:
42
+ data.append(json.loads(line.strip()))
43
+ return data
src/utils/model_loader.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unified model loading and device management"""
2
+
3
+ import os
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+
7
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
8
+
9
+ class ModelLoader:
10
+ """Handles device detection and model/tokenizer loading"""
11
+
12
+ @staticmethod
13
+ def get_device_and_dtype():
14
+ """Determine the best available device and dtype"""
15
+ if torch.cuda.is_available():
16
+ return "cuda", torch.float16
17
+ elif torch.backends.mps.is_available():
18
+ return "mps", torch.float16
19
+ else:
20
+ return "cpu", torch.float32
21
+
22
+ @staticmethod
23
+ def load_model_and_tokenizer(model_name="meta-llama/Llama-3.2-1B-Instruct"):
24
+ """Load model and tokenizer with optimal device/dtype settings"""
25
+ device, dtype = ModelLoader.get_device_and_dtype()
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
28
+ model = AutoModelForCausalLM.from_pretrained(
29
+ model_name,
30
+ dtype=dtype,
31
+ low_cpu_mem_usage=True
32
+ )
33
+ model = model.to(device)
34
+
35
+ # Set pad token if needed
36
+ if tokenizer.pad_token is None:
37
+ tokenizer.pad_token = tokenizer.eos_token
38
+
39
+ return model, tokenizer, device, dtype
src/utils/prompt_formatter.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unified prompt formatting and response extraction"""
2
+
3
+ import re
4
+
5
+ class PromptFormatter:
6
+ """Handles prompt formatting and assistant response extraction"""
7
+
8
+ @staticmethod
9
+ def format_prompt(model_name, prompt, partial_response, continuation):
10
+ """Format the full prompt for generation"""
11
+ if "meta-llama" in model_name:
12
+ return f"{prompt}\n\nAssistant: {partial_response}{continuation}"
13
+ else:
14
+ raise NotImplementedError(f"Prompt formatting not implemented for model: {model_name}")
15
+
16
+ @staticmethod
17
+ def extract_assistant_response(model_name, full_response):
18
+ """Extract the assistant's response from the full generated text"""
19
+ if "meta-llama" in model_name:
20
+ # Check if we have multiple assistant tags and get the last one
21
+ assistant_tags = re.findall(r"Assistant:\s*", full_response)
22
+ if len(assistant_tags) > 1:
23
+ print(f"Found multiple assistant tags ({len(assistant_tags)})\nFull response:\n{full_response}\n**")
24
+
25
+ # Only split on the first assistant tag
26
+ return full_response.split("Assistant:", maxsplit=1)[-1].strip()
27
+ else:
28
+ raise NotImplementedError(f"Response extraction not implemented for model: {model_name}")