TuKoResearch
/

AuriStream-base

@@ -240,7 +240,8 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
     AuriStream speech language model.
     A GPT-like transformer model for cochlear token prediction with optional
-    multi-token prediction (MTP) heads for speculative decoding.
     Developed by Greta Tuckute and Klemen Kotar.
     """
@@ -266,7 +267,7 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         else:
             self.future_heads = None
-        # Output head
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         # Initialize weights
@@ -305,16 +306,27 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         Args:
             input_ids: Input token IDs of shape (batch_size, seq_len)
             labels: Target token IDs for computing loss
-            output_logits: Whether to return all logits (including from future heads)
-            output_hidden_states: Whether to return all hidden states
-            return_dict: Whether to return a dict or tuple
-            up_until_layer: Stop forward pass at this layer index
             normalize_embeddings: 'l2' or 'learned' to normalize hidden states
-            seq: Legacy argument (alias for input_ids)
-            tgt: Legacy argument (alias for labels)
         Returns:
-            CausalLMOutput with logits and optional loss, or tuple
         """
         # Handle legacy arguments
         if seq is not None:
@@ -343,16 +355,18 @@ class AuriStreamModel(AuriStreamPreTrainedModel):
         # Normalize hidden states if requested
         hs_to_return = all_hidden_states
         if output_hidden_states and normalize_embeddings is not None:
-            if normalize_embeddings == 'l2':
-                hs_to_return = [F.normalize(h, p=2, dim=-1) for h in all_hidden_states]
-            elif normalize_embeddings == 'learned':
                 hs_to_return = []
                 L = len(self.h)
                 for i, h in enumerate(all_hidden_states):
                     if i < L:
-                        hs_to_return.append(self.h[i].norm1(h))
                     else:
-                        hs_to_return.append(self.ln_f(h))
         # If only hidden states requested (not logits), return early
         if output_hidden_states and not output_logits and labels is None:

     AuriStream speech language model.
     A GPT-like transformer model for cochlear token prediction with optional
+    multi-token prediction (MTP) heads for improved representation learning and
+    novel inference capabilities.
     Developed by Greta Tuckute and Klemen Kotar.
     """
         else:
             self.future_heads = None
+        # "Standard" LM output head
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         # Initialize weights
         Args:
             input_ids: Input token IDs of shape (batch_size, seq_len)
             labels: Target token IDs for computing loss
+            output_logits: Whether to return all logits (including from future heads).
+                 The first element corresponds to the standard next-token head (prediction of i+1);
+                 subsequent elements correspond to future heads predicting tokens i+2, i+3, etc.
+            output_hidden_states: Whether to return all hidden states, including the input
+                embedding state and final pre-ln_f state. Matches HuggingFace GPT-style.
+            return_dict: Whether to return a dict or tuple. If True, return a CausalLMOutput dict,
+                otherwise return a tuple.
+            up_until_layer: If set, stop the forward pass after this transformer block
+                (inclusive) and return intermediate activations. Useful for saving compute.
             normalize_embeddings: 'l2' or 'learned' to normalize hidden states
+            seq: Legacy argument (alias for input_ids for backward compatibility)
+            tgt: Legacy argument (alias for labels for backward compatibility)
         Returns:
+            If return_dict is True:
+                CausalLMOutput with fields:
+                  • loss (optional): Scalar training loss
+                  • logits: Tensor or list of tensors of prediction logits
+                  • hidden_states (optional): Tuple of hidden states
+            Otherwise:
+                Tuple of (logits or list of logits, loss).
         """
         # Handle legacy arguments
         if seq is not None:
         # Normalize hidden states if requested
         hs_to_return = all_hidden_states
         if output_hidden_states and normalize_embeddings is not None:
+            if normalize_embeddings == 'l2': # Preserve direction, get rid of magnitude
+                hs_to_return = [F.normalize(h, p=2, dim=-1) for h in all_hidden_states] # Dim -1 is the hidden state dim;
+                # after normalization torch.norm(h_norm, p=2, dim=-1) will be 1. I.e. for every token, the hidden state dim norm is 1.
+            elif normalize_embeddings == 'learned': # We use the learned RMSNorm (first one; used to prepare embeddings for attn)
+                # I.e. these are the representations on which the model computes.
                 hs_to_return = []
                 L = len(self.h)
                 for i, h in enumerate(all_hidden_states):
                     if i < L:
+                        hs_to_return.append(self.h[i].norm1(h))
                     else:
+                        hs_to_return.append(self.ln_f(h)) # Final layer norm (after the main blocks, before LM head(s))
         # If only hidden states requested (not logits), return early
         if output_hidden_states and not output_logits and labels is None: