TuKoResearch
/

AuriStream1B_librilight_ckpt500k

@@ -242,26 +242,35 @@ class AuriStream(PreTrainedModel):
         return sampled
     @torch.no_grad()
-    def generate(self, seq: torch.Tensor, n_tokens: int = 1, temp=1.0,
-        top_k=500, top_p=0.5, seed=None):
         """
         Parameters:
-            seq: torch.Tensor of shape (b, t, n_freq_bins)
-                Input cochleagram to use for generation
             n_tokens: int
-                Number of time bins to predict
             temp: float
                 Temperature for sampling logits
             seed: int
                 Random seed for sampling
         Returns:
-            pred_coch: torch.Tensor of shape (b, t, n_freq_bins)
-                The predicted cochleagram
-            all_logits: (optional if return_logits is True) torch.Tensor of shape (b, n_tokens, n_freq_bins)
-                The logits for each time step
-            all_embs: (optional if return_embs is not None) list of torch.Tensor
-                The embeddings for each transformer block
         """
         # Set seed if provided
@@ -277,14 +286,6 @@ class AuriStream(PreTrainedModel):
         # grab shape of the cochleagram
         b, t = seq.size()
-        # TODO: double check this works then delete the block bellow:
-        # pass the given input through the model to get the predictions and cache
-        # the k and v values for each transformer block in the process
-        # pos = torch.arange(0, t, dtype=torch.long, device=device)
-        # tok_emb = self.transformer.wte(seq) # token embeddings of shape (b, t, n_embd)
-        # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
-        # x = self.transformer.drop(tok_emb + pos_emb)
         #### Embed conditioning sequence into KV cache
         tok_emb = self.transformer.wte(seq) # token embeddings of shape (b, t, n_embd)
@@ -322,13 +323,6 @@ class AuriStream(PreTrainedModel):
         # using the last embedding of the input
         for i in range(n_tokens-1):
-            # TODO: double check this works then delete the block bellow:
-            # # Get the emb and pos embedding of just the last token
-            # pos = torch.arange(t+i, t+i+1, dtype=torch.long, device=device) # shape (t)
-            # tok_emb = self.transformer.wte(predictions[-1]) # token embeddings of shape (b, t, n_embd)
-            # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
-            # x = self.transformer.drop(tok_emb + pos_emb)
             # Get the emb and pos embedding of just the last token
             tok_emb = self.transformer.wte(predictions[-1]) # token embeddings of shape (b, t, n_embd)
             # if wpe exists in self.transformer apply leanred positional embedding

         return sampled
     @torch.no_grad()
+    def generate(
+        self,
+        seq: torch.Tensor,
+        n_tokens: int = 1,
+        temp: float = 1.0,
+        top_k: int = None,
+        top_p: float = None,
+        seed: int = None,
+    ):
         """
         Parameters:
+            seq: torch.Tensor of shape (b, t)
+                Input cochlear tokens to condition the generation
             n_tokens: int
+                Number of future tokens (5ms time bins) to predict
             temp: float
                 Temperature for sampling logits
+            top_k: int
+                Restrict sampling to k tokens with highest probability (sample from all tokens if None)
+            top_p: float
+                Restrict sampling to most probable tokens with cumulative probability of p (sample form all tokens if None)
             seed: int
                 Random seed for sampling
         Returns:
+            pred_coch: torch.Tensor of shape (b, t)
+                The generated cochlear tokens
+            all_logits: torch.Tensor of shape (b, n_tokens, vocab_size)
+                The logits at each time step
         """
         # Set seed if provided
         # grab shape of the cochleagram
         b, t = seq.size()
         #### Embed conditioning sequence into KV cache
         tok_emb = self.transformer.wte(seq) # token embeddings of shape (b, t, n_embd)
         # using the last embedding of the input
         for i in range(n_tokens-1):
             # Get the emb and pos embedding of just the last token
             tok_emb = self.transformer.wte(predictions[-1]) # token embeddings of shape (b, t, n_embd)
             # if wpe exists in self.transformer apply leanred positional embedding