WillHeld
/

DiVA-llama-3-v0-8b

Feature Extraction

Model card Files Files and versions Community

Will Held commited on Jul 29, 2024

Commit

b7ca827

·

1 Parent(s): 259eb63

Add Stream

Files changed (1) hide show

modeling_diva.py +65 -0

modeling_diva.py CHANGED Viewed

@@ -243,3 +243,68 @@ class DiVAModel(PreTrainedModel):
         return self.tokenizer.decode(outs, skip_special_tokens=True).replace(
             "<|eot_id|>", ""
         )

         return self.tokenizer.decode(outs, skip_special_tokens=True).replace(
             "<|eot_id|>", ""
         )
+    def generate_stream(
+        self, audio, text_prompt, do_sample=False, logits_processor=None, max_new_tokens=128
+    ):
+        inputs = self.processor(audio, return_tensors="pt", sampling_rate=16_000)
+        input_features = inputs.input_features.to(self.speech_encoder_device)
+        hidden_states = self.whisper_encoder(input_features=input_features)[
+            "last_hidden_state"
+        ]
+        virt_tokens = self.connector(
+            hidden_states,
+            output_device=self.llama_decoder.model.embed_tokens.weight.device,
+        ).squeeze()
+        if text_prompt != None and text_prompt != "":
+            user_prompt_text = torch.tensor(
+                self.tokenizer(text_prompt, add_special_tokens=False)["input_ids"],
+                device=self.pre_user_suffix.device,
+            )
+            prefix = torch.cat(
+                [self.pre_user_suffix, user_prompt_text, self.prefix], axis=0
+            )
+        else:
+            prefix = self.prefix
+        prefix_embed = self.llama_decoder.model.embed_tokens(prefix)
+        suffix = self.final_header
+        suffix_embed = self.llama_decoder.model.embed_tokens(suffix)
+        inputs_embeds = torch.cat(
+            [prefix_embed, virt_tokens, suffix_embed], axis=0
+        ).unsqueeze(0)
+        outs = []
+        outputs = None
+        greedy = 1
+        i = 0
+        while greedy != 128009 and len(outs) < max_new_tokens:
+            past_key_values = outputs.past_key_values if outputs else None
+            outputs = self.llama_decoder(
+                inputs_embeds=inputs_embeds.to(
+                    self.llama_decoder.model.embed_tokens.weight.device
+                ).half(),
+                return_dict=True,
+                output_hidden_states=True,
+                past_key_values=past_key_values,
+            )
+            next_token_logits = outputs.logits[-1, -1, :]
+            if logits_processor:
+                local_outs = torch.tensor(outs) if outs != [] else suffix
+                local_outs = local_outs.reshape(1, -1)
+                next_token_logits = logits_processor(
+                    local_outs,
+                    next_token_logits.reshape(1, -1),
+                )
+                next_token_logits = next_token_logits.flatten()
+            if do_sample:
+                logits = next_token_logits / temperature
+                probs = F.softmax(logits, dim=-1)
+                greedy = torch.multinomial(probs, num_samples=1)[0]
+            else:
+                greedy = next_token_logits.argmax()
+            outs.append(greedy)
+            next_embed = self.llama_decoder.model.embed_tokens(greedy.reshape(1, 1))
+            inputs_embeds = next_embed
+            yield tokenizer.decode(outs).replace("<|eot_id|>", "")
+    return tokenizer.decode(outs).replace("<|eot_id|>", "")