Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Runtime error

App Files Files Community

AppleSwing commited on May 11, 2024

Commit

a3cdaa8

1 Parent(s): c2dbb45

fix generation issue

Browse files

Files changed (2) hide show

src/backend/hflm_with_measurement.py +37 -52
src/backend/tasks/gsm8k/gsm8k-custom.yaml +10 -7

src/backend/hflm_with_measurement.py CHANGED Viewed

@@ -294,7 +294,7 @@ class HFLMWithMeasurement(HFLM):
         return re_ord.get_original(res)
-    def _model_generate(self, context, max_length, stop, **generation_kwargs):
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:
         # remove temperature, as do_sample=False takes care of this
@@ -302,7 +302,7 @@ class HFLMWithMeasurement(HFLM):
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
-        is_gsm8k = generation_kwargs.get("is_gsm8k", False)
         # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -311,12 +311,12 @@ class HFLMWithMeasurement(HFLM):
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
-        if is_gsm8k:
-            generation_kwargs.pop("is_gsm8k")
         context_length = context.shape[1]
         model_config = self.model.config
         if not self.precision:
             if model_config.quantization_config._load_in_4bit:
                 self.precision = "4bit"
@@ -325,38 +325,21 @@ class HFLMWithMeasurement(HFLM):
             else:
                 raise ValueError("Unknown precision")
-        if not is_gsm8k:
-        # build stopping criteria
-            print("Using normal stopping criteria")
-            stopping_criteria = stop_sequences_criteria(
-                self.tokenizer, stop, context.shape[1], context.shape[0]
-            )
-            stop_watch = StopWatch(self.tokenizer)
-            start = time()
-            res = self.model.generate(
-                input_ids=context,
-                max_length=max_length,
-                stopping_criteria=stopping_criteria,
-                pad_token_id=self.tokenizer.pad_token_id,
-                use_cache=True,
-                streamer=stop_watch,
-                **generation_kwargs,
-            )
-            end = time()
-        else:
-            # print("Using GSM8K")
-            stop_watch = StopWatch(self.tokenizer)
-            start = time()
-            res = self.model.generate(
-                input_ids=context,
-                max_length=max_length,
-                eos_token_id=stop,
-                pad_token_id=self.tokenizer.pad_token_id,
-                use_cache=True,
-                streamer=stop_watch,
-                **generation_kwargs,
-            )
-            end = time()
         batch_size = context.shape[0]
         output_length = stop_watch.decoding_iterations
@@ -498,15 +481,18 @@ class HFLMWithMeasurement(HFLM):
                     f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                 )
             # add EOS token to stop sequences
-            eos = self.tok_decode(self.eot_token_id)
             if not until:
                 until = [eos]
             else:
                 until.append(eos)
-            is_gsm8k = kwargs.get("is_gsm8k", False)
-            if is_gsm8k:
-                until = [self.tokenizer.eos_token_id, self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
             if "max_gen_toks" in kwargs.keys():
                 max_gen_toks = kwargs.pop("max_gen_toks")
@@ -532,8 +518,8 @@ class HFLMWithMeasurement(HFLM):
             context_enc = context_enc.to(self.device)
             attn_masks = attn_masks.to(self.device)
-            if "max_length" not in kwargs:
-                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
             # perform batched generation
             cont, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu = self._model_generate(
@@ -551,17 +537,16 @@ class HFLMWithMeasurement(HFLM):
                     cont_toks = cont_toks[context_enc.shape[1] :]
                 s = self.tok_decode(cont_toks)
                 # print(s)
-                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                if not is_gsm8k:
-                    for term in until:
-                        if len(term) > 0:
-                            # ignore '' separator,
-                            # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
-                            s = s.split(term)[0]
                 res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)

         return re_ord.get_original(res)
+    def _model_generate(self, context, max_tokens, stop, **generation_kwargs):
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:
         # remove temperature, as do_sample=False takes care of this
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
+        # is_gsm8k = generation_kwargs.get("is_gsm8k", False)
         # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
+        # if is_gsm8k:
+        #     generation_kwargs.pop("is_gsm8k")
         context_length = context.shape[1]
         model_config = self.model.config
         if not self.precision:
             if model_config.quantization_config._load_in_4bit:
                 self.precision = "4bit"
             else:
                 raise ValueError("Unknown precision")
+        stopping_criteria = stop_sequences_criteria(
+            self.tokenizer, stop, context.shape[1], context.shape[0]
+        )
+        stop_watch = StopWatch(self.tokenizer)
+        start = time()
+        res = self.model.generate(
+            input_ids=context,
+            max_new_tokens=max_tokens,
+            stopping_criteria=stopping_criteria,
+            pad_token_id=self.tokenizer.pad_token_id,
+            use_cache=True,
+            streamer=stop_watch,
+            **generation_kwargs,
+        )
+        end = time()
         batch_size = context.shape[0]
         output_length = stop_watch.decoding_iterations
                     f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                 )
             # add EOS token to stop sequences
+            eos = "<|eot_id|>"
             if not until:
                 until = [eos]
             else:
                 until.append(eos)
+            # is_gsm8k = kwargs.get("is_gsm8k", False)
+            # if is_gsm8k:
+            #     until = ["Question:", "Question", "</s>"]
+            #     eos_ids = [self.tokenizer.eos_token_id,
+            #              self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
             if "max_gen_toks" in kwargs.keys():
                 max_gen_toks = kwargs.pop("max_gen_toks")
             context_enc = context_enc.to(self.device)
             attn_masks = attn_masks.to(self.device)
+            if "max_tokens" not in kwargs:
+                kwargs["max_tokens"] = max_gen_toks
             # perform batched generation
             cont, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu = self._model_generate(
                     cont_toks = cont_toks[context_enc.shape[1] :]
                 s = self.tok_decode(cont_toks)
+                # # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
+                # if not is_gsm8k:
+                for term in until:
+                    if len(term) > 0:
+                        # ignore '' separator,
+                        # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
+                        s = s.split(term)[0]
                 # print(s)
                 res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)

src/backend/tasks/gsm8k/gsm8k-custom.yaml CHANGED Viewed

@@ -22,18 +22,21 @@ metric_list:
       - "\\.$"
 generation_kwargs:
   until:
-    - "<|eot_id|>"
   do_sample: false
   temperature: 0.0
-  is_gsm8k: true
 repeats: 1
 num_fewshot: 5
 filter_list:
-  # - name: "strict-match"
-  #   filter:
-  #     - function: "regex"
-  #       regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
-  #     - function: "take_first"
   - name: "flexible-extract"
     filter:
       - function: "regex"

       - "\\.$"
 generation_kwargs:
   until:
+    - "Question:"
+    - "Question"
+    - "</s>"
+    - "<|im_end|>"
   do_sample: false
   temperature: 0.0
+  # is_gsm8k: true
 repeats: 1
 num_fewshot: 5
 filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
+      - function: "take_first"
   - name: "flexible-extract"
     filter:
       - function: "regex"