NexaAIDev
/

Squid

Safetensors

English

dolphin

RAG

on-device language model

Retrieval Augmented Generation

Model card Files Files and versions Community

runningSnail commited on Aug 27, 2024

Commit

1e57a45

1 Parent(s): 6ad73d9

registration works

Browse files

Files changed (2) hide show

configuration_dolphin.py +6 -123
modeling_dolphin.py +20 -22

configuration_dolphin.py CHANGED Viewed

@@ -84,8 +84,8 @@ class DolphinConfig(PretrainedConfig):
     def __init__(
         self,
-        vocab_size=151936,
-        hidden_size=4096,
         intermediate_size=22016,
         num_hidden_layers=32,
         num_attention_heads=32,
@@ -133,7 +133,7 @@ class DolphinConfig(PretrainedConfig):
         )
 encoder_config_dict = {
-    "_name_or_path": "alexchen4ai/Qwen2-0.5B",
     "add_cross_attention": False,
     "architectures": ["Qwen2ForCausalLM"],
     "attention_dropout": 0.0,
@@ -208,123 +208,6 @@ encoder_config_dict = {
     "attn_implementation": None,
 }
-class Qwen2Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a
-    Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of
-    Qwen2-7B-beta [Qwen/Qwen2-7B-beta](https://huggingface.co/Qwen/Qwen2-7B-beta).
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 151936):
-            Vocabulary size of the Qwen2 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Qwen2Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 22016):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        num_key_value_heads (`int`, *optional*, defaults to 32):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 32768):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied.
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        use_sliding_window (`bool`, *optional*, defaults to `False`):
-            Whether to use sliding window attention.
-        sliding_window (`int`, *optional*, defaults to 4096):
-            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
-        max_window_layers (`int`, *optional*, defaults to 28):
-            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-    ```python
-    >>> from transformers import Qwen2Model, Qwen2Config
-    >>> # Initializing a Qwen2 style configuration
-    >>> configuration = Qwen2Config()
-    >>> # Initializing a model from the Qwen2-7B style configuration
-    >>> model = Qwen2Model(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "qwen2"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    def __init__(
-        self,
-        vocab_size=151936,
-        hidden_size=4096,
-        intermediate_size=22016,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=32,
-        hidden_act="silu",
-        max_position_embeddings=32768,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        use_sliding_window=False,
-        sliding_window=4096,
-        max_window_layers=28,
-        attention_dropout=0.0,
-        encoder_config=None,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_sliding_window = use_sliding_window
-        self.sliding_window = sliding_window
-        self.max_window_layers = max_window_layers
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.rms_norm_eps = rms_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.attention_dropout = attention_dropout
-        self.encoder_config = encoder_config
-        super().__init__(
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )

     def __init__(
         self,
+        vocab_size=152064,  # Updated to match the checkpoint
+        hidden_size=3584,    # Updated to match the checkpoint
         intermediate_size=22016,
         num_hidden_layers=32,
         num_attention_heads=32,
         )
 encoder_config_dict = {
+    "_name_or_path": "Qwen/Qwen2-0.5B",
     "add_cross_attention": False,
     "architectures": ["Qwen2ForCausalLM"],
     "attention_dropout": 0.0,
     "attn_implementation": None,
 }
+if __name__ == "__main__":
+    config = DolphinConfig(encoder_config=encoder_config_dict)
+    config.save_pretrained("dolphin-config")

modeling_dolphin.py CHANGED Viewed

@@ -12,7 +12,7 @@ from transformers.models.qwen2.modeling_qwen2 import (
     Qwen2PreTrainedModel, Qwen2Model, Qwen2RMSNorm
 )
 from transformers.modeling_attn_mask_utils import (
-    AttentionMaskConverter,
 )
 from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
 from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
@@ -186,7 +186,7 @@ class DolphinModel(Qwen2PreTrainedModel):
     Args:
         config: DolphinModel
     """
-    config_class = DolphinConfig
     def __init__(self, config: DolphinConfig):
         super().__init__(config)
@@ -732,33 +732,30 @@ class DolphinForCausalLM(Qwen2PreTrainedModel):
             )
         return reordered_past
-MEMORY_SIZE = 32
-def inference_instruct(mycontext, device = "cuda:0"):
     import time
-    start = time.time()
     generated_token_ids = []
-    prompt = " <context>Who and when founded the Shanda group?"
-    print("input prompt: " + prompt)
-    print("input context: " + mycontext)
     text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
     input_ids = (
-        torch.tensor(text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long)
         .unsqueeze(0)
         .to(device)
     )
-    # print(input_ids)
     # to process the context
     context_tokenized = tokenizer(
         mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
         return_tensors="pt",
     )
     context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
-    # print(context_tokenized["input_ids"])
-    context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
-    print("length of context: " + str(context_token_count) + " tokens")
     # We conduct a inference process
     for i in range(context_token_count):
-        print(f"\rGenerating token {i+1}/{context_token_count}", end="")
         next_token = (
             model(
                 input_ids,
@@ -772,23 +769,24 @@ def inference_instruct(mycontext, device = "cuda:0"):
             break
         generated_token_ids.append(next_token.item())
         input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
-    print("\noutput: " + tokenizer.decode(generated_token_ids))
-    end = time.time()
-    print(f"Elapsed time: {end - start:.2f}s")
 if __name__ == "__main__":
     # Register your configuration and model
     AutoConfig.register("dolphin", DolphinConfig)
     AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
     # Load the tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
     # Run inference example
     mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
-    inference_instruct(mycontext, "who founded Nexa AI?")
-    inference_instruct(mycontext, "what is the mission of Nexa AI?")
-    inference_instruct(mycontext, "what is the performance of Octopus V2 and V3?")
-    inference_instruct(mycontext, "when is Nexa AI founded?")

     Qwen2PreTrainedModel, Qwen2Model, Qwen2RMSNorm
 )
 from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter
 )
 from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
 from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
     Args:
         config: DolphinModel
     """
+    # config_class = DolphinConfig
     def __init__(self, config: DolphinConfig):
         super().__init__(config)
             )
         return reordered_past
+def inference_instruct(mycontext, question, device="cuda:0"):
     import time
+    MEMORY_SIZE = 32
+    start_time = time.time()
     generated_token_ids = []
+    prompt = f" <context>{question}"
     text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")]
     input_ids = (
+        torch.tensor(
+            text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long
+        )
         .unsqueeze(0)
         .to(device)
     )
     # to process the context
     context_tokenized = tokenizer(
         mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]),
         return_tensors="pt",
     )
     context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()}
+    context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE
     # We conduct a inference process
     for i in range(context_token_count):
         next_token = (
             model(
                 input_ids,
             break
         generated_token_ids.append(next_token.item())
         input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1)
+    result = tokenizer.decode(generated_token_ids)
+    print(f"Time taken: {time.time() - start_time}")
+    return result
 if __name__ == "__main__":
     # Register your configuration and model
     AutoConfig.register("dolphin", DolphinConfig)
     AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM)
+    device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
     # Load the tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda:0")
     # Run inference example
     mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally"
+    question = "Who founded Nexa AI?"
+    # Pass the context and the correct device string
+    result = inference_instruct(mycontext, question, device=device_name)
+    print("Result:", result)