automatedstockminingorg
/

expert-on-investment-valuation-mypricermodel

Text Generation

instruction-tuned

8-bit precision

Model card Files Files and versions Metrics Training metrics Community

automatedstockminingorg commited on Nov 2, 2024

Commit

cdb39c8

·

verified ·

1 Parent(s): 0546162

Update config.json

Files changed (1) hide show

config.json +5 -35

config.json CHANGED Viewed

@@ -1,16 +1,10 @@
 {
   "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
-  "architectures": [
-    "LlamaForCausalLM"
-  ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 128000,
-  "eos_token_id": [
-    128001,
-    128008,
-    128009
-  ],
   "hidden_act": "silu",
   "hidden_size": 4096,
   "initializer_range": 0.02,
@@ -28,20 +22,11 @@
     "bnb_4bit_compute_dtype": "bfloat16",
     "bnb_4bit_quant_storage": "uint8",
     "bnb_4bit_quant_type": "nf4",
-    "bnb_4bit_use_double_quant": true,
-    "llm_int8_enable_fp32_cpu_offload": false,
-    "llm_int8_has_fp16_weight": false,
-    "llm_int8_skip_modules": null,
-    "llm_int8_threshold": 6.0,
-    "load_in_4bit": true,
-    "load_in_8bit": false,
-    "quant_method": "bitsandbytes"
   },
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 8.0,
-    "high_freq_factor": 4.0,
-    "low_freq_factor": 1.0,
     "original_max_position_embeddings": 8192,
     "rope_type": "llama3"
   },
@@ -53,31 +38,16 @@
   "vocab_size": 128256,
   "serverless": {
     "enabled": true,
-    "max_batch_size": 128,
     "max_sequence_length": 2048,
     "model_parallelism": {
       "enabled": true,
       "num_gpus": 1,
-      "num_shards": 1
     },
     "quantization": {
       "enabled": true,
       "target_dtype": "int8"
     }
   }
 }
-"serverless": {
-  "enabled": true,
-  "max_batch_size": 128,
-  "max_sequence_length": 2048,
-  "model_parallelism": {
-    "enabled": true,
-    "num_gpus": 1,
-    "num_shards": 1
-  },
-  "quantization": {
-    "enabled": true,
-    "target_dtype": "int8"
-  }
-}

 {
   "_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
+  "architectures": ["LlamaForCausalLM"],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 128000,
+  "eos_token_id": [128001, 128008, 128009],
   "hidden_act": "silu",
   "hidden_size": 4096,
   "initializer_range": 0.02,
     "bnb_4bit_compute_dtype": "bfloat16",
     "bnb_4bit_quant_storage": "uint8",
     "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true
   },
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 8.0,
     "original_max_position_embeddings": 8192,
     "rope_type": "llama3"
   },
   "vocab_size": 128256,
   "serverless": {
     "enabled": true,
+    "max_batch_size": 64,                // Lowered batch size for better compatibility
     "max_sequence_length": 2048,
     "model_parallelism": {
       "enabled": true,
       "num_gpus": 1,
+      "num_shards": 2                     // Adjusted for distributed inference
     },
     "quantization": {
       "enabled": true,
       "target_dtype": "int8"
     }
   }
 }