jburtoft commited on
Commit
6db13fc
·
verified ·
1 Parent(s): 45a4585

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +38 -1
README.md CHANGED
@@ -2,4 +2,41 @@
2
  license: apache-2.0
3
  ---
4
  This is a copy of Qwen3-8B compiled with the 2.25 SDK for the Neuron workshop.
5
- https://github.com/aws-neuron/neuron-workshops
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  license: apache-2.0
3
  ---
4
  This is a copy of Qwen3-8B compiled with the 2.25 SDK for the Neuron workshop.
5
+ https://github.com/aws-neuron/neuron-workshops
6
+
7
+ This checkpoint was generated with the code:
8
+ ```
9
+ bs=1
10
+ seqlength=1024
11
+
12
+ import os
13
+ from vllm import LLM, SamplingParams
14
+ os.environ['VLLM_NEURON_FRAMEWORK'] = "neuronx-distributed-inference"
15
+
16
+ path = f"/home/ubuntu/qwen3/qwen3-8B-BS{bs}-SEQ{seqlength}"
17
+ #save the sharded weights and compiler artifacts in the same folder
18
+ os.environ['NEURON_COMPILED_ARTIFACTS'] = path
19
+ os.environ['BASE_COMPILE_WORK_DIR'] =path
20
+
21
+ llm = LLM(
22
+ model="/home/ubuntu/models/Qwen3-8B",
23
+ max_num_seqs=bs,
24
+ max_model_len=seqlength,
25
+ device="neuron",
26
+ tensor_parallel_size=2,
27
+ override_neuron_config={"save_sharded_checkpoint": True})
28
+ prompts = [
29
+ "Hello, my name is",
30
+ "The president of the United States is",
31
+ "The capital of France is",
32
+ "The future of AI is",
33
+ ]
34
+ # note that top_k must be set to lower than the global_top_k defined in
35
+ # the neuronx_distributed_inference.models.config.OnDeviceSamplingConfig
36
+ sampling_params = SamplingParams(top_k=10, temperature=0.8, top_p=0.95)
37
+ outputs = llm.generate(prompts, sampling_params)
38
+ for output in outputs:
39
+ prompt = output.prompt
40
+ generated_text = output.outputs[0].text
41
+ print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
42
+ ```