File size: 1,350 Bytes
			
			| 45a4585 6db13fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | ---
license: apache-2.0
---
This is a copy of Qwen3-8B compiled with the 2.25 SDK for the Neuron workshop.
https://github.com/aws-neuron/neuron-workshops
This checkpoint was generated with the code:
```
bs=1
seqlength=1024
import os
from vllm import LLM, SamplingParams
os.environ['VLLM_NEURON_FRAMEWORK'] = "neuronx-distributed-inference"
path = f"/home/ubuntu/qwen3/qwen3-8B-BS{bs}-SEQ{seqlength}"
#save the sharded weights and compiler artifacts in the same folder
os.environ['NEURON_COMPILED_ARTIFACTS'] = path
os.environ['BASE_COMPILE_WORK_DIR'] =path
llm = LLM(
    model="/home/ubuntu/models/Qwen3-8B",
    max_num_seqs=bs,
    max_model_len=seqlength,
    device="neuron",
    tensor_parallel_size=2,
    override_neuron_config={"save_sharded_checkpoint": True})
prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
# note that top_k must be set to lower than the global_top_k defined in
# the neuronx_distributed_inference.models.config.OnDeviceSamplingConfig
sampling_params = SamplingParams(top_k=10, temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` | 
