vllm deployment failed

#1
by Liangmingxin - opened
(vllm) xx@DESKTOP-xx:~$ python /home/xx/vllm/vllm/entrypoints/openai/api_server.py \
> --model '/home/xx/internlm2-chat-7b-llama' \
> --tokenizer '/home/xx/internlm2-chat-7b-llama' \
> --tokenizer-mode auto \
> --trust-remote-code \
> --dtype float16 \
> --enforce-eager \
> --tensor-parallel-size 2 \
> --max-model-len 20000 \
> --chat-template 'vllm/examples/template_chatml.jinja' \
> --worker-use-ray \
> --engine-use-ray \
> --host 0.0.0.0 \
> --port 6001 \
> --disable-log-stats \
> --disable-log-requests
INFO 01-20 06:51:14 api_server.py:742] args: Namespace(host='0.0.0.0', port=6001, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], served_model_name=None, chat_template='vllm/examples/template_chatml.jinja', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, model='/home/xx/internlm2-chat-7b-llama', tokenizer='/home/xx/internlm2-chat-7b-llama', revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=True, download_dir=None, load_format='auto', dtype='float16', max_model_len=20000, worker_use_ray=True, pipeline_parallel_size=1, tensor_parallel_size=2, max_parallel_loading_workers=None, block_size=16, seed=0, swap_space=4, gpu_memory_utilization=0.9, max_num_batched_tokens=None, max_num_seqs=256, max_paddings=256, disable_log_stats=True, quantization=None, enforce_eager=True, max_context_len_to_capture=8192, engine_use_ray=True, disable_log_requests=True, max_log_len=None)
2024-01-20 06:51:17,483 INFO worker.py:1724 -- Started a local Ray instance.
Traceback (most recent call last):
  File "/home/xx/vllm/vllm/entrypoints/openai/api_server.py", line 753, in <module>
    engine_model_config = asyncio.run(engine.get_model_config())
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xx/anaconda3/envs/vllm/lib/python3.11/asyncio/runners.py", line 190, in run
    return runner.run(main)
           ^^^^^^^^^^^^^^^^
  File "/home/xx/anaconda3/envs/vllm/lib/python3.11/asyncio/runners.py", line 118, in run
    return self._loop.run_until_complete(task)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xx/anaconda3/envs/vllm/lib/python3.11/asyncio/base_events.py", line 653, in run_until_complete
    return future.result()
           ^^^^^^^^^^^^^^^
  File "/home/xx/vllm/vllm/engine/async_llm_engine.py", line 484, in get_model_config
    return await self.engine.get_model_config.remote()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ray.exceptions.RayActorError: The actor died because of an error raised in its creation task, ray::_AsyncLLMEngine.__init__() (pid=1065574, ip=172.20.4.244, actor_id=bb26c7f82bdf3238571ec97601000000, repr=<vllm.engine.async_llm_engine._AsyncLLMEngine object at 0x7fc5f2bc2550>)
  File "/home/xx/anaconda3/envs/vllm/lib/python3.11/concurrent/futures/_base.py", line 449, in result
    return self.__get_result()
           ^^^^^^^^^^^^^^^^^^^
  File "/home/xx/anaconda3/envs/vllm/lib/python3.11/concurrent/futures/_base.py", line 401, in __get_result
    raise self._exception
           ^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xx/vllm/vllm/engine/llm_engine.py", line 95, in __init__
    self.tokenizer = get_tokenizer(
                     ^^^^^^^^^^^^^^
  File "/home/xx/vllm/vllm/transformers_utils/tokenizer.py", line 28, in get_tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 770, in from_pretrained
    tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/transformers/dynamic_module_utils.py", line 488, in get_class_from_dynamic_module
    final_module = get_cached_module_file(
                   ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/transformers/dynamic_module_utils.py", line 294, in get_cached_module_file
    resolved_module_file = cached_file(
                           ^^^^^^^^^^^^
  File "/home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/transformers/utils/hub.py", line 360, in cached_file
    raise EnvironmentError(
OSError: /home/xx/internlm2-chat-7b-llama does not appear to have a file named tokenization_internlm.py. Checkout 'https://huggingface.co//home/xx/internlm2-chat-7b-llama/None' for available files.
(_AsyncLLMEngine pid=1065574) Could not locate the tokenization_internlm.py inside /home/xx/internlm2-chat-7b-llama.
(_AsyncLLMEngine pid=1065574) Exception raised in creation task: The actor died because of an error raised in its creation task, ray::_AsyncLLMEngine.__init__() (pid=1065574, ip=172.20.4.244, actor_id=bb26c7f82bdf3238571ec97601000000, repr=<vllm.engine.async_llm_engine._AsyncLLMEngine object at 0x7fc5f2bc2550>)
(_AsyncLLMEngine pid=1065574)   File "/home/xx/anaconda3/envs/vllm/lib/python3.11/concurrent/futures/_base.py", line 449, in result
(_AsyncLLMEngine pid=1065574)     return self.__get_result()
(_AsyncLLMEngine pid=1065574)            ^^^^^^^^^^^^^^^^^^^
(_AsyncLLMEngine pid=1065574)   File "/home/xx/anaconda3/envs/vllm/lib/python3.11/concurrent/futures/_base.py", line 401, in __get_result
(_AsyncLLMEngine pid=1065574)     raise self._exception
(_AsyncLLMEngine pid=1065574)            ^^^^^^^^^^^^^^^^^^^^^
(_AsyncLLMEngine pid=1065574)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(_AsyncLLMEngine pid=1065574)            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(_AsyncLLMEngine pid=1065574)   File "/home/xx/vllm/vllm/engine/llm_engine.py", line 95, in __init__
(_AsyncLLMEngine pid=1065574)     self.tokenizer = get_tokenizer(
(_AsyncLLMEngine pid=1065574)                      ^^^^^^^^^^^^^^
(_AsyncLLMEngine pid=1065574)   File "/home/xx/vllm/vllm/transformers_utils/tokenizer.py", line 28, in get_tokenizer
(_AsyncLLMEngine pid=1065574)     tokenizer = AutoTokenizer.from_pretrained(
(_AsyncLLMEngine pid=1065574)                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(_AsyncLLMEngine pid=1065574)   File "/home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/transformers/models/auto/tokenization_auto.py", line 770, in from_pretrained
(_AsyncLLMEngine pid=1065574)     tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs)
(_AsyncLLMEngine pid=1065574)                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(_AsyncLLMEngine pid=1065574)   File "/home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/transformers/dynamic_module_utils.py", line 488, in get_class_from_dynamic_module
(_AsyncLLMEngine pid=1065574)     final_module = get_cached_module_file(
(_AsyncLLMEngine pid=1065574)                    ^^^^^^^^^^^^^^^^^^^^^^^
(_AsyncLLMEngine pid=1065574)   File "/home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/transformers/dynamic_module_utils.py", line 294, in get_cached_module_file
(_AsyncLLMEngine pid=1065574)     resolved_module_file = cached_file(
(_AsyncLLMEngine pid=1065574)                            ^^^^^^^^^^^^
(_AsyncLLMEngine pid=1065574)   File "/home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/transformers/utils/hub.py", line 360, in cached_file
(_AsyncLLMEngine pid=1065574)     raise EnvironmentError(
(_AsyncLLMEngine pid=1065574) OSError: /home/xx/internlm2-chat-7b-llama does not appear to have a file named tokenization_internlm.py. Checkout 'https://huggingface.co//home/xx/internlm2-chat-7b-llama/None' for available files.
(_AsyncLLMEngine pid=1065574) INFO 01-20 06:51:19 llm_engine.py:70] Initializing an LLM engine with config: model='/home/xx/internlm2-chat-7b-llama', tokenizer='/home/xx/internlm2-chat-7b-llama', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=20000, download_dir=None, load_format=auto, tensor_parallel_size=2, quantization=None, enforce_eager=True, seed=0)
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,081 E 1065574 1065574] logging.cc:104: Stack trace:
(_AsyncLLMEngine pid=1065574)  /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(+0xfe925a) [0x7fc848e0a25a] ray::operator<<()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(+0xfeb998) [0x7fc848e0c998] ray::TerminateHandler()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/bin/../lib/libstdc++.so.6(+0xb135a) [0x7fc847c9b35a] __cxxabiv1::__terminate()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/bin/../lib/libstdc++.so.6(+0xb13c5) [0x7fc847c9b3c5]
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(+0x7c92d0) [0x7fc8485ea2d0] std::thread::_State_impl<>::~_State_impl()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(+0x62e5fa) [0x7fc84844f5fa] std::_Sp_counted_base<>::_M_release()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(+0x7b22d2) [0x7fc8485d32d2] std::_Sp_counted_ptr_inplace<>::_M_dispose()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(+0x62e5fa) [0x7fc84844f5fa] std::_Sp_counted_base<>::_M_release()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(+0x6e9732) [0x7fc84850a732] std::default_delete<>::operator()()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(_ZN3ray4core10CoreWorkerD1Ev+0xeb) [0x7fc84857c6db] ray::core::CoreWorker::~CoreWorker()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(+0x62e5fa) [0x7fc84844f5fa] std::_Sp_counted_base<>::_M_release()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(_ZN3ray4core21CoreWorkerProcessImpl26RunWorkerTaskExecutionLoopEv+0x134) [0x7fc8485ba854] ray::core::CoreWorkerProcessImpl::RunWorkerTaskExecutionLoop()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(_ZN3ray4core17CoreWorkerProcess20RunTaskExecutionLoopEv+0x1d) [0x7fc8485ba95d] ray::core::CoreWorkerProcess::RunTaskExecutionLoop()
(_AsyncLLMEngine pid=1065574) /home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_raylet.so(+0x5ae947) [0x7fc8483cf947] __pyx_pw_3ray_7_raylet_10CoreWorker_7run_task_loop()
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine() [0x535490] method_vectorcall_NOARGS
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine(PyObject_Vectorcall+0x31) [0x51bff1] PyObject_Vectorcall
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine(_PyEval_EvalFrameDefault+0x755) [0x50f025] _PyEval_EvalFrameDefault
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine() [0x5c82ce] _PyEval_Vector
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine(PyEval_EvalCode+0x9f) [0x5c79cf] PyEval_EvalCode
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine() [0x5e8807] run_eval_code_obj
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine() [0x5e4e40] run_mod
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine() [0x5f9132] pyrun_file
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine(_PyRun_SimpleFileObject+0x19f) [0x5f871f] _PyRun_SimpleFileObject
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine(_PyRun_AnyFileObject+0x43) [0x5f8473] _PyRun_AnyFileObject
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine(Py_RunMain+0x2ee) [0x5f2fee] Py_RunMain
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine(Py_BytesMain+0x39) [0x5b6e19] Py_BytesMain
(_AsyncLLMEngine pid=1065574) /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7fc849f87083] __libc_start_main
(_AsyncLLMEngine pid=1065574) ray::_AsyncLLMEngine() [0x5b6c6f]
(_AsyncLLMEngine pid=1065574)
(_AsyncLLMEngine pid=1065574) *** SIGABRT received at time=1705704679 on cpu 13 ***
(_AsyncLLMEngine pid=1065574) PC: @     0x7fc849fa600b  (unknown)  raise
(_AsyncLLMEngine pid=1065574)     @     0x7fc84a2c3420   74038176  (unknown)
(_AsyncLLMEngine pid=1065574)     @     0x7fc847c9b35a         80  __cxxabiv1::__terminate()
(_AsyncLLMEngine pid=1065574)     @     0x7fc84844f5fa         32  std::_Sp_counted_base<>::_M_release()
(_AsyncLLMEngine pid=1065574)     @     0x7fc8485d32d2         96  std::_Sp_counted_ptr_inplace<>::_M_dispose()
(_AsyncLLMEngine pid=1065574)     @     0x7fc84844f5fa         32  std::_Sp_counted_base<>::_M_release()
(_AsyncLLMEngine pid=1065574)     @     0x7fc84850a732        144  std::default_delete<>::operator()()
(_AsyncLLMEngine pid=1065574)     @     0x7fc84857c6db        128  ray::core::CoreWorker::~CoreWorker()
(_AsyncLLMEngine pid=1065574)     @     0x7fc84844f5fa         32  std::_Sp_counted_base<>::_M_release()
(_AsyncLLMEngine pid=1065574)     @     0x7fc8485ba854        112  ray::core::CoreWorkerProcessImpl::RunWorkerTaskExecutionLoop()
(_AsyncLLMEngine pid=1065574)     @     0x7fc8485ba95d         32  ray::core::CoreWorkerProcess::RunTaskExecutionLoop()
(_AsyncLLMEngine pid=1065574)     @     0x7fc8483cf947         32  __pyx_pw_3ray_7_raylet_10CoreWorker_7run_task_loop()
(_AsyncLLMEngine pid=1065574)     @           0x535490  (unknown)  method_vectorcall_NOARGS
(_AsyncLLMEngine pid=1065574)     @           0x875180  (unknown)  (unknown)
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,086 E 1065574 1065574] logging.cc:361: *** SIGABRT received at time=1705704679 on cpu 13 ***
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,086 E 1065574 1065574] logging.cc:361: PC: @     0x7fc849fa600b  (unknown)  raise
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc84a2c3420   74038176  (unknown)
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc847c9b35a         80  __cxxabiv1::__terminate()
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc84844f5fa         32  std::_Sp_counted_base<>::_M_release()
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc8485d32d2         96  std::_Sp_counted_ptr_inplace<>::_M_dispose()
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc84844f5fa         32  std::_Sp_counted_base<>::_M_release()
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc84850a732        144  std::default_delete<>::operator()()
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc84857c6db        128  ray::core::CoreWorker::~CoreWorker()
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc84844f5fa         32  std::_Sp_counted_base<>::_M_release()
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc8485ba854        112  ray::core::CoreWorkerProcessImpl::RunWorkerTaskExecutionLoop()
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc8485ba95d         32  ray::core::CoreWorkerProcess::RunTaskExecutionLoop()
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @     0x7fc8483cf947         32  __pyx_pw_3ray_7_raylet_10CoreWorker_7run_task_loop()
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @           0x535490  (unknown)  method_vectorcall_NOARGS
(_AsyncLLMEngine pid=1065574) [2024-01-20 06:51:19,087 E 1065574 1065574] logging.cc:361:     @           0x875180  (unknown)  (unknown)
(_AsyncLLMEngine pid=1065574) Fatal Python error: Aborted
(_AsyncLLMEngine pid=1065574)
(_AsyncLLMEngine pid=1065574) Stack (most recent call first):
(_AsyncLLMEngine pid=1065574)   File "/home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_private/worker.py", line 847 in main_loop
(_AsyncLLMEngine pid=1065574)   File "/home/xx/anaconda3/envs/vllm/lib/python3.11/site-packages/ray/_private/workers/default_worker.py", line 282 in <module>
(_AsyncLLMEngine pid=1065574)
(_AsyncLLMEngine pid=1065574) Extension modules: psutil._psutil_linux, psutil._psutil_posix, msgpack._cmsgpack, google._upb._message, setproctitle, yaml._yaml, charset_normalizer.md, uvloop.loop, ray._raylet, numpy.core._multiarray_umath, numpy.core._multiarray_tests, numpy.linalg._umath_linalg, numpy.fft._pocketfft_internal, numpy.random._common, numpy.random.bit_generator, numpy.random._bounded_integers, numpy.random._mt19937, numpy.random.mtrand, numpy.random._philox, numpy.random._pcg64, numpy.random._sfc64, numpy.random._generator, torch._C, torch._C._fft, torch._C._linalg, torch._C._nested, torch._C._nn, torch._C._sparse, torch._C._special, pydantic.typing, pydantic.errors, pydantic.version, pydantic.utils, pydantic.class_validators, pydantic.config, pydantic.color, pydantic.datetime_parse, pydantic.validators, pydantic.networks, pydantic.types, pydantic.json, pydantic.error_wrappers, pydantic.fields, pydantic.parse, pydantic.schema, pydantic.main, pydantic.dataclasses, pydantic.annotated_types, pydantic.decorator, pydantic.env_settings, pydantic.tools, pydantic, sentencepiece._sentencepiece, pyarrow.lib, pyarrow._hdfsio, pyarrow._json (total: 56)

It seems that there is little official interest in adapting the vllm and llama frameworks, checking the llm leaderboards it's about 68-70 points or so, well, sadly it's not possible to continue experiencing the model...

Interesting, the error seems to suggest that you need to go download the file from the original repo: https://huggingface.co/internlm/internlm2-chat-7b/blob/main/tokenization_internlm.py

Could give that a try?

Didn't expect to need any of those files after conversion, I can include them if that fixes your error

Sign up or log in to comment