RuntimeError: probability tensor contains either `inf`, `nan` or element < 0
Trying to run Mixtral 8X7B AWQ model using autoawq and huggingface pipeline as mentioned in the Model card .
I am able to load the model but while inferring from model it throws probability tensor error during model generation call .
I found some fixes in this github thread https://github.com/facebookresearch/llama/issues/380 ,but none of them worked .
Also tried to load model through vLLM library but I could not load model due to ''assert linear_method is None'' error.
Any idea as to why this probability error is coming ?
Here is the traceback for probability error(with HF pipeline) :
RuntimeError Traceback (most recent call last)
Cell In[6], line 23
12 #tokenizer.pad_token = "[PAD]"
13 #tokenizer.padding_side = "left"
16 pipe = pipeline(
17 "text-generation",
18 model=model,
19 tokenizer=tokenizer,
20 **generation_params
21 )
---> 23 pipe_output = pipe(prompt_template)[0]['generated_text']
24 print("pipeline output: ", pipe_output)
File ~/.conda/envs/awq/lib/python3.11/site-packages/transformers/pipelines/text_generation.py:219, in TextGenerationPipeline.__call__(self, text_inputs, **kwargs)
178 def __call__(self, text_inputs, **kwargs):
179 """
180 Complete the prompt(s) given as inputs.
181
(...)
217 ids of the generated text.
218 """
--> 219 return super().__call__(text_inputs, **kwargs)
File ~/.conda/envs/awq/lib/python3.11/site-packages/transformers/pipelines/base.py:1162, in Pipeline.__call__(self, inputs, num_workers, batch_size, *args, **kwargs)
1154 return next(
1155 iter(
1156 self.get_iterator(
(...)
1159 )
1160 )
1161 else:
-> 1162 return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
File ~/.conda/envs/awq/lib/python3.11/site-packages/transformers/pipelines/base.py:1169, in Pipeline.run_single(self, inputs, preprocess_params, forward_params, postprocess_params)
1167 def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
1168 model_inputs = self.preprocess(inputs, **preprocess_params)
-> 1169 model_outputs = self.forward(model_inputs, **forward_params)
1170 outputs = self.postprocess(model_outputs, **postprocess_params)
1171 return outputs
File ~/.conda/envs/awq/lib/python3.11/site-packages/transformers/pipelines/base.py:1068, in Pipeline.forward(self, model_inputs, **forward_params)
1066 with inference_context():
1067 model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
-> 1068 model_outputs = self._forward(model_inputs, **forward_params)
1069 model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
1070 else:
File ~/.conda/envs/awq/lib/python3.11/site-packages/transformers/pipelines/text_generation.py:295, in TextGenerationPipeline._forward(self, model_inputs, **generate_kwargs)
292 generate_kwargs["min_length"] += prefix_length
294 # BS x SL
--> 295 generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
296 out_b = generated_sequence.shape[0]
297 if self.framework == "pt":
File ~/.conda/envs/awq/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/.conda/envs/awq/lib/python3.11/site-packages/transformers/generation/utils.py:1525, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)
1517 input_ids, model_kwargs = self._expand_inputs_for_generation(
1518 input_ids=input_ids,
1519 expand_size=generation_config.num_return_sequences,
1520 is_encoder_decoder=self.config.is_encoder_decoder,
1521 **model_kwargs,
1522 )
1524 # 13. run sample
-> 1525 return self.sample(
1526 input_ids,
1527 logits_processor=prepared_logits_processor,
1528 logits_warper=logits_warper,
1529 stopping_criteria=prepared_stopping_criteria,
1530 pad_token_id=generation_config.pad_token_id,
1531 eos_token_id=generation_config.eos_token_id,
1532 output_scores=generation_config.output_scores,
1533 return_dict_in_generate=generation_config.return_dict_in_generate,
1534 synced_gpus=synced_gpus,
1535 streamer=streamer,
1536 **model_kwargs,
1537 )
1539 elif generation_mode == GenerationMode.BEAM_SEARCH:
1540 # 11. prepare beam search scorer
1541 beam_scorer = BeamSearchScorer(
1542 batch_size=batch_size,
1543 num_beams=generation_config.num_beams,
(...)
1548 max_length=generation_config.max_length,
1549 )
File ~/.conda/envs/awq/lib/python3.11/site-packages/transformers/generation/utils.py:2664, in GenerationMixin.sample(self, input_ids, logits_processor, stopping_criteria, logits_warper, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
2657 probs = nn.functional.softmax(next_token_scores, dim=-1)
2658 #nans = torch.isnan(probs)
2659 #if nans.any():
2660 # idx = torch.argwhere(torch.sum(nans, 1))
2661 # z = torch.zeros_like(probs[idx][0])
2662 # z[0][2] = 1.
2663 #probs[idx] = z
-> 2664 next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
2666 # finished sentences should have their next token be a padding token
2667 if eos_token_id is not None:
RuntimeError: probability tensor contains either `inf`, `nan` or element < 0
Traceback for vLLM assert error :
RayTaskError(AssertionError) Traceback (most recent call last)
Cell In[4], line 3
1 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
----> 3 llm = LLM(model="Mixtral-8x7B-Instruct-v0.1-AWQ",tensor_parallel_size=4,quantization="awq" , dtype="auto")
File ~/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/entrypoints/llm.py:93, in LLM.__init__(self, model, tokenizer, tokenizer_mode, trust_remote_code, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, **kwargs)
77 kwargs["disable_log_stats"] = True
78 engine_args = EngineArgs(
79 model=model,
80 tokenizer=tokenizer,
(...)
91 **kwargs,
92 )
---> 93 self.llm_engine = LLMEngine.from_engine_args(engine_args)
94 self.request_counter = Counter()
File ~/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/engine/llm_engine.py:246, in LLMEngine.from_engine_args(cls, engine_args)
243 distributed_init_method, placement_group = initialize_cluster(
244 parallel_config)
245 # Create the LLM engine.
--> 246 engine = cls(*engine_configs,
247 distributed_init_method,
248 placement_group,
249 log_stats=not engine_args.disable_log_stats)
250 return engine
File ~/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/engine/llm_engine.py:107, in LLMEngine.__init__(self, model_config, cache_config, parallel_config, scheduler_config, distributed_init_method, placement_group, log_stats)
105 # Create the parallel GPU workers.
106 if self.parallel_config.worker_use_ray:
--> 107 self._init_workers_ray(placement_group)
108 else:
109 self._init_workers(distributed_init_method)
File ~/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/engine/llm_engine.py:194, in LLMEngine._init_workers_ray(self, placement_group, **ray_remote_kwargs)
181 self._run_workers("init_worker",
182 get_all_outputs=True,
183 worker_init_fn=lambda: Worker(
(...)
188 None,
189 ))
190 self._run_workers(
191 "init_model",
192 get_all_outputs=True,
193 )
--> 194 self._run_workers(
195 "load_model",
196 get_all_outputs=True,
197 max_concurrent_workers=self.parallel_config.
198 max_parallel_loading_workers,
199 )
File ~/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/engine/llm_engine.py:750, in LLMEngine._run_workers(self, method, get_all_outputs, max_concurrent_workers, *args, **kwargs)
746 work_groups = [self.workers]
748 for workers in work_groups:
749 all_outputs.extend(
--> 750 self._run_workers_in_batch(workers, method, *args, **kwargs))
752 if get_all_outputs:
753 return all_outputs
File ~/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/engine/llm_engine.py:727, in LLMEngine._run_workers_in_batch(self, workers, method, *args, **kwargs)
725 all_outputs.append(output)
726 if self.parallel_config.worker_use_ray:
--> 727 all_outputs = ray.get(all_outputs)
728 return all_outputs
File ~/.conda/envs/mixtral/lib/python3.11/site-packages/ray/_private/auto_init_hook.py:22, in wrap_auto_init.<locals>.auto_init_wrapper(*args, **kwargs)
19
@wraps
(fn)
20 def auto_init_wrapper(*args, **kwargs):
21 auto_init_ray()
---> 22 return fn(*args, **kwargs)
File ~/.conda/envs/mixtral/lib/python3.11/site-packages/ray/_private/client_mode_hook.py:103, in client_mode_hook.<locals>.wrapper(*args, **kwargs)
101 if func.__name__ != "init" or is_client_mode_enabled_by_default:
102 return getattr(ray, func.__name__)(*args, **kwargs)
--> 103 return func(*args, **kwargs)
File ~/.conda/envs/mixtral/lib/python3.11/site-packages/ray/_private/worker.py:2624, in get(object_refs, timeout)
2622 worker.core_worker.dump_object_store_memory_usage()
2623 if isinstance(value, RayTaskError):
-> 2624 raise value.as_instanceof_cause()
2625 else:
2626 raise value
RayTaskError(AssertionError): ray::RayWorkerVllm.execute_method() (pid=32521, ip=172.31.9.16, actor_id=05d54b9ede8ed31dcabe3fc401000000, repr=<vllm.engine.ray_utils.RayWorkerVllm object at 0x7f43606a5450>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/centos/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/engine/ray_utils.py", line 32, in execute_method
return executor(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/centos/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/worker/worker.py", line 72, in load_model
self.model_runner.load_model()
File "/home/centos/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 36, in load_model
self.model = get_model(self.model_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/centos/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/model_executor/model_loader.py", line 117, in get_model
model = model_class(model_config.hf_config, linear_method)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/centos/.conda/envs/mixtral/lib/python3.11/site-packages/vllm/model_executor/models/mixtral.py", line 451, in __init__
assert linear_method is None
^^^^^^^^^^^^^^^^^^^^^
AssertionError
Please use this model instead (TheBloke's is corrupted)
https://huggingface.co/casperhansen/mixtral-instruct-awq
Thanks for the update . casperhansen's repo is working !