Skip to content

Commit 1f4b09b

Browse files
authored
Add support to modelopt quantization of Mixtral model (#15961)
Signed-off-by: Yue <yueshen@nvidia.com>
1 parent 86c3369 commit 1f4b09b

File tree

1 file changed

+7
-1
lines changed

1 file changed

+7
-1
lines changed

vllm/model_executor/models/mixtral_quant.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@
4545
from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
4646
from vllm.model_executor.layers.vocab_parallel_embedding import (
4747
ParallelLMHead, VocabParallelEmbedding)
48-
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
48+
from vllm.model_executor.model_loader.weight_utils import (
49+
default_weight_loader, maybe_remap_kv_scale_name)
4950
from vllm.model_executor.sampling_metadata import SamplingMetadata
5051
from vllm.sequence import IntermediateTensors
5152

@@ -420,6 +421,11 @@ def load_weights(self, weights: Iterable[Tuple[str,
420421
for name, loaded_weight in weights:
421422
if "rotary_emb.inv_freq" in name:
422423
continue
424+
if name.endswith("scale"):
425+
# Remapping the name of FP8 kv-scale.
426+
name = maybe_remap_kv_scale_name(name, params_dict)
427+
if name is None:
428+
continue
423429
for (param_name, weight_name, shard_id) in stacked_params_mapping:
424430
if weight_name not in name:
425431
continue

0 commit comments

Comments
 (0)