Skip to content

[Misc] Qwen2.5 VL support LoRA #13261

New issue

Have a question about this project? No Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “No Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? No Sign in to your account

Merged
merged 8 commits into from
Feb 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,7 @@ See [this page](#generative-models) for more information on how to use generativ
* Qwen2.5-VL
* T + I<sup>E+</sup> + V<sup>E+</sup>
* `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.
*
* ✅︎
* ✅︎
* ✅︎
- * `UltravoxModel`
Expand Down
5 changes: 5 additions & 0 deletions tests/lora/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,11 @@ def qwen2vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen2-vl-lora-pokemon")


@pytest.fixture(scope="session")
def qwen25vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")


@pytest.fixture(scope="session")
def tinyllama_lora_files():
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
Expand Down
176 changes: 118 additions & 58 deletions tests/lora/test_qwen2vl.py
Original file line number Diff line number Diff line change
@@ -1,83 +1,143 @@
# SPDX-License-Identifier: Apache-2.0

from typing import List
from dataclasses import dataclass
from typing import Dict, List, Optional

import pytest
from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION

import vllm
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.platforms import current_platform

MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"

PROMPT_TEMPLATE = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
"What is in the image?<|im_end|>\n"
"<|im_start|>assistant\n")
@dataclass
class TestConfig:
model_path: str
lora_path: str
max_num_seqs: int = 2
max_loras: int = 2
max_lora_rank: int = 16
max_model_len: int = 4096
mm_processor_kwargs: Optional[Dict[str, int]] = None

def __post_init__(self):
if self.mm_processor_kwargs is None:
self.mm_processor_kwargs = {
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
}


class Qwen2VLTester:
"""Test helper for Qwen2 VL models with LoRA"""

PROMPT_TEMPLATE = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
"What is in the image?<|im_end|>\n"
"<|im_start|>assistant\n")

def __init__(self, config: TestConfig):
self.config = config
self.llm = self._initialize_llm()

def _initialize_llm(self) -> vllm.LLM:
"""Initialize the LLM with given configuration"""
return vllm.LLM(
model=self.config.model_path,
max_num_seqs=self.config.max_num_seqs,
enable_lora=True,
max_loras=self.config.max_loras,
max_lora_rank=self.config.max_lora_rank,
trust_remote_code=True,
mm_processor_kwargs=self.config.mm_processor_kwargs,
max_model_len=self.config.max_model_len,
)

def run_test(self,
images: List[ImageAsset],
expected_outputs: List[str],
lora_id: Optional[int] = None,
temperature: float = 0,
max_tokens: int = 5) -> List[str]:

sampling_params = vllm.SamplingParams(
temperature=temperature,
max_tokens=max_tokens,
)
inputs = [{
"prompt": self.PROMPT_TEMPLATE,
"multi_modal_data": {
"image": asset.pil_image
},
} for asset in images]

lora_request = LoRARequest(str(lora_id), lora_id,
self.config.lora_path)
outputs = self.llm.generate(inputs,
sampling_params,
lora_request=lora_request)
generated_texts = [
output.outputs[0].text.strip() for output in outputs
]

IMAGE_ASSETS = [
# Validate outputs
for generated, expected in zip(generated_texts, expected_outputs):
assert expected.startswith(
generated), f"Generated text {generated} doesn't "
f"match expected pattern {expected}"

return generated_texts


TEST_IMAGES = [
ImageAsset("stop_sign"),
ImageAsset("cherry_blossom"),
]

# After fine-tuning with LoRA, all generated content should start begin `A`.
EXPECTED_OUTPUT = [
EXPECTED_OUTPUTS = [
"A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.", # noqa: E501
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
]


def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
sampling_params = vllm.SamplingParams(
temperature=0,
max_tokens=5,
)

inputs = [{
"prompt": PROMPT_TEMPLATE,
"multi_modal_data": {
"image": asset.pil_image
},
} for asset in IMAGE_ASSETS]

outputs = llm.generate(
inputs,
sampling_params,
lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
if lora_id else None,
)
# Print the outputs.
generated_texts: List[str] = []
for output in outputs:
generated_text = output.outputs[0].text.strip()
generated_texts.append(generated_text)
print(f"Generated text: {generated_text!r}")
return generated_texts
QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"


@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2-VL dependency xformers incompatible with ROCm")
def test_qwen2vl_lora(qwen2vl_lora_files):
llm = vllm.LLM(
MODEL_PATH,
max_num_seqs=2,
enable_lora=True,
max_loras=2,
max_lora_rank=16,
trust_remote_code=True,
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
max_model_len=4096,
)
output1 = do_sample(llm, qwen2vl_lora_files, lora_id=1)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output1[i])

output2 = do_sample(llm, qwen2vl_lora_files, lora_id=2)
for i in range(len(EXPECTED_OUTPUT)):
assert EXPECTED_OUTPUT[i].startswith(output2[i])
"""Test Qwen 2.0 VL model with LoRA"""
config = TestConfig(model_path=QWEN2VL_MODEL_PATH,
lora_path=qwen2vl_lora_files)
tester = Qwen2VLTester(config)

# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id)


@pytest.mark.xfail(
current_platform.is_rocm(),
reason="Qwen2.5-VL dependency xformers incompatible with ROCm",
)
@pytest.mark.skipif(
Version(TRANSFORMERS_VERSION) < Version("4.49.0"),
reason="Qwen2.5-VL require transformers version no lower than 4.49.0",
)
def test_qwen25vl_lora(qwen25vl_lora_files):
"""Test Qwen 2.5 VL model with LoRA"""
config = TestConfig(model_path=QWEN25VL_MODEL_PATH,
lora_path=qwen25vl_lora_files)
tester = Qwen2VLTester(config)

# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id)
10 changes: 6 additions & 4 deletions vllm/model_executor/models/qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -757,23 +757,25 @@ class Qwen2_5_VLForConditionalGeneration(nn.Module, SupportsMultiModal,
"up_proj",
],
}
# LoRA specific attributes, TODO: double check
# LoRA specific attributes
supported_lora_modules = [
# language model
"qkv_proj",
"o_proj",
"gate_up_proj",
"down_proj",
"gate_proj"
"up_proj",
"down_proj", # Same name with vision encoder
# vision tower
"qkv",
"gate_proj",
"up_proj",
"attn.proj", # Distinguish patch_embed.proj
"fc1",
"fc2",
# projector
"mlp.0",
"mlp.2"
]

embedding_modules = {}
embedding_padding_modules = []

Expand Down