feat: add gguf support#1354
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces support for GGUF model weights and quantization formats (such as Q5_0, Q8_0, and BF16) in LightLLM, including Triton-based dequantization kernels, GGUF metadata reading, and tensor name mapping. The review feedback highlights several critical issues: top-level imports of gguf and transformers.integrations.ggml must be wrapped in try-except blocks to prevent startup crashes for users without these dependencies; redundant configuration loading in Gemma3 should be removed; and quant_shape_from_byte_shape calls must be updated to support 3D MoE tensors. Additionally, the reviewer suggests simplifying Triton kernel loading by casting to tl.int8, correcting an invalid assertion on rt.shape.ndim, aligning method signatures for weight_names, and adding safer fallbacks to weight_dir in visual models.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| def load_tokenizer_from_gguf( | ||
| gguf_path: str, | ||
| model_config: Optional[Dict[str, Any]] = None, | ||
| **kwargs, | ||
| ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: | ||
| """ Load the tokenizer from GGUF file. """ | ||
| if model_config is None: | ||
| raise ValueError("model_config is required when loading tokenizer from GGUF") |
There was a problem hiding this comment.
Add a check to ensure _has_gguf_tokenizer is True before proceeding, raising a clear ImportError if the required transformers version is not installed.
| def load_tokenizer_from_gguf( | |
| gguf_path: str, | |
| model_config: Optional[Dict[str, Any]] = None, | |
| **kwargs, | |
| ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: | |
| """ Load the tokenizer from GGUF file. """ | |
| if model_config is None: | |
| raise ValueError("model_config is required when loading tokenizer from GGUF") | |
| def load_tokenizer_from_gguf( | |
| gguf_path: str, | |
| model_config: Optional[Dict[str, Any]] = None, | |
| **kwargs, | |
| ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: | |
| """ Load the tokenizer from GGUF file. """ | |
| if not _has_gguf_tokenizer: | |
| raise ImportError( | |
| "Loading tokenizer from GGUF requires transformers >= 4.41.0 with GGUF support." | |
| ) | |
| if model_config is None: | |
| raise ValueError("model_config is required when loading tokenizer from GGUF") |
| import torch | ||
| import triton | ||
| import triton.language as tl | ||
| from gguf import GGMLQuantizationType |
There was a problem hiding this comment.
Importing gguf at the top level of dequantization.py will cause lightllm to crash on startup for users without the gguf package installed, because this file is imported on startup via lightllm/common/quantization/__init__.py. Wrap the import in a try-except ImportError block to prevent this.
| from gguf import GGMLQuantizationType | |
| try: | |
| from gguf import GGMLQuantizationType | |
| _has_gguf = True | |
| except ImportError: | |
| _has_gguf = False | |
| class GGMLQuantizationType: | |
| pass |
| import gguf | ||
| import numpy as np | ||
| import os | ||
| import torch | ||
| from functools import lru_cache | ||
| from gguf import GGMLQuantizationType, dequantize, quant_shape_to_byte_shape | ||
| from gguf.gguf_reader import GGUFReader, ReaderTensor | ||
| from transformers import AutoModelForCausalLM, AutoModelForImageTextToText | ||
| from transformers.models.auto.configuration_auto import CONFIG_MAPPING | ||
| from typing import Any, Dict, Iterable, List, Optional, Set, Tuple |
There was a problem hiding this comment.
Importing gguf at the top level of gguf_load_utils.py will cause lightllm to crash on startup for any user who does not have the gguf package installed, because gguf_load_utils.py is imported at the top level of fused_moe_weight.py and mm_weight.py. To prevent this, wrap the gguf imports in a try-except ImportError block and define fallback dummy classes/variables.
import numpy as np
import os
import torch
from functools import lru_cache
from transformers import AutoModelForCausalLM, AutoModelForImageTextToText
from transformers.models.auto.configuration_auto import CONFIG_MAPPING
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
try:
import gguf
from gguf import GGMLQuantizationType, dequantize, quant_shape_to_byte_shape
from gguf.gguf_reader import GGUFReader, ReaderTensor
_has_gguf = True
except ImportError:
_has_gguf = False
class DummyGguf:
class GGMLQuantizationType:
pass
gguf = DummyGguf()
GGMLQuantizationType = gguf.GGMLQuantizationType
dequantize = None
quant_shape_to_byte_shape = None
GGUFReader = None
ReaderTensor = None| from gguf import GGMLQuantizationType, quant_shape_from_byte_shape | ||
| from gguf.quants import quant_shape_to_byte_shape |
There was a problem hiding this comment.
Importing gguf at the top level of gguf.py will cause lightllm to crash on startup for users without the gguf package installed. Wrap these imports in a try-except ImportError block.
| from gguf import GGMLQuantizationType, quant_shape_from_byte_shape | |
| from gguf.quants import quant_shape_to_byte_shape | |
| try: | |
| from gguf import GGMLQuantizationType, quant_shape_from_byte_shape | |
| from gguf.quants import quant_shape_to_byte_shape | |
| _has_gguf = True | |
| except ImportError: | |
| _has_gguf = False | |
| class GGMLQuantizationType: | |
| pass | |
| quant_shape_from_byte_shape = None | |
| quant_shape_to_byte_shape = None |
| from transformers.integrations.ggml import ( | ||
| GGUF_TO_FAST_CONVERTERS, | ||
| GGUF_TOKENIZER_MAPPING, | ||
| convert_gguf_tokenizer, | ||
| ) |
There was a problem hiding this comment.
Importing transformers.integrations.ggml at the top level will cause lightllm to crash on startup for any user with transformers < 4.41.0, even if they are not using GGUF models. To maintain backward compatibility, wrap these imports in a try-except ImportError block and only raise an error if a GGUF tokenizer is actually loaded.
| from transformers.integrations.ggml import ( | |
| GGUF_TO_FAST_CONVERTERS, | |
| GGUF_TOKENIZER_MAPPING, | |
| convert_gguf_tokenizer, | |
| ) | |
| try: | |
| from transformers.integrations.ggml import ( | |
| GGUF_TO_FAST_CONVERTERS, | |
| GGUF_TOKENIZER_MAPPING, | |
| convert_gguf_tokenizer, | |
| ) | |
| _has_gguf_tokenizer = True | |
| except ImportError: | |
| _has_gguf_tokenizer = False |
| qb = tl.load(w_ptr + block_base + 2 + pos_in_block, mask=mask, | ||
| other=0).to(tl.int32) | ||
| qs = tl.where(qb > 127, qb - 256, qb) | ||
| y = qs.to(tl.float32) * d |
There was a problem hiding this comment.
Instead of manually converting unsigned 8-bit integers to signed 8-bit integers using tl.where(qb > 127, qb - 256, qb), you can cast the pointer to tl.int8 and load directly. This is cleaner and more efficient.
| qb = tl.load(w_ptr + block_base + 2 + pos_in_block, mask=mask, | |
| other=0).to(tl.int32) | |
| qs = tl.where(qb > 127, qb - 256, qb) | |
| y = qs.to(tl.float32) * d | |
| w_i8 = tl.cast(w_ptr, tl.pointer_type(tl.int8)) | |
| qs = tl.load(w_i8 + block_base + 2 + pos_in_block, mask=mask, other=0).to(tl.float32) | |
| y = qs * d |
| processor_dir = kvargs.get("processor_dir") | ||
| self.processor = load_image_processor(processor_dir, Qwen2VLImageProcessor) |
There was a problem hiding this comment.
To prevent potential TypeError or crashes if processor_dir is not provided in kvargs, it is safer to fall back to weight_dir, consistent with how it is handled in other visual models (e.g., qwen3_omni_visual.py and qwen3_visual.py).
| processor_dir = kvargs.get("processor_dir") | |
| self.processor = load_image_processor(processor_dir, Qwen2VLImageProcessor) | |
| processor_dir = kvargs.get("processor_dir", kvargs.get("weight_dir")) | |
| self.processor = load_image_processor(processor_dir, Qwen2VLImageProcessor) |
| **kwargs, | ||
| ): | ||
| super().__init__() | ||
| self.processor_dir = kvargs.get("processor_dir") |
There was a problem hiding this comment.
| dtype: torch.dtype, | ||
| device_id: int, | ||
| num_experts: int = 1, | ||
| weight_names: Optional[str] = None, |
There was a problem hiding this comment.
| logical_shape: Tuple[int, ...] = None, | ||
| ) -> torch.Tensor: | ||
| """ Read a GGUF tensor to a CPU torch tensor. """ | ||
| assert rt.shape.ndim <= 2, "GGUF tensor must be 2D or less" |
There was a problem hiding this comment.
The assertion rt.shape.ndim <= 2 is incorrect because rt.shape is a 1D numpy array representing the shape of the tensor, so rt.shape.ndim is always 1. To check the number of dimensions of the tensor, you should use len(rt.shape) <= 2 or rt.shape.size <= 2.
| assert rt.shape.ndim <= 2, "GGUF tensor must be 2D or less" | |
| assert len(rt.shape) <= 2, "GGUF tensor must be 2D or less" |
No description provided.