From 94a95196748477a1f9f199f9fca5ac92c0dfe1db Mon Sep 17 00:00:00 2001 From: Limour <93720049+Limour-dev@users.noreply.github.com> Date: Thu, 28 Mar 2024 19:27:37 +0800 Subject: [PATCH 1/4] add KV cache quantization options https://github.com/abetlen/llama-cpp-python/discussions/1220 https://github.com/abetlen/llama-cpp-python/issues/1305 --- llama_cpp/llama.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fed84d579..331be92f2 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -107,6 +107,9 @@ def __init__( tokenizer: Optional[BaseLlamaTokenizer] = None, # Misc verbose: bool = True, + # KV cache quantization + type_k: str = 'f16', + type_v: str = 'f16', # Extra Params **kwargs, # type: ignore ): @@ -172,6 +175,8 @@ def __init__( draft_model: Optional draft model to use for speculative decoding. tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp. verbose: Print verbose output to stderr. + type_k: KV cache data type for K (default: f16) + type_v: KV cache data type for V (default: f16) Raises: ValueError: If the model path does not exist. @@ -298,7 +303,19 @@ def __init__( ) # Must be set to True for speculative decoding self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv - + # KV cache quantization + kv_cache_type = { + 'f32': 0, + 'f16': 1, + 'q8_0': 8, + 'q4_0': 2, + 'q4_1': 3, + 'iq4_nl': 20, + 'q5_0': 6, + 'q5_1': 7 + } + self.context_params.type_k = kv_cache_type[type_k] + self.context_params.type_v = kv_cache_type[type_v] # Sampling Params self.last_n_tokens_size = last_n_tokens_size From 1a6a9a3e235330ebd00cc994004ed89b5596c1fb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 31 Mar 2024 13:41:32 -0400 Subject: [PATCH 2/4] Add ggml_type --- llama_cpp/llama_cpp.py | 64 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 1db47becc..accc02cb8 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -141,6 +141,70 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa byref = ctypes.byref # type: ignore +# from ggml.h +# // NOTE: always add types at the end of the enum to keep backward compatibility +# enum ggml_type { +# GGML_TYPE_F32 = 0, +# GGML_TYPE_F16 = 1, +# GGML_TYPE_Q4_0 = 2, +# GGML_TYPE_Q4_1 = 3, +# // GGML_TYPE_Q4_2 = 4, support has been removed +# // GGML_TYPE_Q4_3 = 5, support has been removed +# GGML_TYPE_Q5_0 = 6, +# GGML_TYPE_Q5_1 = 7, +# GGML_TYPE_Q8_0 = 8, +# GGML_TYPE_Q8_1 = 9, +# GGML_TYPE_Q2_K = 10, +# GGML_TYPE_Q3_K = 11, +# GGML_TYPE_Q4_K = 12, +# GGML_TYPE_Q5_K = 13, +# GGML_TYPE_Q6_K = 14, +# GGML_TYPE_Q8_K = 15, +# GGML_TYPE_IQ2_XXS = 16, +# GGML_TYPE_IQ2_XS = 17, +# GGML_TYPE_IQ3_XXS = 18, +# GGML_TYPE_IQ1_S = 19, +# GGML_TYPE_IQ4_NL = 20, +# GGML_TYPE_IQ3_S = 21, +# GGML_TYPE_IQ2_S = 22, +# GGML_TYPE_IQ4_XS = 23, +# GGML_TYPE_I8 = 24, +# GGML_TYPE_I16 = 25, +# GGML_TYPE_I32 = 26, +# GGML_TYPE_I64 = 27, +# GGML_TYPE_F64 = 28, +# GGML_TYPE_IQ1_M = 29, +# GGML_TYPE_COUNT, +# }; +GGML_TYPE_F32 = 0 +GGML_TYPE_F16 = 1 +GGML_TYPE_Q4_0 = 2 +GGML_TYPE_Q4_1 = 3 +GGML_TYPE_Q5_0 = 6 +GGML_TYPE_Q5_1 = 7 +GGML_TYPE_Q8_0 = 8 +GGML_TYPE_Q8_1 = 9 +GGML_TYPE_Q2_K = 10 +GGML_TYPE_Q3_K = 11 +GGML_TYPE_Q4_K = 12 +GGML_TYPE_Q5_K = 13 +GGML_TYPE_Q6_K = 14 +GGML_TYPE_Q8_K = 15 +GGML_TYPE_IQ2_XXS = 16 +GGML_TYPE_IQ2_XS = 17 +GGML_TYPE_IQ3_XXS = 18 +GGML_TYPE_IQ1_S = 19 +GGML_TYPE_IQ4_NL = 20 +GGML_TYPE_IQ3_S = 21 +GGML_TYPE_IQ2_S = 22 +GGML_TYPE_IQ4_XS = 23 +GGML_TYPE_I8 = 24 +GGML_TYPE_I16 = 25 +GGML_TYPE_I32 = 26 +GGML_TYPE_I64 = 27 +GGML_TYPE_F64 = 28 +GGML_TYPE_IQ1_M = 29 +GGML_TYPE_COUNT = 30 # from ggml-backend.h # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); From fcb80517c36491291c700ac0443eba61594a9a52 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 31 Mar 2024 13:42:07 -0400 Subject: [PATCH 3/4] Use ggml_type instead of string for quantization --- llama_cpp/llama.py | 70 ++++++++++------------------------------------ 1 file changed, 15 insertions(+), 55 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ee07efe9f..dcc7be758 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -105,11 +105,11 @@ def __init__( draft_model: Optional[LlamaDraftModel] = None, # Tokenizer Override tokenizer: Optional[BaseLlamaTokenizer] = None, + # KV cache quantization + type_k: Optional[int] = None, + type_v: Optional[int] = None, # Misc verbose: bool = True, - # KV cache quantization - type_k: str = 'f16', - type_v: str = 'f16', # Extra Params **kwargs, # type: ignore ): @@ -304,18 +304,10 @@ def __init__( self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv # KV cache quantization - kv_cache_type = { - 'f32': 0, - 'f16': 1, - 'q8_0': 8, - 'q4_0': 2, - 'q4_1': 3, - 'iq4_nl': 20, - 'q5_0': 6, - 'q5_1': 7 - } - self.context_params.type_k = kv_cache_type[type_k] - self.context_params.type_v = kv_cache_type[type_v] + if type_k is not None: + self.context_params.type_k = type_k + if type_v is not None: + self.context_params.type_v = type_v # Sampling Params self.last_n_tokens_size = last_n_tokens_size @@ -1741,6 +1733,7 @@ def __getstate__(self): n_threads=self.context_params.n_threads, n_threads_batch=self.context_params.n_threads_batch, rope_scaling_type=self.context_params.rope_scaling_type, + pooling_type=self.context_params.pooling_type, rope_freq_base=self.context_params.rope_freq_base, rope_freq_scale=self.context_params.rope_freq_scale, yarn_ext_factor=self.context_params.yarn_ext_factor, @@ -1750,6 +1743,7 @@ def __getstate__(self): yarn_orig_ctx=self.context_params.yarn_orig_ctx, logits_all=self.context_params.logits_all, embedding=self.context_params.embeddings, + offload_kqv=self.context_params.offload_kqv, # Sampling Params last_n_tokens_size=self.last_n_tokens_size, # LoRA Params @@ -1761,51 +1755,17 @@ def __getstate__(self): # Chat Format Params chat_format=self.chat_format, chat_handler=self.chat_handler, + # Speculative Decidng + draft_model=self.draft_model, + # KV cache quantization + type_k=self.context_params.type_k, + type_v=self.context_params.type_v, # Misc verbose=self.verbose, ) def __setstate__(self, state): - self.__init__( - model_path=state["model_path"], - # Model Params - n_gpu_layers=state["n_gpu_layers"], - split_mode=state["split_mode"], - main_gpu=state["main_gpu"], - tensor_split=state["tensor_split"], - vocab_only=state["vocab_only"], - use_mmap=state["use_mmap"], - use_mlock=state["use_mlock"], - kv_overrides=state["kv_overrides"], - # Context Params - seed=state["seed"], - n_ctx=state["n_ctx"], - n_batch=state["n_batch"], - n_threads=state["n_threads"], - n_threads_batch=state["n_threads_batch"], - rope_freq_base=state["rope_freq_base"], - rope_freq_scale=state["rope_freq_scale"], - rope_scaling_type=state["rope_scaling_type"], - yarn_ext_factor=state["yarn_ext_factor"], - yarn_attn_factor=state["yarn_attn_factor"], - yarn_beta_fast=state["yarn_beta_fast"], - yarn_beta_slow=state["yarn_beta_slow"], - yarn_orig_ctx=state["yarn_orig_ctx"], - logits_all=state["logits_all"], - embedding=state["embedding"], - # Sampling Params - last_n_tokens_size=state["last_n_tokens_size"], - # LoRA Params - lora_base=state["lora_base"], - lora_path=state["lora_path"], - # Backend Params - numa=state["numa"], - # Chat Format Params - chat_format=state["chat_format"], - chat_handler=state["chat_handler"], - # Misc - verbose=state["verbose"], - ) + self.__init__(**state) def save_state(self) -> LlamaState: assert self._ctx.ctx is not None From 7828382778ac04cd3eb84c20394d6c20f2cf7b70 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Mon, 1 Apr 2024 10:19:01 -0400 Subject: [PATCH 4/4] Add server support --- llama_cpp/server/model.py | 3 +++ llama_cpp/server/settings.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index dace8d547..c24fca652 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -175,6 +175,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: chat_handler=chat_handler, # Speculative Decoding draft_model=draft_model, + # KV Cache Quantization + type_k=settings.type_k, + type_v=settings.type_v, # Tokenizer tokenizer=tokenizer, # Misc diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index daa913fac..9ebdd0d8c 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -159,6 +159,15 @@ class ModelSettings(BaseSettings): default=10, description="Number of tokens to predict using the draft model.", ) + # KV Cache Quantization + type_k: Optional[int] = Field( + default=None, + description="Type of the key cache quantization.", + ) + type_v: Optional[int] = Field( + default=None, + description="Type of the value cache quantization.", + ) # Misc verbose: bool = Field( default=True, description="Whether to print debug information."