From 94a95196748477a1f9f199f9fca5ac92c0dfe1db Mon Sep 17 00:00:00 2001
From: Limour <93720049+Limour-dev@users.noreply.github.com>
Date: Thu, 28 Mar 2024 19:27:37 +0800
Subject: [PATCH 1/4] add KV cache quantization options

https://github.com/abetlen/llama-cpp-python/discussions/1220
https://github.com/abetlen/llama-cpp-python/issues/1305
---
 llama_cpp/llama.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index fed84d579..331be92f2 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -107,6 +107,9 @@ def __init__(
         tokenizer: Optional[BaseLlamaTokenizer] = None,
         # Misc
         verbose: bool = True,
+        # KV cache quantization
+        type_k: str = 'f16',
+        type_v: str = 'f16',
         # Extra Params
         **kwargs,  # type: ignore
     ):
@@ -172,6 +175,8 @@ def __init__(
             draft_model: Optional draft model to use for speculative decoding.
             tokenizer: Optional tokenizer to override the default tokenizer from llama.cpp.
             verbose: Print verbose output to stderr.
+            type_k: KV cache data type for K (default: f16)
+            type_v: KV cache data type for V (default: f16)
 
         Raises:
             ValueError: If the model path does not exist.
@@ -298,7 +303,19 @@ def __init__(
         )  # Must be set to True for speculative decoding
         self.context_params.embeddings = embedding # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
-
+        #  KV cache quantization
+        kv_cache_type = {
+            'f32': 0,
+            'f16': 1,
+            'q8_0': 8,
+            'q4_0': 2,
+            'q4_1': 3,
+            'iq4_nl': 20,
+            'q5_0': 6,
+            'q5_1': 7
+        }
+        self.context_params.type_k = kv_cache_type[type_k]
+        self.context_params.type_v = kv_cache_type[type_v]
         # Sampling Params
         self.last_n_tokens_size = last_n_tokens_size
 

From 1a6a9a3e235330ebd00cc994004ed89b5596c1fb Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 31 Mar 2024 13:41:32 -0400
Subject: [PATCH 2/4] Add ggml_type

---
 llama_cpp/llama_cpp.py | 64 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 1db47becc..accc02cb8 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -141,6 +141,70 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
 
 byref = ctypes.byref  # type: ignore
 
+# from ggml.h
+# // NOTE: always add types at the end of the enum to keep backward compatibility
+# enum ggml_type {
+#     GGML_TYPE_F32     = 0,
+#     GGML_TYPE_F16     = 1,
+#     GGML_TYPE_Q4_0    = 2,
+#     GGML_TYPE_Q4_1    = 3,
+#     // GGML_TYPE_Q4_2 = 4, support has been removed
+#     // GGML_TYPE_Q4_3 = 5, support has been removed
+#     GGML_TYPE_Q5_0    = 6,
+#     GGML_TYPE_Q5_1    = 7,
+#     GGML_TYPE_Q8_0    = 8,
+#     GGML_TYPE_Q8_1    = 9,
+#     GGML_TYPE_Q2_K    = 10,
+#     GGML_TYPE_Q3_K    = 11,
+#     GGML_TYPE_Q4_K    = 12,
+#     GGML_TYPE_Q5_K    = 13,
+#     GGML_TYPE_Q6_K    = 14,
+#     GGML_TYPE_Q8_K    = 15,
+#     GGML_TYPE_IQ2_XXS = 16,
+#     GGML_TYPE_IQ2_XS  = 17,
+#     GGML_TYPE_IQ3_XXS = 18,
+#     GGML_TYPE_IQ1_S   = 19,
+#     GGML_TYPE_IQ4_NL  = 20,
+#     GGML_TYPE_IQ3_S   = 21,
+#     GGML_TYPE_IQ2_S   = 22,
+#     GGML_TYPE_IQ4_XS  = 23,
+#     GGML_TYPE_I8      = 24,
+#     GGML_TYPE_I16     = 25,
+#     GGML_TYPE_I32     = 26,
+#     GGML_TYPE_I64     = 27,
+#     GGML_TYPE_F64     = 28,
+#     GGML_TYPE_IQ1_M   = 29,
+#     GGML_TYPE_COUNT,
+# };
+GGML_TYPE_F32 = 0
+GGML_TYPE_F16 = 1
+GGML_TYPE_Q4_0 = 2
+GGML_TYPE_Q4_1 = 3
+GGML_TYPE_Q5_0 = 6
+GGML_TYPE_Q5_1 = 7
+GGML_TYPE_Q8_0 = 8
+GGML_TYPE_Q8_1 = 9
+GGML_TYPE_Q2_K = 10
+GGML_TYPE_Q3_K = 11
+GGML_TYPE_Q4_K = 12
+GGML_TYPE_Q5_K = 13
+GGML_TYPE_Q6_K = 14
+GGML_TYPE_Q8_K = 15
+GGML_TYPE_IQ2_XXS = 16
+GGML_TYPE_IQ2_XS = 17
+GGML_TYPE_IQ3_XXS = 18
+GGML_TYPE_IQ1_S = 19
+GGML_TYPE_IQ4_NL = 20
+GGML_TYPE_IQ3_S = 21
+GGML_TYPE_IQ2_S = 22
+GGML_TYPE_IQ4_XS = 23
+GGML_TYPE_I8 = 24
+GGML_TYPE_I16 = 25
+GGML_TYPE_I32 = 26
+GGML_TYPE_I64 = 27
+GGML_TYPE_F64 = 28
+GGML_TYPE_IQ1_M = 29
+GGML_TYPE_COUNT = 30
 
 # from ggml-backend.h
 # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);

From fcb80517c36491291c700ac0443eba61594a9a52 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 31 Mar 2024 13:42:07 -0400
Subject: [PATCH 3/4] Use ggml_type instead of string for quantization

---
 llama_cpp/llama.py | 70 ++++++++++------------------------------------
 1 file changed, 15 insertions(+), 55 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index ee07efe9f..dcc7be758 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -105,11 +105,11 @@ def __init__(
         draft_model: Optional[LlamaDraftModel] = None,
         # Tokenizer Override
         tokenizer: Optional[BaseLlamaTokenizer] = None,
+        # KV cache quantization
+        type_k: Optional[int] = None,
+        type_v: Optional[int] = None,
         # Misc
         verbose: bool = True,
-        # KV cache quantization
-        type_k: str = 'f16',
-        type_v: str = 'f16',
         # Extra Params
         **kwargs,  # type: ignore
     ):
@@ -304,18 +304,10 @@ def __init__(
         self.context_params.embeddings = embedding # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
         #  KV cache quantization
-        kv_cache_type = {
-            'f32': 0,
-            'f16': 1,
-            'q8_0': 8,
-            'q4_0': 2,
-            'q4_1': 3,
-            'iq4_nl': 20,
-            'q5_0': 6,
-            'q5_1': 7
-        }
-        self.context_params.type_k = kv_cache_type[type_k]
-        self.context_params.type_v = kv_cache_type[type_v]
+        if type_k is not None:
+            self.context_params.type_k = type_k
+        if type_v is not None:
+            self.context_params.type_v = type_v
         # Sampling Params
         self.last_n_tokens_size = last_n_tokens_size
 
@@ -1741,6 +1733,7 @@ def __getstate__(self):
             n_threads=self.context_params.n_threads,
             n_threads_batch=self.context_params.n_threads_batch,
             rope_scaling_type=self.context_params.rope_scaling_type,
+            pooling_type=self.context_params.pooling_type,
             rope_freq_base=self.context_params.rope_freq_base,
             rope_freq_scale=self.context_params.rope_freq_scale,
             yarn_ext_factor=self.context_params.yarn_ext_factor,
@@ -1750,6 +1743,7 @@ def __getstate__(self):
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
             logits_all=self.context_params.logits_all,
             embedding=self.context_params.embeddings,
+            offload_kqv=self.context_params.offload_kqv,
             # Sampling Params
             last_n_tokens_size=self.last_n_tokens_size,
             # LoRA Params
@@ -1761,51 +1755,17 @@ def __getstate__(self):
             # Chat Format Params
             chat_format=self.chat_format,
             chat_handler=self.chat_handler,
+            # Speculative Decidng
+            draft_model=self.draft_model,
+            # KV cache quantization
+            type_k=self.context_params.type_k,
+            type_v=self.context_params.type_v,
             # Misc
             verbose=self.verbose,
         )
 
     def __setstate__(self, state):
-        self.__init__(
-            model_path=state["model_path"],
-            # Model Params
-            n_gpu_layers=state["n_gpu_layers"],
-            split_mode=state["split_mode"],
-            main_gpu=state["main_gpu"],
-            tensor_split=state["tensor_split"],
-            vocab_only=state["vocab_only"],
-            use_mmap=state["use_mmap"],
-            use_mlock=state["use_mlock"],
-            kv_overrides=state["kv_overrides"],
-            # Context Params
-            seed=state["seed"],
-            n_ctx=state["n_ctx"],
-            n_batch=state["n_batch"],
-            n_threads=state["n_threads"],
-            n_threads_batch=state["n_threads_batch"],
-            rope_freq_base=state["rope_freq_base"],
-            rope_freq_scale=state["rope_freq_scale"],
-            rope_scaling_type=state["rope_scaling_type"],
-            yarn_ext_factor=state["yarn_ext_factor"],
-            yarn_attn_factor=state["yarn_attn_factor"],
-            yarn_beta_fast=state["yarn_beta_fast"],
-            yarn_beta_slow=state["yarn_beta_slow"],
-            yarn_orig_ctx=state["yarn_orig_ctx"],
-            logits_all=state["logits_all"],
-            embedding=state["embedding"],
-            # Sampling Params
-            last_n_tokens_size=state["last_n_tokens_size"],
-            # LoRA Params
-            lora_base=state["lora_base"],
-            lora_path=state["lora_path"],
-            # Backend Params
-            numa=state["numa"],
-            # Chat Format Params
-            chat_format=state["chat_format"],
-            chat_handler=state["chat_handler"],
-            # Misc
-            verbose=state["verbose"],
-        )
+        self.__init__(**state)
 
     def save_state(self) -> LlamaState:
         assert self._ctx.ctx is not None

From 7828382778ac04cd3eb84c20394d6c20f2cf7b70 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Mon, 1 Apr 2024 10:19:01 -0400
Subject: [PATCH 4/4] Add server support

---
 llama_cpp/server/model.py    | 3 +++
 llama_cpp/server/settings.py | 9 +++++++++
 2 files changed, 12 insertions(+)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index dace8d547..c24fca652 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -175,6 +175,9 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
             chat_handler=chat_handler,
             # Speculative Decoding
             draft_model=draft_model,
+            # KV Cache Quantization
+            type_k=settings.type_k,
+            type_v=settings.type_v,
             # Tokenizer
             tokenizer=tokenizer,
             # Misc
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index daa913fac..9ebdd0d8c 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -159,6 +159,15 @@ class ModelSettings(BaseSettings):
         default=10,
         description="Number of tokens to predict using the draft model.",
     )
+    # KV Cache Quantization
+    type_k: Optional[int] = Field(
+        default=None,
+        description="Type of the key cache quantization.",
+    )
+    type_v: Optional[int] = Field(
+        default=None,
+        description="Type of the value cache quantization.",
+    )
     # Misc
     verbose: bool = Field(
         default=True, description="Whether to print debug information."