diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
index b5175a7f2..43a60a6c2 100644
--- a/llama_cpp/_internals.py
+++ b/llama_cpp/_internals.py
@@ -64,7 +64,8 @@ def __init__(
 
         self.model = model
         self.vocab = vocab
-        self.sampler = None  # LlamaModel doesn't use samplers, but some cleanup code expects this attribute
+        # LlamaModel doesn't use samplers, but some cleanup code expects this attribute
+        self.sampler = None
 
         def free_model():
             if self.model is None:
@@ -191,7 +192,8 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
         buffer = (ctypes.c_char * size)()
         for token in tokens:
             n = llama_cpp.llama_token_to_piece(
-                self.vocab, llama_cpp.llama_token(token), buffer, size, 0, special
+                self.vocab, llama_cpp.llama_token(
+                    token), buffer, size, 0, special
             )
             assert n <= size
             output += bytes(buffer[:n])
@@ -264,7 +266,8 @@ def __init__(
 
         self.ctx = ctx
         self.memory = llama_cpp.llama_get_memory(self.ctx)
-        self.sampler = None  # LlamaContext doesn't manage samplers directly, but some cleanup code expects this attribute
+        # LlamaContext doesn't manage samplers directly, but some cleanup code expects this attribute
+        self.sampler = None
 
         def free_ctx():
             if self.ctx is None:
@@ -297,7 +300,8 @@ def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
 
     def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
         assert self.memory is not None, "Memory is not initialized"
-        llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1)
+        llama_cpp.llama_memory_seq_cp(
+            self.memory, seq_id_src, seq_id_dst, p0, p1)
 
     def kv_cache_seq_keep(self, seq_id: int):
         assert self.memory is not None, "Memory is not initialized"
@@ -355,7 +359,8 @@ def get_embeddings_seq(self, seq_id: int):
     # Sampling functions - deprecated, use LlamaSampler instead
 
     def set_rng_seed(self, seed: int):
-        raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "set_rng_seed is deprecated, use LlamaSampler instead")
 
     def sample_repetition_penalties(
         self,
@@ -366,30 +371,38 @@ def sample_repetition_penalties(
         penalty_freq: float,
         penalty_present: float,
     ):
-        raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_repetition_penalties is deprecated, use LlamaSampler instead")
 
     def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
-        raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_softmax is deprecated, use LlamaSampler instead")
 
     def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
-        raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_top_k is deprecated, use LlamaSampler instead")
 
     def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_top_p is deprecated, use LlamaSampler instead")
 
     def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
-        raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_min_p is deprecated, use LlamaSampler instead")
 
     def sample_typical(
         self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
     ):
-        raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_typical is deprecated, use LlamaSampler instead")
 
     def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
-        raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_temp is deprecated, use LlamaSampler instead")
 
     def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
-        raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_grammar is deprecated, use LlamaSampler instead")
 
     def sample_token_mirostat(
         self,
@@ -399,7 +412,8 @@ def sample_token_mirostat(
         m: int,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_mirostat is deprecated, use LlamaSampler instead")
 
     def sample_token_mirostat_v2(
         self,
@@ -408,17 +422,21 @@ def sample_token_mirostat_v2(
         eta: float,
         mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
     ) -> int:
-        raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
 
     def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token_greedy is deprecated, use LlamaSampler instead")
 
     def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
-        raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "sample_token is deprecated, use LlamaSampler instead")
 
     # Grammar
     def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
-        raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "grammar_accept_token is deprecated, use LlamaSampler instead")
 
     def reset_timings(self):
         llama_cpp.llama_perf_context_reset(self.ctx)
@@ -443,13 +461,15 @@ def __init__(
         self.verbose = verbose
         self._exit_stack = ExitStack()
 
-        batch = llama_cpp.llama_batch_init(self._n_tokens, self.embd, self.n_seq_max)
+        batch = llama_cpp.llama_batch_init(
+            self._n_tokens, self.embd, self.n_seq_max)
 
         if batch is None:
             raise ValueError("Failed to create llama_batch")
 
         self.batch = batch
-        self.sampler = None  # LlamaBatch doesn't use samplers, but some cleanup code expects this attribute
+        # LlamaBatch doesn't use samplers, but some cleanup code expects this attribute
+        self.sampler = None
 
         def free_batch():
             if self.batch is None:
@@ -506,13 +526,17 @@ def __init__(self, *, n_vocab: int):
             ),
         )
         self.candidates = llama_cpp.llama_token_data_array(
-            data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
+            data=self.candidates_data.ctypes.data_as(
+                llama_cpp.llama_token_data_p),
             size=self.n_vocab,
             sorted=False,
         )
-        self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc)  # type: ignore
-        self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
-        self.sampler = None  # LlamaTokenDataArray doesn't use samplers, but some cleanup code expects this attribute
+        self.default_candidates_data_id = np.arange(
+            self.n_vocab, dtype=np.intc)  # type: ignore
+        self.default_candidates_data_p = np.zeros(
+            self.n_vocab, dtype=np.single)
+        # LlamaTokenDataArray doesn't use samplers, but some cleanup code expects this attribute
+        self.sampler = None
 
     def copy_logits(self, logits: npt.NDArray[np.single]):
         self.candidates_data.id[:] = self.default_candidates_data_id
@@ -602,7 +626,8 @@ def sample(
         logits_array: Optional[npt.NDArray[np.single]] = None,
     ):
         # This method is deprecated in favor of using LlamaSampler directly
-        raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
+        raise NotImplementedError(
+            "LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
 
     def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
         self.prev.append(id)
@@ -673,8 +698,9 @@ def add_dist(self, seed: int):
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_softmax(self):
-        sampler = llama_cpp.llama_sampler_init_softmax()
-        llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
+        # DEPRECATED: softmax is now automatically applied in newer llama.cpp versions
+        # This method is kept for backward compatibility but does nothing
+        pass
 
     def add_top_k(self, k: int):
         sampler = llama_cpp.llama_sampler_init_top_k(k)
@@ -709,7 +735,8 @@ def add_top_n_sigma(self, n: float):
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int):
-        sampler = llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m)
+        sampler = llama_cpp.llama_sampler_init_mirostat(
+            n_vocab, seed, tau, eta, m)
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_mirostat_v2(self, seed: int, tau: float, eta: float):
@@ -718,13 +745,14 @@ def add_mirostat_v2(self, seed: int, tau: float, eta: float):
 
     def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
         sampler = llama_cpp.llama_sampler_init_grammar(
-            model.vocab, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8")
+            model.vocab, grammar._grammar.encode(
+                "utf-8"), grammar._root.encode("utf-8")
         )
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_grammar_lazy_patterns(
-        self, 
-        model: LlamaModel, 
+        self,
+        model: LlamaModel,
         grammar: LlamaGrammar,
         trigger_patterns: List[str],
         trigger_tokens: List[int]
@@ -733,10 +761,11 @@ def add_grammar_lazy_patterns(
         pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))()
         for i, pattern in enumerate(trigger_patterns):
             pattern_ptrs[i] = pattern.encode("utf-8")
-        
+
         # Convert tokens to C array
-        token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens)
-        
+        token_array = (llama_cpp.llama_token *
+                       len(trigger_tokens))(*trigger_tokens)
+
         sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns(
             model.vocab,
             grammar._grammar.encode("utf-8"),
@@ -777,7 +806,7 @@ def add_dry(
         breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))()
         for i, breaker in enumerate(seq_breakers):
             breaker_ptrs[i] = breaker.encode("utf-8")
-        
+
         sampler = llama_cpp.llama_sampler_init_dry(
             model.vocab,
             n_ctx_train,
@@ -791,8 +820,8 @@ def add_dry(
         llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
 
     def add_logit_bias(
-        self, 
-        n_vocab: int, 
+        self,
+        n_vocab: int,
         logit_bias: Dict[int, float]
     ):
         # Convert logit_bias dict to C array
@@ -800,7 +829,7 @@ def add_logit_bias(
         for i, (token, bias) in enumerate(logit_bias.items()):
             bias_array[i].token = token
             bias_array[i].bias = bias
-        
+
         sampler = llama_cpp.llama_sampler_init_logit_bias(
             n_vocab,
             len(logit_bias),
@@ -838,15 +867,16 @@ def reset(self):
     def clone(self):
         # NOTE: Custom samplers cannot be cloned due to Python callback limitations
         if self.custom_samplers:
-            raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers")
-        
+            raise NotImplementedError(
+                "Cannot clone LlamaSampler that contains custom samplers")
+
         cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler)
         # Create a new wrapper around the cloned sampler
         new_sampler = LlamaSampler.__new__(LlamaSampler)
         new_sampler.sampler = cloned_sampler
         new_sampler.custom_samplers = []
         new_sampler._exit_stack = ExitStack()
-        
+
         def free_sampler():
             if new_sampler.sampler is not None:
                 llama_cpp.llama_sampler_free(new_sampler.sampler)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 71d94ebd8..3882f4356 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -285,7 +285,8 @@ def __init__(
                         ctypes.addressof(self._kv_overrides_array[i].value)
                         + llama_cpp.llama_model_kv_override_value.val_str.offset,
                     )
-                    buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char))
+                    buffer_start = ctypes.cast(
+                        address, ctypes.POINTER(ctypes.c_char))
                     ctypes.memmove(
                         buffer_start,
                         v_bytes,
@@ -439,7 +440,8 @@ def free_lora_adapter():
                 )
 
         if self.verbose:
-            print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
+            print(llama_cpp.llama_print_system_info().decode(
+                "utf-8"), file=sys.stderr)
 
         self.chat_format = chat_format
         self.chat_handler = chat_handler
@@ -458,7 +460,8 @@ def free_lora_adapter():
         self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab)
 
         self.n_tokens = 0
-        self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
+        self.input_ids: npt.NDArray[np.intc] = np.ndarray(
+            (n_ctx,), dtype=np.intc)
         self.scores: npt.NDArray[np.single] = np.ndarray(
             (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single
         )
@@ -481,10 +484,12 @@ def free_lora_adapter():
         bos_token_id = self.token_bos()
 
         eos_token = (
-            self._model.token_get_text(eos_token_id) if eos_token_id != -1 else ""
+            self._model.token_get_text(
+                eos_token_id) if eos_token_id != -1 else ""
         )
         bos_token = (
-            self._model.token_get_text(bos_token_id) if bos_token_id != -1 else ""
+            self._model.token_get_text(
+                bos_token_id) if bos_token_id != -1 else ""
         )
 
         # Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
@@ -525,15 +530,18 @@ def free_lora_adapter():
             if chat_format is not None:
                 self.chat_format = chat_format
                 if self.verbose:
-                    print(f"Guessed chat format: {chat_format}", file=sys.stderr)
+                    print(
+                        f"Guessed chat format: {chat_format}", file=sys.stderr)
             else:
                 if self.verbose:
                     print(
                         f"Using gguf chat template: {template_choices['chat_template.default']}",
                         file=sys.stderr,
                     )
-                    print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
-                    print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
+                    print(
+                        f"Using chat eos_token: {eos_token}", file=sys.stderr)
+                    print(
+                        f"Using chat bos_token: {bos_token}", file=sys.stderr)
 
                 self.chat_format = "chat_template.default"
 
@@ -639,7 +647,7 @@ def eval(self, tokens: Sequence[int]):
         """
         self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
         for i in range(0, len(tokens), self.n_batch):
-            batch = tokens[i : min(len(tokens), i + self.n_batch)]
+            batch = tokens[i: min(len(tokens), i + self.n_batch)]
             n_past = self.n_tokens
             n_tokens = len(batch)
             self._batch.set_batch(
@@ -647,7 +655,7 @@ def eval(self, tokens: Sequence[int]):
             )
             self._ctx.decode(self._batch)
             # Save tokens
-            self.input_ids[n_past : n_past + n_tokens] = batch
+            self.input_ids[n_past: n_past + n_tokens] = batch
             # Save logits
             if self._logits_all:
                 rows = n_tokens
@@ -655,7 +663,8 @@ def eval(self, tokens: Sequence[int]):
                 logits = np.ctypeslib.as_array(
                     self._ctx.get_logits(), shape=(rows * cols,)
                 )
-                self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits
+                self.scores[n_past: n_past + n_tokens,
+                            :].reshape(-1)[::] = logits
             else:
                 # rows = 1
                 # cols = self._n_vocab
@@ -706,7 +715,8 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
                     ),
                 )
                 for logit_processor in logits_processor:
-                    recarray.logit[:] = logit_processor(self._input_ids, recarray.logit)
+                    recarray.logit[:] = logit_processor(
+                        self._input_ids, recarray.logit)
 
             sampler.add_custom(apply_func)
 
@@ -726,7 +736,7 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
             sampler.add_grammar(self._model, grammar)
 
         if temp < 0.0:
-            sampler.add_softmax()
+            # Note: softmax is now automatically applied, no need to call add_softmax()
             sampler.add_dist(self._seed)
         elif temp == 0.0:
             sampler.add_greedy()
@@ -934,7 +944,8 @@ def generate(
 
                 sample_idx += 1
                 if stopping_criteria is not None and stopping_criteria(
-                    self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :]
+                    self._input_ids[: sample_idx], self._scores[sample_idx -
+                                                                self.n_tokens, :]
                 ):
                     return
                 tokens_or_none = yield token
@@ -949,7 +960,8 @@ def generate(
                     break
 
             if self.draft_model is not None:
-                self.input_ids[self.n_tokens : self.n_tokens + len(tokens)] = tokens
+                self.input_ids[self.n_tokens: self.n_tokens +
+                               len(tokens)] = tokens
                 draft_tokens = self.draft_model(
                     self.input_ids[: self.n_tokens + len(tokens)]
                 )
@@ -977,7 +989,8 @@ def create_embedding(
         # get numeric embeddings
         embeds: Union[List[List[float]], List[List[List[float]]]]
         total_tokens: int
-        embeds, total_tokens = self.embed(input, return_count=True)  # type: ignore
+        embeds, total_tokens = self.embed(
+            input, return_count=True)  # type: ignore
 
         # convert to CreateEmbeddingResponse
         data: List[Embedding] = [
@@ -1041,7 +1054,8 @@ def embed(
         data: Union[List[List[float]], List[List[List[float]]]] = []
 
         def decode_batch(seq_sizes: List[int]):
-            llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+            mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+            llama_cpp.llama_memory_clear(mem, True)
             self._ctx.decode(self._batch)
             self._batch.reset()
 
@@ -1051,7 +1065,7 @@ def decode_batch(seq_sizes: List[int]):
                 for i, size in enumerate(seq_sizes):
                     ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx)
                     embedding: List[List[float]] = [
-                        ptr[pos + j * n_embd : pos + (j + 1) * n_embd]
+                        ptr[pos + j * n_embd: pos + (j + 1) * n_embd]
                         for j in range(size)
                     ]
                     if normalize:
@@ -1112,7 +1126,8 @@ def decode_batch(seq_sizes: List[int]):
 
         output = data[0] if isinstance(input, str) else data
 
-        llama_cpp.llama_kv_self_clear(self._ctx.ctx)
+        mem = llama_cpp.llama_get_memory(self._ctx.ctx)
+        llama_cpp.llama_memory_clear(mem, True)
         self.reset()
 
         if return_count:
@@ -1157,13 +1172,15 @@ def _create_completion(
         bos_token_id: int = self.token_bos()
         cls_token_id: int = self._model.token_cls()
         sep_token_id: int = self._model.token_sep()
-        prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix
-        middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix
-        suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix
+        prefix_token_id: int = 0  # self._model.token_prefix() # TODO: Fix
+        middle_token_id: int = 0  # self._model.token_middle() # TODO: Fix
+        suffix_token_id: int = 0  # self._model.token_suffix() # TODO: Fix
         add_space_prefix: bool = (
-            self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true"
+            self.metadata.get(
+                "tokenizer.ggml.add_space_prefix", "true") == "true"
         )
-        bos_tokens: List[int] = [cls_token_id if cls_token_id != -1 else bos_token_id]
+        bos_tokens: List[int] = [
+            cls_token_id if cls_token_id != -1 else bos_token_id]
         eos_tokens: List[int] = [
             sep_token_id if sep_token_id != -1 else self.token_eos()
         ]
@@ -1188,7 +1205,8 @@ def _create_completion(
 
         # If prompt is empty, initialize completion with BOS token to avoid
         # detokenization including a space at the beginning of the completion
-        completion_tokens: List[int] = [] if len(prompt) > 0 else [bos_token_id]
+        completion_tokens: List[int] = [] if len(prompt) > 0 else [
+            bos_token_id]
         # Add blank space to start of prompt to match OG llama tokenizer
         prefix_tokens: List[int] = (
             [prefix_token_id] if prefix_token_id >= 0 and suffix is not None else []
@@ -1234,7 +1252,8 @@ def _create_completion(
         text: bytes = b""
         returned_tokens: int = 0
         stop = (
-            stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else []
+            stop if isinstance(stop, list) else [
+                stop] if isinstance(stop, str) else []
         )
         model_name: str = model if model is not None else self.model_path
 
@@ -1264,7 +1283,8 @@ def logit_bias_processor(
             if logits_processor is None:
                 logits_processor = _logit_bias_processor
             else:
-                logits_processor = logits_processor.extend(_logit_bias_processor)
+                logits_processor = logits_processor.extend(
+                    _logit_bias_processor)
 
         if self.verbose:
             self._ctx.reset_timings()
@@ -1307,7 +1327,8 @@ def logit_bias_processor(
                 if cache_prefix_len > eval_prefix_len:
                     self.load_state(cache_item)
                     if self.verbose:
-                        print("Llama._create_completion: cache hit", file=sys.stderr)
+                        print("Llama._create_completion: cache hit",
+                              file=sys.stderr)
             except KeyError:
                 if self.verbose:
                     print("Llama._create_completion: cache miss", file=sys.stderr)
@@ -1338,13 +1359,15 @@ def logit_bias_processor(
             grammar=grammar,
         ):
             if llama_cpp.llama_token_is_eog(self._model.vocab, token):
-                text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+                text = self.detokenize(
+                    completion_tokens, prev_tokens=prompt_tokens)
                 finish_reason = "stop"
                 break
 
             completion_tokens.append(token)
 
-            all_text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+            all_text = self.detokenize(
+                completion_tokens, prev_tokens=prompt_tokens)
 
             # Contains multi-byte UTF8
             for k, char in enumerate(all_text[-3:]):
@@ -1370,7 +1393,8 @@ def logit_bias_processor(
                 remaining_tokens = completion_tokens[returned_tokens:]
                 remaining_text = self.detokenize(
                     remaining_tokens,
-                    prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
+                    prev_tokens=prompt_tokens +
+                    completion_tokens[:returned_tokens],
                 )
                 remaining_length = len(remaining_text)
 
@@ -1419,10 +1443,12 @@ def logit_bias_processor(
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
                         logits = self._scores[token_offset - 1, :]
-                        current_logprobs = Llama.logits_to_logprobs(logits).tolist()
+                        current_logprobs = Llama.logits_to_logprobs(
+                            logits).tolist()
                         sorted_logprobs = list(
                             sorted(
-                                zip(current_logprobs, range(len(current_logprobs))),
+                                zip(current_logprobs, range(
+                                    len(current_logprobs))),
                                 reverse=True,
                             )
                         )
@@ -1432,7 +1458,8 @@ def logit_bias_processor(
                             ): logprob
                             for logprob, i in sorted_logprobs[:logprobs]
                         }
-                        top_logprob.update({token_str: current_logprobs[int(token)]})
+                        top_logprob.update(
+                            {token_str: current_logprobs[int(token)]})
                         logprobs_or_none = {
                             "tokens": [
                                 self.detokenize(
@@ -1508,14 +1535,16 @@ def logit_bias_processor(
                         }
 
             if len(completion_tokens) >= max_tokens:
-                text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+                text = self.detokenize(
+                    completion_tokens, prev_tokens=prompt_tokens)
                 finish_reason = "length"
                 break
 
         if stopping_criteria is not None and stopping_criteria(
             self._input_ids, self._scores[-1, :]
         ):
-            text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+            text = self.detokenize(
+                completion_tokens, prev_tokens=prompt_tokens)
             finish_reason = "stop"
 
         if self.verbose:
@@ -1525,7 +1554,8 @@ def logit_bias_processor(
             remaining_tokens = completion_tokens[returned_tokens:]
             remaining_text = self.detokenize(
                 remaining_tokens,
-                prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
+                prev_tokens=prompt_tokens +
+                completion_tokens[:returned_tokens],
             )
             any_stop = [s for s in stop_sequences if s in remaining_text]
             if len(any_stop) > 0:
@@ -1538,7 +1568,8 @@ def logit_bias_processor(
                 token_end_position += len(
                     self.detokenize(
                         [token],
-                        prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
+                        prev_tokens=prompt_tokens +
+                        completion_tokens[:returned_tokens],
                     )
                 )
 
@@ -1558,7 +1589,8 @@ def logit_bias_processor(
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :]
-                    current_logprobs = Llama.logits_to_logprobs(logits).tolist()
+                    current_logprobs = Llama.logits_to_logprobs(
+                        logits).tolist()
                     sorted_logprobs = list(
                         sorted(
                             zip(current_logprobs, range(len(current_logprobs))),
@@ -1569,10 +1601,12 @@ def logit_bias_processor(
                         self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
                         for logprob, i in sorted_logprobs[:logprobs]
                     }
-                    top_logprob.update({token_str: current_logprobs[int(token)]})
+                    top_logprob.update(
+                        {token_str: current_logprobs[int(token)]})
                     logprobs_or_none = {
                         "tokens": [
-                            self.detokenize([token]).decode("utf-8", errors="ignore")
+                            self.detokenize([token]).decode(
+                                "utf-8", errors="ignore")
                         ],
                         "text_offset": [text_offset],
                         "token_logprobs": [current_logprobs[int(token)]],
@@ -1635,7 +1669,8 @@ def logit_bias_processor(
             if self.cache:
                 if self.verbose:
                     print("Llama._create_completion: cache save", file=sys.stderr)
-                self.cache[prompt_tokens + completion_tokens] = self.save_state()
+                self.cache[prompt_tokens +
+                           completion_tokens] = self.save_state()
                 if self.verbose:
                     print("Llama._create_completion: cache saved", file=sys.stderr)
             return
@@ -1665,7 +1700,8 @@ def logit_bias_processor(
             if echo:
                 # Remove leading BOS token if exists
                 all_tokens = (
-                    prompt_tokens[1 if prompt_tokens[0] == self.token_bos() else 0 :]
+                    prompt_tokens[1 if prompt_tokens[0]
+                                  == self.token_bos() else 0:]
                     + completion_tokens
                 )
             else:
@@ -1677,7 +1713,8 @@ def logit_bias_processor(
                 )
                 for i, token in enumerate(all_tokens)
             ]
-            all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
+            all_logprobs = Llama.logits_to_logprobs(self._scores)[
+                token_offset:]
             # TODO: may be able to change this loop to use np.take_along_dim
             for idx, (token, token_str, logprobs_token) in enumerate(
                 zip(all_tokens, all_token_strs, all_logprobs)
@@ -2056,9 +2093,11 @@ def create_chat_completion_openai_v1(
             stream = kwargs.get("stream", False)  # type: ignore
             assert isinstance(stream, bool)
             if stream:
-                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
+                # type: ignore
+                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))
             else:
-                return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
+                # type: ignore
+                return ChatCompletion(**self.create_chat_completion(*args, **kwargs))
         except ImportError:
             raise ImportError(
                 "To use create_chat_completion_openai_v1, you must install the openai package."
@@ -2129,17 +2168,20 @@ def save_state(self) -> LlamaState:
             print("Llama.save_state: saving llama state", file=sys.stderr)
         state_size = llama_cpp.llama_get_state_size(self._ctx.ctx)
         if self.verbose:
-            print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
+            print(
+                f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
         llama_state = (ctypes.c_uint8 * int(state_size))()
         if self.verbose:
             print("Llama.save_state: allocated state", file=sys.stderr)
         n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state)
         if self.verbose:
-            print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
+            print(
+                f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
         if int(n_bytes) > int(state_size):
             raise RuntimeError("Failed to copy llama state data")
         llama_state_compact = (ctypes.c_uint8 * int(n_bytes))()
-        llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes))
+        llama_cpp.ctypes.memmove(
+            llama_state_compact, llama_state, int(n_bytes))
         if self.verbose:
             print(
                 f"Llama.save_state: saving {n_bytes} bytes of llama state",
@@ -2157,7 +2199,7 @@ def save_state(self) -> LlamaState:
     def load_state(self, state: LlamaState) -> None:
         # Only filling in up to `n_tokens` and then zero-ing out the rest
         self.scores[: state.n_tokens, :] = state.scores.copy()
-        rest = self.scores[state.n_tokens :, :]
+        rest = self.scores[state.n_tokens:, :]
         rest[rest > 0] = 0.0
         self.input_ids = state.input_ids.copy()
         self.n_tokens = state.n_tokens
@@ -2286,7 +2328,8 @@ def from_pretrained(
             file_list.append(str(rel_path))
 
         # find the only/first shard file:
-        matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)]  # type: ignore
+        matching_files = [file for file in file_list if fnmatch.fnmatch(
+            file, filename)]  # type: ignore
 
         if len(matching_files) == 0:
             raise ValueError(
@@ -2318,7 +2361,8 @@ def from_pretrained(
         if additional_files:
             for additonal_file_name in additional_files:
                 # find the additional shard file:
-                matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
+                matching_additional_files = [
+                    file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)]
 
                 if len(matching_additional_files) == 0:
                     raise ValueError(
@@ -2396,7 +2440,8 @@ def __call__(
         return scores
 
 
-StoppingCriteria = Callable[[npt.NDArray[np.intc], npt.NDArray[np.single]], bool]
+StoppingCriteria = Callable[[
+    npt.NDArray[np.intc], npt.NDArray[np.single]], bool]
 
 
 class StoppingCriteriaList(List[StoppingCriteria]):
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 711d42a6a..cf22a84ce 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -33,7 +33,8 @@
 # Specify the base name of the shared library to load
 _lib_base_name = "llama"
 _override_base_path = os.environ.get("LLAMA_CPP_LIB_PATH")
-_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _override_base_path is None else pathlib.Path(_override_base_path)
+_base_path = pathlib.Path(os.path.abspath(os.path.dirname(
+    __file__))) / "lib" if _override_base_path is None else pathlib.Path(_override_base_path)
 # Load the library
 _lib = load_shared_library(_lib_base_name, _base_path)
 
@@ -716,7 +717,8 @@ class llama_model_params(ctypes.Structure):
 
     if TYPE_CHECKING:
         devices: CtypesArray[ctypes.c_void_p]  # NOTE: unused
-        tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused
+        # NOTE: unused
+        tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override]
         n_gpu_layers: int
         split_mode: int
         main_gpu: int
@@ -731,8 +733,8 @@ class llama_model_params(ctypes.Structure):
         use_extra_bufts: bool
 
     _fields_ = [
-        ("devices", ctypes.c_void_p), # NOTE: unnused
-        ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused
+        ("devices", ctypes.c_void_p),  # NOTE: unnused
+        ("tensor_buft_overrides", ctypes.c_void_p),  # NOTE: unused
         ("n_gpu_layers", ctypes.c_int32),
         ("split_mode", ctypes.c_int),
         ("main_gpu", ctypes.c_int32),
@@ -1405,14 +1407,15 @@ def llama_pooling_type(ctx: llama_context_p, /) -> int:
 
 
 # DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead");
-@ctypes_function(
-    "llama_get_kv_self",
-    [llama_context_p_ctypes],
-    llama_kv_cache_p_ctypes,
-)
-def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
-    """Get the KV cache for self-attention (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_get_kv_self",
+#     [llama_context_p_ctypes],
+#     llama_kv_cache_p_ctypes,
+# )
+# def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]:
+#     """Get the KV cache for self-attention (DEPRECATED)"""
+#     ...
 
 
 # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
@@ -2000,7 +2003,8 @@ def llama_memory_seq_div(
 #         llama_memory_t mem,
 #           llama_seq_id seq_id);
 @ctypes_function(
-    "llama_memory_seq_pos_min", [llama_memory_t_ctypes, llama_seq_id], llama_pos
+    "llama_memory_seq_pos_min", [
+        llama_memory_t_ctypes, llama_seq_id], llama_pos
 )
 def llama_memory_seq_pos_min(
     mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /
@@ -2018,7 +2022,8 @@ def llama_memory_seq_pos_min(
 #         llama_memory_t mem,
 #           llama_seq_id seq_id);
 @ctypes_function(
-    "llama_memory_seq_pos_max", [llama_memory_t_ctypes, llama_seq_id], llama_pos
+    "llama_memory_seq_pos_max", [
+        llama_memory_t_ctypes, llama_seq_id], llama_pos
 )
 def llama_memory_seq_pos_max(
     mem: llama_memory_t, seq_id: Union[llama_seq_id, int], /
@@ -2044,35 +2049,38 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool:
 # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
 # DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx),
 #            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function(
-    "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
-)
-def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
-    """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32
+# )
+# def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int:
+#     """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)"""
+#     ...
 
 
 # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
 # DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx),
 #            "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)");
-@ctypes_function(
-    "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32
-)
-def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
-    """Returns the number of used KV cells (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32
+# )
+# def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int:
+#     """Returns the number of used KV cells (DEPRECATED)"""
+#     ...
 
 
 # // Clear the KV cache - both cell info is erased and KV data is zeroed
 # DEPRECATED(LLAMA_API void llama_kv_self_clear(
 #             struct llama_context * ctx),
 #         "Use llama_memory_clear() instead");
-@ctypes_function(
-    "llama_kv_self_clear", [llama_context_p_ctypes], None
-)
-def llama_kv_self_clear(ctx: llama_context_p, /):
-    """Clear the KV cache (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_kv_self_clear", [llama_context_p_ctypes], None
+# )
+# def llama_kv_self_clear(ctx: llama_context_p, /):
+#     """Clear the KV cache (DEPRECATED)"""
+#     ...
 
 
 # // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -2086,25 +2094,26 @@ def llama_kv_self_clear(ctx: llama_context_p, /):
 #                    llama_pos   p0,
 #                    llama_pos   p1),
 #         "Use llama_memory_seq_rm() instead");
-@ctypes_function(
-    "llama_kv_self_seq_rm",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-    ],
-    ctypes.c_bool,
-)
-def llama_kv_self_seq_rm(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    /,
-) -> bool:
-    """Remove tokens from KV cache (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_kv_self_seq_rm",
+#     [
+#         llama_context_p_ctypes,
+#         llama_seq_id,
+#         llama_pos,
+#         llama_pos,
+#     ],
+#     ctypes.c_bool,
+# )
+# def llama_kv_self_seq_rm(
+#     ctx: llama_context_p,
+#     seq_id: Union[llama_seq_id, int],
+#     p0: Union[llama_pos, int],
+#     p1: Union[llama_pos, int],
+#     /,
+# ) -> bool:
+#     """Remove tokens from KV cache (DEPRECATED)"""
+#     ...
 
 
 # // Copy all tokens that belong to the specified sequence to another sequence
@@ -2118,27 +2127,28 @@ def llama_kv_self_seq_rm(
 #                    llama_pos   p0,
 #                    llama_pos   p1),
 #         "Use llama_memory_seq_cp() instead");
-@ctypes_function(
-    "llama_kv_self_seq_cp",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-    ],
-    None,
-)
-def llama_kv_self_seq_cp(
-    ctx: llama_context_p,
-    seq_id_src: Union[llama_seq_id, int],
-    seq_id_dst: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    /,
-):
-    """Copy tokens in KV cache (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_kv_self_seq_cp",
+#     [
+#         llama_context_p_ctypes,
+#         llama_seq_id,
+#         llama_seq_id,
+#         llama_pos,
+#         llama_pos,
+#     ],
+#     None,
+# )
+# def llama_kv_self_seq_cp(
+#     ctx: llama_context_p,
+#     seq_id_src: Union[llama_seq_id, int],
+#     seq_id_dst: Union[llama_seq_id, int],
+#     p0: Union[llama_pos, int],
+#     p1: Union[llama_pos, int],
+#     /,
+# ):
+#     """Copy tokens in KV cache (DEPRECATED)"""
+#     ...
 
 
 # // Removes all tokens that do not belong to the specified sequence
@@ -2146,12 +2156,13 @@ def llama_kv_self_seq_cp(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id),
 #         "Use llama_memory_seq_keep() instead");
-@ctypes_function(
-    "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
-)
-def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
-    """Keep only specified sequence in KV cache (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None
+# )
+# def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /):
+#     """Keep only specified sequence in KV cache (DEPRECATED)"""
+#     ...
 
 
 # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
@@ -2166,27 +2177,28 @@ def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int
 #                    llama_pos   p1,
 #                    llama_pos   delta),
 #         "Use llama_memory_seq_add() instead");
-@ctypes_function(
-    "llama_kv_self_seq_add",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-        llama_pos,
-    ],
-    None,
-)
-def llama_kv_self_seq_add(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    delta: Union[llama_pos, int],
-    /,
-):
-    """Add delta to sequence positions in KV cache (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_kv_self_seq_add",
+#     [
+#         llama_context_p_ctypes,
+#         llama_seq_id,
+#         llama_pos,
+#         llama_pos,
+#         llama_pos,
+#     ],
+#     None,
+# )
+# def llama_kv_self_seq_add(
+#     ctx: llama_context_p,
+#     seq_id: Union[llama_seq_id, int],
+#     p0: Union[llama_pos, int],
+#     p1: Union[llama_pos, int],
+#     delta: Union[llama_pos, int],
+#     /,
+# ):
+#     """Add delta to sequence positions in KV cache (DEPRECATED)"""
+#     ...
 
 
 # // Integer division of the positions by factor of `d > 1`
@@ -2201,27 +2213,28 @@ def llama_kv_self_seq_add(
 #                    llama_pos   p1,
 #                          int   d),
 #         "Use llama_memory_seq_div() instead");
-@ctypes_function(
-    "llama_kv_self_seq_div",
-    [
-        llama_context_p_ctypes,
-        llama_seq_id,
-        llama_pos,
-        llama_pos,
-        ctypes.c_int,
-    ],
-    None,
-)
-def llama_kv_self_seq_div(
-    ctx: llama_context_p,
-    seq_id: Union[llama_seq_id, int],
-    p0: Union[llama_pos, int],
-    p1: Union[llama_pos, int],
-    d: Union[ctypes.c_int, int],
-    /,
-):
-    """Divide sequence positions in KV cache (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_kv_self_seq_div",
+#     [
+#         llama_context_p_ctypes,
+#         llama_seq_id,
+#         llama_pos,
+#         llama_pos,
+#         ctypes.c_int,
+#     ],
+#     None,
+# )
+# def llama_kv_self_seq_div(
+#     ctx: llama_context_p,
+#     seq_id: Union[llama_seq_id, int],
+#     p0: Union[llama_pos, int],
+#     p1: Union[llama_pos, int],
+#     d: Union[ctypes.c_int, int],
+#     /,
+# ):
+#     """Divide sequence positions in KV cache (DEPRECATED)"""
+#     ...
 
 
 # // Returns the smallest position present in the KV cache for the specified sequence
@@ -2232,14 +2245,16 @@ def llama_kv_self_seq_div(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id),
 #         "Use llama_memory_seq_pos_min() instead");
-@ctypes_function(
-    "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos
-)
-def llama_kv_self_seq_pos_min(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
-) -> int:
-    """Returns the smallest position in KV cache for sequence (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_kv_self_seq_pos_min", [
+#         llama_context_p_ctypes, llama_seq_id], llama_pos
+# )
+# def llama_kv_self_seq_pos_min(
+#     ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
+# ) -> int:
+#     """Returns the smallest position in KV cache for sequence (DEPRECATED)"""
+#     ...
 
 
 # // Returns the largest position present in the KV cache for the specified sequence
@@ -2249,14 +2264,16 @@ def llama_kv_self_seq_pos_min(
 #         struct llama_context * ctx,
 #                 llama_seq_id   seq_id),
 #         "Use llama_memory_seq_pos_max() instead");
-@ctypes_function(
-    "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos
-)
-def llama_kv_self_seq_pos_max(
-    ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
-) -> int:
-    """Returns the largest position in KV cache for sequence (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function(
+#     "llama_kv_self_seq_pos_max", [
+#         llama_context_p_ctypes, llama_seq_id], llama_pos
+# )
+# def llama_kv_self_seq_pos_max(
+#     ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /
+# ) -> int:
+#     """Returns the largest position in KV cache for sequence (DEPRECATED)"""
+#     ...
 
 
 # // Defragment the KV cache
@@ -2264,28 +2281,31 @@ def llama_kv_self_seq_pos_max(
 # //   - lazily on next llama_decode()
 # DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx),
 #         "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'");
-@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
-def llama_kv_self_defrag(ctx: llama_context_p, /):
-    """Defragment the KV cache (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None)
+# def llama_kv_self_defrag(ctx: llama_context_p, /):
+#     """Defragment the KV cache (DEPRECATED)"""
+#     ...
 
 
 # // Check if the context supports KV cache shifting
 # DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx),
 #         "use llama_memory_can_shift() instead");
-@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
-def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
-    """Check if the context supports KV cache shifting (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool)
+# def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool:
+#     """Check if the context supports KV cache shifting (DEPRECATED)"""
+#     ...
 
 
 # // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
 # DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx),
 #         "simply remove this call, updates are applied lazily on the next llama_decode()");
-@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
-def llama_kv_self_update(ctx: llama_context_p, /):
-    """Apply the KV cache updates (DEPRECATED)"""
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None)
+# def llama_kv_self_update(ctx: llama_context_p, /):
+#     """Apply the KV cache updates (DEPRECATED)"""
+#     ...
 
 
 # //
@@ -2679,7 +2699,8 @@ def llama_batch_get_one(
 #         int32_t embd,
 #         int32_t n_seq_max);
 @ctypes_function(
-    "llama_batch_init", [ctypes.c_int32, ctypes.c_int32, ctypes.c_int32], llama_batch
+    "llama_batch_init", [ctypes.c_int32,
+                         ctypes.c_int32, ctypes.c_int32], llama_batch
 )
 def llama_batch_init(
     n_tokens: Union[ctypes.c_int32, int],
@@ -2856,7 +2877,8 @@ def llama_synchronize(ctx: llama_context_p, /):
 # // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
 # LLAMA_API float * llama_get_logits(struct llama_context * ctx);
 @ctypes_function(
-    "llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
+    "llama_get_logits", [
+        llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
 )
 def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     """Token logits obtained from the last call to llama_decode()
@@ -2897,7 +2919,8 @@ def llama_get_logits_ith(
 # // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
 # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
 @ctypes_function(
-    "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
+    "llama_get_embeddings", [
+        llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float)
 )
 def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]:
     """Get the embeddings for the input
@@ -2949,7 +2972,8 @@ def llama_get_embeddings_seq(
 
 # LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
-    "llama_vocab_get_text", [llama_vocab_p_ctypes, llama_token], ctypes.c_char_p
+    "llama_vocab_get_text", [
+        llama_vocab_p_ctypes, llama_token], ctypes.c_char_p
 )
 def llama_vocab_get_text(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
@@ -2959,7 +2983,8 @@ def llama_vocab_get_text(
 
 # LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
-    "llama_vocab_get_score", [llama_vocab_p_ctypes, llama_token], ctypes.c_float
+    "llama_vocab_get_score", [
+        llama_vocab_p_ctypes, llama_token], ctypes.c_float
 )
 def llama_vocab_get_score(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
@@ -2990,7 +3015,8 @@ def llama_vocab_is_eog(vocab: llama_vocab_p, token: Union[llama_token, int], /)
 # // Identify if Token Id is a control token or a render-able token
 # LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token);
 @ctypes_function(
-    "llama_vocab_is_control", [llama_vocab_p_ctypes, llama_token], ctypes.c_bool
+    "llama_vocab_is_control", [
+        llama_vocab_p_ctypes, llama_token], ctypes.c_bool
 )
 def llama_vocab_is_control(
     vocab: llama_vocab_p, token: Union[llama_token, int], /
@@ -3164,6 +3190,8 @@ def llama_token_get_score(
     ...
 
 # DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead");
+
+
 @ctypes_function(
     "llama_token_get_attr",
     [llama_vocab_p_ctypes, llama_token],
@@ -3175,6 +3203,8 @@ def llama_token_get_attr(
     ...
 
 # DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead");
+
+
 @ctypes_function(
     "llama_token_is_eog",
     [llama_vocab_p_ctypes, llama_token],
@@ -3186,6 +3216,8 @@ def llama_token_is_eog(
     ...
 
 # DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead");
+
+
 @ctypes_function(
     "llama_token_is_control",
     [llama_vocab_p_ctypes, llama_token],
@@ -3197,6 +3229,8 @@ def llama_token_is_control(
     ...
 
 # DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead");
+
+
 @ctypes_function(
     "llama_token_bos",
     [llama_vocab_p_ctypes],
@@ -3206,6 +3240,8 @@ def llama_token_bos(vocab: llama_vocab_p, /) -> int:
     ...
 
 # DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead");
+
+
 @ctypes_function(
     "llama_token_eos",
     [llama_vocab_p_ctypes],
@@ -3215,6 +3251,8 @@ def llama_token_eos(vocab: llama_vocab_p, /) -> int:
     ...
 
 # DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead");
+
+
 @ctypes_function(
     "llama_token_eot",
     [llama_vocab_p_ctypes],
@@ -3224,6 +3262,8 @@ def llama_token_eot(vocab: llama_vocab_p, /) -> int:
     ...
 
 # DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead");
+
+
 @ctypes_function(
     "llama_token_cls",
     [llama_vocab_p_ctypes],
@@ -3233,6 +3273,8 @@ def llama_token_cls(vocab: llama_vocab_p, /) -> int:
     ...
 
 # DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead");
+
+
 @ctypes_function(
     "llama_token_sep",
     [llama_vocab_p_ctypes],
@@ -3272,6 +3314,8 @@ def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool:
     ...
 
 # DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead");
+
+
 @ctypes_function(
     "llama_add_eos_token",
     [llama_vocab_p_ctypes],
@@ -3291,6 +3335,8 @@ def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead");
+
+
 @ctypes_function(
     "llama_token_fim_suf",
     [llama_vocab_p_ctypes],
@@ -3300,6 +3346,8 @@ def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead");
+
+
 @ctypes_function(
     "llama_token_fim_mid",
     [llama_vocab_p_ctypes],
@@ -3309,6 +3357,8 @@ def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead");
+
+
 @ctypes_function(
     "llama_token_fim_pad",
     [llama_vocab_p_ctypes],
@@ -3318,6 +3368,8 @@ def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead");
+
+
 @ctypes_function(
     "llama_token_fim_rep",
     [llama_vocab_p_ctypes],
@@ -3327,6 +3379,8 @@ def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token:
     ...
 
 # DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead");
+
+
 @ctypes_function(
     "llama_token_fim_sep",
     [llama_vocab_p_ctypes],
@@ -3338,6 +3392,8 @@ def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token:
 # // CLS is equivalent to BOS
 # DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification
 #         "use llama_vocab_bos instead");
+
+
 @ctypes_function(
     "llama_vocab_cls",
     [llama_vocab_p_ctypes],
@@ -3634,13 +3690,16 @@ class llama_sampler(ctypes.Structure):
 
 llama_sampler_p_ctypes = ctypes.POINTER(llama_sampler)
 
-llama_sampler_i_name = ctypes.CFUNCTYPE(ctypes.c_char_p, llama_sampler_p_ctypes)
-llama_sampler_i_accept = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes, llama_token)
+llama_sampler_i_name = ctypes.CFUNCTYPE(
+    ctypes.c_char_p, llama_sampler_p_ctypes)
+llama_sampler_i_accept = ctypes.CFUNCTYPE(
+    None, llama_sampler_p_ctypes, llama_token)
 llama_sampler_i_apply = ctypes.CFUNCTYPE(
     None, llama_sampler_p_ctypes, llama_token_data_array_p
 )
 llama_sampler_i_reset = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes)
-llama_sampler_i_clone = ctypes.CFUNCTYPE(llama_sampler_p_ctypes, llama_sampler_p_ctypes)
+llama_sampler_i_clone = ctypes.CFUNCTYPE(
+    llama_sampler_p_ctypes, llama_sampler_p_ctypes)
 llama_sampler_i_free = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes)
 
 llama_sampler_i._fields_ = [
@@ -3806,9 +3865,10 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p:
 # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
 # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
 #     "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
-@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
-def llama_sampler_init_softmax() -> llama_sampler_p:
-    ...
+# NOTE: This function has been removed in newer versions of llama.cpp
+# @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes)
+# def llama_sampler_init_softmax() -> llama_sampler_p:
+#     ...
 
 
 # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@@ -4298,10 +4358,13 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /):
 
 # // function that returns whether or not a given tensor contains trainable parameters
 # typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
-llama_opt_param_filter = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
+llama_opt_param_filter = ctypes.CFUNCTYPE(
+    ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
 
 # // always returns true
 # LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
+
+
 @ctypes_function(
     "llama_opt_param_filter_all",
     [ctypes.c_void_p, ctypes.c_void_p],
@@ -4325,7 +4388,8 @@ class llama_opt_params(ctypes.Structure):
         ("n_ctx_train", ctypes.c_uint32),
         ("param_filter", llama_opt_param_filter),
         ("param_filter_ud", ctypes.c_void_p),
-        ("get_opt_pars", ctypes.c_void_p),  # ggml_opt_get_optimizer_params - not implemented here
+        # ggml_opt_get_optimizer_params - not implemented here
+        ("get_opt_pars", ctypes.c_void_p),
         ("get_opt_pars_ud", ctypes.c_void_p),
     ]
 
@@ -4353,7 +4417,7 @@ def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: lla
     [
         llama_context_p_ctypes,
         ctypes.c_void_p,  # ggml_opt_dataset_t
-        ctypes.c_void_p,  # ggml_opt_result_t  
+        ctypes.c_void_p,  # ggml_opt_result_t
         ctypes.c_void_p,  # ggml_opt_result_t
         ctypes.c_int64,
         ctypes.c_void_p,  # ggml_opt_epoch_callback
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 4227c9be4..5215b91e9 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 4227c9be4268ac844921b90f31595f81236bd317
+Subproject commit 5215b91e9377ce23e4ccc92ec3156bf5c7f892a3