diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py index b5175a7f2..43a60a6c2 100644 --- a/llama_cpp/_internals.py +++ b/llama_cpp/_internals.py @@ -64,7 +64,8 @@ def __init__( self.model = model self.vocab = vocab - self.sampler = None # LlamaModel doesn't use samplers, but some cleanup code expects this attribute + # LlamaModel doesn't use samplers, but some cleanup code expects this attribute + self.sampler = None def free_model(): if self.model is None: @@ -191,7 +192,8 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes: buffer = (ctypes.c_char * size)() for token in tokens: n = llama_cpp.llama_token_to_piece( - self.vocab, llama_cpp.llama_token(token), buffer, size, 0, special + self.vocab, llama_cpp.llama_token( + token), buffer, size, 0, special ) assert n <= size output += bytes(buffer[:n]) @@ -264,7 +266,8 @@ def __init__( self.ctx = ctx self.memory = llama_cpp.llama_get_memory(self.ctx) - self.sampler = None # LlamaContext doesn't manage samplers directly, but some cleanup code expects this attribute + # LlamaContext doesn't manage samplers directly, but some cleanup code expects this attribute + self.sampler = None def free_ctx(): if self.ctx is None: @@ -297,7 +300,8 @@ def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): assert self.memory is not None, "Memory is not initialized" - llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1) + llama_cpp.llama_memory_seq_cp( + self.memory, seq_id_src, seq_id_dst, p0, p1) def kv_cache_seq_keep(self, seq_id: int): assert self.memory is not None, "Memory is not initialized" @@ -355,7 +359,8 @@ def get_embeddings_seq(self, seq_id: int): # Sampling functions - deprecated, use LlamaSampler instead def set_rng_seed(self, seed: int): - raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "set_rng_seed is deprecated, use LlamaSampler instead") def sample_repetition_penalties( self, @@ -366,30 +371,38 @@ def sample_repetition_penalties( penalty_freq: float, penalty_present: float, ): - raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_repetition_penalties is deprecated, use LlamaSampler instead") def sample_softmax(self, candidates: "_LlamaTokenDataArray"): - raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_softmax is deprecated, use LlamaSampler instead") def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int): - raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_top_k is deprecated, use LlamaSampler instead") def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_top_p is deprecated, use LlamaSampler instead") def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_min_p is deprecated, use LlamaSampler instead") def sample_typical( self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int ): - raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_typical is deprecated, use LlamaSampler instead") def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float): - raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_temp is deprecated, use LlamaSampler instead") def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar): - raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_grammar is deprecated, use LlamaSampler instead") def sample_token_mirostat( self, @@ -399,7 +412,8 @@ def sample_token_mirostat( m: int, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_mirostat is deprecated, use LlamaSampler instead") def sample_token_mirostat_v2( self, @@ -408,17 +422,21 @@ def sample_token_mirostat_v2( eta: float, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_mirostat_v2 is deprecated, use LlamaSampler instead") def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token_greedy is deprecated, use LlamaSampler instead") def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: - raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "sample_token is deprecated, use LlamaSampler instead") # Grammar def grammar_accept_token(self, grammar: LlamaGrammar, token: int): - raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "grammar_accept_token is deprecated, use LlamaSampler instead") def reset_timings(self): llama_cpp.llama_perf_context_reset(self.ctx) @@ -443,13 +461,15 @@ def __init__( self.verbose = verbose self._exit_stack = ExitStack() - batch = llama_cpp.llama_batch_init(self._n_tokens, self.embd, self.n_seq_max) + batch = llama_cpp.llama_batch_init( + self._n_tokens, self.embd, self.n_seq_max) if batch is None: raise ValueError("Failed to create llama_batch") self.batch = batch - self.sampler = None # LlamaBatch doesn't use samplers, but some cleanup code expects this attribute + # LlamaBatch doesn't use samplers, but some cleanup code expects this attribute + self.sampler = None def free_batch(): if self.batch is None: @@ -506,13 +526,17 @@ def __init__(self, *, n_vocab: int): ), ) self.candidates = llama_cpp.llama_token_data_array( - data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), + data=self.candidates_data.ctypes.data_as( + llama_cpp.llama_token_data_p), size=self.n_vocab, sorted=False, ) - self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc) # type: ignore - self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single) - self.sampler = None # LlamaTokenDataArray doesn't use samplers, but some cleanup code expects this attribute + self.default_candidates_data_id = np.arange( + self.n_vocab, dtype=np.intc) # type: ignore + self.default_candidates_data_p = np.zeros( + self.n_vocab, dtype=np.single) + # LlamaTokenDataArray doesn't use samplers, but some cleanup code expects this attribute + self.sampler = None def copy_logits(self, logits: npt.NDArray[np.single]): self.candidates_data.id[:] = self.default_candidates_data_id @@ -602,7 +626,8 @@ def sample( logits_array: Optional[npt.NDArray[np.single]] = None, ): # This method is deprecated in favor of using LlamaSampler directly - raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead") + raise NotImplementedError( + "LlamaSamplingContext.sample is deprecated, use LlamaSampler instead") def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool): self.prev.append(id) @@ -673,8 +698,9 @@ def add_dist(self, seed: int): llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_softmax(self): - sampler = llama_cpp.llama_sampler_init_softmax() - llama_cpp.llama_sampler_chain_add(self.sampler, sampler) + # DEPRECATED: softmax is now automatically applied in newer llama.cpp versions + # This method is kept for backward compatibility but does nothing + pass def add_top_k(self, k: int): sampler = llama_cpp.llama_sampler_init_top_k(k) @@ -709,7 +735,8 @@ def add_top_n_sigma(self, n: float): llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int): - sampler = llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m) + sampler = llama_cpp.llama_sampler_init_mirostat( + n_vocab, seed, tau, eta, m) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_mirostat_v2(self, seed: int, tau: float, eta: float): @@ -718,13 +745,14 @@ def add_mirostat_v2(self, seed: int, tau: float, eta: float): def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar): sampler = llama_cpp.llama_sampler_init_grammar( - model.vocab, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8") + model.vocab, grammar._grammar.encode( + "utf-8"), grammar._root.encode("utf-8") ) llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_grammar_lazy_patterns( - self, - model: LlamaModel, + self, + model: LlamaModel, grammar: LlamaGrammar, trigger_patterns: List[str], trigger_tokens: List[int] @@ -733,10 +761,11 @@ def add_grammar_lazy_patterns( pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))() for i, pattern in enumerate(trigger_patterns): pattern_ptrs[i] = pattern.encode("utf-8") - + # Convert tokens to C array - token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens) - + token_array = (llama_cpp.llama_token * + len(trigger_tokens))(*trigger_tokens) + sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns( model.vocab, grammar._grammar.encode("utf-8"), @@ -777,7 +806,7 @@ def add_dry( breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))() for i, breaker in enumerate(seq_breakers): breaker_ptrs[i] = breaker.encode("utf-8") - + sampler = llama_cpp.llama_sampler_init_dry( model.vocab, n_ctx_train, @@ -791,8 +820,8 @@ def add_dry( llama_cpp.llama_sampler_chain_add(self.sampler, sampler) def add_logit_bias( - self, - n_vocab: int, + self, + n_vocab: int, logit_bias: Dict[int, float] ): # Convert logit_bias dict to C array @@ -800,7 +829,7 @@ def add_logit_bias( for i, (token, bias) in enumerate(logit_bias.items()): bias_array[i].token = token bias_array[i].bias = bias - + sampler = llama_cpp.llama_sampler_init_logit_bias( n_vocab, len(logit_bias), @@ -838,15 +867,16 @@ def reset(self): def clone(self): # NOTE: Custom samplers cannot be cloned due to Python callback limitations if self.custom_samplers: - raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers") - + raise NotImplementedError( + "Cannot clone LlamaSampler that contains custom samplers") + cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler) # Create a new wrapper around the cloned sampler new_sampler = LlamaSampler.__new__(LlamaSampler) new_sampler.sampler = cloned_sampler new_sampler.custom_samplers = [] new_sampler._exit_stack = ExitStack() - + def free_sampler(): if new_sampler.sampler is not None: llama_cpp.llama_sampler_free(new_sampler.sampler) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 71d94ebd8..3882f4356 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -285,7 +285,8 @@ def __init__( ctypes.addressof(self._kv_overrides_array[i].value) + llama_cpp.llama_model_kv_override_value.val_str.offset, ) - buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char)) + buffer_start = ctypes.cast( + address, ctypes.POINTER(ctypes.c_char)) ctypes.memmove( buffer_start, v_bytes, @@ -439,7 +440,8 @@ def free_lora_adapter(): ) if self.verbose: - print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + print(llama_cpp.llama_print_system_info().decode( + "utf-8"), file=sys.stderr) self.chat_format = chat_format self.chat_handler = chat_handler @@ -458,7 +460,8 @@ def free_lora_adapter(): self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab) self.n_tokens = 0 - self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) + self.input_ids: npt.NDArray[np.intc] = np.ndarray( + (n_ctx,), dtype=np.intc) self.scores: npt.NDArray[np.single] = np.ndarray( (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single ) @@ -481,10 +484,12 @@ def free_lora_adapter(): bos_token_id = self.token_bos() eos_token = ( - self._model.token_get_text(eos_token_id) if eos_token_id != -1 else "" + self._model.token_get_text( + eos_token_id) if eos_token_id != -1 else "" ) bos_token = ( - self._model.token_get_text(bos_token_id) if bos_token_id != -1 else "" + self._model.token_get_text( + bos_token_id) if bos_token_id != -1 else "" ) # Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates @@ -525,15 +530,18 @@ def free_lora_adapter(): if chat_format is not None: self.chat_format = chat_format if self.verbose: - print(f"Guessed chat format: {chat_format}", file=sys.stderr) + print( + f"Guessed chat format: {chat_format}", file=sys.stderr) else: if self.verbose: print( f"Using gguf chat template: {template_choices['chat_template.default']}", file=sys.stderr, ) - print(f"Using chat eos_token: {eos_token}", file=sys.stderr) - print(f"Using chat bos_token: {bos_token}", file=sys.stderr) + print( + f"Using chat eos_token: {eos_token}", file=sys.stderr) + print( + f"Using chat bos_token: {bos_token}", file=sys.stderr) self.chat_format = "chat_template.default" @@ -639,7 +647,7 @@ def eval(self, tokens: Sequence[int]): """ self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1) for i in range(0, len(tokens), self.n_batch): - batch = tokens[i : min(len(tokens), i + self.n_batch)] + batch = tokens[i: min(len(tokens), i + self.n_batch)] n_past = self.n_tokens n_tokens = len(batch) self._batch.set_batch( @@ -647,7 +655,7 @@ def eval(self, tokens: Sequence[int]): ) self._ctx.decode(self._batch) # Save tokens - self.input_ids[n_past : n_past + n_tokens] = batch + self.input_ids[n_past: n_past + n_tokens] = batch # Save logits if self._logits_all: rows = n_tokens @@ -655,7 +663,8 @@ def eval(self, tokens: Sequence[int]): logits = np.ctypeslib.as_array( self._ctx.get_logits(), shape=(rows * cols,) ) - self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits + self.scores[n_past: n_past + n_tokens, + :].reshape(-1)[::] = logits else: # rows = 1 # cols = self._n_vocab @@ -706,7 +715,8 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p): ), ) for logit_processor in logits_processor: - recarray.logit[:] = logit_processor(self._input_ids, recarray.logit) + recarray.logit[:] = logit_processor( + self._input_ids, recarray.logit) sampler.add_custom(apply_func) @@ -726,7 +736,7 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p): sampler.add_grammar(self._model, grammar) if temp < 0.0: - sampler.add_softmax() + # Note: softmax is now automatically applied, no need to call add_softmax() sampler.add_dist(self._seed) elif temp == 0.0: sampler.add_greedy() @@ -934,7 +944,8 @@ def generate( sample_idx += 1 if stopping_criteria is not None and stopping_criteria( - self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :] + self._input_ids[: sample_idx], self._scores[sample_idx - + self.n_tokens, :] ): return tokens_or_none = yield token @@ -949,7 +960,8 @@ def generate( break if self.draft_model is not None: - self.input_ids[self.n_tokens : self.n_tokens + len(tokens)] = tokens + self.input_ids[self.n_tokens: self.n_tokens + + len(tokens)] = tokens draft_tokens = self.draft_model( self.input_ids[: self.n_tokens + len(tokens)] ) @@ -977,7 +989,8 @@ def create_embedding( # get numeric embeddings embeds: Union[List[List[float]], List[List[List[float]]]] total_tokens: int - embeds, total_tokens = self.embed(input, return_count=True) # type: ignore + embeds, total_tokens = self.embed( + input, return_count=True) # type: ignore # convert to CreateEmbeddingResponse data: List[Embedding] = [ @@ -1041,7 +1054,8 @@ def embed( data: Union[List[List[float]], List[List[List[float]]]] = [] def decode_batch(seq_sizes: List[int]): - llama_cpp.llama_kv_self_clear(self._ctx.ctx) + mem = llama_cpp.llama_get_memory(self._ctx.ctx) + llama_cpp.llama_memory_clear(mem, True) self._ctx.decode(self._batch) self._batch.reset() @@ -1051,7 +1065,7 @@ def decode_batch(seq_sizes: List[int]): for i, size in enumerate(seq_sizes): ptr = llama_cpp.llama_get_embeddings(self._ctx.ctx) embedding: List[List[float]] = [ - ptr[pos + j * n_embd : pos + (j + 1) * n_embd] + ptr[pos + j * n_embd: pos + (j + 1) * n_embd] for j in range(size) ] if normalize: @@ -1112,7 +1126,8 @@ def decode_batch(seq_sizes: List[int]): output = data[0] if isinstance(input, str) else data - llama_cpp.llama_kv_self_clear(self._ctx.ctx) + mem = llama_cpp.llama_get_memory(self._ctx.ctx) + llama_cpp.llama_memory_clear(mem, True) self.reset() if return_count: @@ -1157,13 +1172,15 @@ def _create_completion( bos_token_id: int = self.token_bos() cls_token_id: int = self._model.token_cls() sep_token_id: int = self._model.token_sep() - prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix - middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix - suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix + prefix_token_id: int = 0 # self._model.token_prefix() # TODO: Fix + middle_token_id: int = 0 # self._model.token_middle() # TODO: Fix + suffix_token_id: int = 0 # self._model.token_suffix() # TODO: Fix add_space_prefix: bool = ( - self.metadata.get("tokenizer.ggml.add_space_prefix", "true") == "true" + self.metadata.get( + "tokenizer.ggml.add_space_prefix", "true") == "true" ) - bos_tokens: List[int] = [cls_token_id if cls_token_id != -1 else bos_token_id] + bos_tokens: List[int] = [ + cls_token_id if cls_token_id != -1 else bos_token_id] eos_tokens: List[int] = [ sep_token_id if sep_token_id != -1 else self.token_eos() ] @@ -1188,7 +1205,8 @@ def _create_completion( # If prompt is empty, initialize completion with BOS token to avoid # detokenization including a space at the beginning of the completion - completion_tokens: List[int] = [] if len(prompt) > 0 else [bos_token_id] + completion_tokens: List[int] = [] if len(prompt) > 0 else [ + bos_token_id] # Add blank space to start of prompt to match OG llama tokenizer prefix_tokens: List[int] = ( [prefix_token_id] if prefix_token_id >= 0 and suffix is not None else [] @@ -1234,7 +1252,8 @@ def _create_completion( text: bytes = b"" returned_tokens: int = 0 stop = ( - stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + stop if isinstance(stop, list) else [ + stop] if isinstance(stop, str) else [] ) model_name: str = model if model is not None else self.model_path @@ -1264,7 +1283,8 @@ def logit_bias_processor( if logits_processor is None: logits_processor = _logit_bias_processor else: - logits_processor = logits_processor.extend(_logit_bias_processor) + logits_processor = logits_processor.extend( + _logit_bias_processor) if self.verbose: self._ctx.reset_timings() @@ -1307,7 +1327,8 @@ def logit_bias_processor( if cache_prefix_len > eval_prefix_len: self.load_state(cache_item) if self.verbose: - print("Llama._create_completion: cache hit", file=sys.stderr) + print("Llama._create_completion: cache hit", + file=sys.stderr) except KeyError: if self.verbose: print("Llama._create_completion: cache miss", file=sys.stderr) @@ -1338,13 +1359,15 @@ def logit_bias_processor( grammar=grammar, ): if llama_cpp.llama_token_is_eog(self._model.vocab, token): - text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + text = self.detokenize( + completion_tokens, prev_tokens=prompt_tokens) finish_reason = "stop" break completion_tokens.append(token) - all_text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + all_text = self.detokenize( + completion_tokens, prev_tokens=prompt_tokens) # Contains multi-byte UTF8 for k, char in enumerate(all_text[-3:]): @@ -1370,7 +1393,8 @@ def logit_bias_processor( remaining_tokens = completion_tokens[returned_tokens:] remaining_text = self.detokenize( remaining_tokens, - prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + prev_tokens=prompt_tokens + + completion_tokens[:returned_tokens], ) remaining_length = len(remaining_text) @@ -1419,10 +1443,12 @@ def logit_bias_processor( ) token_offset = len(prompt_tokens) + returned_tokens logits = self._scores[token_offset - 1, :] - current_logprobs = Llama.logits_to_logprobs(logits).tolist() + current_logprobs = Llama.logits_to_logprobs( + logits).tolist() sorted_logprobs = list( sorted( - zip(current_logprobs, range(len(current_logprobs))), + zip(current_logprobs, range( + len(current_logprobs))), reverse=True, ) ) @@ -1432,7 +1458,8 @@ def logit_bias_processor( ): logprob for logprob, i in sorted_logprobs[:logprobs] } - top_logprob.update({token_str: current_logprobs[int(token)]}) + top_logprob.update( + {token_str: current_logprobs[int(token)]}) logprobs_or_none = { "tokens": [ self.detokenize( @@ -1508,14 +1535,16 @@ def logit_bias_processor( } if len(completion_tokens) >= max_tokens: - text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + text = self.detokenize( + completion_tokens, prev_tokens=prompt_tokens) finish_reason = "length" break if stopping_criteria is not None and stopping_criteria( self._input_ids, self._scores[-1, :] ): - text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + text = self.detokenize( + completion_tokens, prev_tokens=prompt_tokens) finish_reason = "stop" if self.verbose: @@ -1525,7 +1554,8 @@ def logit_bias_processor( remaining_tokens = completion_tokens[returned_tokens:] remaining_text = self.detokenize( remaining_tokens, - prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + prev_tokens=prompt_tokens + + completion_tokens[:returned_tokens], ) any_stop = [s for s in stop_sequences if s in remaining_text] if len(any_stop) > 0: @@ -1538,7 +1568,8 @@ def logit_bias_processor( token_end_position += len( self.detokenize( [token], - prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], + prev_tokens=prompt_tokens + + completion_tokens[:returned_tokens], ) ) @@ -1558,7 +1589,8 @@ def logit_bias_processor( ) token_offset = len(prompt_tokens) + returned_tokens - 1 logits = self._scores[token_offset, :] - current_logprobs = Llama.logits_to_logprobs(logits).tolist() + current_logprobs = Llama.logits_to_logprobs( + logits).tolist() sorted_logprobs = list( sorted( zip(current_logprobs, range(len(current_logprobs))), @@ -1569,10 +1601,12 @@ def logit_bias_processor( self.detokenize([i]).decode("utf-8", errors="ignore"): logprob for logprob, i in sorted_logprobs[:logprobs] } - top_logprob.update({token_str: current_logprobs[int(token)]}) + top_logprob.update( + {token_str: current_logprobs[int(token)]}) logprobs_or_none = { "tokens": [ - self.detokenize([token]).decode("utf-8", errors="ignore") + self.detokenize([token]).decode( + "utf-8", errors="ignore") ], "text_offset": [text_offset], "token_logprobs": [current_logprobs[int(token)]], @@ -1635,7 +1669,8 @@ def logit_bias_processor( if self.cache: if self.verbose: print("Llama._create_completion: cache save", file=sys.stderr) - self.cache[prompt_tokens + completion_tokens] = self.save_state() + self.cache[prompt_tokens + + completion_tokens] = self.save_state() if self.verbose: print("Llama._create_completion: cache saved", file=sys.stderr) return @@ -1665,7 +1700,8 @@ def logit_bias_processor( if echo: # Remove leading BOS token if exists all_tokens = ( - prompt_tokens[1 if prompt_tokens[0] == self.token_bos() else 0 :] + prompt_tokens[1 if prompt_tokens[0] + == self.token_bos() else 0:] + completion_tokens ) else: @@ -1677,7 +1713,8 @@ def logit_bias_processor( ) for i, token in enumerate(all_tokens) ] - all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:] + all_logprobs = Llama.logits_to_logprobs(self._scores)[ + token_offset:] # TODO: may be able to change this loop to use np.take_along_dim for idx, (token, token_str, logprobs_token) in enumerate( zip(all_tokens, all_token_strs, all_logprobs) @@ -2056,9 +2093,11 @@ def create_chat_completion_openai_v1( stream = kwargs.get("stream", False) # type: ignore assert isinstance(stream, bool) if stream: - return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) # type: ignore + # type: ignore + return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs)) else: - return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) # type: ignore + # type: ignore + return ChatCompletion(**self.create_chat_completion(*args, **kwargs)) except ImportError: raise ImportError( "To use create_chat_completion_openai_v1, you must install the openai package." @@ -2129,17 +2168,20 @@ def save_state(self) -> LlamaState: print("Llama.save_state: saving llama state", file=sys.stderr) state_size = llama_cpp.llama_get_state_size(self._ctx.ctx) if self.verbose: - print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr) + print( + f"Llama.save_state: got state size: {state_size}", file=sys.stderr) llama_state = (ctypes.c_uint8 * int(state_size))() if self.verbose: print("Llama.save_state: allocated state", file=sys.stderr) n_bytes = llama_cpp.llama_copy_state_data(self._ctx.ctx, llama_state) if self.verbose: - print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr) + print( + f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr) if int(n_bytes) > int(state_size): raise RuntimeError("Failed to copy llama state data") llama_state_compact = (ctypes.c_uint8 * int(n_bytes))() - llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes)) + llama_cpp.ctypes.memmove( + llama_state_compact, llama_state, int(n_bytes)) if self.verbose: print( f"Llama.save_state: saving {n_bytes} bytes of llama state", @@ -2157,7 +2199,7 @@ def save_state(self) -> LlamaState: def load_state(self, state: LlamaState) -> None: # Only filling in up to `n_tokens` and then zero-ing out the rest self.scores[: state.n_tokens, :] = state.scores.copy() - rest = self.scores[state.n_tokens :, :] + rest = self.scores[state.n_tokens:, :] rest[rest > 0] = 0.0 self.input_ids = state.input_ids.copy() self.n_tokens = state.n_tokens @@ -2286,7 +2328,8 @@ def from_pretrained( file_list.append(str(rel_path)) # find the only/first shard file: - matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore + matching_files = [file for file in file_list if fnmatch.fnmatch( + file, filename)] # type: ignore if len(matching_files) == 0: raise ValueError( @@ -2318,7 +2361,8 @@ def from_pretrained( if additional_files: for additonal_file_name in additional_files: # find the additional shard file: - matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)] + matching_additional_files = [ + file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)] if len(matching_additional_files) == 0: raise ValueError( @@ -2396,7 +2440,8 @@ def __call__( return scores -StoppingCriteria = Callable[[npt.NDArray[np.intc], npt.NDArray[np.single]], bool] +StoppingCriteria = Callable[[ + npt.NDArray[np.intc], npt.NDArray[np.single]], bool] class StoppingCriteriaList(List[StoppingCriteria]): diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 711d42a6a..cf22a84ce 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -33,7 +33,8 @@ # Specify the base name of the shared library to load _lib_base_name = "llama" _override_base_path = os.environ.get("LLAMA_CPP_LIB_PATH") -_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _override_base_path is None else pathlib.Path(_override_base_path) +_base_path = pathlib.Path(os.path.abspath(os.path.dirname( + __file__))) / "lib" if _override_base_path is None else pathlib.Path(_override_base_path) # Load the library _lib = load_shared_library(_lib_base_name, _base_path) @@ -716,7 +717,8 @@ class llama_model_params(ctypes.Structure): if TYPE_CHECKING: devices: CtypesArray[ctypes.c_void_p] # NOTE: unused - tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] # NOTE: unused + # NOTE: unused + tensor_buft_overrides: CtypesArray[llama_model_tensor_buft_override] n_gpu_layers: int split_mode: int main_gpu: int @@ -731,8 +733,8 @@ class llama_model_params(ctypes.Structure): use_extra_bufts: bool _fields_ = [ - ("devices", ctypes.c_void_p), # NOTE: unnused - ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused + ("devices", ctypes.c_void_p), # NOTE: unnused + ("tensor_buft_overrides", ctypes.c_void_p), # NOTE: unused ("n_gpu_layers", ctypes.c_int32), ("split_mode", ctypes.c_int), ("main_gpu", ctypes.c_int32), @@ -1405,14 +1407,15 @@ def llama_pooling_type(ctx: llama_context_p, /) -> int: # DEPRECATED(LLAMA_API struct llama_kv_cache * llama_get_kv_self(struct llama_context * ctx), "use llama_get_memory instead"); -@ctypes_function( - "llama_get_kv_self", - [llama_context_p_ctypes], - llama_kv_cache_p_ctypes, -) -def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]: - """Get the KV cache for self-attention (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_get_kv_self", +# [llama_context_p_ctypes], +# llama_kv_cache_p_ctypes, +# ) +# def llama_get_kv_self(ctx: llama_context_p, /) -> Optional[llama_kv_cache_p]: +# """Get the KV cache for self-attention (DEPRECATED)""" +# ... # LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); @@ -2000,7 +2003,8 @@ def llama_memory_seq_div( # llama_memory_t mem, # llama_seq_id seq_id); @ctypes_function( - "llama_memory_seq_pos_min", [llama_memory_t_ctypes, llama_seq_id], llama_pos + "llama_memory_seq_pos_min", [ + llama_memory_t_ctypes, llama_seq_id], llama_pos ) def llama_memory_seq_pos_min( mem: llama_memory_t, seq_id: Union[llama_seq_id, int], / @@ -2018,7 +2022,8 @@ def llama_memory_seq_pos_min( # llama_memory_t mem, # llama_seq_id seq_id); @ctypes_function( - "llama_memory_seq_pos_max", [llama_memory_t_ctypes, llama_seq_id], llama_pos + "llama_memory_seq_pos_max", [ + llama_memory_t_ctypes, llama_seq_id], llama_pos ) def llama_memory_seq_pos_max( mem: llama_memory_t, seq_id: Union[llama_seq_id, int], / @@ -2044,35 +2049,38 @@ def llama_memory_can_shift(mem: llama_memory_t, /) -> bool: # // If a KV cell has multiple sequences assigned to it, it will be counted multiple times # DEPRECATED(LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx), # "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); -@ctypes_function( - "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32 -) -def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: - """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_kv_self_n_tokens", [llama_context_p_ctypes], ctypes.c_int32 +# ) +# def llama_kv_self_n_tokens(ctx: llama_context_p, /) -> int: +# """Returns the number of tokens in the KV cache (slow, use only for debug) (DEPRECATED)""" +# ... # // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) # DEPRECATED(LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx), # "Use llama_kv_self_seq_pos_max() and llama_kv_self_seq_pos_min() instead (https://github.com/ggml-org/llama.cpp/issues/13793)"); -@ctypes_function( - "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32 -) -def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: - """Returns the number of used KV cells (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_kv_self_used_cells", [llama_context_p_ctypes], ctypes.c_int32 +# ) +# def llama_kv_self_used_cells(ctx: llama_context_p, /) -> int: +# """Returns the number of used KV cells (DEPRECATED)""" +# ... # // Clear the KV cache - both cell info is erased and KV data is zeroed # DEPRECATED(LLAMA_API void llama_kv_self_clear( # struct llama_context * ctx), # "Use llama_memory_clear() instead"); -@ctypes_function( - "llama_kv_self_clear", [llama_context_p_ctypes], None -) -def llama_kv_self_clear(ctx: llama_context_p, /): - """Clear the KV cache (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_kv_self_clear", [llama_context_p_ctypes], None +# ) +# def llama_kv_self_clear(ctx: llama_context_p, /): +# """Clear the KV cache (DEPRECATED)""" +# ... # // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) @@ -2086,25 +2094,26 @@ def llama_kv_self_clear(ctx: llama_context_p, /): # llama_pos p0, # llama_pos p1), # "Use llama_memory_seq_rm() instead"); -@ctypes_function( - "llama_kv_self_seq_rm", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - ], - ctypes.c_bool, -) -def llama_kv_self_seq_rm( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - /, -) -> bool: - """Remove tokens from KV cache (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_kv_self_seq_rm", +# [ +# llama_context_p_ctypes, +# llama_seq_id, +# llama_pos, +# llama_pos, +# ], +# ctypes.c_bool, +# ) +# def llama_kv_self_seq_rm( +# ctx: llama_context_p, +# seq_id: Union[llama_seq_id, int], +# p0: Union[llama_pos, int], +# p1: Union[llama_pos, int], +# /, +# ) -> bool: +# """Remove tokens from KV cache (DEPRECATED)""" +# ... # // Copy all tokens that belong to the specified sequence to another sequence @@ -2118,27 +2127,28 @@ def llama_kv_self_seq_rm( # llama_pos p0, # llama_pos p1), # "Use llama_memory_seq_cp() instead"); -@ctypes_function( - "llama_kv_self_seq_cp", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_seq_id, - llama_pos, - llama_pos, - ], - None, -) -def llama_kv_self_seq_cp( - ctx: llama_context_p, - seq_id_src: Union[llama_seq_id, int], - seq_id_dst: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - /, -): - """Copy tokens in KV cache (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_kv_self_seq_cp", +# [ +# llama_context_p_ctypes, +# llama_seq_id, +# llama_seq_id, +# llama_pos, +# llama_pos, +# ], +# None, +# ) +# def llama_kv_self_seq_cp( +# ctx: llama_context_p, +# seq_id_src: Union[llama_seq_id, int], +# seq_id_dst: Union[llama_seq_id, int], +# p0: Union[llama_pos, int], +# p1: Union[llama_pos, int], +# /, +# ): +# """Copy tokens in KV cache (DEPRECATED)""" +# ... # // Removes all tokens that do not belong to the specified sequence @@ -2146,12 +2156,13 @@ def llama_kv_self_seq_cp( # struct llama_context * ctx, # llama_seq_id seq_id), # "Use llama_memory_seq_keep() instead"); -@ctypes_function( - "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None -) -def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): - """Keep only specified sequence in KV cache (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_kv_self_seq_keep", [llama_context_p_ctypes, llama_seq_id], None +# ) +# def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int], /): +# """Keep only specified sequence in KV cache (DEPRECATED)""" +# ... # // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1) @@ -2166,27 +2177,28 @@ def llama_kv_self_seq_keep(ctx: llama_context_p, seq_id: Union[llama_seq_id, int # llama_pos p1, # llama_pos delta), # "Use llama_memory_seq_add() instead"); -@ctypes_function( - "llama_kv_self_seq_add", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - llama_pos, - ], - None, -) -def llama_kv_self_seq_add( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - delta: Union[llama_pos, int], - /, -): - """Add delta to sequence positions in KV cache (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_kv_self_seq_add", +# [ +# llama_context_p_ctypes, +# llama_seq_id, +# llama_pos, +# llama_pos, +# llama_pos, +# ], +# None, +# ) +# def llama_kv_self_seq_add( +# ctx: llama_context_p, +# seq_id: Union[llama_seq_id, int], +# p0: Union[llama_pos, int], +# p1: Union[llama_pos, int], +# delta: Union[llama_pos, int], +# /, +# ): +# """Add delta to sequence positions in KV cache (DEPRECATED)""" +# ... # // Integer division of the positions by factor of `d > 1` @@ -2201,27 +2213,28 @@ def llama_kv_self_seq_add( # llama_pos p1, # int d), # "Use llama_memory_seq_div() instead"); -@ctypes_function( - "llama_kv_self_seq_div", - [ - llama_context_p_ctypes, - llama_seq_id, - llama_pos, - llama_pos, - ctypes.c_int, - ], - None, -) -def llama_kv_self_seq_div( - ctx: llama_context_p, - seq_id: Union[llama_seq_id, int], - p0: Union[llama_pos, int], - p1: Union[llama_pos, int], - d: Union[ctypes.c_int, int], - /, -): - """Divide sequence positions in KV cache (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_kv_self_seq_div", +# [ +# llama_context_p_ctypes, +# llama_seq_id, +# llama_pos, +# llama_pos, +# ctypes.c_int, +# ], +# None, +# ) +# def llama_kv_self_seq_div( +# ctx: llama_context_p, +# seq_id: Union[llama_seq_id, int], +# p0: Union[llama_pos, int], +# p1: Union[llama_pos, int], +# d: Union[ctypes.c_int, int], +# /, +# ): +# """Divide sequence positions in KV cache (DEPRECATED)""" +# ... # // Returns the smallest position present in the KV cache for the specified sequence @@ -2232,14 +2245,16 @@ def llama_kv_self_seq_div( # struct llama_context * ctx, # llama_seq_id seq_id), # "Use llama_memory_seq_pos_min() instead"); -@ctypes_function( - "llama_kv_self_seq_pos_min", [llama_context_p_ctypes, llama_seq_id], llama_pos -) -def llama_kv_self_seq_pos_min( - ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / -) -> int: - """Returns the smallest position in KV cache for sequence (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_kv_self_seq_pos_min", [ +# llama_context_p_ctypes, llama_seq_id], llama_pos +# ) +# def llama_kv_self_seq_pos_min( +# ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / +# ) -> int: +# """Returns the smallest position in KV cache for sequence (DEPRECATED)""" +# ... # // Returns the largest position present in the KV cache for the specified sequence @@ -2249,14 +2264,16 @@ def llama_kv_self_seq_pos_min( # struct llama_context * ctx, # llama_seq_id seq_id), # "Use llama_memory_seq_pos_max() instead"); -@ctypes_function( - "llama_kv_self_seq_pos_max", [llama_context_p_ctypes, llama_seq_id], llama_pos -) -def llama_kv_self_seq_pos_max( - ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / -) -> int: - """Returns the largest position in KV cache for sequence (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function( +# "llama_kv_self_seq_pos_max", [ +# llama_context_p_ctypes, llama_seq_id], llama_pos +# ) +# def llama_kv_self_seq_pos_max( +# ctx: llama_context_p, seq_id: Union[llama_seq_id, int], / +# ) -> int: +# """Returns the largest position in KV cache for sequence (DEPRECATED)""" +# ... # // Defragment the KV cache @@ -2264,28 +2281,31 @@ def llama_kv_self_seq_pos_max( # // - lazily on next llama_decode() # DEPRECATED(LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx), # "simply remove this call, the context will automatically decide when to do a defragmentation based on 'defrag_thold'"); -@ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None) -def llama_kv_self_defrag(ctx: llama_context_p, /): - """Defragment the KV cache (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function("llama_kv_self_defrag", [llama_context_p_ctypes], None) +# def llama_kv_self_defrag(ctx: llama_context_p, /): +# """Defragment the KV cache (DEPRECATED)""" +# ... # // Check if the context supports KV cache shifting # DEPRECATED(LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx), # "use llama_memory_can_shift() instead"); -@ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool) -def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: - """Check if the context supports KV cache shifting (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function("llama_kv_self_can_shift", [llama_context_p_ctypes], ctypes.c_bool) +# def llama_kv_self_can_shift(ctx: llama_context_p, /) -> bool: +# """Check if the context supports KV cache shifting (DEPRECATED)""" +# ... # // Apply the KV cache updates (such as K-shifts, defragmentation, etc.) # DEPRECATED(LLAMA_API void llama_kv_self_update(struct llama_context * ctx), # "simply remove this call, updates are applied lazily on the next llama_decode()"); -@ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None) -def llama_kv_self_update(ctx: llama_context_p, /): - """Apply the KV cache updates (DEPRECATED)""" - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function("llama_kv_self_update", [llama_context_p_ctypes], None) +# def llama_kv_self_update(ctx: llama_context_p, /): +# """Apply the KV cache updates (DEPRECATED)""" +# ... # // @@ -2679,7 +2699,8 @@ def llama_batch_get_one( # int32_t embd, # int32_t n_seq_max); @ctypes_function( - "llama_batch_init", [ctypes.c_int32, ctypes.c_int32, ctypes.c_int32], llama_batch + "llama_batch_init", [ctypes.c_int32, + ctypes.c_int32, ctypes.c_int32], llama_batch ) def llama_batch_init( n_tokens: Union[ctypes.c_int32, int], @@ -2856,7 +2877,8 @@ def llama_synchronize(ctx: llama_context_p, /): # // TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522) # LLAMA_API float * llama_get_logits(struct llama_context * ctx); @ctypes_function( - "llama_get_logits", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float) + "llama_get_logits", [ + llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float) ) def llama_get_logits(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]: """Token logits obtained from the last call to llama_decode() @@ -2897,7 +2919,8 @@ def llama_get_logits_ith( # // TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522) # LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); @ctypes_function( - "llama_get_embeddings", [llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float) + "llama_get_embeddings", [ + llama_context_p_ctypes], ctypes.POINTER(ctypes.c_float) ) def llama_get_embeddings(ctx: llama_context_p, /) -> CtypesArray[ctypes.c_float]: """Get the embeddings for the input @@ -2949,7 +2972,8 @@ def llama_get_embeddings_seq( # LLAMA_API const char * llama_vocab_get_text(const struct llama_vocab * vocab, llama_token token); @ctypes_function( - "llama_vocab_get_text", [llama_vocab_p_ctypes, llama_token], ctypes.c_char_p + "llama_vocab_get_text", [ + llama_vocab_p_ctypes, llama_token], ctypes.c_char_p ) def llama_vocab_get_text( vocab: llama_vocab_p, token: Union[llama_token, int], / @@ -2959,7 +2983,8 @@ def llama_vocab_get_text( # LLAMA_API float llama_vocab_get_score(const struct llama_vocab * vocab, llama_token token); @ctypes_function( - "llama_vocab_get_score", [llama_vocab_p_ctypes, llama_token], ctypes.c_float + "llama_vocab_get_score", [ + llama_vocab_p_ctypes, llama_token], ctypes.c_float ) def llama_vocab_get_score( vocab: llama_vocab_p, token: Union[llama_token, int], / @@ -2990,7 +3015,8 @@ def llama_vocab_is_eog(vocab: llama_vocab_p, token: Union[llama_token, int], /) # // Identify if Token Id is a control token or a render-able token # LLAMA_API bool llama_vocab_is_control(const struct llama_vocab * vocab, llama_token token); @ctypes_function( - "llama_vocab_is_control", [llama_vocab_p_ctypes, llama_token], ctypes.c_bool + "llama_vocab_is_control", [ + llama_vocab_p_ctypes, llama_token], ctypes.c_bool ) def llama_vocab_is_control( vocab: llama_vocab_p, token: Union[llama_token, int], / @@ -3164,6 +3190,8 @@ def llama_token_get_score( ... # DEPRECATED(LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_get_attr instead"); + + @ctypes_function( "llama_token_get_attr", [llama_vocab_p_ctypes, llama_token], @@ -3175,6 +3203,8 @@ def llama_token_get_attr( ... # DEPRECATED(LLAMA_API bool llama_token_is_eog(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_eog instead"); + + @ctypes_function( "llama_token_is_eog", [llama_vocab_p_ctypes, llama_token], @@ -3186,6 +3216,8 @@ def llama_token_is_eog( ... # DEPRECATED(LLAMA_API bool llama_token_is_control(const struct llama_vocab * vocab, llama_token token), "use llama_vocab_is_control instead"); + + @ctypes_function( "llama_token_is_control", [llama_vocab_p_ctypes, llama_token], @@ -3197,6 +3229,8 @@ def llama_token_is_control( ... # DEPRECATED(LLAMA_API llama_token llama_token_bos(const struct llama_vocab * vocab), "use llama_vocab_bos instead"); + + @ctypes_function( "llama_token_bos", [llama_vocab_p_ctypes], @@ -3206,6 +3240,8 @@ def llama_token_bos(vocab: llama_vocab_p, /) -> int: ... # DEPRECATED(LLAMA_API llama_token llama_token_eos(const struct llama_vocab * vocab), "use llama_vocab_eos instead"); + + @ctypes_function( "llama_token_eos", [llama_vocab_p_ctypes], @@ -3215,6 +3251,8 @@ def llama_token_eos(vocab: llama_vocab_p, /) -> int: ... # DEPRECATED(LLAMA_API llama_token llama_token_eot(const struct llama_vocab * vocab), "use llama_vocab_eot instead"); + + @ctypes_function( "llama_token_eot", [llama_vocab_p_ctypes], @@ -3224,6 +3262,8 @@ def llama_token_eot(vocab: llama_vocab_p, /) -> int: ... # DEPRECATED(LLAMA_API llama_token llama_token_cls(const struct llama_vocab * vocab), "use llama_vocab_cls instead"); + + @ctypes_function( "llama_token_cls", [llama_vocab_p_ctypes], @@ -3233,6 +3273,8 @@ def llama_token_cls(vocab: llama_vocab_p, /) -> int: ... # DEPRECATED(LLAMA_API llama_token llama_token_sep(const struct llama_vocab * vocab), "use llama_vocab_sep instead"); + + @ctypes_function( "llama_token_sep", [llama_vocab_p_ctypes], @@ -3272,6 +3314,8 @@ def llama_add_bos_token(vocab: llama_vocab_p, /) -> bool: ... # DEPRECATED(LLAMA_API bool llama_add_eos_token(const struct llama_vocab * vocab), "use llama_vocab_get_add_eos instead"); + + @ctypes_function( "llama_add_eos_token", [llama_vocab_p_ctypes], @@ -3291,6 +3335,8 @@ def llama_token_fim_pre(vocab: llama_vocab_p, /) -> llama_token: ... # DEPRECATED(LLAMA_API llama_token llama_token_fim_suf(const struct llama_vocab * vocab), "use llama_vocab_fim_suf instead"); + + @ctypes_function( "llama_token_fim_suf", [llama_vocab_p_ctypes], @@ -3300,6 +3346,8 @@ def llama_token_fim_suf(vocab: llama_vocab_p, /) -> llama_token: ... # DEPRECATED(LLAMA_API llama_token llama_token_fim_mid(const struct llama_vocab * vocab), "use llama_vocab_fim_mid instead"); + + @ctypes_function( "llama_token_fim_mid", [llama_vocab_p_ctypes], @@ -3309,6 +3357,8 @@ def llama_token_fim_mid(vocab: llama_vocab_p, /) -> llama_token: ... # DEPRECATED(LLAMA_API llama_token llama_token_fim_pad(const struct llama_vocab * vocab), "use llama_vocab_fim_pad instead"); + + @ctypes_function( "llama_token_fim_pad", [llama_vocab_p_ctypes], @@ -3318,6 +3368,8 @@ def llama_token_fim_pad(vocab: llama_vocab_p, /) -> llama_token: ... # DEPRECATED(LLAMA_API llama_token llama_token_fim_rep(const struct llama_vocab * vocab), "use llama_vocab_fim_rep instead"); + + @ctypes_function( "llama_token_fim_rep", [llama_vocab_p_ctypes], @@ -3327,6 +3379,8 @@ def llama_token_fim_rep(vocab: llama_vocab_p, /) -> llama_token: ... # DEPRECATED(LLAMA_API llama_token llama_token_fim_sep(const struct llama_vocab * vocab), "use llama_vocab_fim_sep instead"); + + @ctypes_function( "llama_token_fim_sep", [llama_vocab_p_ctypes], @@ -3338,6 +3392,8 @@ def llama_token_fim_sep(vocab: llama_vocab_p, /) -> llama_token: # // CLS is equivalent to BOS # DEPRECATED(LLAMA_API llama_token llama_vocab_cls(const struct llama_vocab * vocab), // classification # "use llama_vocab_bos instead"); + + @ctypes_function( "llama_vocab_cls", [llama_vocab_p_ctypes], @@ -3634,13 +3690,16 @@ class llama_sampler(ctypes.Structure): llama_sampler_p_ctypes = ctypes.POINTER(llama_sampler) -llama_sampler_i_name = ctypes.CFUNCTYPE(ctypes.c_char_p, llama_sampler_p_ctypes) -llama_sampler_i_accept = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes, llama_token) +llama_sampler_i_name = ctypes.CFUNCTYPE( + ctypes.c_char_p, llama_sampler_p_ctypes) +llama_sampler_i_accept = ctypes.CFUNCTYPE( + None, llama_sampler_p_ctypes, llama_token) llama_sampler_i_apply = ctypes.CFUNCTYPE( None, llama_sampler_p_ctypes, llama_token_data_array_p ) llama_sampler_i_reset = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) -llama_sampler_i_clone = ctypes.CFUNCTYPE(llama_sampler_p_ctypes, llama_sampler_p_ctypes) +llama_sampler_i_clone = ctypes.CFUNCTYPE( + llama_sampler_p_ctypes, llama_sampler_p_ctypes) llama_sampler_i_free = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) llama_sampler_i._fields_ = [ @@ -3806,9 +3865,10 @@ def llama_sampler_init_dist(seed: int) -> llama_sampler_p: # /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. # DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), # "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)"); -@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) -def llama_sampler_init_softmax() -> llama_sampler_p: - ... +# NOTE: This function has been removed in newer versions of llama.cpp +# @ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) +# def llama_sampler_init_softmax() -> llama_sampler_p: +# ... # /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 @@ -4298,10 +4358,13 @@ def llama_perf_sampler_reset(chain: llama_sampler_p, /): # // function that returns whether or not a given tensor contains trainable parameters # typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata); -llama_opt_param_filter = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p) +llama_opt_param_filter = ctypes.CFUNCTYPE( + ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p) # // always returns true # LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata); + + @ctypes_function( "llama_opt_param_filter_all", [ctypes.c_void_p, ctypes.c_void_p], @@ -4325,7 +4388,8 @@ class llama_opt_params(ctypes.Structure): ("n_ctx_train", ctypes.c_uint32), ("param_filter", llama_opt_param_filter), ("param_filter_ud", ctypes.c_void_p), - ("get_opt_pars", ctypes.c_void_p), # ggml_opt_get_optimizer_params - not implemented here + # ggml_opt_get_optimizer_params - not implemented here + ("get_opt_pars", ctypes.c_void_p), ("get_opt_pars_ud", ctypes.c_void_p), ] @@ -4353,7 +4417,7 @@ def llama_opt_init(lctx: llama_context_p, model: llama_model_p, lopt_params: lla [ llama_context_p_ctypes, ctypes.c_void_p, # ggml_opt_dataset_t - ctypes.c_void_p, # ggml_opt_result_t + ctypes.c_void_p, # ggml_opt_result_t ctypes.c_void_p, # ggml_opt_result_t ctypes.c_int64, ctypes.c_void_p, # ggml_opt_epoch_callback diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 4227c9be4..5215b91e9 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 4227c9be4268ac844921b90f31595f81236bd317 +Subproject commit 5215b91e9377ce23e4ccc92ec3156bf5c7f892a3