From 3ec00d01fd41cd87f33fb86effc4c1147f15d134 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 14:41:17 +0100 Subject: [PATCH 01/44] Update Llama class to handle chat_format & caching --- llama_cpp/llama.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2e18b47a0..982b7410c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -758,7 +758,11 @@ def __init__( numa: bool = False, # Chat Format Params chat_format: str = "llama-2", - chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None, + clip_model_path: Optional[str] = None, # only for multimodal, when chat_format=llava-1-5 + # Cache + cache: bool = False, + cache_type: str = "ram", + cache_size: int = 2 << 30, # Misc verbose: bool = True, # Extra Params @@ -791,7 +795,10 @@ def __init__( lora_path: Path to a LoRA file to apply to the model. numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init) chat_format: String specifying the chat format to use when calling create_chat_completion. - chat_handler: Optional chat handler to use when calling create_chat_completion. + clip_model_path: Optional clip model path to use when using multimodal mode, expected when chat_format=llava-1-5. + cache: Optional if true enables caching. + cache_type: String can be "ram" or "disk". + cache_size: Number of bytes to cache, defaults to 2GB verbose: Print verbose output to stderr. Raises: @@ -917,6 +924,14 @@ def __init__( if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + chat_handler = None + if chat_format == "llava-1-5": + assert clip_model_path is not None, "clip model not found" + chat_handler = llama_chat_format.Llava15ChatHandler( + clip_model_path=clip_model_path, + verbose=verbose + ) + self.chat_format = chat_format self.chat_handler = chat_handler @@ -934,6 +949,17 @@ def __init__( (n_ctx, self._n_vocab), dtype=np.single ) + if cache: + if cache_type == "disk": + if verbose: + print(f"Using disk cache with size {cache_size}") + cache = LlamaDiskCache(capacity_bytes=cache_size) + else: + if verbose: + print(f"Using ram cache with size {cache_size}") + cache = LlamaRAMCache(capacity_bytes=cache_size) + self.set_cache(cache) + @property def ctx(self) -> llama_cpp.llama_context_p: assert self._ctx.ctx is not None From 6e68a4bd62fe91803880e4da219450589ac70aab Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 14:41:59 +0100 Subject: [PATCH 02/44] Add settings.py --- llama_cpp/server/settings.py | 142 +++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 llama_cpp/server/settings.py diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py new file mode 100644 index 000000000..46f4c9922 --- /dev/null +++ b/llama_cpp/server/settings.py @@ -0,0 +1,142 @@ +import multiprocessing +from typing import Optional, List, Literal +from pydantic import Field +from pydantic_settings import BaseSettings +import llama_cpp + +# Disable warning for model and model_alias settings +BaseSettings.model_config['protected_namespaces'] = () + +class Settings(BaseSettings): + model: str = Field( + description="The path to the model to use for generating completions." + ) + model_alias: Optional[str] = Field( + default=None, + description="The alias of the model to use for generating completions.", + ) + # Model Params + n_gpu_layers: int = Field( + default=0, + ge=-1, + description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.", + ) + main_gpu: int = Field( + default=0, + ge=0, + description="Main GPU to use.", + ) + tensor_split: Optional[List[float]] = Field( + default=None, + description="Split layers across multiple GPUs in proportion.", + ) + vocab_only: bool = Field( + default=False, description="Whether to only return the vocabulary." + ) + use_mmap: bool = Field( + default=llama_cpp.llama_mmap_supported(), + description="Use mmap.", + ) + use_mlock: bool = Field( + default=llama_cpp.llama_mlock_supported(), + description="Use mlock.", + ) + # Context Params + seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.") + n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_batch: int = Field( + default=512, ge=1, description="The batch size to use per eval." + ) + n_threads: int = Field( + default=max(multiprocessing.cpu_count() // 2, 1), + ge=1, + description="The number of threads to use.", + ) + n_threads_batch: int = Field( + default=max(multiprocessing.cpu_count() // 2, 1), + ge=0, + description="The number of threads to use when batch processing.", + ) + rope_scaling_type: int = Field( + default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED + ) + rope_freq_base: float = Field( + default=0.0, description="RoPE base frequency" + ) + rope_freq_scale: float = Field( + default=0.0, description="RoPE frequency scaling factor" + ) + yarn_ext_factor: float = Field( + default=-1.0 + ) + yarn_attn_factor: float = Field( + default=1.0 + ) + yarn_beta_fast: float = Field( + default=32.0 + ) + yarn_beta_slow: float = Field( + default=1.0 + ) + yarn_orig_ctx: int = Field( + default=0 + ) + mul_mat_q: bool = Field( + default=True, description="if true, use experimental mul_mat_q kernels" + ) + f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") + logits_all: bool = Field(default=True, description="Whether to return logits.") + embedding: bool = Field(default=True, description="Whether to use embeddings.") + # Sampling Params + last_n_tokens_size: int = Field( + default=64, + ge=0, + description="Last n tokens to keep for repeat penalty calculation.", + ) + # LoRA Params + lora_base: Optional[str] = Field( + default=None, + description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model." + ) + lora_path: Optional[str] = Field( + default=None, + description="Path to a LoRA file to apply to the model.", + ) + # Backend Params + numa: bool = Field( + default=False, + description="Enable NUMA support.", + ) + # Chat Format Params + chat_format: str = Field( + default="llama-2", + description="Chat format to use.", + ) + clip_model_path: Optional[str] = Field( + default=None, + description="Path to a CLIP model to use for multi-modal chat completion.", + ) + # Cache Params + cache: bool = Field( + default=False, + description="Use a cache to reduce processing times for evaluated prompts.", + ) + cache_type: Literal["ram", "disk"] = Field( + default="ram", + description="The type of cache to use. Only used if cache is True.", + ) + cache_size: int = Field( + default=2 << 30, + description="The size of the cache in bytes. Only used if cache is True.", + ) + # Misc + verbose: bool = Field( + default=True, description="Whether to print debug information." + ) + # Server Params + host: str = Field(default="localhost", description="Listen address") + port: int = Field(default=8000, description="Listen port") + interrupt_requests: bool = Field( + default=True, + description="Whether to interrupt requests when a new request is received.", + ) From e63cffb7bfc8f4d803856b2b9a4ee691dbc18b3e Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 14:42:49 +0100 Subject: [PATCH 03/44] Add util.py & update __main__.py --- llama_cpp/server/__main__.py | 3 ++- llama_cpp/server/util.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 llama_cpp/server/util.py diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index a294ebf8a..e2da83478 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -27,7 +27,8 @@ import uvicorn -from llama_cpp.server.app import create_app, Settings +from llama_cpp.server.app import create_app +from llama_cpp.server.settings import Settings def get_base_type(annotation): if getattr(annotation, '__origin__', None) is Literal: diff --git a/llama_cpp/server/util.py b/llama_cpp/server/util.py new file mode 100644 index 000000000..e3702794f --- /dev/null +++ b/llama_cpp/server/util.py @@ -0,0 +1,14 @@ +import os +import shutil + +def remove_file(path: str) -> None: + if path and os.path.exists(path): + if os.path.isdir(path): + shutil.rmtree(path) + else: + os.unlink(path) + +def models_root_dir(path = None): + path = os.path.abspath(path or os.environ.get('MODEL', '/models')) + if os.path.isdir(path): return path + return os.path.dirname(path) From 55e33abe56ef2e8ad5ba13186a6678e007293778 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 14:43:05 +0100 Subject: [PATCH 04/44] multimodel --- llama_cpp/server/app.py | 266 +++----------------------------------- llama_cpp/server/model.py | 159 +++++++++++++++++++++++ 2 files changed, 174 insertions(+), 251 deletions(-) create mode 100644 llama_cpp/server/model.py diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 2a6aed81c..2046239cb 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,10 +1,8 @@ import sys import json import traceback -import multiprocessing import time from re import compile, Match, Pattern -from threading import Lock from functools import partial from typing import Callable, Coroutine, Iterator, List, Optional, Tuple, Union, Dict from typing_extensions import TypedDict, Literal @@ -20,7 +18,6 @@ from fastapi.responses import JSONResponse from fastapi.routing import APIRoute from pydantic import BaseModel, Field -from pydantic_settings import BaseSettings from sse_starlette.sse import EventSourceResponse from starlette_context import plugins from starlette_context.middleware import RawContextMiddleware @@ -28,145 +25,10 @@ import numpy as np import numpy.typing as npt - -# Disable warning for model and model_alias settings -BaseSettings.model_config['protected_namespaces'] = () - - -class Settings(BaseSettings): - model: str = Field( - description="The path to the model to use for generating completions." - ) - model_alias: Optional[str] = Field( - default=None, - description="The alias of the model to use for generating completions.", - ) - # Model Params - n_gpu_layers: int = Field( - default=0, - ge=-1, - description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.", - ) - main_gpu: int = Field( - default=0, - ge=0, - description="Main GPU to use.", - ) - tensor_split: Optional[List[float]] = Field( - default=None, - description="Split layers across multiple GPUs in proportion.", - ) - vocab_only: bool = Field( - default=False, description="Whether to only return the vocabulary." - ) - use_mmap: bool = Field( - default=llama_cpp.llama_mmap_supported(), - description="Use mmap.", - ) - use_mlock: bool = Field( - default=llama_cpp.llama_mlock_supported(), - description="Use mlock.", - ) - # Context Params - seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.") - n_ctx: int = Field(default=2048, ge=1, description="The context size.") - n_batch: int = Field( - default=512, ge=1, description="The batch size to use per eval." - ) - n_threads: int = Field( - default=max(multiprocessing.cpu_count() // 2, 1), - ge=1, - description="The number of threads to use.", - ) - n_threads_batch: int = Field( - default=max(multiprocessing.cpu_count() // 2, 1), - ge=0, - description="The number of threads to use when batch processing.", - ) - rope_scaling_type: int = Field( - default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED - ) - rope_freq_base: float = Field( - default=0.0, description="RoPE base frequency" - ) - rope_freq_scale: float = Field( - default=0.0, description="RoPE frequency scaling factor" - ) - yarn_ext_factor: float = Field( - default=-1.0 - ) - yarn_attn_factor: float = Field( - default=1.0 - ) - yarn_beta_fast: float = Field( - default=32.0 - ) - yarn_beta_slow: float = Field( - default=1.0 - ) - yarn_orig_ctx: int = Field( - default=0 - ) - mul_mat_q: bool = Field( - default=True, description="if true, use experimental mul_mat_q kernels" - ) - f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") - logits_all: bool = Field(default=True, description="Whether to return logits.") - embedding: bool = Field(default=True, description="Whether to use embeddings.") - # Sampling Params - last_n_tokens_size: int = Field( - default=64, - ge=0, - description="Last n tokens to keep for repeat penalty calculation.", - ) - # LoRA Params - lora_base: Optional[str] = Field( - default=None, - description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model." - ) - lora_path: Optional[str] = Field( - default=None, - description="Path to a LoRA file to apply to the model.", - ) - # Backend Params - numa: bool = Field( - default=False, - description="Enable NUMA support.", - ) - # Chat Format Params - chat_format: str = Field( - default="llama-2", - description="Chat format to use.", - ) - clip_model_path: Optional[str] = Field( - default=None, - description="Path to a CLIP model to use for multi-modal chat completion.", - ) - # Cache Params - cache: bool = Field( - default=False, - description="Use a cache to reduce processing times for evaluated prompts.", - ) - cache_type: Literal["ram", "disk"] = Field( - default="ram", - description="The type of cache to use. Only used if cache is True.", - ) - cache_size: int = Field( - default=2 << 30, - description="The size of the cache in bytes. Only used if cache is True.", - ) - # Misc - verbose: bool = Field( - default=True, description="Whether to print debug information." - ) - # Server Params - host: str = Field(default="localhost", description="Listen address") - port: int = Field(default=8000, description="Listen port") - interrupt_requests: bool = Field( - default=True, - description="Whether to interrupt requests when a new request is received.", - ) - +from llama_cpp.server.model import get_llama, llama_outer_lock, set_settings, get_settings +from llama_cpp.server.model import router as models_router +from llama_cpp.server.model import MultiLlama as Llama +from llama_cpp.server.settings import Settings class ErrorResponse(TypedDict): """OpenAI style error response""" @@ -176,7 +38,6 @@ class ErrorResponse(TypedDict): param: Optional[str] code: Optional[str] - class ErrorResponseFormatters: """Collection of formatters for error responses. @@ -243,7 +104,6 @@ def model_not_found( code="model_not_found", ) - class RouteErrorHandler(APIRoute): """Custom APIRoute that handles application errors and exceptions""" @@ -351,13 +211,8 @@ async def custom_route_handler(request: Request) -> Response: return custom_route_handler - router = APIRouter(route_class=RouteErrorHandler) -settings: Optional[Settings] = None -llama: Optional[llama_cpp.Llama] = None - - def create_app(settings: Optional[Settings] = None): if settings is None: settings = Settings() @@ -378,103 +233,11 @@ def create_app(settings: Optional[Settings] = None): allow_headers=["*"], ) app.include_router(router) - global llama - - ## - chat_handler = None - if settings.chat_format == "llava-1-5": - assert settings.clip_model_path is not None - chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path, verbose=settings.verbose) - ## - - llama = llama_cpp.Llama( - model_path=settings.model, - # Model Params - n_gpu_layers=settings.n_gpu_layers, - main_gpu=settings.main_gpu, - tensor_split=settings.tensor_split, - vocab_only=settings.vocab_only, - use_mmap=settings.use_mmap, - use_mlock=settings.use_mlock, - # Context Params - seed=settings.seed, - n_ctx=settings.n_ctx, - n_batch=settings.n_batch, - n_threads=settings.n_threads, - n_threads_batch=settings.n_threads_batch, - rope_scaling_type=settings.rope_scaling_type, - rope_freq_base=settings.rope_freq_base, - rope_freq_scale=settings.rope_freq_scale, - yarn_ext_factor=settings.yarn_ext_factor, - yarn_attn_factor=settings.yarn_attn_factor, - yarn_beta_fast=settings.yarn_beta_fast, - yarn_beta_slow=settings.yarn_beta_slow, - yarn_orig_ctx=settings.yarn_orig_ctx, - mul_mat_q=settings.mul_mat_q, - f16_kv=settings.f16_kv, - logits_all=settings.logits_all, - embedding=settings.embedding, - # Sampling Params - last_n_tokens_size=settings.last_n_tokens_size, - # LoRA Params - lora_base=settings.lora_base, - lora_path=settings.lora_path, - # Backend Params - numa=settings.numa, - # Chat Format Params - chat_format=settings.chat_format, - chat_handler=chat_handler, - # Misc - verbose=settings.verbose, - ) - if settings.cache: - if settings.cache_type == "disk": - if settings.verbose: - print(f"Using disk cache with size {settings.cache_size}") - cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) - else: - if settings.verbose: - print(f"Using ram cache with size {settings.cache_size}") - cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) - - cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) - llama.set_cache(cache) - - def set_settings(_settings: Settings): - global settings - settings = _settings + app.include_router(models_router) set_settings(settings) return app - -llama_outer_lock = Lock() -llama_inner_lock = Lock() - - -def get_llama(): - # NOTE: This double lock allows the currently streaming llama model to - # check if any other requests are pending in the same thread and cancel - # the stream if so. - llama_outer_lock.acquire() - release_outer_lock = True - try: - llama_inner_lock.acquire() - try: - llama_outer_lock.release() - release_outer_lock = False - yield llama - finally: - llama_inner_lock.release() - finally: - if release_outer_lock: - llama_outer_lock.release() - - -def get_settings(): - yield settings - - async def get_event_publisher( request: Request, inner_send_chan: MemoryObjectSendStream, @@ -676,11 +439,13 @@ def logit_bias_processor( async def create_completion( request: Request, body: CreateCompletionRequest, - llama: llama_cpp.Llama = Depends(get_llama), + llama: Llama = Depends(get_llama), ) -> llama_cpp.Completion: if isinstance(body.prompt, list): assert len(body.prompt) <= 1 body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" + + llama = llama(body.model) exclude = { "n", @@ -728,9 +493,8 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: else: return iterator_or_completion - class CreateEmbeddingRequest(BaseModel): - model: Optional[str] = model_field + model: str = model_field input: Union[str, List[str]] = Field(description="The input to embed.") user: Optional[str] = Field(default=None) @@ -744,15 +508,14 @@ class CreateEmbeddingRequest(BaseModel): } } - @router.post( "/v1/embeddings", ) async def create_embedding( - request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) + request: CreateEmbeddingRequest, llama: Llama = Depends(get_llama) ): return await run_in_threadpool( - llama.create_embedding, **request.model_dump(exclude={"user"}) + llama(request.model).create_embedding, **request.model_dump(exclude={"user"}) ) @@ -799,7 +562,7 @@ class CreateChatCompletionRequest(BaseModel): ) # ignored or currently unsupported - model: Optional[str] = model_field + model: str = model_field n: Optional[int] = 1 user: Optional[str] = Field(None) @@ -836,7 +599,7 @@ class CreateChatCompletionRequest(BaseModel): async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, - llama: llama_cpp.Llama = Depends(get_llama), + llama: Llama = Depends(get_llama), settings: Settings = Depends(get_settings), ) -> llama_cpp.ChatCompletion: exclude = { @@ -846,7 +609,7 @@ async def create_chat_completion( "user", } kwargs = body.model_dump(exclude=exclude) - + llama = llama(body.model) if body.logit_bias is not None: kwargs["logits_processor"] = llama_cpp.LogitsProcessorList( [ @@ -900,6 +663,7 @@ class ModelList(TypedDict): @router.get("/v1/models") async def get_models( settings: Settings = Depends(get_settings), + llama: Llama = Depends(get_llama), ) -> ModelList: assert llama is not None return { diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py new file mode 100644 index 000000000..237806e37 --- /dev/null +++ b/llama_cpp/server/model.py @@ -0,0 +1,159 @@ +import os +import shutil +import tempfile +from pathlib import Path +from typing import Any, Optional +from threading import Lock +from fastapi import APIRouter, UploadFile, Depends, HTTPException +from fastapi.background import BackgroundTasks +import llama_cpp + +from llama_cpp.server.util import remove_file, models_root_dir +from llama_cpp.server.settings import Settings + +class MultiLlama: + _model: Optional[llama_cpp.Llama] = None + _models = {} + + def __init__(self, settings: Settings) -> None: + self._settings = settings + model_root = models_root_dir(settings.model) + for filename in os.listdir(model_root): + if filename.endswith('.gguf'): + self._models[filename.split('.gguf')[0]] = os.path.join(model_root, filename) + + def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama: + try: + model_path = self._models[model] + except KeyError: + raise HTTPException(404, f"Model file for {model} NOT found") + + if self._model: + if self._model.model_path == model_path: + return self._model + del self._model + + settings = self._settings + self._model = llama_cpp.Llama( + model_path=model_path, + # Model Params + n_gpu_layers=settings.n_gpu_layers, + main_gpu=settings.main_gpu, + tensor_split=settings.tensor_split, + vocab_only=settings.vocab_only, + use_mmap=settings.use_mmap, + use_mlock=settings.use_mlock, + # Context Params + seed=settings.seed, + n_ctx=settings.n_ctx, + n_batch=settings.n_batch, + n_threads=settings.n_threads, + n_threads_batch=settings.n_threads_batch, + rope_scaling_type=settings.rope_scaling_type, + rope_freq_base=settings.rope_freq_base, + rope_freq_scale=settings.rope_freq_scale, + yarn_ext_factor=settings.yarn_ext_factor, + yarn_attn_factor=settings.yarn_attn_factor, + yarn_beta_fast=settings.yarn_beta_fast, + yarn_beta_slow=settings.yarn_beta_slow, + yarn_orig_ctx=settings.yarn_orig_ctx, + mul_mat_q=settings.mul_mat_q, + f16_kv=settings.f16_kv, + logits_all=settings.logits_all, + embedding=settings.embedding, + # Sampling Params + last_n_tokens_size=settings.last_n_tokens_size, + # LoRA Params + lora_base=settings.lora_base, + lora_path=settings.lora_path, + # Backend Params + numa=settings.numa, + # Chat Format Params + chat_format=settings.chat_format, + clip_model_path=settings.clip_model_path, + # Cache + cache=settings.cache, + cache_type=settings.cache_type, + cache_size=settings.cache_size, + # Misc + verbose=settings.verbose, + **kwargs + ) + return self._model + + def __getitem__(self, model): + return self._models[model] + +LLAMA: Optional[MultiLlama] = None +SETTINGS: Optional[Settings] = None + +def set_settings(settings: Settings): + global SETTINGS + SETTINGS = settings + +def get_settings(): + yield SETTINGS + +def init_llama(): + global LLAMA + LLAMA = MultiLlama(SETTINGS) + +llama_outer_lock = Lock() +llama_inner_lock = Lock() + +def get_llama(): + # NOTE: This double lock allows the currently streaming llama model to + # check if any other requests are pending in the same thread and cancel + # the stream if so. + llama_outer_lock.acquire() + release_outer_lock = True + try: + llama_inner_lock.acquire() + try: + if not LLAMA: + init_llama() + llama_outer_lock.release() + release_outer_lock = False + yield LLAMA + finally: + llama_inner_lock.release() + finally: + if release_outer_lock: + llama_outer_lock.release() + + +router = APIRouter( + prefix="/models", + tags=["Model"], + responses={404: {"description": "Not found"}}, +) + +@router.put("/") +async def api_update_model( + file: UploadFile, + background_tasks: BackgroundTasks + # user: User = Depends(RBAC(settings.auth_role)), +): + ext = "".join(Path(file.filename).suffixes) if file.filename else ".gguf" + model_file = tempfile.NamedTemporaryFile(suffix=ext).name + with open(model_file, "wb") as buffer: + shutil.copyfileobj(file.file, buffer) + background_tasks.add_task(remove_file, model_file) + models_dir = os.path.dirname(os.path.abspath(os.environ.get('MODEL', '/'))) + target_path = os.path.join(models_dir, file.filename) + shutil.copy(model_file, target_path) + LLAMA[file.filename] = target_path + return {"model": target_path} + +@router.delete("/{model}") +async def api_delete_model( + model: str, + background_tasks: BackgroundTasks + # user: User = Depends(RBAC(settings.auth_role)), +): + models_dir = models_root_dir() + target_path = os.path.join(models_dir, LLAMA[model]) + if not os.path.exists(target_path): + raise HTTPException(status_code=404, detail=f"Model File NOT Found for {model}") + background_tasks.add_task(remove_file, target_path) + return 'success' From 5ab0010443a65dc3ce84b653a273f15a7b31945e Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 15:15:08 +0100 Subject: [PATCH 05/44] update settings.py --- llama_cpp/server/app.py | 8 ++++---- llama_cpp/server/model.py | 14 +++++--------- llama_cpp/server/settings.py | 9 +++++++++ 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 0d0c0696d..c769d9f56 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -25,10 +25,10 @@ import numpy as np import numpy.typing as npt -from llama_cpp.server.model import get_llama, llama_outer_lock, set_settings, get_settings +from llama_cpp.server.model import get_llama, llama_outer_lock, MultiLlama as Llama from llama_cpp.server.model import router as models_router -from llama_cpp.server.model import MultiLlama as Llama -from llama_cpp.server.settings import Settings +#from llama_cpp.server.model import MultiLlama as Llama +from llama_cpp.server.settings import Settings, SETTINGS, set_settings, get_settings class ErrorResponse(TypedDict): """OpenAI style error response""" @@ -249,7 +249,7 @@ async def get_event_publisher( await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() - if settings.interrupt_requests and llama_outer_lock.locked(): + if SETTINGS.interrupt_requests and llama_outer_lock.locked(): await inner_send_chan.send(dict(data="[DONE]")) raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 237806e37..d375811e3 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -9,7 +9,7 @@ import llama_cpp from llama_cpp.server.util import remove_file, models_root_dir -from llama_cpp.server.settings import Settings +from llama_cpp.server.settings import Settings, SETTINGS class MultiLlama: _model: Optional[llama_cpp.Llama] = None @@ -26,6 +26,7 @@ def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama: try: model_path = self._models[model] except KeyError: + # TODO server raises 500 ? raise HTTPException(404, f"Model file for {model} NOT found") if self._model: @@ -84,15 +85,10 @@ def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama: def __getitem__(self, model): return self._models[model] + def __setitem__(self, model, path): + self._models[model] = path + LLAMA: Optional[MultiLlama] = None -SETTINGS: Optional[Settings] = None - -def set_settings(settings: Settings): - global SETTINGS - SETTINGS = settings - -def get_settings(): - yield SETTINGS def init_llama(): global LLAMA diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 46f4c9922..c7f696976 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -140,3 +140,12 @@ class Settings(BaseSettings): default=True, description="Whether to interrupt requests when a new request is received.", ) + +SETTINGS: Optional[Settings] = None + +def set_settings(settings: Settings): + global SETTINGS + SETTINGS = settings + +def get_settings(): + yield SETTINGS From 45bfa0750d3fc8c8e70f35366cf53d8cb16fb137 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 15:38:06 +0100 Subject: [PATCH 06/44] cleanup --- llama_cpp/server/app.py | 7 ++--- llama_cpp/server/model.py | 54 +++++---------------------------------- 2 files changed, 8 insertions(+), 53 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index c769d9f56..41b5508e0 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -26,9 +26,7 @@ import numpy.typing as npt from llama_cpp.server.model import get_llama, llama_outer_lock, MultiLlama as Llama -from llama_cpp.server.model import router as models_router -#from llama_cpp.server.model import MultiLlama as Llama -from llama_cpp.server.settings import Settings, SETTINGS, set_settings, get_settings +from llama_cpp.server.settings import Settings, set_settings, get_settings class ErrorResponse(TypedDict): """OpenAI style error response""" @@ -233,7 +231,6 @@ def create_app(settings: Optional[Settings] = None): allow_headers=["*"], ) app.include_router(router) - app.include_router(models_router) set_settings(settings) return app @@ -249,7 +246,7 @@ async def get_event_publisher( await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() - if SETTINGS.interrupt_requests and llama_outer_lock.locked(): + if next(get_settings()).interrupt_requests and llama_outer_lock.locked(): await inner_send_chan.send(dict(data="[DONE]")) raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index d375811e3..268776218 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -1,15 +1,10 @@ import os -import shutil -import tempfile -from pathlib import Path from typing import Any, Optional from threading import Lock -from fastapi import APIRouter, UploadFile, Depends, HTTPException -from fastapi.background import BackgroundTasks import llama_cpp -from llama_cpp.server.util import remove_file, models_root_dir -from llama_cpp.server.settings import Settings, SETTINGS +from llama_cpp.server.util import models_root_dir +from llama_cpp.server.settings import Settings, get_settings class MultiLlama: _model: Optional[llama_cpp.Llama] = None @@ -27,7 +22,7 @@ def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama: model_path = self._models[model] except KeyError: # TODO server raises 500 ? - raise HTTPException(404, f"Model file for {model} NOT found") + raise Exception(404, f"Model file for {model} NOT found") if self._model: if self._model.model_path == model_path: @@ -90,9 +85,9 @@ def __setitem__(self, model, path): LLAMA: Optional[MultiLlama] = None -def init_llama(): +def _set_llama(settings: Optional[Settings] = None): global LLAMA - LLAMA = MultiLlama(SETTINGS) + LLAMA = MultiLlama(settings or next(get_settings())) llama_outer_lock = Lock() llama_inner_lock = Lock() @@ -107,7 +102,7 @@ def get_llama(): llama_inner_lock.acquire() try: if not LLAMA: - init_llama() + _set_llama() llama_outer_lock.release() release_outer_lock = False yield LLAMA @@ -116,40 +111,3 @@ def get_llama(): finally: if release_outer_lock: llama_outer_lock.release() - - -router = APIRouter( - prefix="/models", - tags=["Model"], - responses={404: {"description": "Not found"}}, -) - -@router.put("/") -async def api_update_model( - file: UploadFile, - background_tasks: BackgroundTasks - # user: User = Depends(RBAC(settings.auth_role)), -): - ext = "".join(Path(file.filename).suffixes) if file.filename else ".gguf" - model_file = tempfile.NamedTemporaryFile(suffix=ext).name - with open(model_file, "wb") as buffer: - shutil.copyfileobj(file.file, buffer) - background_tasks.add_task(remove_file, model_file) - models_dir = os.path.dirname(os.path.abspath(os.environ.get('MODEL', '/'))) - target_path = os.path.join(models_dir, file.filename) - shutil.copy(model_file, target_path) - LLAMA[file.filename] = target_path - return {"model": target_path} - -@router.delete("/{model}") -async def api_delete_model( - model: str, - background_tasks: BackgroundTasks - # user: User = Depends(RBAC(settings.auth_role)), -): - models_dir = models_root_dir() - target_path = os.path.join(models_dir, LLAMA[model]) - if not os.path.exists(target_path): - raise HTTPException(status_code=404, detail=f"Model File NOT Found for {model}") - background_tasks.add_task(remove_file, target_path) - return 'success' From 76c0168a454825c5bf124e94247482a74bbc76e4 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 15:43:37 +0100 Subject: [PATCH 07/44] delete util.py --- llama_cpp/server/model.py | 7 +++++-- llama_cpp/server/util.py | 14 -------------- 2 files changed, 5 insertions(+), 16 deletions(-) delete mode 100644 llama_cpp/server/util.py diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 268776218..ff715e124 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -2,10 +2,13 @@ from typing import Any, Optional from threading import Lock import llama_cpp - -from llama_cpp.server.util import models_root_dir from llama_cpp.server.settings import Settings, get_settings +def models_root_dir(path = None): + path = os.path.abspath(path or os.environ.get('MODEL', '/models')) + if os.path.isdir(path): return path + return os.path.dirname(path) + class MultiLlama: _model: Optional[llama_cpp.Llama] = None _models = {} diff --git a/llama_cpp/server/util.py b/llama_cpp/server/util.py deleted file mode 100644 index e3702794f..000000000 --- a/llama_cpp/server/util.py +++ /dev/null @@ -1,14 +0,0 @@ -import os -import shutil - -def remove_file(path: str) -> None: - if path and os.path.exists(path): - if os.path.isdir(path): - shutil.rmtree(path) - else: - os.unlink(path) - -def models_root_dir(path = None): - path = os.path.abspath(path or os.environ.get('MODEL', '/models')) - if os.path.isdir(path): return path - return os.path.dirname(path) From 97a6a218b320e010d2ac3b780a6ad2eba3d55472 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 16:07:33 +0100 Subject: [PATCH 08/44] Fix /v1/models endpoint --- llama_cpp/server/app.py | 7 ++----- llama_cpp/server/model.py | 10 +++++++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 41b5508e0..c6632c370 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -655,17 +655,14 @@ async def get_models( settings: Settings = Depends(get_settings), llama: Llama = Depends(get_llama), ) -> ModelList: - assert llama is not None return { "object": "list", "data": [ { - "id": settings.model_alias - if settings.model_alias is not None - else llama.model_path, + "id": model, "object": "model", "owned_by": "me", "permissions": [], - } + } for model in llama._models ], } diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index ff715e124..b3ecf6968 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -4,8 +4,12 @@ import llama_cpp from llama_cpp.server.settings import Settings, get_settings +FILE_EXT = ".gguf" +MODEL_ENV_ARG = "MODEL" +DEFAULT_MODEL_DIR = "/models" + def models_root_dir(path = None): - path = os.path.abspath(path or os.environ.get('MODEL', '/models')) + path = os.path.abspath(path or os.environ.get(MODEL_ENV_ARG, DEFAULT_MODEL_DIR)) if os.path.isdir(path): return path return os.path.dirname(path) @@ -17,8 +21,8 @@ def __init__(self, settings: Settings) -> None: self._settings = settings model_root = models_root_dir(settings.model) for filename in os.listdir(model_root): - if filename.endswith('.gguf'): - self._models[filename.split('.gguf')[0]] = os.path.join(model_root, filename) + if filename.endswith(FILE_EXT): + self._models[filename.split(FILE_EXT)[0]] = os.path.join(model_root, filename) def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama: try: From fb2a1e782bf1ec20ce56b38882256051c67e3642 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 16:21:50 +0100 Subject: [PATCH 09/44] MultiLlama now iterable, app check-alive on "/" --- llama_cpp/server/app.py | 7 +++++-- llama_cpp/server/model.py | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index c6632c370..66235ec6b 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -232,6 +232,10 @@ def create_app(settings: Optional[Settings] = None): ) app.include_router(router) + @app.get('/') + async def root(): + return "pong" + set_settings(settings) return app @@ -652,7 +656,6 @@ class ModelList(TypedDict): @router.get("/v1/models") async def get_models( - settings: Settings = Depends(get_settings), llama: Llama = Depends(get_llama), ) -> ModelList: return { @@ -663,6 +666,6 @@ async def get_models( "object": "model", "owned_by": "me", "permissions": [], - } for model in llama._models + } for model in llama ], } diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index b3ecf6968..a957a7159 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -90,6 +90,10 @@ def __getitem__(self, model): def __setitem__(self, model, path): self._models[model] = path + def __iter__(self): + for model in self._models: + yield model + LLAMA: Optional[MultiLlama] = None def _set_llama(settings: Optional[Settings] = None): From 3f150ac93f08cfa9fa2882d7f3da319dba436046 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 17:54:26 +0100 Subject: [PATCH 10/44] instant model init if file is given --- llama_cpp/server/app.py | 1 + llama_cpp/server/model.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 66235ec6b..95865822b 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -237,6 +237,7 @@ async def root(): return "pong" set_settings(settings) + next(get_llama()) return app async def get_event_publisher( diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index a957a7159..46c2c1e1b 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -23,6 +23,8 @@ def __init__(self, settings: Settings) -> None: for filename in os.listdir(model_root): if filename.endswith(FILE_EXT): self._models[filename.split(FILE_EXT)[0]] = os.path.join(model_root, filename) + if os.path.isfile(settings.model): + self(settings.model.split(os.path.sep)[-1].split(FILE_EXT)[0]) def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama: try: From e71946ce26d55c62940c40327965e0f28549d74e Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Tue, 21 Nov 2023 18:16:14 +0100 Subject: [PATCH 11/44] backward compability --- llama_cpp/server/model.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 46c2c1e1b..065c4e97d 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -1,6 +1,7 @@ import os from typing import Any, Optional from threading import Lock +import logging import llama_cpp from llama_cpp.server.settings import Settings, get_settings @@ -8,6 +9,8 @@ MODEL_ENV_ARG = "MODEL" DEFAULT_MODEL_DIR = "/models" +logger = logging.getLogger("uvicorn") + def models_root_dir(path = None): path = os.path.abspath(path or os.environ.get(MODEL_ENV_ARG, DEFAULT_MODEL_DIR)) if os.path.isdir(path): return path @@ -30,8 +33,11 @@ def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama: try: model_path = self._models[model] except KeyError: - # TODO server raises 500 ? - raise Exception(404, f"Model file for {model} NOT found") + if self._model: + if self._settings.verbose: logger.info(f"Model file for {model} NOT found! Using preloaded") + return self._model + else: raise Exception(404, f"Model file for {model} NOT found") + if self._model: if self._model.model_path == model_path: From 55a9767067029c5219aa8681ca4a871ba8850b53 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 22 Nov 2023 16:30:59 +0100 Subject: [PATCH 12/44] revert model param mandatory --- llama_cpp/server/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 95865822b..e7238e180 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -486,7 +486,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: return iterator_or_completion class CreateEmbeddingRequest(BaseModel): - model: str = model_field + model: [str] = model_field input: Union[str, List[str]] = Field(description="The input to embed.") user: Optional[str] = Field(default=None) @@ -558,7 +558,7 @@ class CreateChatCompletionRequest(BaseModel): ) # ignored or currently unsupported - model: str = model_field + model: [str] = model_field n: Optional[int] = 1 user: Optional[str] = Field(None) From 3c4b526041958e25e1dcc095491d9693aa285fdf Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 22 Nov 2023 18:19:11 +0100 Subject: [PATCH 13/44] fix error --- llama_cpp/llama.py | 2 ++ llama_cpp/server/app.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 097eacfc9..0209cacb0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -924,6 +924,7 @@ def __init__( if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + # TODO move out of class chat_handler = None if chat_format == "llava-1-5": assert clip_model_path is not None, "clip model not found" @@ -931,6 +932,7 @@ def __init__( clip_model_path=clip_model_path, verbose=verbose ) + ## self.chat_format = chat_format self.chat_handler = chat_handler diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index e7238e180..b21c28354 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -486,7 +486,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: return iterator_or_completion class CreateEmbeddingRequest(BaseModel): - model: [str] = model_field + model: Optional[str] = model_field input: Union[str, List[str]] = Field(description="The input to embed.") user: Optional[str] = Field(default=None) @@ -558,7 +558,7 @@ class CreateChatCompletionRequest(BaseModel): ) # ignored or currently unsupported - model: [str] = model_field + model: Optional[str] = model_field n: Optional[int] = 1 user: Optional[str] = Field(None) From 10a2d32655722b2a2baafd71cde5d07ea2fea240 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 22 Nov 2023 19:21:18 +0100 Subject: [PATCH 14/44] handle individual model config json --- llama_cpp/server/model.py | 23 +++++++++++++++-------- llama_cpp/server/settings.py | 23 ++++++++++++++--------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 065c4e97d..7efa35d18 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -3,7 +3,7 @@ from threading import Lock import logging import llama_cpp -from llama_cpp.server.settings import Settings, get_settings +from llama_cpp.server.settings import Settings, ModelSettings, get_settings FILE_EXT = ".gguf" MODEL_ENV_ARG = "MODEL" @@ -29,22 +29,30 @@ def __init__(self, settings: Settings) -> None: if os.path.isfile(settings.model): self(settings.model.split(os.path.sep)[-1].split(FILE_EXT)[0]) - def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama: + def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: + # handle backward compatibility, model param optional try: model_path = self._models[model] except KeyError: if self._model: - if self._settings.verbose: logger.info(f"Model file for {model} NOT found! Using preloaded") + if self._settings.verbose: logger.warn(f"Model file for {model} NOT found! Using preloaded") return self._model else: raise Exception(404, f"Model file for {model} NOT found") - if self._model: if self._model.model_path == model_path: return self._model del self._model - settings = self._settings + settings_path = os.path.join(os.path.dirname(model_path), + model_path.split(os.path.sep)[-1].split(FILE_EXT)[0] + ".json") + try: + with open(settings_path, 'rb') as f: + settings = ModelSettings.model_validate_json(f.read()) + except Exception as e: + if self._settings.verbose: logger.warn(f"Loading settings for {model} FAILED! Using default") + settings = self._settings + self._model = llama_cpp.Llama( model_path=model_path, # Model Params @@ -88,14 +96,13 @@ def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama: cache_size=settings.cache_size, # Misc verbose=settings.verbose, - **kwargs ) return self._model - def __getitem__(self, model): + def __getitem__(self, model: str) -> str: return self._models[model] - def __setitem__(self, model, path): + def __setitem__(self, model: str, path: str): self._models[model] = path def __iter__(self): diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index c7f696976..ad5c7ed77 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -1,20 +1,13 @@ import multiprocessing from typing import Optional, List, Literal from pydantic import Field -from pydantic_settings import BaseSettings +from pydantic_settings import BaseSettings, SettingsConfigDict import llama_cpp # Disable warning for model and model_alias settings BaseSettings.model_config['protected_namespaces'] = () -class Settings(BaseSettings): - model: str = Field( - description="The path to the model to use for generating completions." - ) - model_alias: Optional[str] = Field( - default=None, - description="The alias of the model to use for generating completions.", - ) +class ModelSettings(BaseSettings): # Model Params n_gpu_layers: int = Field( default=0, @@ -133,6 +126,9 @@ class Settings(BaseSettings): verbose: bool = Field( default=True, description="Whether to print debug information." ) + +class ServerSettings(BaseSettings): + model_config = SettingsConfigDict(env_file='.env') # Server Params host: str = Field(default="localhost", description="Listen address") port: int = Field(default=8000, description="Listen port") @@ -141,6 +137,15 @@ class Settings(BaseSettings): description="Whether to interrupt requests when a new request is received.", ) +class Settings(ModelSettings, ServerSettings): + model: str = Field( + description="The path to the model to use for generating completions." + ) + model_alias: Optional[str] = Field( + default=None, + description="The alias of the model to use for generating completions.", + ) + SETTINGS: Optional[Settings] = None def set_settings(settings: Settings): From ee71f2088c29fb812f5b9972c701806aa53920a9 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 22 Nov 2023 20:18:09 +0100 Subject: [PATCH 15/44] refactor --- llama_cpp/server/model.py | 46 +++------------------------------------ 1 file changed, 3 insertions(+), 43 deletions(-) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 7efa35d18..f76fac3c2 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -18,7 +18,7 @@ def models_root_dir(path = None): class MultiLlama: _model: Optional[llama_cpp.Llama] = None - _models = {} + _models: dict[str,str] = {} def __init__(self, settings: Settings) -> None: self._settings = settings @@ -49,53 +49,13 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: try: with open(settings_path, 'rb') as f: settings = ModelSettings.model_validate_json(f.read()) - except Exception as e: + except: if self._settings.verbose: logger.warn(f"Loading settings for {model} FAILED! Using default") settings = self._settings self._model = llama_cpp.Llama( model_path=model_path, - # Model Params - n_gpu_layers=settings.n_gpu_layers, - main_gpu=settings.main_gpu, - tensor_split=settings.tensor_split, - vocab_only=settings.vocab_only, - use_mmap=settings.use_mmap, - use_mlock=settings.use_mlock, - # Context Params - seed=settings.seed, - n_ctx=settings.n_ctx, - n_batch=settings.n_batch, - n_threads=settings.n_threads, - n_threads_batch=settings.n_threads_batch, - rope_scaling_type=settings.rope_scaling_type, - rope_freq_base=settings.rope_freq_base, - rope_freq_scale=settings.rope_freq_scale, - yarn_ext_factor=settings.yarn_ext_factor, - yarn_attn_factor=settings.yarn_attn_factor, - yarn_beta_fast=settings.yarn_beta_fast, - yarn_beta_slow=settings.yarn_beta_slow, - yarn_orig_ctx=settings.yarn_orig_ctx, - mul_mat_q=settings.mul_mat_q, - f16_kv=settings.f16_kv, - logits_all=settings.logits_all, - embedding=settings.embedding, - # Sampling Params - last_n_tokens_size=settings.last_n_tokens_size, - # LoRA Params - lora_base=settings.lora_base, - lora_path=settings.lora_path, - # Backend Params - numa=settings.numa, - # Chat Format Params - chat_format=settings.chat_format, - clip_model_path=settings.clip_model_path, - # Cache - cache=settings.cache, - cache_type=settings.cache_type, - cache_size=settings.cache_size, - # Misc - verbose=settings.verbose, + **(settings.model_dump(exclude={"model",})) ) return self._model From ea0fcca8b0a939169a33a4ebf63819a6e441a9e3 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 22 Nov 2023 20:47:57 +0100 Subject: [PATCH 16/44] revert chathandler/clip_model changes --- llama_cpp/llama.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0209cacb0..ce7053ab0 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -758,7 +758,7 @@ def __init__( numa: bool = False, # Chat Format Params chat_format: str = "llama-2", - clip_model_path: Optional[str] = None, # only for multimodal, when chat_format=llava-1-5 + chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None, # Cache cache: bool = False, cache_type: str = "ram", @@ -795,7 +795,7 @@ def __init__( lora_path: Path to a LoRA file to apply to the model. numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init) chat_format: String specifying the chat format to use when calling create_chat_completion. - clip_model_path: Optional clip model path to use when using multimodal mode, expected when chat_format=llava-1-5. + chat_handler: Optional chat handler to use when calling create_chat_completion. cache: Optional if true enables caching. cache_type: String can be "ram" or "disk". cache_size: Number of bytes to cache, defaults to 2GB @@ -923,16 +923,6 @@ def __init__( if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - - # TODO move out of class - chat_handler = None - if chat_format == "llava-1-5": - assert clip_model_path is not None, "clip model not found" - chat_handler = llama_chat_format.Llava15ChatHandler( - clip_model_path=clip_model_path, - verbose=verbose - ) - ## self.chat_format = chat_format self.chat_handler = chat_handler From 6f5e60a896198a885f707266dbe7048170441fbd Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 22 Nov 2023 21:10:26 +0100 Subject: [PATCH 17/44] handle chat_handler in MulitLlama() --- llama_cpp/server/model.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index f76fac3c2..cb24e1ef2 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -53,9 +53,18 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: if self._settings.verbose: logger.warn(f"Loading settings for {model} FAILED! Using default") settings = self._settings + chat_handler = None + if settings.chat_format == "llava-1-5": + assert settings.clip_model_path is not None, "clip model not found" + chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler( + clip_model_path=settings.clip_model_path, + verbose=settings.verbose + ) + self._model = llama_cpp.Llama( model_path=model_path, - **(settings.model_dump(exclude={"model",})) + chat_handler=chat_handler, + **(settings.model_dump(exclude={"model","clip_model_path",})) ) return self._model From d9d696d0001a27b307cc8d06698a6ed8e7bb26d2 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:31:30 +0100 Subject: [PATCH 18/44] split settings into server/llama --- llama_cpp/server/__main__.py | 32 +++++++++++++++++++++++++------- llama_cpp/server/settings.py | 25 +++++++++++++++---------- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index e2da83478..f109b4682 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -22,13 +22,14 @@ """ import os +import sys import argparse from typing import List, Literal, Union import uvicorn from llama_cpp.server.app import create_app -from llama_cpp.server.settings import Settings +from llama_cpp.server.settings import Settings, ServerSettings, set_settings def get_base_type(annotation): if getattr(annotation, '__origin__', None) is Literal: @@ -68,9 +69,9 @@ def parse_bool_arg(arg): else: raise ValueError(f'Invalid boolean argument: {arg}') -if __name__ == "__main__": +def create_parser(settings_dict): parser = argparse.ArgumentParser() - for name, field in Settings.model_fields.items(): + for name, field in settings_dict.items(): description = field.description if field.default is not None and description is not None: description += f" (default: {field.default})" @@ -91,11 +92,28 @@ def parse_bool_arg(arg): type=parse_bool_arg, help=f"{description}", ) + return parser - args = parser.parse_args() - settings = Settings(**{k: v for k, v in vars(args).items() if v is not None}) - app = create_app(settings=settings) +if __name__ == "__main__": + server_arg_parser = create_parser(ServerSettings.model_fields) + parser = create_parser(Settings.model_fields) + + try: + server_args, _ = server_arg_parser.parse_known_args() + server_settings = ServerSettings(**{k: v for k, v in vars(server_args).items() if v is not None}) + set_settings(server_settings) + if server_settings.config and os.path.exists(server_settings.config): + with open(server_settings.config, 'rb') as f: + llama_settings = Settings.model_validate_json(f.read()) + else: + args, _ = parser.parse_known_args() + llama_settings = Settings(**{k: v for k, v in vars(args).items() if v is not None}) + app = create_app(settings=llama_settings) + except Exception as e: + print(e, file=sys.stderr) + parser.print_help() + sys.exit(1) uvicorn.run( - app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port)) + app, host=server_settings.host, port=server_settings.port ) diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index ad5c7ed77..98215a891 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -1,3 +1,4 @@ +import os import multiprocessing from typing import Optional, List, Literal from pydantic import Field @@ -8,6 +9,13 @@ BaseSettings.model_config['protected_namespaces'] = () class ModelSettings(BaseSettings): + model: str = Field( + description="The path to the model to use for generating completions." + ) + model_alias: Optional[str] = Field( + default=None, + description="The alias of the model to use for generating completions.", + ) # Model Params n_gpu_layers: int = Field( default=0, @@ -129,26 +137,23 @@ class ModelSettings(BaseSettings): class ServerSettings(BaseSettings): model_config = SettingsConfigDict(env_file='.env') - # Server Params host: str = Field(default="localhost", description="Listen address") port: int = Field(default=8000, description="Listen port") interrupt_requests: bool = Field( default=True, description="Whether to interrupt requests when a new request is received.", ) + config: Optional[str] = Field(default=None, description="Path to config file") -class Settings(ModelSettings, ServerSettings): - model: str = Field( - description="The path to the model to use for generating completions." - ) - model_alias: Optional[str] = Field( - default=None, - description="The alias of the model to use for generating completions.", +class Settings(ModelSettings): + models: Optional[List[ModelSettings]] = Field( + default = [], + description="Model configs, overwrites default config" ) -SETTINGS: Optional[Settings] = None +SETTINGS: Optional[ServerSettings] = None -def set_settings(settings: Settings): +def set_settings(settings: ServerSettings): global SETTINGS SETTINGS = settings From e71fc92e962c89127139774f82489b2a5904bf0a Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:31:54 +0100 Subject: [PATCH 19/44] reduce global vars --- llama_cpp/server/app.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b21c28354..abf1cb043 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -25,7 +25,7 @@ import numpy as np import numpy.typing as npt -from llama_cpp.server.model import get_llama, llama_outer_lock, MultiLlama as Llama +from llama_cpp.server.model import get_llama, set_llama, llama_outer_lock, LlamaProxy as Llama from llama_cpp.server.settings import Settings, set_settings, get_settings class ErrorResponse(TypedDict): @@ -211,10 +211,7 @@ async def custom_route_handler(request: Request) -> Response: router = APIRouter(route_class=RouteErrorHandler) -def create_app(settings: Optional[Settings] = None): - if settings is None: - settings = Settings() - +def create_app(settings: Settings): middleware = [ Middleware(RawContextMiddleware, plugins=(plugins.RequestIdPlugin(),)) ] @@ -236,8 +233,7 @@ def create_app(settings: Optional[Settings] = None): async def root(): return "pong" - set_settings(settings) - next(get_llama()) + set_llama(settings) return app async def get_event_publisher( @@ -596,7 +592,7 @@ async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, llama: Llama = Depends(get_llama), - settings: Settings = Depends(get_settings), + #settings: Settings = Depends(get_settings), ) -> llama_cpp.ChatCompletion: exclude = { "n", From 522f0bd5a7426a058dbd643892a04b6a5500b087 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Thu, 23 Nov 2023 20:32:53 +0100 Subject: [PATCH 20/44] Update LlamaProxy to handle config files --- llama_cpp/server/model.py | 84 ++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index cb24e1ef2..2ef63e119 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -1,57 +1,68 @@ import os -from typing import Any, Optional -from threading import Lock import logging +from typing import Optional, Union +from threading import Lock import llama_cpp -from llama_cpp.server.settings import Settings, ModelSettings, get_settings +from llama_cpp.server.settings import Settings, ModelSettings FILE_EXT = ".gguf" MODEL_ENV_ARG = "MODEL" -DEFAULT_MODEL_DIR = "/models" +DEFAULT_MODEL_DIR = os.path.join(os.getcwd(), "/models") logger = logging.getLogger("uvicorn") -def models_root_dir(path = None): +def models_root_dir(path: Optional[str] = None): path = os.path.abspath(path or os.environ.get(MODEL_ENV_ARG, DEFAULT_MODEL_DIR)) if os.path.isdir(path): return path return os.path.dirname(path) -class MultiLlama: +def model_alias(path: str) -> str: + return path.split(os.path.sep)[-1].split(FILE_EXT)[0] + +class LlamaProxy: _model: Optional[llama_cpp.Llama] = None - _models: dict[str,str] = {} + _models: dict[str,ModelSettings] = {} def __init__(self, settings: Settings) -> None: self._settings = settings + for model in settings.models: + if not model.model_alias: + model.model_alias = model_alias(model.model) + self._models[model.model_alias] = model + model_root = models_root_dir(settings.model) for filename in os.listdir(model_root): if filename.endswith(FILE_EXT): - self._models[filename.split(FILE_EXT)[0]] = os.path.join(model_root, filename) + alias = model_alias(filename) + if alias in self._models: continue + exclude={'model', 'model_alias', 'models', 'host', 'port', 'interrupt_requests', 'config'} + default_settings = settings.model_dump(exclude=exclude) + self._models[alias] = ModelSettings(model=os.path.join(model_root, filename), + model_alias=alias, **default_settings) + if os.path.isfile(settings.model): - self(settings.model.split(os.path.sep)[-1].split(FILE_EXT)[0]) + alias = settings.model_alias + if alias is None: alias = model_alias(settings.model) + if alias not in self._models: + self._models[alias] = settings + self(alias) def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: # handle backward compatibility, model param optional try: - model_path = self._models[model] + model_alias = self._models[model].model_alias except KeyError: if self._model: - if self._settings.verbose: logger.warn(f"Model file for {model} NOT found! Using preloaded") + if self._settings.verbose: logger.warn(f"Model {model} NOT found! Using {self._model.alias}") return self._model - else: raise Exception(404, f"Model file for {model} NOT found") + else: raise Exception(404, f"Model {model} NOT found!") if self._model: - if self._model.model_path == model_path: + if self._model.alias == model_alias: return self._model del self._model - settings_path = os.path.join(os.path.dirname(model_path), - model_path.split(os.path.sep)[-1].split(FILE_EXT)[0] + ".json") - try: - with open(settings_path, 'rb') as f: - settings = ModelSettings.model_validate_json(f.read()) - except: - if self._settings.verbose: logger.warn(f"Loading settings for {model} FAILED! Using default") - settings = self._settings + settings = self._models[model] chat_handler = None if settings.chat_format == "llava-1-5": @@ -62,31 +73,34 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: ) self._model = llama_cpp.Llama( - model_path=model_path, - chat_handler=chat_handler, - **(settings.model_dump(exclude={"model","clip_model_path",})) + model_path=settings.model, + **(settings.model_dump(exclude={'model', 'models'})), + chat_handler=chat_handler ) + self._model.alias = model return self._model - def __getitem__(self, model: str) -> str: - return self._models[model] - - def __setitem__(self, model: str, path: str): - self._models[model] = path + def __getitem__(self, model: str): + return self._models[model].model_dump() + + def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]): + if isinstance(settings, bytes) or isinstance(settings, str): + settings = ModelSettings.model_validate_json(settings) + self._models[model] = settings def __iter__(self): for model in self._models: yield model -LLAMA: Optional[MultiLlama] = None - -def _set_llama(settings: Optional[Settings] = None): - global LLAMA - LLAMA = MultiLlama(settings or next(get_settings())) +LLAMA: Optional[LlamaProxy] = None llama_outer_lock = Lock() llama_inner_lock = Lock() +def set_llama(settings: Settings): + global LLAMA + LLAMA = LlamaProxy(settings) + def get_llama(): # NOTE: This double lock allows the currently streaming llama model to # check if any other requests are pending in the same thread and cancel @@ -96,8 +110,6 @@ def get_llama(): try: llama_inner_lock.acquire() try: - if not LLAMA: - _set_llama() llama_outer_lock.release() release_outer_lock = False yield LLAMA From 6e0ab3e18bbcaba8056a91d96281335a3d0745cc Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Fri, 24 Nov 2023 03:00:55 +0100 Subject: [PATCH 21/44] Add free method to LlamaProxy --- llama_cpp/server/model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 2ef63e119..12c84c94b 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -84,13 +84,16 @@ def __getitem__(self, model: str): return self._models[model].model_dump() def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]): - if isinstance(settings, bytes) or isinstance(settings, str): + if isinstance(settings, (bytes, str)): settings = ModelSettings.model_validate_json(settings) self._models[model] = settings def __iter__(self): for model in self._models: yield model + + def free(self): + if self._model: del self._model LLAMA: Optional[LlamaProxy] = None From ec9a9db6a21bea1605239acd3e33ec2cc6e7c6f7 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Sat, 25 Nov 2023 22:41:15 +0100 Subject: [PATCH 22/44] update arg parsers & install server alias --- llama_cpp/server/__main__.py | 24 ++++++++++++------------ llama_cpp/server/settings.py | 2 +- pyproject.toml | 3 +++ 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index f109b4682..22c2260f3 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -31,6 +31,8 @@ from llama_cpp.server.app import create_app from llama_cpp.server.settings import Settings, ServerSettings, set_settings +EXE_NAME = 'llama_server' + def get_base_type(annotation): if getattr(annotation, '__origin__', None) is Literal: return type(annotation.__args__[0]) @@ -69,11 +71,12 @@ def parse_bool_arg(arg): else: raise ValueError(f'Invalid boolean argument: {arg}') -def create_parser(settings_dict): - parser = argparse.ArgumentParser() - for name, field in settings_dict.items(): +def main(): + description = "🦙 Llama.cpp python server. Host your own LLMs!🚀" + parser = argparse.ArgumentParser(EXE_NAME, description=description) + for name, field in (ServerSettings.model_fields|Settings.model_fields).items(): description = field.description - if field.default is not None and description is not None: + if field.default and description and not field.is_required(): description += f" (default: {field.default})" base_type = get_base_type(field.annotation) if field.annotation is not None else str list_type = contains_list_type(field.annotation) @@ -92,21 +95,15 @@ def create_parser(settings_dict): type=parse_bool_arg, help=f"{description}", ) - return parser - -if __name__ == "__main__": - server_arg_parser = create_parser(ServerSettings.model_fields) - parser = create_parser(Settings.model_fields) try: - server_args, _ = server_arg_parser.parse_known_args() - server_settings = ServerSettings(**{k: v for k, v in vars(server_args).items() if v is not None}) + args = parser.parse_args() + server_settings = ServerSettings(**{k: v for k, v in vars(args).items() if v is not None}) set_settings(server_settings) if server_settings.config and os.path.exists(server_settings.config): with open(server_settings.config, 'rb') as f: llama_settings = Settings.model_validate_json(f.read()) else: - args, _ = parser.parse_known_args() llama_settings = Settings(**{k: v for k, v in vars(args).items() if v is not None}) app = create_app(settings=llama_settings) except Exception as e: @@ -117,3 +114,6 @@ def create_parser(settings_dict): uvicorn.run( app, host=server_settings.host, port=server_settings.port ) + +if __name__ == "__main__": + main() diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 98215a891..ecd034d2e 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -136,7 +136,7 @@ class ModelSettings(BaseSettings): ) class ServerSettings(BaseSettings): - model_config = SettingsConfigDict(env_file='.env') + model_config = SettingsConfigDict(env_file='.env', extra='ignore') host: str = Field(default="localhost", description="Listen address") port: int = Field(default=8000, description="Listen port") interrupt_requests: bool = Field( diff --git a/pyproject.toml b/pyproject.toml index 6c1022581..ffb9f2637 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,6 +63,9 @@ sdist.include = [".git", "vendor/llama.cpp/.git"] provider = "scikit_build_core.metadata.regex" input = "llama_cpp/__init__.py" +[project.scripts] +llama_server = "llama_cpp.server.__main__:main" + [project.urls] Homepage = "https://github.com/abetlen/llama-cpp-python" Issues = "https://github.com/abetlen/llama-cpp-python/issues" From fd731d7512e4191a137e7b1aa0ae669f5f28a044 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 29 Nov 2023 18:35:46 +0100 Subject: [PATCH 23/44] refactor cache settings --- llama_cpp/llama.py | 18 ------------------ llama_cpp/server/model.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c1bb9d95e..cc7f562b5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -759,10 +759,6 @@ def __init__( # Chat Format Params chat_format: str = "llama-2", chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None, - # Cache - cache: bool = False, - cache_type: str = "ram", - cache_size: int = 2 << 30, # Misc verbose: bool = True, # Extra Params @@ -820,9 +816,6 @@ def __init__( numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init) chat_format: String specifying the chat format to use when calling create_chat_completion. chat_handler: Optional chat handler to use when calling create_chat_completion. - cache: Optional if true enables caching. - cache_type: String can be "ram" or "disk". - cache_size: Number of bytes to cache, defaults to 2GB verbose: Print verbose output to stderr. Raises: @@ -965,17 +958,6 @@ def __init__( (n_ctx, self._n_vocab), dtype=np.single ) - if cache: - if cache_type == "disk": - if verbose: - print(f"Using disk cache with size {cache_size}") - cache = LlamaDiskCache(capacity_bytes=cache_size) - else: - if verbose: - print(f"Using ram cache with size {cache_size}") - cache = LlamaRAMCache(capacity_bytes=cache_size) - self.set_cache(cache) - @property def ctx(self) -> llama_cpp.llama_context_p: assert self._ctx.ctx is not None diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 12c84c94b..ec9bd9416 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -78,6 +78,16 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: chat_handler=chat_handler ) self._model.alias = model + if settings.cache: + if settings.cache_type == "disk": + if settings.verbose: + print(f"Using disk cache with size {settings.cache_size}") + cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) + else: + if settings.verbose: + print(f"Using ram cache with size {settings.cache_size}") + cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) + self._model.set_cache(cache) return self._model def __getitem__(self, model: str): From 288fa85d66b74123ca5543ed0801cbddde031ba2 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 29 Nov 2023 18:36:43 +0100 Subject: [PATCH 24/44] change server executable name --- llama_cpp/server/__main__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 22c2260f3..566112689 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -31,7 +31,7 @@ from llama_cpp.server.app import create_app from llama_cpp.server.settings import Settings, ServerSettings, set_settings -EXE_NAME = 'llama_server' +EXE_NAME = 'llama_cpp.server' def get_base_type(annotation): if getattr(annotation, '__origin__', None) is Literal: diff --git a/pyproject.toml b/pyproject.toml index ffb9f2637..4662af173 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,7 +64,7 @@ provider = "scikit_build_core.metadata.regex" input = "llama_cpp/__init__.py" [project.scripts] -llama_server = "llama_cpp.server.__main__:main" +llama_cpp.server = "llama_cpp.server.__main__:main" [project.urls] Homepage = "https://github.com/abetlen/llama-cpp-python" From b64742b0f3ffd5f8463eaf624cdf1395c573e66b Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 29 Nov 2023 18:46:07 +0100 Subject: [PATCH 25/44] better var name --- llama_cpp/server/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index abf1cb043..ddc23101c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -659,10 +659,10 @@ async def get_models( "object": "list", "data": [ { - "id": model, + "id": model_alias, "object": "model", "owned_by": "me", "permissions": [], - } for model in llama + } for model_alias in llama ], } From bc5cf51c64a95bfc9926e1bc58166059711a1cd8 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 29 Nov 2023 18:48:25 +0100 Subject: [PATCH 26/44] whitespace --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index cc7f562b5..df08c162b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -940,7 +940,7 @@ def __init__( if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - + self.chat_format = chat_format self.chat_handler = chat_handler From 5fd9892ae4470d4464cc216896ccf952f2922896 Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:34:04 +0100 Subject: [PATCH 27/44] Revert "whitespace" This reverts commit bc5cf51c64a95bfc9926e1bc58166059711a1cd8. --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 807654889..1230b6100 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -950,7 +950,7 @@ def __init__( if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - + self.chat_format = chat_format self.chat_handler = chat_handler From 7b1c17b4fdedb20759fcbbb9aa27e1047cea5b4b Mon Sep 17 00:00:00 2001 From: Dave <69651599+D4ve-R@users.noreply.github.com> Date: Fri, 1 Dec 2023 14:03:01 +0100 Subject: [PATCH 28/44] remove exe_name --- llama_cpp/server/__main__.py | 4 +--- pyproject.toml | 3 --- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 566112689..55ffc5f48 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -31,8 +31,6 @@ from llama_cpp.server.app import create_app from llama_cpp.server.settings import Settings, ServerSettings, set_settings -EXE_NAME = 'llama_cpp.server' - def get_base_type(annotation): if getattr(annotation, '__origin__', None) is Literal: return type(annotation.__args__[0]) @@ -73,7 +71,7 @@ def parse_bool_arg(arg): def main(): description = "🦙 Llama.cpp python server. Host your own LLMs!🚀" - parser = argparse.ArgumentParser(EXE_NAME, description=description) + parser = argparse.ArgumentParser(description=description) for name, field in (ServerSettings.model_fields|Settings.model_fields).items(): description = field.description if field.default and description and not field.is_required(): diff --git a/pyproject.toml b/pyproject.toml index 4662af173..6c1022581 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,9 +63,6 @@ sdist.include = [".git", "vendor/llama.cpp/.git"] provider = "scikit_build_core.metadata.regex" input = "llama_cpp/__init__.py" -[project.scripts] -llama_cpp.server = "llama_cpp.server.__main__:main" - [project.urls] Homepage = "https://github.com/abetlen/llama-cpp-python" Issues = "https://github.com/abetlen/llama-cpp-python/issues" From ba36629b5a52302a2439c3b2e0248b8118601370 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 21 Dec 2023 16:12:53 -0500 Subject: [PATCH 29/44] Fix merge bugs --- llama_cpp/server/model.py | 40 +++++++++++++++++++++-- llama_cpp/server/settings.py | 62 +++++++++++++++++++----------------- 2 files changed, 70 insertions(+), 32 deletions(-) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index ec9bd9416..18816281a 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -74,10 +74,44 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: self._model = llama_cpp.Llama( model_path=settings.model, - **(settings.model_dump(exclude={'model', 'models'})), - chat_handler=chat_handler + # Model Params + n_gpu_layers=settings.n_gpu_layers, + main_gpu=settings.main_gpu, + tensor_split=settings.tensor_split, + vocab_only=settings.vocab_only, + use_mmap=settings.use_mmap, + use_mlock=settings.use_mlock, + # Context Params + seed=settings.seed, + n_ctx=settings.n_ctx, + n_batch=settings.n_batch, + n_threads=settings.n_threads, + n_threads_batch=settings.n_threads_batch, + rope_scaling_type=settings.rope_scaling_type, + rope_freq_base=settings.rope_freq_base, + rope_freq_scale=settings.rope_freq_scale, + yarn_ext_factor=settings.yarn_ext_factor, + yarn_attn_factor=settings.yarn_attn_factor, + yarn_beta_fast=settings.yarn_beta_fast, + yarn_beta_slow=settings.yarn_beta_slow, + yarn_orig_ctx=settings.yarn_orig_ctx, + mul_mat_q=settings.mul_mat_q, + logits_all=settings.logits_all, + embedding=settings.embedding, + offload_kqv=settings.offload_kqv, + # Sampling Params + last_n_tokens_size=settings.last_n_tokens_size, + # LoRA Params + lora_base=settings.lora_base, + lora_path=settings.lora_path, + # Backend Params + numa=settings.numa, + # Chat Format Params + chat_format=settings.chat_format, + chat_handler=chat_handler, + # Misc + verbose=settings.verbose, ) - self._model.alias = model if settings.cache: if settings.cache_type == "disk": if settings.verbose: diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index ecd034d2e..b604515f2 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -1,4 +1,3 @@ -import os import multiprocessing from typing import Optional, List, Literal from pydantic import Field @@ -6,7 +5,8 @@ import llama_cpp # Disable warning for model and model_alias settings -BaseSettings.model_config['protected_namespaces'] = () +BaseSettings.model_config["protected_namespaces"] = () + class ModelSettings(BaseSettings): model: str = Field( @@ -43,7 +43,9 @@ class ModelSettings(BaseSettings): description="Use mlock.", ) # Context Params - seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.") + seed: int = Field( + default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random." + ) n_ctx: int = Field(default=2048, ge=1, description="The context size.") n_batch: int = Field( default=512, ge=1, description="The batch size to use per eval." @@ -58,36 +60,24 @@ class ModelSettings(BaseSettings): ge=0, description="The number of threads to use when batch processing.", ) - rope_scaling_type: int = Field( - default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED - ) - rope_freq_base: float = Field( - default=0.0, description="RoPE base frequency" - ) + rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED) + rope_freq_base: float = Field(default=0.0, description="RoPE base frequency") rope_freq_scale: float = Field( default=0.0, description="RoPE frequency scaling factor" ) - yarn_ext_factor: float = Field( - default=-1.0 - ) - yarn_attn_factor: float = Field( - default=1.0 - ) - yarn_beta_fast: float = Field( - default=32.0 - ) - yarn_beta_slow: float = Field( - default=1.0 - ) - yarn_orig_ctx: int = Field( - default=0 - ) + yarn_ext_factor: float = Field(default=-1.0) + yarn_attn_factor: float = Field(default=1.0) + yarn_beta_fast: float = Field(default=32.0) + yarn_beta_slow: float = Field(default=1.0) + yarn_orig_ctx: int = Field(default=0) mul_mat_q: bool = Field( default=True, description="if true, use experimental mul_mat_q kernels" ) - f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") logits_all: bool = Field(default=True, description="Whether to return logits.") embedding: bool = Field(default=True, description="Whether to use embeddings.") + offload_kqv: bool = Field( + default=False, description="Whether to offload kqv to the GPU." + ) # Sampling Params last_n_tokens_size: int = Field( default=64, @@ -97,7 +87,7 @@ class ModelSettings(BaseSettings): # LoRA Params lora_base: Optional[str] = Field( default=None, - description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model." + description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.", ) lora_path: Optional[str] = Field( default=None, @@ -135,8 +125,9 @@ class ModelSettings(BaseSettings): default=True, description="Whether to print debug information." ) + class ServerSettings(BaseSettings): - model_config = SettingsConfigDict(env_file='.env', extra='ignore') + model_config = SettingsConfigDict(env_file=".env", extra="ignore") host: str = Field(default="localhost", description="Listen address") port: int = Field(default=8000, description="Listen port") interrupt_requests: bool = Field( @@ -144,18 +135,31 @@ class ServerSettings(BaseSettings): description="Whether to interrupt requests when a new request is received.", ) config: Optional[str] = Field(default=None, description="Path to config file") + ssl_keyfile: Optional[str] = Field( + default=None, description="SSL key file for HTTPS" + ) + ssl_certfile: Optional[str] = Field( + default=None, description="SSL certificate file for HTTPS" + ) + api_key: Optional[str] = Field( + default=None, + description="API key for authentication. If set all requests need to be authenticated.", + ) + class Settings(ModelSettings): models: Optional[List[ModelSettings]] = Field( - default = [], - description="Model configs, overwrites default config" + default=[], description="Model configs, overwrites default config" ) + SETTINGS: Optional[ServerSettings] = None + def set_settings(settings: ServerSettings): global SETTINGS SETTINGS = settings + def get_settings(): yield SETTINGS From 315a82fdc163ac04ad34eb9772142dc34ac190cb Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 21 Dec 2023 16:13:36 -0500 Subject: [PATCH 30/44] Fix type annotations --- llama_cpp/server/__main__.py | 2 ++ llama_cpp/server/app.py | 2 ++ llama_cpp/server/settings.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 23c4a3b59..ba48855a0 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -21,6 +21,8 @@ Then visit http://localhost:8000/docs to see the interactive API docs. """ +from __future__ import annotations + import os import sys import argparse diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index c924822d4..3e73a5199 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import sys import json import traceback diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index b604515f2..fca73e50d 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import multiprocessing from typing import Optional, List, Literal from pydantic import Field From c5051becfd93c87bcb550fe68db82a5a3a6d574d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 21 Dec 2023 16:15:18 -0500 Subject: [PATCH 31/44] Fix type annotations --- llama_cpp/server/model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 18816281a..09b4bfedf 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os import logging from typing import Optional, Union From 7a3e11a216f984cb8a5e8a8cad51bf34a23bc3ca Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 21 Dec 2023 16:18:36 -0500 Subject: [PATCH 32/44] Fix uvicorn app factory --- llama_cpp/server/app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 3e73a5199..20f7127ba 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -217,7 +217,9 @@ async def custom_route_handler(request: Request) -> Response: router = APIRouter(route_class=RouteErrorHandler) -def create_app(settings: Settings): +def create_app(settings: Settings | None = None): + if settings is None: + settings = Settings() middleware = [ Middleware(RawContextMiddleware, plugins=(plugins.RequestIdPlugin(),)) ] From 4f99ac667dbbd8ffe51c4f8131755fbf5171d105 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 21 Dec 2023 17:26:12 -0500 Subject: [PATCH 33/44] Fix settings --- llama_cpp/server/app.py | 1 + llama_cpp/server/model.py | 3 ++- llama_cpp/server/settings.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 20f7127ba..05b38dc09 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -220,6 +220,7 @@ async def custom_route_handler(request: Request) -> Response: def create_app(settings: Settings | None = None): if settings is None: settings = Settings() + set_settings(settings) middleware = [ Middleware(RawContextMiddleware, plugins=(plugins.RequestIdPlugin(),)) ] diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 09b4bfedf..5ae9a045c 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -37,7 +37,7 @@ def __init__(self, settings: Settings) -> None: if filename.endswith(FILE_EXT): alias = model_alias(filename) if alias in self._models: continue - exclude={'model', 'model_alias', 'models', 'host', 'port', 'interrupt_requests', 'config'} + exclude={'model', 'model_alias', 'models', 'host', 'port', 'interrupt_requests', 'config', 'ssl_keyfile', 'ssl_certfile', 'api_key'} default_settings = settings.model_dump(exclude=exclude) self._models[alias] = ModelSettings(model=os.path.join(model_root, filename), model_alias=alias, **default_settings) @@ -114,6 +114,7 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: # Misc verbose=settings.verbose, ) + self._model.alias = model_alias if settings.cache: if settings.cache_type == "disk": if settings.verbose: diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index fca73e50d..ea3b65504 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -149,7 +149,7 @@ class ServerSettings(BaseSettings): ) -class Settings(ModelSettings): +class Settings(ServerSettings, ModelSettings): models: Optional[List[ModelSettings]] = Field( default=[], description="Model configs, overwrites default config" ) From 3f2e6c1874b97cb7ede2418359514f3b02289549 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 21 Dec 2023 22:28:02 -0500 Subject: [PATCH 34/44] Refactor server --- llama_cpp/server/__main__.py | 148 ++++----- llama_cpp/server/app.py | 594 +++++++---------------------------- llama_cpp/server/cli.py | 99 ++++++ llama_cpp/server/errors.py | 210 +++++++++++++ llama_cpp/server/model.py | 156 ++++----- llama_cpp/server/settings.py | 26 +- llama_cpp/server/types.py | 264 ++++++++++++++++ 7 files changed, 814 insertions(+), 683 deletions(-) create mode 100644 llama_cpp/server/cli.py create mode 100644 llama_cpp/server/errors.py create mode 100644 llama_cpp/server/types.py diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index ba48855a0..7a3587721 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -26,114 +26,84 @@ import os import sys import argparse -from typing import List, Literal, Union import uvicorn from llama_cpp.server.app import create_app -from llama_cpp.server.settings import Settings, ServerSettings, set_settings - - -def get_base_type(annotation): - if getattr(annotation, "__origin__", None) is Literal: - return type(annotation.__args__[0]) - elif getattr(annotation, "__origin__", None) is Union: - non_optional_args = [ - arg for arg in annotation.__args__ if arg is not type(None) - ] - if non_optional_args: - return get_base_type(non_optional_args[0]) - elif ( - getattr(annotation, "__origin__", None) is list - or getattr(annotation, "__origin__", None) is List - ): - return get_base_type(annotation.__args__[0]) - else: - return annotation - - -def contains_list_type(annotation) -> bool: - origin = getattr(annotation, "__origin__", None) - - if origin is list or origin is List: - return True - elif origin in (Literal, Union): - return any(contains_list_type(arg) for arg in annotation.__args__) - else: - return False - - -def parse_bool_arg(arg): - if isinstance(arg, bytes): - arg = arg.decode("utf-8") - - true_values = {"1", "on", "t", "true", "y", "yes"} - false_values = {"0", "off", "f", "false", "n", "no"} - - arg_str = str(arg).lower().strip() - - if arg_str in true_values: - return True - elif arg_str in false_values: - return False - else: - raise ValueError(f"Invalid boolean argument: {arg}") +from llama_cpp.server.settings import ( + Server, + ServerSettings, + ModelSettings, + ConfigFileSettings, + set_server_settings, +) +from llama_cpp.server.cli import add_args_from_model, parse_model_from_args def main(): description = "🦙 Llama.cpp python server. Host your own LLMs!🚀" parser = argparse.ArgumentParser(description=description) - for name, field in (ServerSettings.model_fields | Settings.model_fields).items(): - description = field.description - if field.default and description and not field.is_required(): - description += f" (default: {field.default})" - base_type = ( - get_base_type(field.annotation) if field.annotation is not None else str - ) - list_type = contains_list_type(field.annotation) - if base_type is not bool: - parser.add_argument( - f"--{name}", - dest=name, - nargs="*" if list_type else None, - type=base_type, - help=description, - ) - if base_type is bool: - parser.add_argument( - f"--{name}", - dest=name, - type=parse_bool_arg, - help=f"{description}", - ) + add_args_from_model(parser, ModelSettings) + add_args_from_model(parser, ServerSettings) + parser.add_argument( + "--config-file", + type=str, + help="Path to a config file to load.", + ) try: args = parser.parse_args() - server_settings = ServerSettings( - **{k: v for k, v in vars(args).items() if v is not None} - ) - set_settings(server_settings) - if server_settings.config and os.path.exists(server_settings.config): - with open(server_settings.config, "rb") as f: - llama_settings = Settings.model_validate_json(f.read()) + server_settings: ServerSettings | None = None + model_settings: list[ModelSettings] = [] + # Load server settings from config_file if provided + config_file = os.environ.get("CONFIG_FILE", args.config_file) + if config_file: + if not os.path.exists(config_file): + raise ValueError(f"Config file {config_file} not found!") + with open(config_file, "rb") as f: + config_file_settings = ConfigFileSettings.model_validate_json(f.read()) + server_settings = ServerSettings( + **{ + k: v + for k, v in config_file_settings.model_dump().items() + if k in ServerSettings.model_fields + } + ) + model_settings = config_file_settings.models else: - llama_settings = Settings( - **{k: v for k, v in vars(args).items() if v is not None} + server_settings = ServerSettings( + **{ + k: v + for k, v in vars(args).items() + if k in ServerSettings.model_fields + } ) - app = create_app(settings=llama_settings) + model_settings = [ + ModelSettings( + **{ + k: v + for k, v in vars(args).items() + if k in ModelSettings.model_fields + } + ) + ] + app = create_app( + settings=Server( + **server_settings.model_dump(), **model_settings[0].model_dump() + ) + ) + uvicorn.run( + app, + host=os.getenv("HOST", server_settings.host), + port=int(os.getenv("PORT", server_settings.port)), + ssl_keyfile=server_settings.ssl_keyfile, + ssl_certfile=server_settings.ssl_certfile, + ) except Exception as e: print(e, file=sys.stderr) parser.print_help() sys.exit(1) - uvicorn.run( - app, - host=os.getenv("HOST", server_settings.host), - port=int(os.getenv("PORT", server_settings.port)), - ssl_keyfile=server_settings.ssl_keyfile, - ssl_certfile=server_settings.ssl_certfile, - ) - if __name__ == "__main__": main() diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 05b38dc09..60fc8c0e9 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,229 +1,123 @@ from __future__ import annotations -import sys import json -import traceback -import time -from re import compile, Match, Pattern + +from threading import Lock from functools import partial -from typing import Callable, Coroutine, Iterator, List, Optional, Tuple, Union, Dict -from typing_extensions import TypedDict, Literal +from typing import Iterator, List, Optional, Union, Dict import llama_cpp import anyio from anyio.streams.memory import MemoryObjectSendStream from starlette.concurrency import run_in_threadpool, iterate_in_threadpool -from fastapi import Depends, FastAPI, APIRouter, Request, Response, HTTPException, status +from fastapi import ( + Depends, + FastAPI, + APIRouter, + Request, + HTTPException, + status, +) from fastapi.middleware import Middleware from fastapi.middleware.cors import CORSMiddleware -from fastapi.responses import JSONResponse -from fastapi.routing import APIRoute from fastapi.security import HTTPBearer -from pydantic import BaseModel, Field from sse_starlette.sse import EventSourceResponse -from starlette_context import plugins +from starlette_context.plugins import RequestIdPlugin # type: ignore from starlette_context.middleware import RawContextMiddleware -import numpy as np -import numpy.typing as npt - -from llama_cpp.server.model import get_llama, set_llama, llama_outer_lock, LlamaProxy as Llama -from llama_cpp.server.settings import Settings, set_settings, get_settings - -class ErrorResponse(TypedDict): - """OpenAI style error response""" - - message: str - type: str - param: Optional[str] - code: Optional[str] - -class ErrorResponseFormatters: - """Collection of formatters for error responses. - - Args: - request (Union[CreateCompletionRequest, CreateChatCompletionRequest]): - Request body - match (Match[str]): Match object from regex pattern - - Returns: - Tuple[int, ErrorResponse]: Status code and error response - """ - - @staticmethod - def context_length_exceeded( - request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"], - match, # type: Match[str] # type: ignore - ) -> Tuple[int, ErrorResponse]: - """Formatter for context length exceeded error""" - - context_window = int(match.group(2)) - prompt_tokens = int(match.group(1)) - completion_tokens = request.max_tokens - if hasattr(request, "messages"): - # Chat completion - message = ( - "This model's maximum context length is {} tokens. " - "However, you requested {} tokens " - "({} in the messages, {} in the completion). " - "Please reduce the length of the messages or completion." - ) - else: - # Text completion - message = ( - "This model's maximum context length is {} tokens, " - "however you requested {} tokens " - "({} in your prompt; {} for the completion). " - "Please reduce your prompt; or completion length." - ) - return 400, ErrorResponse( - message=message.format( - context_window, - completion_tokens + prompt_tokens, - prompt_tokens, - completion_tokens, - ), - type="invalid_request_error", - param="messages", - code="context_length_exceeded", - ) +from llama_cpp.server.model import ( + LlamaProxy, +) +from llama_cpp.server.settings import ( + Settings, + ModelSettings, + ServerSettings, +) +from llama_cpp.server.types import ( + CreateCompletionRequest, + CreateEmbeddingRequest, + CreateChatCompletionRequest, + ModelList, +) +from llama_cpp.server.errors import RouteErrorHandler - @staticmethod - def model_not_found( - request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"], - match, # type: Match[str] # type: ignore - ) -> Tuple[int, ErrorResponse]: - """Formatter for model_not_found error""" - - model_path = str(match.group(1)) - message = f"The model `{model_path}` does not exist" - return 400, ErrorResponse( - message=message, - type="invalid_request_error", - param=None, - code="model_not_found", - ) -class RouteErrorHandler(APIRoute): - """Custom APIRoute that handles application errors and exceptions""" - - # key: regex pattern for original error message from llama_cpp - # value: formatter function - pattern_and_formatters: Dict[ - "Pattern", - Callable[ - [ - Union["CreateCompletionRequest", "CreateChatCompletionRequest"], - "Match[str]", - ], - Tuple[int, ErrorResponse], - ], - ] = { - compile( - r"Requested tokens \((\d+)\) exceed context window of (\d+)" - ): ErrorResponseFormatters.context_length_exceeded, - compile( - r"Model path does not exist: (.+)" - ): ErrorResponseFormatters.model_not_found, - } +router = APIRouter(route_class=RouteErrorHandler) - def error_message_wrapper( - self, - error: Exception, - body: Optional[ - Union[ - "CreateChatCompletionRequest", - "CreateCompletionRequest", - "CreateEmbeddingRequest", - ] - ] = None, - ) -> Tuple[int, ErrorResponse]: - """Wraps error message in OpenAI style error response""" - print(f"Exception: {str(error)}", file=sys.stderr) - traceback.print_exc(file=sys.stderr) - if body is not None and isinstance( - body, - ( - CreateCompletionRequest, - CreateChatCompletionRequest, - ), - ): - # When text completion or chat completion - for pattern, callback in self.pattern_and_formatters.items(): - match = pattern.search(str(error)) - if match is not None: - return callback(body, match) - - # Wrap other errors as internal server error - return 500, ErrorResponse( - message=str(error), - type="internal_server_error", - param=None, - code=None, - ) +_settings: Optional[ServerSettings] = None - def get_route_handler( - self, - ) -> Callable[[Request], Coroutine[None, None, Response]]: - """Defines custom route handler that catches exceptions and formats - in OpenAI style error response""" - - original_route_handler = super().get_route_handler() - - async def custom_route_handler(request: Request) -> Response: - try: - start_sec = time.perf_counter() - response = await original_route_handler(request) - elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000) - response.headers["openai-processing-ms"] = f"{elapsed_time_ms}" - return response - except HTTPException as unauthorized: - # api key check failed - raise unauthorized - except Exception as exc: - json_body = await request.json() - try: - if "messages" in json_body: - # Chat completion - body: Optional[ - Union[ - CreateChatCompletionRequest, - CreateCompletionRequest, - CreateEmbeddingRequest, - ] - ] = CreateChatCompletionRequest(**json_body) - elif "prompt" in json_body: - # Text completion - body = CreateCompletionRequest(**json_body) - else: - # Embedding - body = CreateEmbeddingRequest(**json_body) - except Exception: - # Invalid request body - body = None - - # Get proper error message from the exception - ( - status_code, - error_message, - ) = self.error_message_wrapper(error=exc, body=body) - return JSONResponse( - {"error": error_message}, - status_code=status_code, - ) - - return custom_route_handler -router = APIRouter(route_class=RouteErrorHandler) +def set_settings(settings: ServerSettings): + global _settings + _settings = settings + + +def get_settings(): + yield _settings + + +LLAMA: Optional[LlamaProxy] = None + +llama_outer_lock = Lock() +llama_inner_lock = Lock() + + +def set_llama(models: List[ModelSettings]): + global LLAMA + LLAMA = LlamaProxy(models=models) + + +def get_llama(): + # NOTE: This double lock allows the currently streaming llama model to + # check if any other requests are pending in the same thread and cancel + # the stream if so. + llama_outer_lock.acquire() + release_outer_lock = True + try: + llama_inner_lock.acquire() + try: + llama_outer_lock.release() + release_outer_lock = False + yield LLAMA + finally: + llama_inner_lock.release() + finally: + if release_outer_lock: + llama_outer_lock.release() + + +def create_app( + settings: Settings | None = None, + server_settings: ServerSettings | None = None, + model_settings: List[ModelSettings] | None = None, +): + if server_settings is None and model_settings is None: + if settings is None: + settings = Settings() + server_settings = ServerSettings( + **{ + k: v + for k, v in settings.model_dump().items() + if k in ServerSettings.model_fields + } + ) + model_settings = [ + ModelSettings( + **{ + k: v + for k, v in settings.model_dump().items() + if k in ModelSettings.model_fields + } + ) + ] + + assert ( + server_settings is not None and model_settings is not None + ), "server_settings and model_settings must be provided together" -def create_app(settings: Settings | None = None): - if settings is None: - settings = Settings() - set_settings(settings) - middleware = [ - Middleware(RawContextMiddleware, plugins=(plugins.RequestIdPlugin(),)) - ] + set_settings(server_settings) + middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))] app = FastAPI( middleware=middleware, title="🦙 llama.cpp Python API", @@ -238,13 +132,12 @@ def create_app(settings: Settings | None = None): ) app.include_router(router) - @app.get('/') - async def root(): - return "pong" + assert model_settings is not None + set_llama(models=model_settings) - set_llama(settings) return app + async def get_event_publisher( request: Request, inner_send_chan: MemoryObjectSendStream, @@ -256,7 +149,10 @@ async def get_event_publisher( await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() - if next(get_settings()).interrupt_requests and llama_outer_lock.locked(): + if ( + next(get_settings()).interrupt_requests + and llama_outer_lock.locked() + ): await inner_send_chan.send(dict(data="[DONE]")) raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) @@ -267,156 +163,6 @@ async def get_event_publisher( raise e -model_field = Field( - description="The model to use for generating completions.", default=None -) - -max_tokens_field = Field( - default=16, ge=1, description="The maximum number of tokens to generate." -) - -temperature_field = Field( - default=0.8, - ge=0.0, - le=2.0, - description="Adjust the randomness of the generated text.\n\n" - + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.", -) - -top_p_field = Field( - default=0.95, - ge=0.0, - le=1.0, - description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" - + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.", -) - -min_p_field = Field( - default=0.05, - ge=0.0, - le=1.0, - description="Sets a minimum base probability threshold for token selection.\n\n" - + "The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter min_p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with min_p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.", -) - -stop_field = Field( - default=None, - description="A list of tokens at which to stop generation. If None, no stop tokens are used.", -) - -stream_field = Field( - default=False, - description="Whether to stream the results as they are generated. Useful for chatbots.", -) - -top_k_field = Field( - default=40, - ge=0, - description="Limit the next token selection to the K most probable tokens.\n\n" - + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.", -) - -repeat_penalty_field = Field( - default=1.1, - ge=0.0, - description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" - + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", -) - -presence_penalty_field = Field( - default=0.0, - ge=-2.0, - le=2.0, - description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", -) - -frequency_penalty_field = Field( - default=0.0, - ge=-2.0, - le=2.0, - description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", -) - -mirostat_mode_field = Field( - default=0, - ge=0, - le=2, - description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)", -) - -mirostat_tau_field = Field( - default=5.0, - ge=0.0, - le=10.0, - description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text", -) - -mirostat_eta_field = Field( - default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate" -) - -grammar = Field( - default=None, - description="A CBNF grammar (as string) to be used for formatting the model's output.", -) - - -class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] = Field( - default="", description="The prompt to generate completions for." - ) - suffix: Optional[str] = Field( - default=None, - description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.", - ) - max_tokens: int = max_tokens_field - temperature: float = temperature_field - top_p: float = top_p_field - min_p: float = min_p_field - echo: bool = Field( - default=False, - description="Whether to echo the prompt in the generated text. Useful for chatbots.", - ) - stop: Optional[Union[str, List[str]]] = stop_field - stream: bool = stream_field - logprobs: Optional[int] = Field( - default=None, - ge=0, - description="The number of logprobs to generate. If None, no logprobs are generated.", - ) - presence_penalty: Optional[float] = presence_penalty_field - frequency_penalty: Optional[float] = frequency_penalty_field - logit_bias: Optional[Dict[str, float]] = Field(None) - logprobs: Optional[int] = Field(None) - seed: Optional[int] = Field(None) - - # ignored or currently unsupported - model: Optional[str] = model_field - n: Optional[int] = 1 - best_of: Optional[int] = 1 - user: Optional[str] = Field(default=None) - - # llama.cpp specific parameters - top_k: int = top_k_field - repeat_penalty: float = repeat_penalty_field - logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) - mirostat_mode: int = mirostat_mode_field - mirostat_tau: float = mirostat_tau_field - mirostat_eta: float = mirostat_eta_field - grammar: Optional[str] = None - - model_config = { - "json_schema_extra": { - "examples": [ - { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } - ] - } - } - - def _logit_bias_tokens_to_input_ids( llama: llama_cpp.Llama, logit_bias: Dict[str, float], @@ -433,7 +179,10 @@ def _logit_bias_tokens_to_input_ids( bearer_scheme = HTTPBearer(auto_error=False) -async def authenticate(settings: Settings = Depends(get_settings), authorization: Optional[str] = Depends(bearer_scheme)): +async def authenticate( + settings: Settings = Depends(get_settings), + authorization: Optional[str] = Depends(bearer_scheme), +): # Skip API key check if it's not set in settings if settings.api_key is None: return True @@ -450,10 +199,7 @@ async def authenticate(settings: Settings = Depends(get_settings), authorization ) -@router.post( - "/v1/completions", - summary="Completion" -) +@router.post("/v1/completions", summary="Completion") @router.post("/v1/engines/copilot-codex/completions", include_in_schema=False) async def create_completion( request: Request, @@ -464,7 +210,7 @@ async def create_completion( if isinstance(body.prompt, list): assert len(body.prompt) <= 1 body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" - + llama = llama(body.model) exclude = { @@ -513,25 +259,8 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: else: return iterator_or_completion -class CreateEmbeddingRequest(BaseModel): - model: Optional[str] = model_field - input: Union[str, List[str]] = Field(description="The input to embed.") - user: Optional[str] = Field(default=None) - model_config = { - "json_schema_extra": { - "examples": [ - { - "input": "The food was delicious and the waiter...", - } - ] - } - } - -@router.post( - "/v1/embeddings", - summary="Embedding" -) +@router.post("/v1/embeddings", summary="Embedding") async def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama), @@ -542,88 +271,7 @@ async def create_embedding( ) -class ChatCompletionRequestMessage(BaseModel): - role: Literal["system", "user", "assistant", "function"] = Field( - default="user", description="The role of the message." - ) - content: Optional[str] = Field( - default="", description="The content of the message." - ) - - -class CreateChatCompletionRequest(BaseModel): - messages: List[llama_cpp.ChatCompletionRequestMessage] = Field( - default=[], description="A list of messages to generate completions for." - ) - functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field( - default=None, - description="A list of functions to apply to the generated completions.", - ) - function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field( - default=None, - description="A function to apply to the generated completions.", - ) - tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field( - default=None, - description="A list of tools to apply to the generated completions.", - ) - tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field( - default=None, - description="A tool to apply to the generated completions.", - ) # TODO: verify - max_tokens: Optional[int] = Field( - default=None, - description="The maximum number of tokens to generate. Defaults to inf", - ) - temperature: float = temperature_field - top_p: float = top_p_field - min_p: float = min_p_field - stop: Optional[Union[str, List[str]]] = stop_field - stream: bool = stream_field - presence_penalty: Optional[float] = presence_penalty_field - frequency_penalty: Optional[float] = frequency_penalty_field - logit_bias: Optional[Dict[str, float]] = Field(None) - seed: Optional[int] = Field(None) - response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field( - default=None, - ) - - # ignored or currently unsupported - model: Optional[str] = model_field - n: Optional[int] = 1 - user: Optional[str] = Field(None) - - # llama.cpp specific parameters - top_k: int = top_k_field - repeat_penalty: float = repeat_penalty_field - logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) - mirostat_mode: int = mirostat_mode_field - mirostat_tau: float = mirostat_tau_field - mirostat_eta: float = mirostat_eta_field - grammar: Optional[str] = None - - model_config = { - "json_schema_extra": { - "examples": [ - { - "messages": [ - ChatCompletionRequestMessage( - role="system", content="You are a helpful assistant." - ).model_dump(), - ChatCompletionRequestMessage( - role="user", content="What is the capital of France?" - ).model_dump(), - ] - } - ] - } - } - - -@router.post( - "/v1/chat/completions", - summary="Chat" -) +@router.post("/v1/chat/completions", summary="Chat") async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, @@ -676,22 +324,11 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: return iterator_or_completion -class ModelData(TypedDict): - id: str - object: Literal["model"] - owned_by: str - permissions: List[str] - - -class ModelList(TypedDict): - object: Literal["list"] - data: List[ModelData] - - @router.get("/v1/models", summary="Models") async def get_models( settings: Settings = Depends(get_settings), authenticated: str = Depends(authenticate), + llama: llama_cpp.Llama = Depends(get_llama), ) -> ModelList: return { "object": "list", @@ -701,6 +338,7 @@ async def get_models( "object": "model", "owned_by": "me", "permissions": [], - } for model_alias in llama + } + for model_alias in llama ], } diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py new file mode 100644 index 000000000..750b396cd --- /dev/null +++ b/llama_cpp/server/cli.py @@ -0,0 +1,99 @@ +from __future__ import annotations + +import argparse + +from typing import List, Literal, Union, Any, Type, TypeVar + +from pydantic import BaseModel + +from llama_cpp.server.settings import CommandLineSettings + + +def _get_base_type(annotation: Type[Any]) -> Type[Any]: + if getattr(annotation, "__origin__", None) is Literal: + assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1 # type: ignore + return type(annotation.__args__[0]) # type: ignore + elif getattr(annotation, "__origin__", None) is Union: + assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1 # type: ignore + non_optional_args: List[Type[Any]] = [ + arg for arg in annotation.__args__ if arg is not type(None) # type: ignore + ] + if non_optional_args: + return _get_base_type(non_optional_args[0]) + elif ( + getattr(annotation, "__origin__", None) is list + or getattr(annotation, "__origin__", None) is List + ): + assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1 # type: ignore + return _get_base_type(annotation.__args__[0]) # type: ignore + return annotation + + +def _contains_list_type(annotation: Type[Any] | None) -> bool: + origin = getattr(annotation, "__origin__", None) + + if origin is list or origin is List: + return True + elif origin in (Literal, Union): + return any(_contains_list_type(arg) for arg in annotation.__args__) # type: ignore + else: + return False + + +def _parse_bool_arg(arg: str | bytes | bool) -> bool: + if isinstance(arg, bytes): + arg = arg.decode("utf-8") + + true_values = {"1", "on", "t", "true", "y", "yes"} + false_values = {"0", "off", "f", "false", "n", "no"} + + arg_str = str(arg).lower().strip() + + if arg_str in true_values: + return True + elif arg_str in false_values: + return False + else: + raise ValueError(f"Invalid boolean argument: {arg}") + + +def add_args_from_model(parser: argparse.ArgumentParser, model: type[BaseModel]): + """Add arguments from a pydantic model to an argparse parser.""" + + for name, field in model.model_fields.items(): + description = field.description + if field.default and description and not field.is_required(): + description += f" (default: {field.default})" + base_type = ( + _get_base_type(field.annotation) if field.annotation is not None else str + ) + list_type = _contains_list_type(field.annotation) + if base_type is not bool: + parser.add_argument( + f"--{name}", + dest=name, + nargs="*" if list_type else None, + type=base_type, + help=description, + ) + if base_type is bool: + parser.add_argument( + f"--{name}", + dest=name, + type=_parse_bool_arg, + help=f"{description}", + ) + + +T = TypeVar("T", bound=type[BaseModel]) + + +def parse_model_from_args(model: T, args: argparse.Namespace) -> T: + """Parse a pydantic model from an argparse namespace.""" + return model( + **{ + k: v + for k, v in vars(args).items() + if v is not None and k in model.model_fields + } + ) diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py new file mode 100644 index 000000000..febe3e39d --- /dev/null +++ b/llama_cpp/server/errors.py @@ -0,0 +1,210 @@ +from __future__ import annotations + +import sys +import traceback +import time +from re import compile, Match, Pattern +from typing import Callable, Coroutine, Optional, Tuple, Union, Dict +from typing_extensions import TypedDict + + +from fastapi import ( + Request, + Response, + HTTPException, +) +from fastapi.responses import JSONResponse +from fastapi.routing import APIRoute + +from llama_cpp.server.types import ( + CreateCompletionRequest, + CreateEmbeddingRequest, + CreateChatCompletionRequest, +) + +class ErrorResponse(TypedDict): + """OpenAI style error response""" + + message: str + type: str + param: Optional[str] + code: Optional[str] + + +class ErrorResponseFormatters: + """Collection of formatters for error responses. + + Args: + request (Union[CreateCompletionRequest, CreateChatCompletionRequest]): + Request body + match (Match[str]): Match object from regex pattern + + Returns: + Tuple[int, ErrorResponse]: Status code and error response + """ + + @staticmethod + def context_length_exceeded( + request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"], + match, # type: Match[str] # type: ignore + ) -> Tuple[int, ErrorResponse]: + """Formatter for context length exceeded error""" + + context_window = int(match.group(2)) + prompt_tokens = int(match.group(1)) + completion_tokens = request.max_tokens + if hasattr(request, "messages"): + # Chat completion + message = ( + "This model's maximum context length is {} tokens. " + "However, you requested {} tokens " + "({} in the messages, {} in the completion). " + "Please reduce the length of the messages or completion." + ) + else: + # Text completion + message = ( + "This model's maximum context length is {} tokens, " + "however you requested {} tokens " + "({} in your prompt; {} for the completion). " + "Please reduce your prompt; or completion length." + ) + return 400, ErrorResponse( + message=message.format( + context_window, + completion_tokens + prompt_tokens, + prompt_tokens, + completion_tokens, + ), # type: ignore + type="invalid_request_error", + param="messages", + code="context_length_exceeded", + ) + + @staticmethod + def model_not_found( + request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"], + match, # type: Match[str] # type: ignore + ) -> Tuple[int, ErrorResponse]: + """Formatter for model_not_found error""" + + model_path = str(match.group(1)) + message = f"The model `{model_path}` does not exist" + return 400, ErrorResponse( + message=message, + type="invalid_request_error", + param=None, + code="model_not_found", + ) + + +class RouteErrorHandler(APIRoute): + """Custom APIRoute that handles application errors and exceptions""" + + # key: regex pattern for original error message from llama_cpp + # value: formatter function + pattern_and_formatters: Dict[ + "Pattern[str]", + Callable[ + [ + Union["CreateCompletionRequest", "CreateChatCompletionRequest"], + "Match[str]", + ], + Tuple[int, ErrorResponse], + ], + ] = { + compile( + r"Requested tokens \((\d+)\) exceed context window of (\d+)" + ): ErrorResponseFormatters.context_length_exceeded, + compile( + r"Model path does not exist: (.+)" + ): ErrorResponseFormatters.model_not_found, + } + + def error_message_wrapper( + self, + error: Exception, + body: Optional[ + Union[ + "CreateChatCompletionRequest", + "CreateCompletionRequest", + "CreateEmbeddingRequest", + ] + ] = None, + ) -> Tuple[int, ErrorResponse]: + """Wraps error message in OpenAI style error response""" + print(f"Exception: {str(error)}", file=sys.stderr) + traceback.print_exc(file=sys.stderr) + if body is not None and isinstance( + body, + ( + CreateCompletionRequest, + CreateChatCompletionRequest, + ), + ): + # When text completion or chat completion + for pattern, callback in self.pattern_and_formatters.items(): + match = pattern.search(str(error)) + if match is not None: + return callback(body, match) + + # Wrap other errors as internal server error + return 500, ErrorResponse( + message=str(error), + type="internal_server_error", + param=None, + code=None, + ) + + def get_route_handler( + self, + ) -> Callable[[Request], Coroutine[None, None, Response]]: + """Defines custom route handler that catches exceptions and formats + in OpenAI style error response""" + + original_route_handler = super().get_route_handler() + + async def custom_route_handler(request: Request) -> Response: + try: + start_sec = time.perf_counter() + response = await original_route_handler(request) + elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000) + response.headers["openai-processing-ms"] = f"{elapsed_time_ms}" + return response + except HTTPException as unauthorized: + # api key check failed + raise unauthorized + except Exception as exc: + json_body = await request.json() + try: + if "messages" in json_body: + # Chat completion + body: Optional[ + Union[ + CreateChatCompletionRequest, + CreateCompletionRequest, + CreateEmbeddingRequest, + ] + ] = CreateChatCompletionRequest(**json_body) + elif "prompt" in json_body: + # Text completion + body = CreateCompletionRequest(**json_body) + else: + # Embedding + body = CreateEmbeddingRequest(**json_body) + except Exception: + # Invalid request body + body = None + + # Get proper error message from the exception + ( + status_code, + error_message, + ) = self.error_message_wrapper(error=exc, body=body) + return JSONResponse( + {"error": error_message}, + status_code=status_code, + ) + + return custom_route_handler + diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 5ae9a045c..f11c1540a 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -1,80 +1,74 @@ from __future__ import annotations -import os -import logging -from typing import Optional, Union +from typing import Optional, Union, List from threading import Lock -import llama_cpp -from llama_cpp.server.settings import Settings, ModelSettings - -FILE_EXT = ".gguf" -MODEL_ENV_ARG = "MODEL" -DEFAULT_MODEL_DIR = os.path.join(os.getcwd(), "/models") -logger = logging.getLogger("uvicorn") +import llama_cpp -def models_root_dir(path: Optional[str] = None): - path = os.path.abspath(path or os.environ.get(MODEL_ENV_ARG, DEFAULT_MODEL_DIR)) - if os.path.isdir(path): return path - return os.path.dirname(path) +from llama_cpp.server.settings import ModelSettings -def model_alias(path: str) -> str: - return path.split(os.path.sep)[-1].split(FILE_EXT)[0] class LlamaProxy: - _model: Optional[llama_cpp.Llama] = None - _models: dict[str,ModelSettings] = {} + def __init__(self, models: List[ModelSettings]) -> None: + assert len(models) > 0, "No models provided!" - def __init__(self, settings: Settings) -> None: - self._settings = settings - for model in settings.models: + self._model_settings_dict: dict[str, ModelSettings] = {} + for model in models: if not model.model_alias: - model.model_alias = model_alias(model.model) - self._models[model.model_alias] = model - - model_root = models_root_dir(settings.model) - for filename in os.listdir(model_root): - if filename.endswith(FILE_EXT): - alias = model_alias(filename) - if alias in self._models: continue - exclude={'model', 'model_alias', 'models', 'host', 'port', 'interrupt_requests', 'config', 'ssl_keyfile', 'ssl_certfile', 'api_key'} - default_settings = settings.model_dump(exclude=exclude) - self._models[alias] = ModelSettings(model=os.path.join(model_root, filename), - model_alias=alias, **default_settings) - - if os.path.isfile(settings.model): - alias = settings.model_alias - if alias is None: alias = model_alias(settings.model) - if alias not in self._models: - self._models[alias] = settings - self(alias) + model.model_alias = model.model + self._model_settings_dict[model.model_alias] = model + + self._current_model: Optional[llama_cpp.Llama] = None + self._current_model_alias: Optional[str] = None + + self._default_model_settings: ModelSettings = models[0] + self._default_model_alias: str = self._default_model_settings.model_alias # type: ignore + + # Load default model + self._current_model = self.load_llama_from_model_settings( + self._default_model_settings + ) + self._current_model_alias = self._default_model_alias def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: - # handle backward compatibility, model param optional - try: - model_alias = self._models[model].model_alias - except KeyError: - if self._model: - if self._settings.verbose: logger.warn(f"Model {model} NOT found! Using {self._model.alias}") - return self._model - else: raise Exception(404, f"Model {model} NOT found!") - - if self._model: - if self._model.alias == model_alias: - return self._model - del self._model - - settings = self._models[model] + if model is None: + model = self._default_model_alias + + if model == self._current_model_alias: + if self._current_model is not None: + return self._current_model + + settings = self._model_settings_dict[model] + self._current_model = self.load_llama_from_model_settings(settings) + self._current_model_alias = model + return self._current_model + + def __getitem__(self, model: str): + return self._model_settings_dict[model].model_dump() + + def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]): + if isinstance(settings, (bytes, str)): + settings = ModelSettings.model_validate_json(settings) + self._model_settings_dict[model] = settings + def __iter__(self): + for model in self._model_settings_dict: + yield model + + def free(self): + if self._current_model: + del self._current_model + + @staticmethod + def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: chat_handler = None if settings.chat_format == "llava-1-5": assert settings.clip_model_path is not None, "clip model not found" chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler( - clip_model_path=settings.clip_model_path, - verbose=settings.verbose + clip_model_path=settings.clip_model_path, verbose=settings.verbose ) - self._model = llama_cpp.Llama( + _model = llama_cpp.Llama( model_path=settings.model, # Model Params n_gpu_layers=settings.n_gpu_layers, @@ -114,7 +108,6 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: # Misc verbose=settings.verbose, ) - self._model.alias = model_alias if settings.cache: if settings.cache_type == "disk": if settings.verbose: @@ -124,47 +117,6 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: if settings.verbose: print(f"Using ram cache with size {settings.cache_size}") cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) - self._model.set_cache(cache) - return self._model - - def __getitem__(self, model: str): - return self._models[model].model_dump() + _model.set_cache(cache) + return _model - def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]): - if isinstance(settings, (bytes, str)): - settings = ModelSettings.model_validate_json(settings) - self._models[model] = settings - - def __iter__(self): - for model in self._models: - yield model - - def free(self): - if self._model: del self._model - -LLAMA: Optional[LlamaProxy] = None - -llama_outer_lock = Lock() -llama_inner_lock = Lock() - -def set_llama(settings: Settings): - global LLAMA - LLAMA = LlamaProxy(settings) - -def get_llama(): - # NOTE: This double lock allows the currently streaming llama model to - # check if any other requests are pending in the same thread and cancel - # the stream if so. - llama_outer_lock.acquire() - release_outer_lock = True - try: - llama_inner_lock.acquire() - try: - llama_outer_lock.release() - release_outer_lock = False - yield LLAMA - finally: - llama_inner_lock.release() - finally: - if release_outer_lock: - llama_outer_lock.release() diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index ea3b65504..ea9cf26ec 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -1,9 +1,11 @@ from __future__ import annotations import multiprocessing + from typing import Optional, List, Literal from pydantic import Field -from pydantic_settings import BaseSettings, SettingsConfigDict +from pydantic_settings import BaseSettings + import llama_cpp # Disable warning for model and model_alias settings @@ -129,14 +131,12 @@ class ModelSettings(BaseSettings): class ServerSettings(BaseSettings): - model_config = SettingsConfigDict(env_file=".env", extra="ignore") host: str = Field(default="localhost", description="Listen address") port: int = Field(default=8000, description="Listen port") interrupt_requests: bool = Field( default=True, description="Whether to interrupt requests when a new request is received.", ) - config: Optional[str] = Field(default=None, description="Path to config file") ssl_keyfile: Optional[str] = Field( default=None, description="SSL key file for HTTPS" ) @@ -150,18 +150,16 @@ class ServerSettings(BaseSettings): class Settings(ServerSettings, ModelSettings): - models: Optional[List[ModelSettings]] = Field( - default=[], description="Model configs, overwrites default config" - ) - + pass -SETTINGS: Optional[ServerSettings] = None - -def set_settings(settings: ServerSettings): - global SETTINGS - SETTINGS = settings +class CommandLineSettings(Settings): + config_file: Optional[str] = Field( + default=None, description="Path to a config file to load." + ) -def get_settings(): - yield SETTINGS +class ConfigFileSettings(ServerSettings): + models: List[ModelSettings] = Field( + default=[], description="Model configs, overwrites default config" + ) diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py new file mode 100644 index 000000000..f0867bc4e --- /dev/null +++ b/llama_cpp/server/types.py @@ -0,0 +1,264 @@ +from __future__ import annotations + +from typing import List, Optional, Union, Dict +from typing_extensions import TypedDict, Literal + +from pydantic import BaseModel, Field + +import llama_cpp + + +model_field = Field( + description="The model to use for generating completions.", default=None +) + +max_tokens_field = Field( + default=16, ge=1, description="The maximum number of tokens to generate." +) + +temperature_field = Field( + default=0.8, + ge=0.0, + le=2.0, + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.", +) + +top_p_field = Field( + default=0.95, + ge=0.0, + le=1.0, + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.", +) + +min_p_field = Field( + default=0.05, + ge=0.0, + le=1.0, + description="Sets a minimum base probability threshold for token selection.\n\n" + + "The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter min_p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with min_p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.", +) + +stop_field = Field( + default=None, + description="A list of tokens at which to stop generation. If None, no stop tokens are used.", +) + +stream_field = Field( + default=False, + description="Whether to stream the results as they are generated. Useful for chatbots.", +) + +top_k_field = Field( + default=40, + ge=0, + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.", +) + +repeat_penalty_field = Field( + default=1.1, + ge=0.0, + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", +) + +presence_penalty_field = Field( + default=0.0, + ge=-2.0, + le=2.0, + description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", +) + +frequency_penalty_field = Field( + default=0.0, + ge=-2.0, + le=2.0, + description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", +) + +mirostat_mode_field = Field( + default=0, + ge=0, + le=2, + description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)", +) + +mirostat_tau_field = Field( + default=5.0, + ge=0.0, + le=10.0, + description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text", +) + +mirostat_eta_field = Field( + default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate" +) + +grammar = Field( + default=None, + description="A CBNF grammar (as string) to be used for formatting the model's output.", +) + + +class CreateCompletionRequest(BaseModel): + prompt: Union[str, List[str]] = Field( + default="", description="The prompt to generate completions for." + ) + suffix: Optional[str] = Field( + default=None, + description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.", + ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field + min_p: float = min_p_field + echo: bool = Field( + default=False, + description="Whether to echo the prompt in the generated text. Useful for chatbots.", + ) + stop: Optional[Union[str, List[str]]] = stop_field + stream: bool = stream_field + logprobs: Optional[int] = Field( + default=None, + ge=0, + description="The number of logprobs to generate. If None, no logprobs are generated.", + ) + presence_penalty: Optional[float] = presence_penalty_field + frequency_penalty: Optional[float] = frequency_penalty_field + logit_bias: Optional[Dict[str, float]] = Field(None) + logprobs: Optional[int] = Field(None) + seed: Optional[int] = Field(None) + + # ignored or currently unsupported + model: Optional[str] = model_field + n: Optional[int] = 1 + best_of: Optional[int] = 1 + user: Optional[str] = Field(default=None) + + # llama.cpp specific parameters + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) + mirostat_mode: int = mirostat_mode_field + mirostat_tau: float = mirostat_tau_field + mirostat_eta: float = mirostat_eta_field + grammar: Optional[str] = None + + model_config = { + "json_schema_extra": { + "examples": [ + { + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": ["\n", "###"], + } + ] + } + } + + +class CreateEmbeddingRequest(BaseModel): + model: Optional[str] = model_field + input: Union[str, List[str]] = Field(description="The input to embed.") + user: Optional[str] = Field(default=None) + + model_config = { + "json_schema_extra": { + "examples": [ + { + "input": "The food was delicious and the waiter...", + } + ] + } + } + + +class ChatCompletionRequestMessage(BaseModel): + role: Literal["system", "user", "assistant", "function"] = Field( + default="user", description="The role of the message." + ) + content: Optional[str] = Field( + default="", description="The content of the message." + ) + + +class CreateChatCompletionRequest(BaseModel): + messages: List[llama_cpp.ChatCompletionRequestMessage] = Field( + default=[], description="A list of messages to generate completions for." + ) + functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field( + default=None, + description="A list of functions to apply to the generated completions.", + ) + function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field( + default=None, + description="A function to apply to the generated completions.", + ) + tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field( + default=None, + description="A list of tools to apply to the generated completions.", + ) + tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field( + default=None, + description="A tool to apply to the generated completions.", + ) # TODO: verify + max_tokens: Optional[int] = Field( + default=None, + description="The maximum number of tokens to generate. Defaults to inf", + ) + temperature: float = temperature_field + top_p: float = top_p_field + min_p: float = min_p_field + stop: Optional[Union[str, List[str]]] = stop_field + stream: bool = stream_field + presence_penalty: Optional[float] = presence_penalty_field + frequency_penalty: Optional[float] = frequency_penalty_field + logit_bias: Optional[Dict[str, float]] = Field(None) + seed: Optional[int] = Field(None) + response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field( + default=None, + ) + + # ignored or currently unsupported + model: Optional[str] = model_field + n: Optional[int] = 1 + user: Optional[str] = Field(None) + + # llama.cpp specific parameters + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) + mirostat_mode: int = mirostat_mode_field + mirostat_tau: float = mirostat_tau_field + mirostat_eta: float = mirostat_eta_field + grammar: Optional[str] = None + + model_config = { + "json_schema_extra": { + "examples": [ + { + "messages": [ + ChatCompletionRequestMessage( + role="system", content="You are a helpful assistant." + ).model_dump(), + ChatCompletionRequestMessage( + role="user", content="What is the capital of France?" + ).model_dump(), + ] + } + ] + } + } + + +class ModelData(TypedDict): + id: str + object: Literal["model"] + owned_by: str + permissions: List[str] + + +class ModelList(TypedDict): + object: Literal["list"] + data: List[ModelData] From 3472b6f90cd978d4f98322d40316cb15c122a191 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 21 Dec 2023 22:30:54 -0500 Subject: [PATCH 35/44] Remove formatting fix --- llama_cpp/llama.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index fb13e07c3..5477df733 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -955,7 +955,6 @@ def __init__( if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) - self.chat_format = chat_format self.chat_handler = chat_handler From 310e2e6ca14fd21eb274902c9ed40dc1f06f6e2a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 21 Dec 2023 22:31:27 -0500 Subject: [PATCH 36/44] Format --- llama_cpp/llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5477df733..c2c045549 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -955,6 +955,7 @@ def __init__( if self.verbose: print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + self.chat_format = chat_format self.chat_handler = chat_handler From 5c9c35e805037d346557471bc7c30f3919bce190 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 21 Dec 2023 23:25:39 -0500 Subject: [PATCH 37/44] Use default model if not found in model settings --- llama_cpp/server/model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index f11c1540a..ce52171ca 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -37,6 +37,9 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: if model == self._current_model_alias: if self._current_model is not None: return self._current_model + + if model not in self._model_settings_dict: + model = self._default_model_alias settings = self._model_settings_dict[model] self._current_model = self.load_llama_from_model_settings(settings) From 950f721a01d3aedd4633489751d439df24499a40 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 22 Dec 2023 00:02:12 -0500 Subject: [PATCH 38/44] Fix --- llama_cpp/server/__main__.py | 8 +++----- llama_cpp/server/app.py | 17 +++++++++++++++++ llama_cpp/server/model.py | 8 +++++--- 3 files changed, 25 insertions(+), 8 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 7a3587721..3cd38d5a6 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -31,11 +31,10 @@ from llama_cpp.server.app import create_app from llama_cpp.server.settings import ( - Server, + Settings, ServerSettings, ModelSettings, ConfigFileSettings, - set_server_settings, ) from llama_cpp.server.cli import add_args_from_model, parse_model_from_args @@ -44,8 +43,7 @@ def main(): description = "🦙 Llama.cpp python server. Host your own LLMs!🚀" parser = argparse.ArgumentParser(description=description) - add_args_from_model(parser, ModelSettings) - add_args_from_model(parser, ServerSettings) + add_args_from_model(parser, Settings) parser.add_argument( "--config-file", type=str, @@ -88,7 +86,7 @@ def main(): ) ] app = create_app( - settings=Server( + settings=Settings( **server_settings.model_dump(), **model_settings[0].model_dump() ) ) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 60fc8c0e9..5759b1e42 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os import json from threading import Lock @@ -30,6 +31,7 @@ LlamaProxy, ) from llama_cpp.server.settings import ( + ConfigFileSettings, Settings, ModelSettings, ServerSettings, @@ -92,6 +94,21 @@ def create_app( server_settings: ServerSettings | None = None, model_settings: List[ModelSettings] | None = None, ): + config_file = os.environ.get("CONFIG_FILE", None) + if config_file is not None: + if not os.path.exists(config_file): + raise ValueError(f"Config file {config_file} not found!") + with open(config_file, "rb") as f: + config_file_settings = ConfigFileSettings.model_validate_json(f.read()) + server_settings = ServerSettings( + **{ + k: v + for k, v in config_file_settings.model_dump().items() + if k in ServerSettings.model_fields + } + ) + model_settings = config_file_settings.models + if server_settings is None and model_settings is None: if settings is None: settings = Settings() diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index ce52171ca..19d308366 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -34,12 +34,14 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama: if model is None: model = self._default_model_alias + if model not in self._model_settings_dict: + model = self._default_model_alias + if model == self._current_model_alias: if self._current_model is not None: return self._current_model - - if model not in self._model_settings_dict: - model = self._default_model_alias + + self._current_model = None settings = self._model_settings_dict[model] self._current_model = self.load_llama_from_model_settings(settings) From 8347a78ad4ba9dbcff169f814144e354e5401da6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 22 Dec 2023 03:46:06 -0500 Subject: [PATCH 39/44] Cleanup --- llama_cpp/server/__main__.py | 37 ++++----------- llama_cpp/server/app.py | 90 +++++++++++++----------------------- llama_cpp/server/cli.py | 2 - llama_cpp/server/model.py | 1 - llama_cpp/server/settings.py | 10 ++-- 5 files changed, 47 insertions(+), 93 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 3cd38d5a6..6c5e82712 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -9,7 +9,7 @@ Then run: ``` -uvicorn llama_cpp.server.app:app --reload +uvicorn llama_cpp.server.app:create_app --reload ``` or @@ -45,7 +45,7 @@ def main(): add_args_from_model(parser, Settings) parser.add_argument( - "--config-file", + "--config_file", type=str, help="Path to a config file to load.", ) @@ -60,35 +60,16 @@ def main(): raise ValueError(f"Config file {config_file} not found!") with open(config_file, "rb") as f: config_file_settings = ConfigFileSettings.model_validate_json(f.read()) - server_settings = ServerSettings( - **{ - k: v - for k, v in config_file_settings.model_dump().items() - if k in ServerSettings.model_fields - } - ) + server_settings = ServerSettings.model_validate(config_file_settings) model_settings = config_file_settings.models else: - server_settings = ServerSettings( - **{ - k: v - for k, v in vars(args).items() - if k in ServerSettings.model_fields - } - ) - model_settings = [ - ModelSettings( - **{ - k: v - for k, v in vars(args).items() - if k in ModelSettings.model_fields - } - ) - ] + server_settings = parse_model_from_args(ServerSettings, args) + model_settings = [parse_model_from_args(ModelSettings, args)] + assert server_settings is not None + assert model_settings is not None app = create_app( - settings=Settings( - **server_settings.model_dump(), **model_settings[0].model_dump() - ) + server_settings_or_none=server_settings, + model_settings=model_settings, ) uvicorn.run( app, diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 5759b1e42..fbb756f6e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -47,30 +47,30 @@ router = APIRouter(route_class=RouteErrorHandler) -_settings: Optional[ServerSettings] = None +_server_settings: Optional[ServerSettings] = None -def set_settings(settings: ServerSettings): - global _settings - _settings = settings +def set_server_settings(server_settings: ServerSettings): + global _server_settings + _server_settings = server_settings -def get_settings(): - yield _settings +def get_server_settings(): + yield _server_settings -LLAMA: Optional[LlamaProxy] = None +_llama_proxy: Optional[LlamaProxy] = None llama_outer_lock = Lock() llama_inner_lock = Lock() -def set_llama(models: List[ModelSettings]): - global LLAMA - LLAMA = LlamaProxy(models=models) +def set_llama_proxy(model_settings: List[ModelSettings]): + global _llama_proxy + _llama_proxy = LlamaProxy(models=model_settings) -def get_llama(): +def get_llama_proxy(): # NOTE: This double lock allows the currently streaming llama model to # check if any other requests are pending in the same thread and cancel # the stream if so. @@ -81,7 +81,7 @@ def get_llama(): try: llama_outer_lock.release() release_outer_lock = False - yield LLAMA + yield _llama_proxy finally: llama_inner_lock.release() finally: @@ -100,40 +100,20 @@ def create_app( raise ValueError(f"Config file {config_file} not found!") with open(config_file, "rb") as f: config_file_settings = ConfigFileSettings.model_validate_json(f.read()) - server_settings = ServerSettings( - **{ - k: v - for k, v in config_file_settings.model_dump().items() - if k in ServerSettings.model_fields - } - ) + server_settings = ServerSettings.model_validate(config_file_settings) model_settings = config_file_settings.models if server_settings is None and model_settings is None: if settings is None: settings = Settings() - server_settings = ServerSettings( - **{ - k: v - for k, v in settings.model_dump().items() - if k in ServerSettings.model_fields - } - ) - model_settings = [ - ModelSettings( - **{ - k: v - for k, v in settings.model_dump().items() - if k in ModelSettings.model_fields - } - ) - ] + server_settings = ServerSettings.model_validate(settings) + model_settings = [ModelSettings.model_validate(settings)] assert ( server_settings is not None and model_settings is not None ), "server_settings and model_settings must be provided together" - set_settings(server_settings) + set_server_settings(server_settings) middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))] app = FastAPI( middleware=middleware, @@ -150,7 +130,7 @@ def create_app( app.include_router(router) assert model_settings is not None - set_llama(models=model_settings) + set_llama_proxy(model_settings=model_settings) return app @@ -167,7 +147,7 @@ async def get_event_publisher( if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() if ( - next(get_settings()).interrupt_requests + next(get_server_settings()).interrupt_requests and llama_outer_lock.locked() ): await inner_send_chan.send(dict(data="[DONE]")) @@ -197,7 +177,7 @@ def _logit_bias_tokens_to_input_ids( async def authenticate( - settings: Settings = Depends(get_settings), + settings: Settings = Depends(get_server_settings), authorization: Optional[str] = Depends(bearer_scheme), ): # Skip API key check if it's not set in settings @@ -216,19 +196,18 @@ async def authenticate( ) -@router.post("/v1/completions", summary="Completion") -@router.post("/v1/engines/copilot-codex/completions", include_in_schema=False) +@router.post("/v1/completions", summary="Completion", dependencies=[Depends(authenticate)]) +@router.post("/v1/engines/copilot-codex/completions", include_in_schema=False, dependencies=[Depends(authenticate)]) async def create_completion( request: Request, body: CreateCompletionRequest, - llama: llama_cpp.Llama = Depends(get_llama), - authenticated: str = Depends(authenticate), + llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> llama_cpp.Completion: if isinstance(body.prompt, list): assert len(body.prompt) <= 1 body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" - llama = llama(body.model) + llama = llama_proxy(body.model) exclude = { "n", @@ -277,24 +256,21 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: return iterator_or_completion -@router.post("/v1/embeddings", summary="Embedding") +@router.post("/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)]) async def create_embedding( request: CreateEmbeddingRequest, - llama: llama_cpp.Llama = Depends(get_llama), - authenticated: str = Depends(authenticate), + llama_proxy: LlamaProxy = Depends(get_llama_proxy), ): return await run_in_threadpool( - llama(request.model).create_embedding, **request.model_dump(exclude={"user"}) + llama_proxy(request.model).create_embedding, **request.model_dump(exclude={"user"}) ) -@router.post("/v1/chat/completions", summary="Chat") +@router.post("/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)]) async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, - llama: llama_cpp.Llama = Depends(get_llama), - settings: Settings = Depends(get_settings), - authenticated: str = Depends(authenticate), + llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> llama_cpp.ChatCompletion: exclude = { "n", @@ -302,7 +278,7 @@ async def create_chat_completion( "user", } kwargs = body.model_dump(exclude=exclude) - llama = llama(body.model) + llama = llama_proxy(body.model) if body.logit_bias is not None: kwargs["logit_bias"] = ( _logit_bias_tokens_to_input_ids(llama, body.logit_bias) @@ -341,11 +317,9 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: return iterator_or_completion -@router.get("/v1/models", summary="Models") +@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)]]) async def get_models( - settings: Settings = Depends(get_settings), - authenticated: str = Depends(authenticate), - llama: llama_cpp.Llama = Depends(get_llama), + llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> ModelList: return { "object": "list", @@ -356,6 +330,6 @@ async def get_models( "owned_by": "me", "permissions": [], } - for model_alias in llama + for model_alias in llama_proxy ], } diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py index 750b396cd..8e32d2c0e 100644 --- a/llama_cpp/server/cli.py +++ b/llama_cpp/server/cli.py @@ -6,8 +6,6 @@ from pydantic import BaseModel -from llama_cpp.server.settings import CommandLineSettings - def _get_base_type(annotation: Type[Any]) -> Type[Any]: if getattr(annotation, "__origin__", None) is Literal: diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 19d308366..b9373b7ac 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import Optional, Union, List -from threading import Lock import llama_cpp diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index ea9cf26ec..752b68032 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -131,22 +131,24 @@ class ModelSettings(BaseSettings): class ServerSettings(BaseSettings): + # Uvicorn Settings host: str = Field(default="localhost", description="Listen address") port: int = Field(default=8000, description="Listen port") - interrupt_requests: bool = Field( - default=True, - description="Whether to interrupt requests when a new request is received.", - ) ssl_keyfile: Optional[str] = Field( default=None, description="SSL key file for HTTPS" ) ssl_certfile: Optional[str] = Field( default=None, description="SSL certificate file for HTTPS" ) + # FastAPI Settings api_key: Optional[str] = Field( default=None, description="API key for authentication. If set all requests need to be authenticated.", ) + interrupt_requests: bool = Field( + default=True, + description="Whether to interrupt requests when a new request is received.", + ) class Settings(ServerSettings, ModelSettings): From 02ab0e2153b5a29cfddd1dead255f931a129e80d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 22 Dec 2023 03:47:10 -0500 Subject: [PATCH 40/44] Fix --- llama_cpp/server/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index fbb756f6e..4b08cc6ce 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -317,7 +317,7 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]: return iterator_or_completion -@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)]]) +@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)]) async def get_models( llama_proxy: LlamaProxy = Depends(get_llama_proxy), ) -> ModelList: From fd1bf6480efadb5dde58314014d1d817b9de9b94 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 22 Dec 2023 03:48:26 -0500 Subject: [PATCH 41/44] Fix --- llama_cpp/server/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 6c5e82712..a13d4a111 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -68,7 +68,7 @@ def main(): assert server_settings is not None assert model_settings is not None app = create_app( - server_settings_or_none=server_settings, + server_settings=server_settings, model_settings=model_settings, ) uvicorn.run( From ecd84344b2e8ec517413d6c95400658c0d9e8ef7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 22 Dec 2023 04:11:39 -0500 Subject: [PATCH 42/44] Remove unnused CommandLineSettings --- llama_cpp/server/settings.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 752b68032..53ead7487 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -155,12 +155,6 @@ class Settings(ServerSettings, ModelSettings): pass -class CommandLineSettings(Settings): - config_file: Optional[str] = Field( - default=None, description="Path to a config file to load." - ) - - class ConfigFileSettings(ServerSettings): models: List[ModelSettings] = Field( default=[], description="Model configs, overwrites default config" From 52861468b2f51a314ece494732d2084bfaf7aeb6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 22 Dec 2023 04:27:49 -0500 Subject: [PATCH 43/44] Cleanup --- llama_cpp/server/__main__.py | 32 ++++++++++++++++---------------- llama_cpp/server/app.py | 21 ++++++++++++++++----- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index a13d4a111..fadfc5fb4 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -49,10 +49,10 @@ def main(): type=str, help="Path to a config file to load.", ) + server_settings: ServerSettings | None = None + model_settings: list[ModelSettings] = [] + args = parser.parse_args() try: - args = parser.parse_args() - server_settings: ServerSettings | None = None - model_settings: list[ModelSettings] = [] # Load server settings from config_file if provided config_file = os.environ.get("CONFIG_FILE", args.config_file) if config_file: @@ -65,23 +65,23 @@ def main(): else: server_settings = parse_model_from_args(ServerSettings, args) model_settings = [parse_model_from_args(ModelSettings, args)] - assert server_settings is not None - assert model_settings is not None - app = create_app( - server_settings=server_settings, - model_settings=model_settings, - ) - uvicorn.run( - app, - host=os.getenv("HOST", server_settings.host), - port=int(os.getenv("PORT", server_settings.port)), - ssl_keyfile=server_settings.ssl_keyfile, - ssl_certfile=server_settings.ssl_certfile, - ) except Exception as e: print(e, file=sys.stderr) parser.print_help() sys.exit(1) + assert server_settings is not None + assert model_settings is not None + app = create_app( + server_settings=server_settings, + model_settings=model_settings, + ) + uvicorn.run( + app, + host=os.getenv("HOST", server_settings.host), + port=int(os.getenv("PORT", server_settings.port)), + ssl_keyfile=server_settings.ssl_keyfile, + ssl_certfile=server_settings.ssl_certfile, + ) if __name__ == "__main__": diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 4b08cc6ce..a7ce63f76 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -196,8 +196,14 @@ async def authenticate( ) -@router.post("/v1/completions", summary="Completion", dependencies=[Depends(authenticate)]) -@router.post("/v1/engines/copilot-codex/completions", include_in_schema=False, dependencies=[Depends(authenticate)]) +@router.post( + "/v1/completions", summary="Completion", dependencies=[Depends(authenticate)] +) +@router.post( + "/v1/engines/copilot-codex/completions", + include_in_schema=False, + dependencies=[Depends(authenticate)], +) async def create_completion( request: Request, body: CreateCompletionRequest, @@ -256,17 +262,22 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]: return iterator_or_completion -@router.post("/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)]) +@router.post( + "/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)] +) async def create_embedding( request: CreateEmbeddingRequest, llama_proxy: LlamaProxy = Depends(get_llama_proxy), ): return await run_in_threadpool( - llama_proxy(request.model).create_embedding, **request.model_dump(exclude={"user"}) + llama_proxy(request.model).create_embedding, + **request.model_dump(exclude={"user"}), ) -@router.post("/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)]) +@router.post( + "/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)] +) async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, From 1b322b4fdcf62748ebd6189e6952053880fdc2a1 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 22 Dec 2023 05:15:41 -0500 Subject: [PATCH 44/44] Support default name for copilot-codex models --- llama_cpp/server/app.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index a7ce63f76..c54e4eb5c 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -213,7 +213,11 @@ async def create_completion( assert len(body.prompt) <= 1 body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" - llama = llama_proxy(body.model) + llama = llama_proxy( + body.model + if request.url.path != "/v1/engines/copilot-codex/completions" + else "copilot-codex" + ) exclude = { "n",