From 3ec00d01fd41cd87f33fb86effc4c1147f15d134 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 14:41:17 +0100
Subject: [PATCH 01/44] Update Llama class to handle chat_format & caching

---
 llama_cpp/llama.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2e18b47a0..982b7410c 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -758,7 +758,11 @@ def __init__(
         numa: bool = False,
         # Chat Format Params
         chat_format: str = "llama-2",
-        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
+        clip_model_path: Optional[str] = None, # only for multimodal, when chat_format=llava-1-5
+        # Cache
+        cache: bool = False,
+        cache_type: str = "ram",
+        cache_size: int = 2 << 30,
         # Misc
         verbose: bool = True,
         # Extra Params
@@ -791,7 +795,10 @@ def __init__(
             lora_path: Path to a LoRA file to apply to the model.
             numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
             chat_format: String specifying the chat format to use when calling create_chat_completion.
-            chat_handler: Optional chat handler to use when calling create_chat_completion.
+            clip_model_path: Optional clip model path to use when using multimodal mode, expected when chat_format=llava-1-5.
+            cache: Optional if true enables caching.
+            cache_type: String can be "ram" or "disk".
+            cache_size: Number of bytes to cache, defaults to 2GB
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -917,6 +924,14 @@ def __init__(
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
 
+        chat_handler = None
+        if chat_format == "llava-1-5":
+            assert clip_model_path is not None, "clip model not found"
+            chat_handler = llama_chat_format.Llava15ChatHandler(
+                clip_model_path=clip_model_path,
+                verbose=verbose
+            )
+        
         self.chat_format = chat_format
         self.chat_handler = chat_handler
 
@@ -934,6 +949,17 @@ def __init__(
             (n_ctx, self._n_vocab), dtype=np.single
         )
 
+        if cache:
+            if cache_type == "disk":
+                if verbose:
+                    print(f"Using disk cache with size {cache_size}")
+                cache = LlamaDiskCache(capacity_bytes=cache_size)
+            else:
+                if verbose:
+                    print(f"Using ram cache with size {cache_size}")
+                cache = LlamaRAMCache(capacity_bytes=cache_size)
+            self.set_cache(cache)
+
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         assert self._ctx.ctx is not None

From 6e68a4bd62fe91803880e4da219450589ac70aab Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 14:41:59 +0100
Subject: [PATCH 02/44] Add settings.py

---
 llama_cpp/server/settings.py | 142 +++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 llama_cpp/server/settings.py

diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
new file mode 100644
index 000000000..46f4c9922
--- /dev/null
+++ b/llama_cpp/server/settings.py
@@ -0,0 +1,142 @@
+import multiprocessing
+from typing import Optional, List, Literal
+from pydantic import Field
+from pydantic_settings import BaseSettings
+import llama_cpp
+
+# Disable warning for model and model_alias settings
+BaseSettings.model_config['protected_namespaces'] = ()
+
+class Settings(BaseSettings):
+    model: str = Field(
+        description="The path to the model to use for generating completions."
+    )
+    model_alias: Optional[str] = Field(
+        default=None,
+        description="The alias of the model to use for generating completions.",
+    )
+    # Model Params
+    n_gpu_layers: int = Field(
+        default=0,
+        ge=-1,
+        description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
+    )
+    main_gpu: int = Field(
+        default=0,
+        ge=0,
+        description="Main GPU to use.",
+    )
+    tensor_split: Optional[List[float]] = Field(
+        default=None,
+        description="Split layers across multiple GPUs in proportion.",
+    )
+    vocab_only: bool = Field(
+        default=False, description="Whether to only return the vocabulary."
+    )
+    use_mmap: bool = Field(
+        default=llama_cpp.llama_mmap_supported(),
+        description="Use mmap.",
+    )
+    use_mlock: bool = Field(
+        default=llama_cpp.llama_mlock_supported(),
+        description="Use mlock.",
+    )
+    # Context Params
+    seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
+    n_ctx: int = Field(default=2048, ge=1, description="The context size.")
+    n_batch: int = Field(
+        default=512, ge=1, description="The batch size to use per eval."
+    )
+    n_threads: int = Field(
+        default=max(multiprocessing.cpu_count() // 2, 1),
+        ge=1,
+        description="The number of threads to use.",
+    )
+    n_threads_batch: int = Field(
+        default=max(multiprocessing.cpu_count() // 2, 1),
+        ge=0,
+        description="The number of threads to use when batch processing.",
+    )
+    rope_scaling_type: int = Field(
+        default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
+    )
+    rope_freq_base: float = Field(
+        default=0.0, description="RoPE base frequency"
+    )
+    rope_freq_scale: float = Field(
+        default=0.0, description="RoPE frequency scaling factor"
+    )
+    yarn_ext_factor: float = Field(
+        default=-1.0
+    )
+    yarn_attn_factor: float = Field(
+        default=1.0
+    )
+    yarn_beta_fast: float = Field(
+        default=32.0
+    )
+    yarn_beta_slow: float = Field(
+        default=1.0
+    )
+    yarn_orig_ctx: int = Field(
+        default=0
+    )
+    mul_mat_q: bool = Field(
+        default=True, description="if true, use experimental mul_mat_q kernels"
+    )
+    f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
+    logits_all: bool = Field(default=True, description="Whether to return logits.")
+    embedding: bool = Field(default=True, description="Whether to use embeddings.")
+    # Sampling Params
+    last_n_tokens_size: int = Field(
+        default=64,
+        ge=0,
+        description="Last n tokens to keep for repeat penalty calculation.",
+    )
+    # LoRA Params
+    lora_base: Optional[str] = Field(
+        default=None,
+        description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
+    )
+    lora_path: Optional[str] = Field(
+        default=None,
+        description="Path to a LoRA file to apply to the model.",
+    )
+    # Backend Params
+    numa: bool = Field(
+        default=False,
+        description="Enable NUMA support.",
+    )
+    # Chat Format Params
+    chat_format: str = Field(
+        default="llama-2",
+        description="Chat format to use.",
+    )
+    clip_model_path: Optional[str] = Field(
+        default=None,
+        description="Path to a CLIP model to use for multi-modal chat completion.",
+    )
+    # Cache Params
+    cache: bool = Field(
+        default=False,
+        description="Use a cache to reduce processing times for evaluated prompts.",
+    )
+    cache_type: Literal["ram", "disk"] = Field(
+        default="ram",
+        description="The type of cache to use. Only used if cache is True.",
+    )
+    cache_size: int = Field(
+        default=2 << 30,
+        description="The size of the cache in bytes. Only used if cache is True.",
+    )
+    # Misc
+    verbose: bool = Field(
+        default=True, description="Whether to print debug information."
+    )
+    # Server Params
+    host: str = Field(default="localhost", description="Listen address")
+    port: int = Field(default=8000, description="Listen port")
+    interrupt_requests: bool = Field(
+        default=True,
+        description="Whether to interrupt requests when a new request is received.",
+    )

From e63cffb7bfc8f4d803856b2b9a4ee691dbc18b3e Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 14:42:49 +0100
Subject: [PATCH 03/44] Add util.py & update __main__.py

---
 llama_cpp/server/__main__.py |  3 ++-
 llama_cpp/server/util.py     | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 llama_cpp/server/util.py

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index a294ebf8a..e2da83478 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -27,7 +27,8 @@
 
 import uvicorn
 
-from llama_cpp.server.app import create_app, Settings
+from llama_cpp.server.app import create_app
+from llama_cpp.server.settings import Settings
 
 def get_base_type(annotation):
     if getattr(annotation, '__origin__', None) is Literal:
diff --git a/llama_cpp/server/util.py b/llama_cpp/server/util.py
new file mode 100644
index 000000000..e3702794f
--- /dev/null
+++ b/llama_cpp/server/util.py
@@ -0,0 +1,14 @@
+import os
+import shutil
+
+def remove_file(path: str) -> None:
+    if path and os.path.exists(path):
+        if os.path.isdir(path):
+            shutil.rmtree(path)
+        else:
+            os.unlink(path)
+
+def models_root_dir(path = None):
+    path = os.path.abspath(path or os.environ.get('MODEL', '/models'))
+    if os.path.isdir(path): return path
+    return os.path.dirname(path)

From 55e33abe56ef2e8ad5ba13186a6678e007293778 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 14:43:05 +0100
Subject: [PATCH 04/44] multimodel

---
 llama_cpp/server/app.py   | 266 +++-----------------------------------
 llama_cpp/server/model.py | 159 +++++++++++++++++++++++
 2 files changed, 174 insertions(+), 251 deletions(-)
 create mode 100644 llama_cpp/server/model.py

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 2a6aed81c..2046239cb 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -1,10 +1,8 @@
 import sys
 import json
 import traceback
-import multiprocessing
 import time
 from re import compile, Match, Pattern
-from threading import Lock
 from functools import partial
 from typing import Callable, Coroutine, Iterator, List, Optional, Tuple, Union, Dict
 from typing_extensions import TypedDict, Literal
@@ -20,7 +18,6 @@
 from fastapi.responses import JSONResponse
 from fastapi.routing import APIRoute
 from pydantic import BaseModel, Field
-from pydantic_settings import BaseSettings
 from sse_starlette.sse import EventSourceResponse
 from starlette_context import plugins
 from starlette_context.middleware import RawContextMiddleware
@@ -28,145 +25,10 @@
 import numpy as np
 import numpy.typing as npt
 
-
-# Disable warning for model and model_alias settings
-BaseSettings.model_config['protected_namespaces'] = ()
-
-
-class Settings(BaseSettings):
-    model: str = Field(
-        description="The path to the model to use for generating completions."
-    )
-    model_alias: Optional[str] = Field(
-        default=None,
-        description="The alias of the model to use for generating completions.",
-    )
-    # Model Params
-    n_gpu_layers: int = Field(
-        default=0,
-        ge=-1,
-        description="The number of layers to put on the GPU. The rest will be on the CPU. Set -1 to move all to GPU.",
-    )
-    main_gpu: int = Field(
-        default=0,
-        ge=0,
-        description="Main GPU to use.",
-    )
-    tensor_split: Optional[List[float]] = Field(
-        default=None,
-        description="Split layers across multiple GPUs in proportion.",
-    )
-    vocab_only: bool = Field(
-        default=False, description="Whether to only return the vocabulary."
-    )
-    use_mmap: bool = Field(
-        default=llama_cpp.llama_mmap_supported(),
-        description="Use mmap.",
-    )
-    use_mlock: bool = Field(
-        default=llama_cpp.llama_mlock_supported(),
-        description="Use mlock.",
-    )
-    # Context Params
-    seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
-    n_ctx: int = Field(default=2048, ge=1, description="The context size.")
-    n_batch: int = Field(
-        default=512, ge=1, description="The batch size to use per eval."
-    )
-    n_threads: int = Field(
-        default=max(multiprocessing.cpu_count() // 2, 1),
-        ge=1,
-        description="The number of threads to use.",
-    )
-    n_threads_batch: int = Field(
-        default=max(multiprocessing.cpu_count() // 2, 1),
-        ge=0,
-        description="The number of threads to use when batch processing.",
-    )
-    rope_scaling_type: int = Field(
-        default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
-    )
-    rope_freq_base: float = Field(
-        default=0.0, description="RoPE base frequency"
-    )
-    rope_freq_scale: float = Field(
-        default=0.0, description="RoPE frequency scaling factor"
-    )
-    yarn_ext_factor: float = Field(
-        default=-1.0
-    )
-    yarn_attn_factor: float = Field(
-        default=1.0
-    )
-    yarn_beta_fast: float = Field(
-        default=32.0
-    )
-    yarn_beta_slow: float = Field(
-        default=1.0
-    )
-    yarn_orig_ctx: int = Field(
-        default=0
-    )
-    mul_mat_q: bool = Field(
-        default=True, description="if true, use experimental mul_mat_q kernels"
-    )
-    f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
-    logits_all: bool = Field(default=True, description="Whether to return logits.")
-    embedding: bool = Field(default=True, description="Whether to use embeddings.")
-    # Sampling Params
-    last_n_tokens_size: int = Field(
-        default=64,
-        ge=0,
-        description="Last n tokens to keep for repeat penalty calculation.",
-    )
-    # LoRA Params
-    lora_base: Optional[str] = Field(
-        default=None,
-        description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
-    )
-    lora_path: Optional[str] = Field(
-        default=None,
-        description="Path to a LoRA file to apply to the model.",
-    )
-    # Backend Params
-    numa: bool = Field(
-        default=False,
-        description="Enable NUMA support.",
-    )
-    # Chat Format Params
-    chat_format: str = Field(
-        default="llama-2",
-        description="Chat format to use.",
-    )
-    clip_model_path: Optional[str] = Field(
-        default=None,
-        description="Path to a CLIP model to use for multi-modal chat completion.",
-    )
-    # Cache Params
-    cache: bool = Field(
-        default=False,
-        description="Use a cache to reduce processing times for evaluated prompts.",
-    )
-    cache_type: Literal["ram", "disk"] = Field(
-        default="ram",
-        description="The type of cache to use. Only used if cache is True.",
-    )
-    cache_size: int = Field(
-        default=2 << 30,
-        description="The size of the cache in bytes. Only used if cache is True.",
-    )
-    # Misc
-    verbose: bool = Field(
-        default=True, description="Whether to print debug information."
-    )
-    # Server Params
-    host: str = Field(default="localhost", description="Listen address")
-    port: int = Field(default=8000, description="Listen port")
-    interrupt_requests: bool = Field(
-        default=True,
-        description="Whether to interrupt requests when a new request is received.",
-    )
-
+from llama_cpp.server.model import get_llama, llama_outer_lock, set_settings, get_settings
+from llama_cpp.server.model import router as models_router
+from llama_cpp.server.model import MultiLlama as Llama
+from llama_cpp.server.settings import Settings
 
 class ErrorResponse(TypedDict):
     """OpenAI style error response"""
@@ -176,7 +38,6 @@ class ErrorResponse(TypedDict):
     param: Optional[str]
     code: Optional[str]
 
-
 class ErrorResponseFormatters:
     """Collection of formatters for error responses.
 
@@ -243,7 +104,6 @@ def model_not_found(
             code="model_not_found",
         )
 
-
 class RouteErrorHandler(APIRoute):
     """Custom APIRoute that handles application errors and exceptions"""
 
@@ -351,13 +211,8 @@ async def custom_route_handler(request: Request) -> Response:
 
         return custom_route_handler
 
-
 router = APIRouter(route_class=RouteErrorHandler)
 
-settings: Optional[Settings] = None
-llama: Optional[llama_cpp.Llama] = None
-
-
 def create_app(settings: Optional[Settings] = None):
     if settings is None:
         settings = Settings()
@@ -378,103 +233,11 @@ def create_app(settings: Optional[Settings] = None):
         allow_headers=["*"],
     )
     app.include_router(router)
-    global llama
-
-    ##
-    chat_handler = None
-    if settings.chat_format == "llava-1-5":
-        assert settings.clip_model_path is not None
-        chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(clip_model_path=settings.clip_model_path, verbose=settings.verbose)
-    ##
-
-    llama = llama_cpp.Llama(
-        model_path=settings.model,
-        # Model Params
-        n_gpu_layers=settings.n_gpu_layers,
-        main_gpu=settings.main_gpu,
-        tensor_split=settings.tensor_split,
-        vocab_only=settings.vocab_only,
-        use_mmap=settings.use_mmap,
-        use_mlock=settings.use_mlock,
-        # Context Params
-        seed=settings.seed,
-        n_ctx=settings.n_ctx,
-        n_batch=settings.n_batch,
-        n_threads=settings.n_threads,
-        n_threads_batch=settings.n_threads_batch,
-        rope_scaling_type=settings.rope_scaling_type,
-        rope_freq_base=settings.rope_freq_base,
-        rope_freq_scale=settings.rope_freq_scale,
-        yarn_ext_factor=settings.yarn_ext_factor,
-        yarn_attn_factor=settings.yarn_attn_factor,
-        yarn_beta_fast=settings.yarn_beta_fast,
-        yarn_beta_slow=settings.yarn_beta_slow,
-        yarn_orig_ctx=settings.yarn_orig_ctx,
-        mul_mat_q=settings.mul_mat_q,
-        f16_kv=settings.f16_kv,
-        logits_all=settings.logits_all,
-        embedding=settings.embedding,
-        # Sampling Params
-        last_n_tokens_size=settings.last_n_tokens_size,
-        # LoRA Params
-        lora_base=settings.lora_base,
-        lora_path=settings.lora_path,
-        # Backend Params
-        numa=settings.numa,
-        # Chat Format Params
-        chat_format=settings.chat_format,
-        chat_handler=chat_handler,
-        # Misc
-        verbose=settings.verbose,
-    )
-    if settings.cache:
-        if settings.cache_type == "disk":
-            if settings.verbose:
-                print(f"Using disk cache with size {settings.cache_size}")
-            cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
-        else:
-            if settings.verbose:
-                print(f"Using ram cache with size {settings.cache_size}")
-            cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
-
-        cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)
-        llama.set_cache(cache)
-
-    def set_settings(_settings: Settings):
-        global settings
-        settings = _settings
+    app.include_router(models_router)
 
     set_settings(settings)
     return app
 
-
-llama_outer_lock = Lock()
-llama_inner_lock = Lock()
-
-
-def get_llama():
-    # NOTE: This double lock allows the currently streaming llama model to
-    # check if any other requests are pending in the same thread and cancel
-    # the stream if so.
-    llama_outer_lock.acquire()
-    release_outer_lock = True
-    try:
-        llama_inner_lock.acquire()
-        try:
-            llama_outer_lock.release()
-            release_outer_lock = False
-            yield llama
-        finally:
-            llama_inner_lock.release()
-    finally:
-        if release_outer_lock:
-            llama_outer_lock.release()
-
-
-def get_settings():
-    yield settings
-
-
 async def get_event_publisher(
     request: Request,
     inner_send_chan: MemoryObjectSendStream,
@@ -676,11 +439,13 @@ def logit_bias_processor(
 async def create_completion(
     request: Request,
     body: CreateCompletionRequest,
-    llama: llama_cpp.Llama = Depends(get_llama),
+    llama: Llama = Depends(get_llama),
 ) -> llama_cpp.Completion:
     if isinstance(body.prompt, list):
         assert len(body.prompt) <= 1
         body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
+    
+    llama = llama(body.model)
 
     exclude = {
         "n",
@@ -728,9 +493,8 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
     else:
         return iterator_or_completion
 
-
 class CreateEmbeddingRequest(BaseModel):
-    model: Optional[str] = model_field
+    model: str = model_field
     input: Union[str, List[str]] = Field(description="The input to embed.")
     user: Optional[str] = Field(default=None)
 
@@ -744,15 +508,14 @@ class CreateEmbeddingRequest(BaseModel):
         }
     }
 
-
 @router.post(
     "/v1/embeddings",
 )
 async def create_embedding(
-    request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
+    request: CreateEmbeddingRequest, llama: Llama = Depends(get_llama)
 ):
     return await run_in_threadpool(
-        llama.create_embedding, **request.model_dump(exclude={"user"})
+        llama(request.model).create_embedding, **request.model_dump(exclude={"user"})
     )
 
 
@@ -799,7 +562,7 @@ class CreateChatCompletionRequest(BaseModel):
     )
 
     # ignored or currently unsupported
-    model: Optional[str] = model_field
+    model: str = model_field
     n: Optional[int] = 1
     user: Optional[str] = Field(None)
 
@@ -836,7 +599,7 @@ class CreateChatCompletionRequest(BaseModel):
 async def create_chat_completion(
     request: Request,
     body: CreateChatCompletionRequest,
-    llama: llama_cpp.Llama = Depends(get_llama),
+    llama: Llama = Depends(get_llama),
     settings: Settings = Depends(get_settings),
 ) -> llama_cpp.ChatCompletion:
     exclude = {
@@ -846,7 +609,7 @@ async def create_chat_completion(
         "user",
     }
     kwargs = body.model_dump(exclude=exclude)
-
+    llama = llama(body.model)
     if body.logit_bias is not None:
         kwargs["logits_processor"] = llama_cpp.LogitsProcessorList(
             [
@@ -900,6 +663,7 @@ class ModelList(TypedDict):
 @router.get("/v1/models")
 async def get_models(
     settings: Settings = Depends(get_settings),
+    llama: Llama = Depends(get_llama),
 ) -> ModelList:
     assert llama is not None
     return {
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
new file mode 100644
index 000000000..237806e37
--- /dev/null
+++ b/llama_cpp/server/model.py
@@ -0,0 +1,159 @@
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from typing import Any, Optional
+from threading import Lock
+from fastapi import APIRouter, UploadFile, Depends, HTTPException
+from fastapi.background import BackgroundTasks
+import llama_cpp
+
+from llama_cpp.server.util import remove_file, models_root_dir
+from llama_cpp.server.settings import Settings
+
+class MultiLlama:
+    _model: Optional[llama_cpp.Llama] = None
+    _models = {}
+
+    def __init__(self, settings: Settings) -> None:
+        self._settings = settings
+        model_root = models_root_dir(settings.model)
+        for filename in os.listdir(model_root):
+            if filename.endswith('.gguf'):
+                self._models[filename.split('.gguf')[0]] = os.path.join(model_root, filename)
+
+    def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama:
+        try:
+            model_path = self._models[model]
+        except KeyError:
+            raise HTTPException(404, f"Model file for {model} NOT found")
+        
+        if self._model:
+            if self._model.model_path == model_path:
+                return self._model
+            del self._model
+
+        settings = self._settings
+        self._model = llama_cpp.Llama(
+            model_path=model_path,
+            # Model Params
+            n_gpu_layers=settings.n_gpu_layers,
+            main_gpu=settings.main_gpu,
+            tensor_split=settings.tensor_split,
+            vocab_only=settings.vocab_only,
+            use_mmap=settings.use_mmap,
+            use_mlock=settings.use_mlock,
+            # Context Params
+            seed=settings.seed,
+            n_ctx=settings.n_ctx,
+            n_batch=settings.n_batch,
+            n_threads=settings.n_threads,
+            n_threads_batch=settings.n_threads_batch,
+            rope_scaling_type=settings.rope_scaling_type,
+            rope_freq_base=settings.rope_freq_base,
+            rope_freq_scale=settings.rope_freq_scale,
+            yarn_ext_factor=settings.yarn_ext_factor,
+            yarn_attn_factor=settings.yarn_attn_factor,
+            yarn_beta_fast=settings.yarn_beta_fast,
+            yarn_beta_slow=settings.yarn_beta_slow,
+            yarn_orig_ctx=settings.yarn_orig_ctx,
+            mul_mat_q=settings.mul_mat_q,
+            f16_kv=settings.f16_kv,
+            logits_all=settings.logits_all,
+            embedding=settings.embedding,
+            # Sampling Params
+            last_n_tokens_size=settings.last_n_tokens_size,
+            # LoRA Params
+            lora_base=settings.lora_base,
+            lora_path=settings.lora_path,
+            # Backend Params
+            numa=settings.numa,
+            # Chat Format Params
+            chat_format=settings.chat_format,
+            clip_model_path=settings.clip_model_path,
+            # Cache
+            cache=settings.cache,
+            cache_type=settings.cache_type,
+            cache_size=settings.cache_size,
+            # Misc
+            verbose=settings.verbose,
+            **kwargs
+        )
+        return self._model
+
+    def __getitem__(self, model):
+        return self._models[model]
+    
+LLAMA: Optional[MultiLlama] = None
+SETTINGS: Optional[Settings] = None
+
+def set_settings(settings: Settings):
+    global SETTINGS
+    SETTINGS = settings
+
+def get_settings():
+    yield SETTINGS
+
+def init_llama():
+    global LLAMA
+    LLAMA = MultiLlama(SETTINGS)
+
+llama_outer_lock = Lock()
+llama_inner_lock = Lock()
+
+def get_llama():
+    # NOTE: This double lock allows the currently streaming llama model to
+    # check if any other requests are pending in the same thread and cancel
+    # the stream if so.
+    llama_outer_lock.acquire()
+    release_outer_lock = True
+    try:
+        llama_inner_lock.acquire()
+        try:
+            if not LLAMA:
+                init_llama()
+            llama_outer_lock.release()
+            release_outer_lock = False
+            yield LLAMA
+        finally:
+            llama_inner_lock.release()
+    finally:
+        if release_outer_lock:
+            llama_outer_lock.release()
+
+
+router = APIRouter(
+    prefix="/models",
+    tags=["Model"],
+    responses={404: {"description": "Not found"}},
+)
+
+@router.put("/")
+async def api_update_model(
+    file: UploadFile,
+    background_tasks: BackgroundTasks
+   # user: User = Depends(RBAC(settings.auth_role)),
+):
+    ext = "".join(Path(file.filename).suffixes) if file.filename else ".gguf"
+    model_file = tempfile.NamedTemporaryFile(suffix=ext).name
+    with open(model_file, "wb") as buffer:
+        shutil.copyfileobj(file.file, buffer)
+        background_tasks.add_task(remove_file, model_file)
+    models_dir = os.path.dirname(os.path.abspath(os.environ.get('MODEL', '/')))
+    target_path = os.path.join(models_dir, file.filename)
+    shutil.copy(model_file, target_path)
+    LLAMA[file.filename] = target_path
+    return {"model": target_path}
+
+@router.delete("/{model}")
+async def api_delete_model(
+    model: str,
+    background_tasks: BackgroundTasks
+    # user: User = Depends(RBAC(settings.auth_role)),
+):
+    models_dir = models_root_dir()
+    target_path = os.path.join(models_dir, LLAMA[model])
+    if not os.path.exists(target_path):
+        raise HTTPException(status_code=404, detail=f"Model File NOT Found for {model}")
+    background_tasks.add_task(remove_file, target_path)
+    return 'success'

From 5ab0010443a65dc3ce84b653a273f15a7b31945e Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 15:15:08 +0100
Subject: [PATCH 05/44] update settings.py

---
 llama_cpp/server/app.py      |  8 ++++----
 llama_cpp/server/model.py    | 14 +++++---------
 llama_cpp/server/settings.py |  9 +++++++++
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 0d0c0696d..c769d9f56 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -25,10 +25,10 @@
 import numpy as np
 import numpy.typing as npt
 
-from llama_cpp.server.model import get_llama, llama_outer_lock, set_settings, get_settings
+from llama_cpp.server.model import get_llama, llama_outer_lock, MultiLlama as Llama
 from llama_cpp.server.model import router as models_router
-from llama_cpp.server.model import MultiLlama as Llama
-from llama_cpp.server.settings import Settings
+#from llama_cpp.server.model import MultiLlama as Llama
+from llama_cpp.server.settings import Settings, SETTINGS, set_settings, get_settings
 
 class ErrorResponse(TypedDict):
     """OpenAI style error response"""
@@ -249,7 +249,7 @@ async def get_event_publisher(
                 await inner_send_chan.send(dict(data=json.dumps(chunk)))
                 if await request.is_disconnected():
                     raise anyio.get_cancelled_exc_class()()
-                if settings.interrupt_requests and llama_outer_lock.locked():
+                if SETTINGS.interrupt_requests and llama_outer_lock.locked():
                     await inner_send_chan.send(dict(data="[DONE]"))
                     raise anyio.get_cancelled_exc_class()()
             await inner_send_chan.send(dict(data="[DONE]"))
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 237806e37..d375811e3 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -9,7 +9,7 @@
 import llama_cpp
 
 from llama_cpp.server.util import remove_file, models_root_dir
-from llama_cpp.server.settings import Settings
+from llama_cpp.server.settings import Settings, SETTINGS
 
 class MultiLlama:
     _model: Optional[llama_cpp.Llama] = None
@@ -26,6 +26,7 @@ def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama:
         try:
             model_path = self._models[model]
         except KeyError:
+            # TODO server raises 500 ?
             raise HTTPException(404, f"Model file for {model} NOT found")
         
         if self._model:
@@ -84,15 +85,10 @@ def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama:
     def __getitem__(self, model):
         return self._models[model]
     
+    def __setitem__(self, model, path):
+        self._models[model] = path
+    
 LLAMA: Optional[MultiLlama] = None
-SETTINGS: Optional[Settings] = None
-
-def set_settings(settings: Settings):
-    global SETTINGS
-    SETTINGS = settings
-
-def get_settings():
-    yield SETTINGS
 
 def init_llama():
     global LLAMA
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 46f4c9922..c7f696976 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -140,3 +140,12 @@ class Settings(BaseSettings):
         default=True,
         description="Whether to interrupt requests when a new request is received.",
     )
+
+SETTINGS: Optional[Settings] = None
+
+def set_settings(settings: Settings):
+    global SETTINGS
+    SETTINGS = settings
+
+def get_settings():
+    yield SETTINGS

From 45bfa0750d3fc8c8e70f35366cf53d8cb16fb137 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 15:38:06 +0100
Subject: [PATCH 06/44] cleanup

---
 llama_cpp/server/app.py   |  7 ++---
 llama_cpp/server/model.py | 54 +++++----------------------------------
 2 files changed, 8 insertions(+), 53 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index c769d9f56..41b5508e0 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -26,9 +26,7 @@
 import numpy.typing as npt
 
 from llama_cpp.server.model import get_llama, llama_outer_lock, MultiLlama as Llama
-from llama_cpp.server.model import router as models_router
-#from llama_cpp.server.model import MultiLlama as Llama
-from llama_cpp.server.settings import Settings, SETTINGS, set_settings, get_settings
+from llama_cpp.server.settings import Settings, set_settings, get_settings
 
 class ErrorResponse(TypedDict):
     """OpenAI style error response"""
@@ -233,7 +231,6 @@ def create_app(settings: Optional[Settings] = None):
         allow_headers=["*"],
     )
     app.include_router(router)
-    app.include_router(models_router)
 
     set_settings(settings)
     return app
@@ -249,7 +246,7 @@ async def get_event_publisher(
                 await inner_send_chan.send(dict(data=json.dumps(chunk)))
                 if await request.is_disconnected():
                     raise anyio.get_cancelled_exc_class()()
-                if SETTINGS.interrupt_requests and llama_outer_lock.locked():
+                if next(get_settings()).interrupt_requests and llama_outer_lock.locked():
                     await inner_send_chan.send(dict(data="[DONE]"))
                     raise anyio.get_cancelled_exc_class()()
             await inner_send_chan.send(dict(data="[DONE]"))
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index d375811e3..268776218 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -1,15 +1,10 @@
 import os
-import shutil
-import tempfile
-from pathlib import Path
 from typing import Any, Optional
 from threading import Lock
-from fastapi import APIRouter, UploadFile, Depends, HTTPException
-from fastapi.background import BackgroundTasks
 import llama_cpp
 
-from llama_cpp.server.util import remove_file, models_root_dir
-from llama_cpp.server.settings import Settings, SETTINGS
+from llama_cpp.server.util import models_root_dir
+from llama_cpp.server.settings import Settings, get_settings
 
 class MultiLlama:
     _model: Optional[llama_cpp.Llama] = None
@@ -27,7 +22,7 @@ def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama:
             model_path = self._models[model]
         except KeyError:
             # TODO server raises 500 ?
-            raise HTTPException(404, f"Model file for {model} NOT found")
+            raise Exception(404, f"Model file for {model} NOT found")
         
         if self._model:
             if self._model.model_path == model_path:
@@ -90,9 +85,9 @@ def __setitem__(self, model, path):
     
 LLAMA: Optional[MultiLlama] = None
 
-def init_llama():
+def _set_llama(settings: Optional[Settings] = None):
     global LLAMA
-    LLAMA = MultiLlama(SETTINGS)
+    LLAMA = MultiLlama(settings or next(get_settings()))
 
 llama_outer_lock = Lock()
 llama_inner_lock = Lock()
@@ -107,7 +102,7 @@ def get_llama():
         llama_inner_lock.acquire()
         try:
             if not LLAMA:
-                init_llama()
+                _set_llama()
             llama_outer_lock.release()
             release_outer_lock = False
             yield LLAMA
@@ -116,40 +111,3 @@ def get_llama():
     finally:
         if release_outer_lock:
             llama_outer_lock.release()
-
-
-router = APIRouter(
-    prefix="/models",
-    tags=["Model"],
-    responses={404: {"description": "Not found"}},
-)
-
-@router.put("/")
-async def api_update_model(
-    file: UploadFile,
-    background_tasks: BackgroundTasks
-   # user: User = Depends(RBAC(settings.auth_role)),
-):
-    ext = "".join(Path(file.filename).suffixes) if file.filename else ".gguf"
-    model_file = tempfile.NamedTemporaryFile(suffix=ext).name
-    with open(model_file, "wb") as buffer:
-        shutil.copyfileobj(file.file, buffer)
-        background_tasks.add_task(remove_file, model_file)
-    models_dir = os.path.dirname(os.path.abspath(os.environ.get('MODEL', '/')))
-    target_path = os.path.join(models_dir, file.filename)
-    shutil.copy(model_file, target_path)
-    LLAMA[file.filename] = target_path
-    return {"model": target_path}
-
-@router.delete("/{model}")
-async def api_delete_model(
-    model: str,
-    background_tasks: BackgroundTasks
-    # user: User = Depends(RBAC(settings.auth_role)),
-):
-    models_dir = models_root_dir()
-    target_path = os.path.join(models_dir, LLAMA[model])
-    if not os.path.exists(target_path):
-        raise HTTPException(status_code=404, detail=f"Model File NOT Found for {model}")
-    background_tasks.add_task(remove_file, target_path)
-    return 'success'

From 76c0168a454825c5bf124e94247482a74bbc76e4 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 15:43:37 +0100
Subject: [PATCH 07/44] delete util.py

---
 llama_cpp/server/model.py |  7 +++++--
 llama_cpp/server/util.py  | 14 --------------
 2 files changed, 5 insertions(+), 16 deletions(-)
 delete mode 100644 llama_cpp/server/util.py

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 268776218..ff715e124 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -2,10 +2,13 @@
 from typing import Any, Optional
 from threading import Lock
 import llama_cpp
-
-from llama_cpp.server.util import models_root_dir
 from llama_cpp.server.settings import Settings, get_settings
 
+def models_root_dir(path = None):
+    path = os.path.abspath(path or os.environ.get('MODEL', '/models'))
+    if os.path.isdir(path): return path
+    return os.path.dirname(path)
+
 class MultiLlama:
     _model: Optional[llama_cpp.Llama] = None
     _models = {}
diff --git a/llama_cpp/server/util.py b/llama_cpp/server/util.py
deleted file mode 100644
index e3702794f..000000000
--- a/llama_cpp/server/util.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import os
-import shutil
-
-def remove_file(path: str) -> None:
-    if path and os.path.exists(path):
-        if os.path.isdir(path):
-            shutil.rmtree(path)
-        else:
-            os.unlink(path)
-
-def models_root_dir(path = None):
-    path = os.path.abspath(path or os.environ.get('MODEL', '/models'))
-    if os.path.isdir(path): return path
-    return os.path.dirname(path)

From 97a6a218b320e010d2ac3b780a6ad2eba3d55472 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 16:07:33 +0100
Subject: [PATCH 08/44] Fix /v1/models endpoint

---
 llama_cpp/server/app.py   |  7 ++-----
 llama_cpp/server/model.py | 10 +++++++---
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 41b5508e0..c6632c370 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -655,17 +655,14 @@ async def get_models(
     settings: Settings = Depends(get_settings),
     llama: Llama = Depends(get_llama),
 ) -> ModelList:
-    assert llama is not None
     return {
         "object": "list",
         "data": [
             {
-                "id": settings.model_alias
-                if settings.model_alias is not None
-                else llama.model_path,
+                "id": model,
                 "object": "model",
                 "owned_by": "me",
                 "permissions": [],
-            }
+            } for model in llama._models
         ],
     }
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index ff715e124..b3ecf6968 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -4,8 +4,12 @@
 import llama_cpp
 from llama_cpp.server.settings import Settings, get_settings
 
+FILE_EXT = ".gguf"
+MODEL_ENV_ARG = "MODEL"
+DEFAULT_MODEL_DIR = "/models"
+
 def models_root_dir(path = None):
-    path = os.path.abspath(path or os.environ.get('MODEL', '/models'))
+    path = os.path.abspath(path or os.environ.get(MODEL_ENV_ARG, DEFAULT_MODEL_DIR))
     if os.path.isdir(path): return path
     return os.path.dirname(path)
 
@@ -17,8 +21,8 @@ def __init__(self, settings: Settings) -> None:
         self._settings = settings
         model_root = models_root_dir(settings.model)
         for filename in os.listdir(model_root):
-            if filename.endswith('.gguf'):
-                self._models[filename.split('.gguf')[0]] = os.path.join(model_root, filename)
+            if filename.endswith(FILE_EXT):
+                self._models[filename.split(FILE_EXT)[0]] = os.path.join(model_root, filename)
 
     def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama:
         try:

From fb2a1e782bf1ec20ce56b38882256051c67e3642 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 16:21:50 +0100
Subject: [PATCH 09/44] MultiLlama now iterable, app check-alive on "/"

---
 llama_cpp/server/app.py   | 7 +++++--
 llama_cpp/server/model.py | 4 ++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index c6632c370..66235ec6b 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -232,6 +232,10 @@ def create_app(settings: Optional[Settings] = None):
     )
     app.include_router(router)
 
+    @app.get('/')
+    async def root():
+        return "pong"
+
     set_settings(settings)
     return app
 
@@ -652,7 +656,6 @@ class ModelList(TypedDict):
 
 @router.get("/v1/models")
 async def get_models(
-    settings: Settings = Depends(get_settings),
     llama: Llama = Depends(get_llama),
 ) -> ModelList:
     return {
@@ -663,6 +666,6 @@ async def get_models(
                 "object": "model",
                 "owned_by": "me",
                 "permissions": [],
-            } for model in llama._models
+            } for model in llama
         ],
     }
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index b3ecf6968..a957a7159 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -90,6 +90,10 @@ def __getitem__(self, model):
     def __setitem__(self, model, path):
         self._models[model] = path
     
+    def __iter__(self):
+        for model in self._models:
+            yield model
+    
 LLAMA: Optional[MultiLlama] = None
 
 def _set_llama(settings: Optional[Settings] = None):

From 3f150ac93f08cfa9fa2882d7f3da319dba436046 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 17:54:26 +0100
Subject: [PATCH 10/44] instant model init if file is given

---
 llama_cpp/server/app.py   | 1 +
 llama_cpp/server/model.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 66235ec6b..95865822b 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -237,6 +237,7 @@ async def root():
         return "pong"
 
     set_settings(settings)
+    next(get_llama())
     return app
 
 async def get_event_publisher(
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index a957a7159..46c2c1e1b 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -23,6 +23,8 @@ def __init__(self, settings: Settings) -> None:
         for filename in os.listdir(model_root):
             if filename.endswith(FILE_EXT):
                 self._models[filename.split(FILE_EXT)[0]] = os.path.join(model_root, filename)
+        if os.path.isfile(settings.model):
+            self(settings.model.split(os.path.sep)[-1].split(FILE_EXT)[0])
 
     def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama:
         try:

From e71946ce26d55c62940c40327965e0f28549d74e Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Tue, 21 Nov 2023 18:16:14 +0100
Subject: [PATCH 11/44] backward compability

---
 llama_cpp/server/model.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 46c2c1e1b..065c4e97d 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -1,6 +1,7 @@
 import os
 from typing import Any, Optional
 from threading import Lock
+import logging
 import llama_cpp
 from llama_cpp.server.settings import Settings, get_settings
 
@@ -8,6 +9,8 @@
 MODEL_ENV_ARG = "MODEL"
 DEFAULT_MODEL_DIR = "/models"
 
+logger = logging.getLogger("uvicorn")
+
 def models_root_dir(path = None):
     path = os.path.abspath(path or os.environ.get(MODEL_ENV_ARG, DEFAULT_MODEL_DIR))
     if os.path.isdir(path): return path
@@ -30,8 +33,11 @@ def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama:
         try:
             model_path = self._models[model]
         except KeyError:
-            # TODO server raises 500 ?
-            raise Exception(404, f"Model file for {model} NOT found")
+            if self._model:
+                if self._settings.verbose: logger.info(f"Model file for {model} NOT found! Using preloaded")
+                return self._model
+            else: raise Exception(404, f"Model file for {model} NOT found")
+
         
         if self._model:
             if self._model.model_path == model_path:

From 55a9767067029c5219aa8681ca4a871ba8850b53 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 22 Nov 2023 16:30:59 +0100
Subject: [PATCH 12/44] revert model param mandatory

---
 llama_cpp/server/app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 95865822b..e7238e180 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -486,7 +486,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
         return iterator_or_completion
 
 class CreateEmbeddingRequest(BaseModel):
-    model: str = model_field
+    model: [str] = model_field
     input: Union[str, List[str]] = Field(description="The input to embed.")
     user: Optional[str] = Field(default=None)
 
@@ -558,7 +558,7 @@ class CreateChatCompletionRequest(BaseModel):
     )
 
     # ignored or currently unsupported
-    model: str = model_field
+    model: [str] = model_field
     n: Optional[int] = 1
     user: Optional[str] = Field(None)
 

From 3c4b526041958e25e1dcc095491d9693aa285fdf Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 22 Nov 2023 18:19:11 +0100
Subject: [PATCH 13/44] fix error

---
 llama_cpp/llama.py      | 2 ++
 llama_cpp/server/app.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 097eacfc9..0209cacb0 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -924,6 +924,7 @@ def __init__(
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
 
+        # TODO move out of class
         chat_handler = None
         if chat_format == "llava-1-5":
             assert clip_model_path is not None, "clip model not found"
@@ -931,6 +932,7 @@ def __init__(
                 clip_model_path=clip_model_path,
                 verbose=verbose
             )
+        ## 
         
         self.chat_format = chat_format
         self.chat_handler = chat_handler
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index e7238e180..b21c28354 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -486,7 +486,7 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
         return iterator_or_completion
 
 class CreateEmbeddingRequest(BaseModel):
-    model: [str] = model_field
+    model: Optional[str] = model_field
     input: Union[str, List[str]] = Field(description="The input to embed.")
     user: Optional[str] = Field(default=None)
 
@@ -558,7 +558,7 @@ class CreateChatCompletionRequest(BaseModel):
     )
 
     # ignored or currently unsupported
-    model: [str] = model_field
+    model: Optional[str] = model_field
     n: Optional[int] = 1
     user: Optional[str] = Field(None)
 

From 10a2d32655722b2a2baafd71cde5d07ea2fea240 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 22 Nov 2023 19:21:18 +0100
Subject: [PATCH 14/44] handle individual model config json

---
 llama_cpp/server/model.py    | 23 +++++++++++++++--------
 llama_cpp/server/settings.py | 23 ++++++++++++++---------
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 065c4e97d..7efa35d18 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -3,7 +3,7 @@
 from threading import Lock
 import logging
 import llama_cpp
-from llama_cpp.server.settings import Settings, get_settings
+from llama_cpp.server.settings import Settings, ModelSettings, get_settings
 
 FILE_EXT = ".gguf"
 MODEL_ENV_ARG = "MODEL"
@@ -29,22 +29,30 @@ def __init__(self, settings: Settings) -> None:
         if os.path.isfile(settings.model):
             self(settings.model.split(os.path.sep)[-1].split(FILE_EXT)[0])
 
-    def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama:
+    def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
+        # handle backward compatibility, model param optional
         try:
             model_path = self._models[model]
         except KeyError:
             if self._model:
-                if self._settings.verbose: logger.info(f"Model file for {model} NOT found! Using preloaded")
+                if self._settings.verbose: logger.warn(f"Model file for {model} NOT found! Using preloaded")
                 return self._model
             else: raise Exception(404, f"Model file for {model} NOT found")
-
         
         if self._model:
             if self._model.model_path == model_path:
                 return self._model
             del self._model
 
-        settings = self._settings
+        settings_path = os.path.join(os.path.dirname(model_path), 
+                                     model_path.split(os.path.sep)[-1].split(FILE_EXT)[0] + ".json")
+        try:
+            with open(settings_path, 'rb') as f:
+                settings = ModelSettings.model_validate_json(f.read())
+        except Exception as e:
+            if self._settings.verbose: logger.warn(f"Loading settings for {model} FAILED! Using default")
+            settings = self._settings
+
         self._model = llama_cpp.Llama(
             model_path=model_path,
             # Model Params
@@ -88,14 +96,13 @@ def __call__(self, model: str, **kwargs: Any) -> llama_cpp.Llama:
             cache_size=settings.cache_size,
             # Misc
             verbose=settings.verbose,
-            **kwargs
         )
         return self._model
 
-    def __getitem__(self, model):
+    def __getitem__(self, model: str) -> str:
         return self._models[model]
     
-    def __setitem__(self, model, path):
+    def __setitem__(self, model: str, path: str):
         self._models[model] = path
     
     def __iter__(self):
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index c7f696976..ad5c7ed77 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -1,20 +1,13 @@
 import multiprocessing
 from typing import Optional, List, Literal
 from pydantic import Field
-from pydantic_settings import BaseSettings
+from pydantic_settings import BaseSettings, SettingsConfigDict
 import llama_cpp
 
 # Disable warning for model and model_alias settings
 BaseSettings.model_config['protected_namespaces'] = ()
 
-class Settings(BaseSettings):
-    model: str = Field(
-        description="The path to the model to use for generating completions."
-    )
-    model_alias: Optional[str] = Field(
-        default=None,
-        description="The alias of the model to use for generating completions.",
-    )
+class ModelSettings(BaseSettings):
     # Model Params
     n_gpu_layers: int = Field(
         default=0,
@@ -133,6 +126,9 @@ class Settings(BaseSettings):
     verbose: bool = Field(
         default=True, description="Whether to print debug information."
     )
+
+class ServerSettings(BaseSettings):
+    model_config = SettingsConfigDict(env_file='.env')
     # Server Params
     host: str = Field(default="localhost", description="Listen address")
     port: int = Field(default=8000, description="Listen port")
@@ -141,6 +137,15 @@ class Settings(BaseSettings):
         description="Whether to interrupt requests when a new request is received.",
     )
 
+class Settings(ModelSettings, ServerSettings):
+    model: str = Field(
+        description="The path to the model to use for generating completions."
+    )
+    model_alias: Optional[str] = Field(
+        default=None,
+        description="The alias of the model to use for generating completions.",
+    )
+
 SETTINGS: Optional[Settings] = None
 
 def set_settings(settings: Settings):

From ee71f2088c29fb812f5b9972c701806aa53920a9 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 22 Nov 2023 20:18:09 +0100
Subject: [PATCH 15/44] refactor

---
 llama_cpp/server/model.py | 46 +++------------------------------------
 1 file changed, 3 insertions(+), 43 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 7efa35d18..f76fac3c2 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -18,7 +18,7 @@ def models_root_dir(path = None):
 
 class MultiLlama:
     _model: Optional[llama_cpp.Llama] = None
-    _models = {}
+    _models: dict[str,str] = {}
 
     def __init__(self, settings: Settings) -> None:
         self._settings = settings
@@ -49,53 +49,13 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
         try:
             with open(settings_path, 'rb') as f:
                 settings = ModelSettings.model_validate_json(f.read())
-        except Exception as e:
+        except:
             if self._settings.verbose: logger.warn(f"Loading settings for {model} FAILED! Using default")
             settings = self._settings
 
         self._model = llama_cpp.Llama(
             model_path=model_path,
-            # Model Params
-            n_gpu_layers=settings.n_gpu_layers,
-            main_gpu=settings.main_gpu,
-            tensor_split=settings.tensor_split,
-            vocab_only=settings.vocab_only,
-            use_mmap=settings.use_mmap,
-            use_mlock=settings.use_mlock,
-            # Context Params
-            seed=settings.seed,
-            n_ctx=settings.n_ctx,
-            n_batch=settings.n_batch,
-            n_threads=settings.n_threads,
-            n_threads_batch=settings.n_threads_batch,
-            rope_scaling_type=settings.rope_scaling_type,
-            rope_freq_base=settings.rope_freq_base,
-            rope_freq_scale=settings.rope_freq_scale,
-            yarn_ext_factor=settings.yarn_ext_factor,
-            yarn_attn_factor=settings.yarn_attn_factor,
-            yarn_beta_fast=settings.yarn_beta_fast,
-            yarn_beta_slow=settings.yarn_beta_slow,
-            yarn_orig_ctx=settings.yarn_orig_ctx,
-            mul_mat_q=settings.mul_mat_q,
-            f16_kv=settings.f16_kv,
-            logits_all=settings.logits_all,
-            embedding=settings.embedding,
-            # Sampling Params
-            last_n_tokens_size=settings.last_n_tokens_size,
-            # LoRA Params
-            lora_base=settings.lora_base,
-            lora_path=settings.lora_path,
-            # Backend Params
-            numa=settings.numa,
-            # Chat Format Params
-            chat_format=settings.chat_format,
-            clip_model_path=settings.clip_model_path,
-            # Cache
-            cache=settings.cache,
-            cache_type=settings.cache_type,
-            cache_size=settings.cache_size,
-            # Misc
-            verbose=settings.verbose,
+            **(settings.model_dump(exclude={"model",}))
         )
         return self._model
 

From ea0fcca8b0a939169a33a4ebf63819a6e441a9e3 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 22 Nov 2023 20:47:57 +0100
Subject: [PATCH 16/44] revert chathandler/clip_model changes

---
 llama_cpp/llama.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 0209cacb0..ce7053ab0 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -758,7 +758,7 @@ def __init__(
         numa: bool = False,
         # Chat Format Params
         chat_format: str = "llama-2",
-        clip_model_path: Optional[str] = None, # only for multimodal, when chat_format=llava-1-5
+        chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
         # Cache
         cache: bool = False,
         cache_type: str = "ram",
@@ -795,7 +795,7 @@ def __init__(
             lora_path: Path to a LoRA file to apply to the model.
             numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
             chat_format: String specifying the chat format to use when calling create_chat_completion.
-            clip_model_path: Optional clip model path to use when using multimodal mode, expected when chat_format=llava-1-5.
+            chat_handler: Optional chat handler to use when calling create_chat_completion.
             cache: Optional if true enables caching.
             cache_type: String can be "ram" or "disk".
             cache_size: Number of bytes to cache, defaults to 2GB
@@ -923,16 +923,6 @@ def __init__(
 
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
-
-        # TODO move out of class
-        chat_handler = None
-        if chat_format == "llava-1-5":
-            assert clip_model_path is not None, "clip model not found"
-            chat_handler = llama_chat_format.Llava15ChatHandler(
-                clip_model_path=clip_model_path,
-                verbose=verbose
-            )
-        ## 
         
         self.chat_format = chat_format
         self.chat_handler = chat_handler

From 6f5e60a896198a885f707266dbe7048170441fbd Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 22 Nov 2023 21:10:26 +0100
Subject: [PATCH 17/44] handle chat_handler in MulitLlama()

---
 llama_cpp/server/model.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index f76fac3c2..cb24e1ef2 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -53,9 +53,18 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
             if self._settings.verbose: logger.warn(f"Loading settings for {model} FAILED! Using default")
             settings = self._settings
 
+        chat_handler = None
+        if settings.chat_format == "llava-1-5":
+            assert settings.clip_model_path is not None, "clip model not found"
+            chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
+                clip_model_path=settings.clip_model_path,
+                verbose=settings.verbose
+            )
+
         self._model = llama_cpp.Llama(
             model_path=model_path,
-            **(settings.model_dump(exclude={"model",}))
+            chat_handler=chat_handler,
+            **(settings.model_dump(exclude={"model","clip_model_path",}))
         )
         return self._model
 

From d9d696d0001a27b307cc8d06698a6ed8e7bb26d2 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:31:30 +0100
Subject: [PATCH 18/44] split settings into server/llama

---
 llama_cpp/server/__main__.py | 32 +++++++++++++++++++++++++-------
 llama_cpp/server/settings.py | 25 +++++++++++++++----------
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index e2da83478..f109b4682 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -22,13 +22,14 @@
 
 """
 import os
+import sys
 import argparse
 from typing import List, Literal, Union
 
 import uvicorn
 
 from llama_cpp.server.app import create_app
-from llama_cpp.server.settings import Settings
+from llama_cpp.server.settings import Settings, ServerSettings, set_settings
 
 def get_base_type(annotation):
     if getattr(annotation, '__origin__', None) is Literal:
@@ -68,9 +69,9 @@ def parse_bool_arg(arg):
     else:
         raise ValueError(f'Invalid boolean argument: {arg}')
 
-if __name__ == "__main__":
+def create_parser(settings_dict):
     parser = argparse.ArgumentParser()
-    for name, field in Settings.model_fields.items():
+    for name, field in settings_dict.items():
         description = field.description
         if field.default is not None and description is not None:
             description += f" (default: {field.default})"
@@ -91,11 +92,28 @@ def parse_bool_arg(arg):
                 type=parse_bool_arg,
                 help=f"{description}",
             )
+    return parser
 
-    args = parser.parse_args()
-    settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
-    app = create_app(settings=settings)
+if __name__ == "__main__":
+    server_arg_parser = create_parser(ServerSettings.model_fields)
+    parser = create_parser(Settings.model_fields)
+    
+    try:
+        server_args, _ = server_arg_parser.parse_known_args()
+        server_settings = ServerSettings(**{k: v for k, v in vars(server_args).items() if v is not None})
+        set_settings(server_settings)
+        if server_settings.config and os.path.exists(server_settings.config):
+            with open(server_settings.config, 'rb') as f:
+                llama_settings = Settings.model_validate_json(f.read())
+        else:
+            args, _ = parser.parse_known_args()
+            llama_settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
+        app = create_app(settings=llama_settings)
+    except Exception as e:
+        print(e, file=sys.stderr)
+        parser.print_help()
+        sys.exit(1)
 
     uvicorn.run(
-        app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port))
+        app, host=server_settings.host, port=server_settings.port
     )
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index ad5c7ed77..98215a891 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -1,3 +1,4 @@
+import os
 import multiprocessing
 from typing import Optional, List, Literal
 from pydantic import Field
@@ -8,6 +9,13 @@
 BaseSettings.model_config['protected_namespaces'] = ()
 
 class ModelSettings(BaseSettings):
+    model: str = Field(
+        description="The path to the model to use for generating completions."
+    )
+    model_alias: Optional[str] = Field(
+        default=None,
+        description="The alias of the model to use for generating completions.",
+    )
     # Model Params
     n_gpu_layers: int = Field(
         default=0,
@@ -129,26 +137,23 @@ class ModelSettings(BaseSettings):
 
 class ServerSettings(BaseSettings):
     model_config = SettingsConfigDict(env_file='.env')
-    # Server Params
     host: str = Field(default="localhost", description="Listen address")
     port: int = Field(default=8000, description="Listen port")
     interrupt_requests: bool = Field(
         default=True,
         description="Whether to interrupt requests when a new request is received.",
     )
+    config: Optional[str] = Field(default=None, description="Path to config file")
 
-class Settings(ModelSettings, ServerSettings):
-    model: str = Field(
-        description="The path to the model to use for generating completions."
-    )
-    model_alias: Optional[str] = Field(
-        default=None,
-        description="The alias of the model to use for generating completions.",
+class Settings(ModelSettings):
+    models: Optional[List[ModelSettings]] = Field(
+        default = [],
+        description="Model configs, overwrites default config"
     )
 
-SETTINGS: Optional[Settings] = None
+SETTINGS: Optional[ServerSettings] = None
 
-def set_settings(settings: Settings):
+def set_settings(settings: ServerSettings):
     global SETTINGS
     SETTINGS = settings
 

From e71fc92e962c89127139774f82489b2a5904bf0a Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:31:54 +0100
Subject: [PATCH 19/44] reduce global vars

---
 llama_cpp/server/app.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index b21c28354..abf1cb043 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -25,7 +25,7 @@
 import numpy as np
 import numpy.typing as npt
 
-from llama_cpp.server.model import get_llama, llama_outer_lock, MultiLlama as Llama
+from llama_cpp.server.model import get_llama, set_llama, llama_outer_lock, LlamaProxy as Llama
 from llama_cpp.server.settings import Settings, set_settings, get_settings
 
 class ErrorResponse(TypedDict):
@@ -211,10 +211,7 @@ async def custom_route_handler(request: Request) -> Response:
 
 router = APIRouter(route_class=RouteErrorHandler)
 
-def create_app(settings: Optional[Settings] = None):
-    if settings is None:
-        settings = Settings()
-
+def create_app(settings: Settings):
     middleware = [
         Middleware(RawContextMiddleware, plugins=(plugins.RequestIdPlugin(),))
     ]
@@ -236,8 +233,7 @@ def create_app(settings: Optional[Settings] = None):
     async def root():
         return "pong"
 
-    set_settings(settings)
-    next(get_llama())
+    set_llama(settings)
     return app
 
 async def get_event_publisher(
@@ -596,7 +592,7 @@ async def create_chat_completion(
     request: Request,
     body: CreateChatCompletionRequest,
     llama: Llama = Depends(get_llama),
-    settings: Settings = Depends(get_settings),
+    #settings: Settings = Depends(get_settings),
 ) -> llama_cpp.ChatCompletion:
     exclude = {
         "n",

From 522f0bd5a7426a058dbd643892a04b6a5500b087 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Thu, 23 Nov 2023 20:32:53 +0100
Subject: [PATCH 20/44] Update LlamaProxy to handle config files

---
 llama_cpp/server/model.py | 84 ++++++++++++++++++++++-----------------
 1 file changed, 48 insertions(+), 36 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index cb24e1ef2..2ef63e119 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -1,57 +1,68 @@
 import os
-from typing import Any, Optional
-from threading import Lock
 import logging
+from typing import Optional, Union
+from threading import Lock
 import llama_cpp
-from llama_cpp.server.settings import Settings, ModelSettings, get_settings
+from llama_cpp.server.settings import Settings, ModelSettings
 
 FILE_EXT = ".gguf"
 MODEL_ENV_ARG = "MODEL"
-DEFAULT_MODEL_DIR = "/models"
+DEFAULT_MODEL_DIR = os.path.join(os.getcwd(), "/models")
 
 logger = logging.getLogger("uvicorn")
 
-def models_root_dir(path = None):
+def models_root_dir(path: Optional[str] = None):
     path = os.path.abspath(path or os.environ.get(MODEL_ENV_ARG, DEFAULT_MODEL_DIR))
     if os.path.isdir(path): return path
     return os.path.dirname(path)
 
-class MultiLlama:
+def model_alias(path: str) -> str:
+    return path.split(os.path.sep)[-1].split(FILE_EXT)[0]
+
+class LlamaProxy:
     _model: Optional[llama_cpp.Llama] = None
-    _models: dict[str,str] = {}
+    _models: dict[str,ModelSettings] = {}
 
     def __init__(self, settings: Settings) -> None:
         self._settings = settings
+        for model in settings.models:
+            if not model.model_alias:
+                model.model_alias = model_alias(model.model)
+            self._models[model.model_alias] = model
+
         model_root = models_root_dir(settings.model)
         for filename in os.listdir(model_root):
             if filename.endswith(FILE_EXT):
-                self._models[filename.split(FILE_EXT)[0]] = os.path.join(model_root, filename)
+                alias = model_alias(filename)
+                if alias in self._models: continue
+                exclude={'model', 'model_alias', 'models', 'host', 'port', 'interrupt_requests', 'config'}
+                default_settings = settings.model_dump(exclude=exclude)
+                self._models[alias] = ModelSettings(model=os.path.join(model_root, filename), 
+                                                    model_alias=alias, **default_settings)
+
         if os.path.isfile(settings.model):
-            self(settings.model.split(os.path.sep)[-1].split(FILE_EXT)[0])
+            alias = settings.model_alias
+            if alias is None: alias = model_alias(settings.model)
+            if alias not in self._models:
+                self._models[alias] = settings
+            self(alias)
 
     def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
         # handle backward compatibility, model param optional
         try:
-            model_path = self._models[model]
+            model_alias = self._models[model].model_alias
         except KeyError:
             if self._model:
-                if self._settings.verbose: logger.warn(f"Model file for {model} NOT found! Using preloaded")
+                if self._settings.verbose: logger.warn(f"Model {model} NOT found! Using {self._model.alias}")
                 return self._model
-            else: raise Exception(404, f"Model file for {model} NOT found")
+            else: raise Exception(404, f"Model {model} NOT found!")
         
         if self._model:
-            if self._model.model_path == model_path:
+            if self._model.alias == model_alias:
                 return self._model
             del self._model
 
-        settings_path = os.path.join(os.path.dirname(model_path), 
-                                     model_path.split(os.path.sep)[-1].split(FILE_EXT)[0] + ".json")
-        try:
-            with open(settings_path, 'rb') as f:
-                settings = ModelSettings.model_validate_json(f.read())
-        except:
-            if self._settings.verbose: logger.warn(f"Loading settings for {model} FAILED! Using default")
-            settings = self._settings
+        settings = self._models[model]
 
         chat_handler = None
         if settings.chat_format == "llava-1-5":
@@ -62,31 +73,34 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
             )
 
         self._model = llama_cpp.Llama(
-            model_path=model_path,
-            chat_handler=chat_handler,
-            **(settings.model_dump(exclude={"model","clip_model_path",}))
+            model_path=settings.model,
+            **(settings.model_dump(exclude={'model', 'models'})),
+            chat_handler=chat_handler
         )
+        self._model.alias = model
         return self._model
 
-    def __getitem__(self, model: str) -> str:
-        return self._models[model]
-    
-    def __setitem__(self, model: str, path: str):
-        self._models[model] = path
+    def __getitem__(self, model: str):
+        return self._models[model].model_dump()
+
+    def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]):
+        if isinstance(settings, bytes) or isinstance(settings, str):
+            settings = ModelSettings.model_validate_json(settings)
+        self._models[model] = settings
     
     def __iter__(self):
         for model in self._models:
             yield model
     
-LLAMA: Optional[MultiLlama] = None
-
-def _set_llama(settings: Optional[Settings] = None):
-    global LLAMA
-    LLAMA = MultiLlama(settings or next(get_settings()))
+LLAMA: Optional[LlamaProxy] = None
 
 llama_outer_lock = Lock()
 llama_inner_lock = Lock()
 
+def set_llama(settings: Settings):
+    global LLAMA
+    LLAMA = LlamaProxy(settings)
+
 def get_llama():
     # NOTE: This double lock allows the currently streaming llama model to
     # check if any other requests are pending in the same thread and cancel
@@ -96,8 +110,6 @@ def get_llama():
     try:
         llama_inner_lock.acquire()
         try:
-            if not LLAMA:
-                _set_llama()
             llama_outer_lock.release()
             release_outer_lock = False
             yield LLAMA

From 6e0ab3e18bbcaba8056a91d96281335a3d0745cc Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Fri, 24 Nov 2023 03:00:55 +0100
Subject: [PATCH 21/44] Add free method to LlamaProxy

---
 llama_cpp/server/model.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 2ef63e119..12c84c94b 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -84,13 +84,16 @@ def __getitem__(self, model: str):
         return self._models[model].model_dump()
 
     def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]):
-        if isinstance(settings, bytes) or isinstance(settings, str):
+        if isinstance(settings, (bytes, str)):
             settings = ModelSettings.model_validate_json(settings)
         self._models[model] = settings
     
     def __iter__(self):
         for model in self._models:
             yield model
+
+    def free(self):
+        if self._model: del self._model
     
 LLAMA: Optional[LlamaProxy] = None
 

From ec9a9db6a21bea1605239acd3e33ec2cc6e7c6f7 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Sat, 25 Nov 2023 22:41:15 +0100
Subject: [PATCH 22/44] update arg parsers & install server alias

---
 llama_cpp/server/__main__.py | 24 ++++++++++++------------
 llama_cpp/server/settings.py |  2 +-
 pyproject.toml               |  3 +++
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index f109b4682..22c2260f3 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -31,6 +31,8 @@
 from llama_cpp.server.app import create_app
 from llama_cpp.server.settings import Settings, ServerSettings, set_settings
 
+EXE_NAME = 'llama_server'
+
 def get_base_type(annotation):
     if getattr(annotation, '__origin__', None) is Literal:
         return type(annotation.__args__[0])
@@ -69,11 +71,12 @@ def parse_bool_arg(arg):
     else:
         raise ValueError(f'Invalid boolean argument: {arg}')
 
-def create_parser(settings_dict):
-    parser = argparse.ArgumentParser()
-    for name, field in settings_dict.items():
+def main():
+    description = "🦙 Llama.cpp python server. Host your own LLMs!🚀"
+    parser = argparse.ArgumentParser(EXE_NAME, description=description)
+    for name, field in (ServerSettings.model_fields|Settings.model_fields).items():
         description = field.description
-        if field.default is not None and description is not None:
+        if field.default and description and not field.is_required():
             description += f" (default: {field.default})"
         base_type = get_base_type(field.annotation) if field.annotation is not None else str
         list_type = contains_list_type(field.annotation)
@@ -92,21 +95,15 @@ def create_parser(settings_dict):
                 type=parse_bool_arg,
                 help=f"{description}",
             )
-    return parser
-
-if __name__ == "__main__":
-    server_arg_parser = create_parser(ServerSettings.model_fields)
-    parser = create_parser(Settings.model_fields)
     
     try:
-        server_args, _ = server_arg_parser.parse_known_args()
-        server_settings = ServerSettings(**{k: v for k, v in vars(server_args).items() if v is not None})
+        args = parser.parse_args()
+        server_settings = ServerSettings(**{k: v for k, v in vars(args).items() if v is not None})
         set_settings(server_settings)
         if server_settings.config and os.path.exists(server_settings.config):
             with open(server_settings.config, 'rb') as f:
                 llama_settings = Settings.model_validate_json(f.read())
         else:
-            args, _ = parser.parse_known_args()
             llama_settings = Settings(**{k: v for k, v in vars(args).items() if v is not None})
         app = create_app(settings=llama_settings)
     except Exception as e:
@@ -117,3 +114,6 @@ def create_parser(settings_dict):
     uvicorn.run(
         app, host=server_settings.host, port=server_settings.port
     )
+
+if __name__ == "__main__":
+    main()
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 98215a891..ecd034d2e 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -136,7 +136,7 @@ class ModelSettings(BaseSettings):
     )
 
 class ServerSettings(BaseSettings):
-    model_config = SettingsConfigDict(env_file='.env')
+    model_config = SettingsConfigDict(env_file='.env', extra='ignore')
     host: str = Field(default="localhost", description="Listen address")
     port: int = Field(default=8000, description="Listen port")
     interrupt_requests: bool = Field(
diff --git a/pyproject.toml b/pyproject.toml
index 6c1022581..ffb9f2637 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,6 +63,9 @@ sdist.include = [".git", "vendor/llama.cpp/.git"]
 provider = "scikit_build_core.metadata.regex"
 input = "llama_cpp/__init__.py"
 
+[project.scripts]
+llama_server = "llama_cpp.server.__main__:main"
+
 [project.urls]
 Homepage = "https://github.com/abetlen/llama-cpp-python"
 Issues = "https://github.com/abetlen/llama-cpp-python/issues"

From fd731d7512e4191a137e7b1aa0ae669f5f28a044 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 29 Nov 2023 18:35:46 +0100
Subject: [PATCH 23/44] refactor cache settings

---
 llama_cpp/llama.py        | 18 ------------------
 llama_cpp/server/model.py | 10 ++++++++++
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index c1bb9d95e..cc7f562b5 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -759,10 +759,6 @@ def __init__(
         # Chat Format Params
         chat_format: str = "llama-2",
         chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
-        # Cache
-        cache: bool = False,
-        cache_type: str = "ram",
-        cache_size: int = 2 << 30,
         # Misc
         verbose: bool = True,
         # Extra Params
@@ -820,9 +816,6 @@ def __init__(
             numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
             chat_format: String specifying the chat format to use when calling create_chat_completion.
             chat_handler: Optional chat handler to use when calling create_chat_completion.
-            cache: Optional if true enables caching.
-            cache_type: String can be "ram" or "disk".
-            cache_size: Number of bytes to cache, defaults to 2GB
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -965,17 +958,6 @@ def __init__(
             (n_ctx, self._n_vocab), dtype=np.single
         )
 
-        if cache:
-            if cache_type == "disk":
-                if verbose:
-                    print(f"Using disk cache with size {cache_size}")
-                cache = LlamaDiskCache(capacity_bytes=cache_size)
-            else:
-                if verbose:
-                    print(f"Using ram cache with size {cache_size}")
-                cache = LlamaRAMCache(capacity_bytes=cache_size)
-            self.set_cache(cache)
-
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
         assert self._ctx.ctx is not None
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 12c84c94b..ec9bd9416 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -78,6 +78,16 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
             chat_handler=chat_handler
         )
         self._model.alias = model
+        if settings.cache:
+            if settings.cache_type == "disk":
+                if settings.verbose:
+                    print(f"Using disk cache with size {settings.cache_size}")
+                cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
+            else:
+                if settings.verbose:
+                    print(f"Using ram cache with size {settings.cache_size}")
+                cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
+            self._model.set_cache(cache)
         return self._model
 
     def __getitem__(self, model: str):

From 288fa85d66b74123ca5543ed0801cbddde031ba2 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 29 Nov 2023 18:36:43 +0100
Subject: [PATCH 24/44] change server executable name

---
 llama_cpp/server/__main__.py | 2 +-
 pyproject.toml               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 22c2260f3..566112689 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -31,7 +31,7 @@
 from llama_cpp.server.app import create_app
 from llama_cpp.server.settings import Settings, ServerSettings, set_settings
 
-EXE_NAME = 'llama_server'
+EXE_NAME = 'llama_cpp.server'
 
 def get_base_type(annotation):
     if getattr(annotation, '__origin__', None) is Literal:
diff --git a/pyproject.toml b/pyproject.toml
index ffb9f2637..4662af173 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -64,7 +64,7 @@ provider = "scikit_build_core.metadata.regex"
 input = "llama_cpp/__init__.py"
 
 [project.scripts]
-llama_server = "llama_cpp.server.__main__:main"
+llama_cpp.server = "llama_cpp.server.__main__:main"
 
 [project.urls]
 Homepage = "https://github.com/abetlen/llama-cpp-python"

From b64742b0f3ffd5f8463eaf624cdf1395c573e66b Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 29 Nov 2023 18:46:07 +0100
Subject: [PATCH 25/44] better var name

---
 llama_cpp/server/app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index abf1cb043..ddc23101c 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -659,10 +659,10 @@ async def get_models(
         "object": "list",
         "data": [
             {
-                "id": model,
+                "id": model_alias,
                 "object": "model",
                 "owned_by": "me",
                 "permissions": [],
-            } for model in llama
+            } for model_alias in llama
         ],
     }

From bc5cf51c64a95bfc9926e1bc58166059711a1cd8 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 29 Nov 2023 18:48:25 +0100
Subject: [PATCH 26/44] whitespace

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index cc7f562b5..df08c162b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -940,7 +940,7 @@ def __init__(
 
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
-        
+
         self.chat_format = chat_format
         self.chat_handler = chat_handler
 

From 5fd9892ae4470d4464cc216896ccf952f2922896 Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Wed, 29 Nov 2023 19:34:04 +0100
Subject: [PATCH 27/44] Revert "whitespace"

This reverts commit bc5cf51c64a95bfc9926e1bc58166059711a1cd8.
---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 807654889..1230b6100 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -950,7 +950,7 @@ def __init__(
 
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
-
+        
         self.chat_format = chat_format
         self.chat_handler = chat_handler
 

From 7b1c17b4fdedb20759fcbbb9aa27e1047cea5b4b Mon Sep 17 00:00:00 2001
From: Dave <69651599+D4ve-R@users.noreply.github.com>
Date: Fri, 1 Dec 2023 14:03:01 +0100
Subject: [PATCH 28/44] remove exe_name

---
 llama_cpp/server/__main__.py | 4 +---
 pyproject.toml               | 3 ---
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 566112689..55ffc5f48 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -31,8 +31,6 @@
 from llama_cpp.server.app import create_app
 from llama_cpp.server.settings import Settings, ServerSettings, set_settings
 
-EXE_NAME = 'llama_cpp.server'
-
 def get_base_type(annotation):
     if getattr(annotation, '__origin__', None) is Literal:
         return type(annotation.__args__[0])
@@ -73,7 +71,7 @@ def parse_bool_arg(arg):
 
 def main():
     description = "🦙 Llama.cpp python server. Host your own LLMs!🚀"
-    parser = argparse.ArgumentParser(EXE_NAME, description=description)
+    parser = argparse.ArgumentParser(description=description)
     for name, field in (ServerSettings.model_fields|Settings.model_fields).items():
         description = field.description
         if field.default and description and not field.is_required():
diff --git a/pyproject.toml b/pyproject.toml
index 4662af173..6c1022581 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,9 +63,6 @@ sdist.include = [".git", "vendor/llama.cpp/.git"]
 provider = "scikit_build_core.metadata.regex"
 input = "llama_cpp/__init__.py"
 
-[project.scripts]
-llama_cpp.server = "llama_cpp.server.__main__:main"
-
 [project.urls]
 Homepage = "https://github.com/abetlen/llama-cpp-python"
 Issues = "https://github.com/abetlen/llama-cpp-python/issues"

From ba36629b5a52302a2439c3b2e0248b8118601370 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 21 Dec 2023 16:12:53 -0500
Subject: [PATCH 29/44] Fix merge bugs

---
 llama_cpp/server/model.py    | 40 +++++++++++++++++++++--
 llama_cpp/server/settings.py | 62 +++++++++++++++++++-----------------
 2 files changed, 70 insertions(+), 32 deletions(-)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index ec9bd9416..18816281a 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -74,10 +74,44 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
 
         self._model = llama_cpp.Llama(
             model_path=settings.model,
-            **(settings.model_dump(exclude={'model', 'models'})),
-            chat_handler=chat_handler
+            # Model Params
+            n_gpu_layers=settings.n_gpu_layers,
+            main_gpu=settings.main_gpu,
+            tensor_split=settings.tensor_split,
+            vocab_only=settings.vocab_only,
+            use_mmap=settings.use_mmap,
+            use_mlock=settings.use_mlock,
+            # Context Params
+            seed=settings.seed,
+            n_ctx=settings.n_ctx,
+            n_batch=settings.n_batch,
+            n_threads=settings.n_threads,
+            n_threads_batch=settings.n_threads_batch,
+            rope_scaling_type=settings.rope_scaling_type,
+            rope_freq_base=settings.rope_freq_base,
+            rope_freq_scale=settings.rope_freq_scale,
+            yarn_ext_factor=settings.yarn_ext_factor,
+            yarn_attn_factor=settings.yarn_attn_factor,
+            yarn_beta_fast=settings.yarn_beta_fast,
+            yarn_beta_slow=settings.yarn_beta_slow,
+            yarn_orig_ctx=settings.yarn_orig_ctx,
+            mul_mat_q=settings.mul_mat_q,
+            logits_all=settings.logits_all,
+            embedding=settings.embedding,
+            offload_kqv=settings.offload_kqv,
+            # Sampling Params
+            last_n_tokens_size=settings.last_n_tokens_size,
+            # LoRA Params
+            lora_base=settings.lora_base,
+            lora_path=settings.lora_path,
+            # Backend Params
+            numa=settings.numa,
+            # Chat Format Params
+            chat_format=settings.chat_format,
+            chat_handler=chat_handler,
+            # Misc
+            verbose=settings.verbose,
         )
-        self._model.alias = model
         if settings.cache:
             if settings.cache_type == "disk":
                 if settings.verbose:
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index ecd034d2e..b604515f2 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -1,4 +1,3 @@
-import os
 import multiprocessing
 from typing import Optional, List, Literal
 from pydantic import Field
@@ -6,7 +5,8 @@
 import llama_cpp
 
 # Disable warning for model and model_alias settings
-BaseSettings.model_config['protected_namespaces'] = ()
+BaseSettings.model_config["protected_namespaces"] = ()
+
 
 class ModelSettings(BaseSettings):
     model: str = Field(
@@ -43,7 +43,9 @@ class ModelSettings(BaseSettings):
         description="Use mlock.",
     )
     # Context Params
-    seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
+    seed: int = Field(
+        default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."
+    )
     n_ctx: int = Field(default=2048, ge=1, description="The context size.")
     n_batch: int = Field(
         default=512, ge=1, description="The batch size to use per eval."
@@ -58,36 +60,24 @@ class ModelSettings(BaseSettings):
         ge=0,
         description="The number of threads to use when batch processing.",
     )
-    rope_scaling_type: int = Field(
-        default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED
-    )
-    rope_freq_base: float = Field(
-        default=0.0, description="RoPE base frequency"
-    )
+    rope_scaling_type: int = Field(default=llama_cpp.LLAMA_ROPE_SCALING_UNSPECIFIED)
+    rope_freq_base: float = Field(default=0.0, description="RoPE base frequency")
     rope_freq_scale: float = Field(
         default=0.0, description="RoPE frequency scaling factor"
     )
-    yarn_ext_factor: float = Field(
-        default=-1.0
-    )
-    yarn_attn_factor: float = Field(
-        default=1.0
-    )
-    yarn_beta_fast: float = Field(
-        default=32.0
-    )
-    yarn_beta_slow: float = Field(
-        default=1.0
-    )
-    yarn_orig_ctx: int = Field(
-        default=0
-    )
+    yarn_ext_factor: float = Field(default=-1.0)
+    yarn_attn_factor: float = Field(default=1.0)
+    yarn_beta_fast: float = Field(default=32.0)
+    yarn_beta_slow: float = Field(default=1.0)
+    yarn_orig_ctx: int = Field(default=0)
     mul_mat_q: bool = Field(
         default=True, description="if true, use experimental mul_mat_q kernels"
     )
-    f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
     logits_all: bool = Field(default=True, description="Whether to return logits.")
     embedding: bool = Field(default=True, description="Whether to use embeddings.")
+    offload_kqv: bool = Field(
+        default=False, description="Whether to offload kqv to the GPU."
+    )
     # Sampling Params
     last_n_tokens_size: int = Field(
         default=64,
@@ -97,7 +87,7 @@ class ModelSettings(BaseSettings):
     # LoRA Params
     lora_base: Optional[str] = Field(
         default=None,
-        description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
+        description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.",
     )
     lora_path: Optional[str] = Field(
         default=None,
@@ -135,8 +125,9 @@ class ModelSettings(BaseSettings):
         default=True, description="Whether to print debug information."
     )
 
+
 class ServerSettings(BaseSettings):
-    model_config = SettingsConfigDict(env_file='.env', extra='ignore')
+    model_config = SettingsConfigDict(env_file=".env", extra="ignore")
     host: str = Field(default="localhost", description="Listen address")
     port: int = Field(default=8000, description="Listen port")
     interrupt_requests: bool = Field(
@@ -144,18 +135,31 @@ class ServerSettings(BaseSettings):
         description="Whether to interrupt requests when a new request is received.",
     )
     config: Optional[str] = Field(default=None, description="Path to config file")
+    ssl_keyfile: Optional[str] = Field(
+        default=None, description="SSL key file for HTTPS"
+    )
+    ssl_certfile: Optional[str] = Field(
+        default=None, description="SSL certificate file for HTTPS"
+    )
+    api_key: Optional[str] = Field(
+        default=None,
+        description="API key for authentication. If set all requests need to be authenticated.",
+    )
+
 
 class Settings(ModelSettings):
     models: Optional[List[ModelSettings]] = Field(
-        default = [],
-        description="Model configs, overwrites default config"
+        default=[], description="Model configs, overwrites default config"
     )
 
+
 SETTINGS: Optional[ServerSettings] = None
 
+
 def set_settings(settings: ServerSettings):
     global SETTINGS
     SETTINGS = settings
 
+
 def get_settings():
     yield SETTINGS

From 315a82fdc163ac04ad34eb9772142dc34ac190cb Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 21 Dec 2023 16:13:36 -0500
Subject: [PATCH 30/44] Fix type annotations

---
 llama_cpp/server/__main__.py | 2 ++
 llama_cpp/server/app.py      | 2 ++
 llama_cpp/server/settings.py | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 23c4a3b59..ba48855a0 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -21,6 +21,8 @@
 Then visit http://localhost:8000/docs to see the interactive API docs.
 
 """
+from __future__ import annotations
+
 import os
 import sys
 import argparse
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index c924822d4..3e73a5199 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import sys
 import json
 import traceback
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index b604515f2..fca73e50d 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import multiprocessing
 from typing import Optional, List, Literal
 from pydantic import Field

From c5051becfd93c87bcb550fe68db82a5a3a6d574d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 21 Dec 2023 16:15:18 -0500
Subject: [PATCH 31/44] Fix type annotations

---
 llama_cpp/server/model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 18816281a..09b4bfedf 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import os
 import logging
 from typing import Optional, Union

From 7a3e11a216f984cb8a5e8a8cad51bf34a23bc3ca Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 21 Dec 2023 16:18:36 -0500
Subject: [PATCH 32/44] Fix uvicorn app factory

---
 llama_cpp/server/app.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 3e73a5199..20f7127ba 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -217,7 +217,9 @@ async def custom_route_handler(request: Request) -> Response:
 
 router = APIRouter(route_class=RouteErrorHandler)
 
-def create_app(settings: Settings):
+def create_app(settings: Settings | None = None):
+    if settings is None:
+        settings = Settings()
     middleware = [
         Middleware(RawContextMiddleware, plugins=(plugins.RequestIdPlugin(),))
     ]

From 4f99ac667dbbd8ffe51c4f8131755fbf5171d105 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 21 Dec 2023 17:26:12 -0500
Subject: [PATCH 33/44] Fix settings

---
 llama_cpp/server/app.py      | 1 +
 llama_cpp/server/model.py    | 3 ++-
 llama_cpp/server/settings.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 20f7127ba..05b38dc09 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -220,6 +220,7 @@ async def custom_route_handler(request: Request) -> Response:
 def create_app(settings: Settings | None = None):
     if settings is None:
         settings = Settings()
+    set_settings(settings)
     middleware = [
         Middleware(RawContextMiddleware, plugins=(plugins.RequestIdPlugin(),))
     ]
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 09b4bfedf..5ae9a045c 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -37,7 +37,7 @@ def __init__(self, settings: Settings) -> None:
             if filename.endswith(FILE_EXT):
                 alias = model_alias(filename)
                 if alias in self._models: continue
-                exclude={'model', 'model_alias', 'models', 'host', 'port', 'interrupt_requests', 'config'}
+                exclude={'model', 'model_alias', 'models', 'host', 'port', 'interrupt_requests', 'config', 'ssl_keyfile', 'ssl_certfile', 'api_key'}
                 default_settings = settings.model_dump(exclude=exclude)
                 self._models[alias] = ModelSettings(model=os.path.join(model_root, filename), 
                                                     model_alias=alias, **default_settings)
@@ -114,6 +114,7 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
             # Misc
             verbose=settings.verbose,
         )
+        self._model.alias = model_alias
         if settings.cache:
             if settings.cache_type == "disk":
                 if settings.verbose:
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index fca73e50d..ea3b65504 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -149,7 +149,7 @@ class ServerSettings(BaseSettings):
     )
 
 
-class Settings(ModelSettings):
+class Settings(ServerSettings, ModelSettings):
     models: Optional[List[ModelSettings]] = Field(
         default=[], description="Model configs, overwrites default config"
     )

From 3f2e6c1874b97cb7ede2418359514f3b02289549 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 21 Dec 2023 22:28:02 -0500
Subject: [PATCH 34/44] Refactor server

---
 llama_cpp/server/__main__.py | 148 ++++-----
 llama_cpp/server/app.py      | 594 +++++++----------------------------
 llama_cpp/server/cli.py      |  99 ++++++
 llama_cpp/server/errors.py   | 210 +++++++++++++
 llama_cpp/server/model.py    | 156 ++++-----
 llama_cpp/server/settings.py |  26 +-
 llama_cpp/server/types.py    | 264 ++++++++++++++++
 7 files changed, 814 insertions(+), 683 deletions(-)
 create mode 100644 llama_cpp/server/cli.py
 create mode 100644 llama_cpp/server/errors.py
 create mode 100644 llama_cpp/server/types.py

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index ba48855a0..7a3587721 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -26,114 +26,84 @@
 import os
 import sys
 import argparse
-from typing import List, Literal, Union
 
 import uvicorn
 
 from llama_cpp.server.app import create_app
-from llama_cpp.server.settings import Settings, ServerSettings, set_settings
-
-
-def get_base_type(annotation):
-    if getattr(annotation, "__origin__", None) is Literal:
-        return type(annotation.__args__[0])
-    elif getattr(annotation, "__origin__", None) is Union:
-        non_optional_args = [
-            arg for arg in annotation.__args__ if arg is not type(None)
-        ]
-        if non_optional_args:
-            return get_base_type(non_optional_args[0])
-    elif (
-        getattr(annotation, "__origin__", None) is list
-        or getattr(annotation, "__origin__", None) is List
-    ):
-        return get_base_type(annotation.__args__[0])
-    else:
-        return annotation
-
-
-def contains_list_type(annotation) -> bool:
-    origin = getattr(annotation, "__origin__", None)
-
-    if origin is list or origin is List:
-        return True
-    elif origin in (Literal, Union):
-        return any(contains_list_type(arg) for arg in annotation.__args__)
-    else:
-        return False
-
-
-def parse_bool_arg(arg):
-    if isinstance(arg, bytes):
-        arg = arg.decode("utf-8")
-
-    true_values = {"1", "on", "t", "true", "y", "yes"}
-    false_values = {"0", "off", "f", "false", "n", "no"}
-
-    arg_str = str(arg).lower().strip()
-
-    if arg_str in true_values:
-        return True
-    elif arg_str in false_values:
-        return False
-    else:
-        raise ValueError(f"Invalid boolean argument: {arg}")
+from llama_cpp.server.settings import (
+    Server,
+    ServerSettings,
+    ModelSettings,
+    ConfigFileSettings,
+    set_server_settings,
+)
+from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
 
 
 def main():
     description = "🦙 Llama.cpp python server. Host your own LLMs!🚀"
     parser = argparse.ArgumentParser(description=description)
-    for name, field in (ServerSettings.model_fields | Settings.model_fields).items():
-        description = field.description
-        if field.default and description and not field.is_required():
-            description += f" (default: {field.default})"
-        base_type = (
-            get_base_type(field.annotation) if field.annotation is not None else str
-        )
-        list_type = contains_list_type(field.annotation)
-        if base_type is not bool:
-            parser.add_argument(
-                f"--{name}",
-                dest=name,
-                nargs="*" if list_type else None,
-                type=base_type,
-                help=description,
-            )
-        if base_type is bool:
-            parser.add_argument(
-                f"--{name}",
-                dest=name,
-                type=parse_bool_arg,
-                help=f"{description}",
-            )
 
+    add_args_from_model(parser, ModelSettings)
+    add_args_from_model(parser, ServerSettings)
+    parser.add_argument(
+        "--config-file",
+        type=str,
+        help="Path to a config file to load.",
+    )
     try:
         args = parser.parse_args()
-        server_settings = ServerSettings(
-            **{k: v for k, v in vars(args).items() if v is not None}
-        )
-        set_settings(server_settings)
-        if server_settings.config and os.path.exists(server_settings.config):
-            with open(server_settings.config, "rb") as f:
-                llama_settings = Settings.model_validate_json(f.read())
+        server_settings: ServerSettings | None = None
+        model_settings: list[ModelSettings] = []
+        # Load server settings from config_file if provided
+        config_file = os.environ.get("CONFIG_FILE", args.config_file)
+        if config_file:
+            if not os.path.exists(config_file):
+                raise ValueError(f"Config file {config_file} not found!")
+            with open(config_file, "rb") as f:
+                config_file_settings = ConfigFileSettings.model_validate_json(f.read())
+                server_settings = ServerSettings(
+                    **{
+                        k: v
+                        for k, v in config_file_settings.model_dump().items()
+                        if k in ServerSettings.model_fields
+                    }
+                )
+                model_settings = config_file_settings.models
         else:
-            llama_settings = Settings(
-                **{k: v for k, v in vars(args).items() if v is not None}
+            server_settings = ServerSettings(
+                **{
+                    k: v
+                    for k, v in vars(args).items()
+                    if k in ServerSettings.model_fields
+                }
             )
-        app = create_app(settings=llama_settings)
+            model_settings = [
+                ModelSettings(
+                    **{
+                        k: v
+                        for k, v in vars(args).items()
+                        if k in ModelSettings.model_fields
+                    }
+                )
+            ]
+        app = create_app(
+            settings=Server(
+                **server_settings.model_dump(), **model_settings[0].model_dump()
+            )
+        )
+        uvicorn.run(
+            app,
+            host=os.getenv("HOST", server_settings.host),
+            port=int(os.getenv("PORT", server_settings.port)),
+            ssl_keyfile=server_settings.ssl_keyfile,
+            ssl_certfile=server_settings.ssl_certfile,
+        )
     except Exception as e:
         print(e, file=sys.stderr)
         parser.print_help()
         sys.exit(1)
 
-    uvicorn.run(
-        app,
-        host=os.getenv("HOST", server_settings.host),
-        port=int(os.getenv("PORT", server_settings.port)),
-        ssl_keyfile=server_settings.ssl_keyfile,
-        ssl_certfile=server_settings.ssl_certfile,
-    )
-
 
 if __name__ == "__main__":
     main()
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 05b38dc09..60fc8c0e9 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -1,229 +1,123 @@
 from __future__ import annotations
 
-import sys
 import json
-import traceback
-import time
-from re import compile, Match, Pattern
+
+from threading import Lock
 from functools import partial
-from typing import Callable, Coroutine, Iterator, List, Optional, Tuple, Union, Dict
-from typing_extensions import TypedDict, Literal
+from typing import Iterator, List, Optional, Union, Dict
 
 import llama_cpp
 
 import anyio
 from anyio.streams.memory import MemoryObjectSendStream
 from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
-from fastapi import Depends, FastAPI, APIRouter, Request, Response, HTTPException, status
+from fastapi import (
+    Depends,
+    FastAPI,
+    APIRouter,
+    Request,
+    HTTPException,
+    status,
+)
 from fastapi.middleware import Middleware
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
-from fastapi.routing import APIRoute
 from fastapi.security import HTTPBearer
-from pydantic import BaseModel, Field
 from sse_starlette.sse import EventSourceResponse
-from starlette_context import plugins
+from starlette_context.plugins import RequestIdPlugin  # type: ignore
 from starlette_context.middleware import RawContextMiddleware
 
-import numpy as np
-import numpy.typing as npt
-
-from llama_cpp.server.model import get_llama, set_llama, llama_outer_lock, LlamaProxy as Llama
-from llama_cpp.server.settings import Settings, set_settings, get_settings
-
-class ErrorResponse(TypedDict):
-    """OpenAI style error response"""
-
-    message: str
-    type: str
-    param: Optional[str]
-    code: Optional[str]
-
-class ErrorResponseFormatters:
-    """Collection of formatters for error responses.
-
-    Args:
-        request (Union[CreateCompletionRequest, CreateChatCompletionRequest]):
-            Request body
-        match (Match[str]): Match object from regex pattern
-
-    Returns:
-        Tuple[int, ErrorResponse]: Status code and error response
-    """
-
-    @staticmethod
-    def context_length_exceeded(
-        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
-        match,  # type: Match[str] # type: ignore
-    ) -> Tuple[int, ErrorResponse]:
-        """Formatter for context length exceeded error"""
-
-        context_window = int(match.group(2))
-        prompt_tokens = int(match.group(1))
-        completion_tokens = request.max_tokens
-        if hasattr(request, "messages"):
-            # Chat completion
-            message = (
-                "This model's maximum context length is {} tokens. "
-                "However, you requested {} tokens "
-                "({} in the messages, {} in the completion). "
-                "Please reduce the length of the messages or completion."
-            )
-        else:
-            # Text completion
-            message = (
-                "This model's maximum context length is {} tokens, "
-                "however you requested {} tokens "
-                "({} in your prompt; {} for the completion). "
-                "Please reduce your prompt; or completion length."
-            )
-        return 400, ErrorResponse(
-            message=message.format(
-                context_window,
-                completion_tokens + prompt_tokens,
-                prompt_tokens,
-                completion_tokens,
-            ),
-            type="invalid_request_error",
-            param="messages",
-            code="context_length_exceeded",
-        )
+from llama_cpp.server.model import (
+    LlamaProxy,
+)
+from llama_cpp.server.settings import (
+    Settings,
+    ModelSettings,
+    ServerSettings,
+)
+from llama_cpp.server.types import (
+    CreateCompletionRequest,
+    CreateEmbeddingRequest,
+    CreateChatCompletionRequest,
+    ModelList,
+)
+from llama_cpp.server.errors import RouteErrorHandler
 
-    @staticmethod
-    def model_not_found(
-        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
-        match,  # type: Match[str] # type: ignore
-    ) -> Tuple[int, ErrorResponse]:
-        """Formatter for model_not_found error"""
-
-        model_path = str(match.group(1))
-        message = f"The model `{model_path}` does not exist"
-        return 400, ErrorResponse(
-            message=message,
-            type="invalid_request_error",
-            param=None,
-            code="model_not_found",
-        )
 
-class RouteErrorHandler(APIRoute):
-    """Custom APIRoute that handles application errors and exceptions"""
-
-    # key: regex pattern for original error message from llama_cpp
-    # value: formatter function
-    pattern_and_formatters: Dict[
-        "Pattern",
-        Callable[
-            [
-                Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
-                "Match[str]",
-            ],
-            Tuple[int, ErrorResponse],
-        ],
-    ] = {
-        compile(
-            r"Requested tokens \((\d+)\) exceed context window of (\d+)"
-        ): ErrorResponseFormatters.context_length_exceeded,
-        compile(
-            r"Model path does not exist: (.+)"
-        ): ErrorResponseFormatters.model_not_found,
-    }
+router = APIRouter(route_class=RouteErrorHandler)
 
-    def error_message_wrapper(
-        self,
-        error: Exception,
-        body: Optional[
-            Union[
-                "CreateChatCompletionRequest",
-                "CreateCompletionRequest",
-                "CreateEmbeddingRequest",
-            ]
-        ] = None,
-    ) -> Tuple[int, ErrorResponse]:
-        """Wraps error message in OpenAI style error response"""
-        print(f"Exception: {str(error)}", file=sys.stderr)
-        traceback.print_exc(file=sys.stderr)
-        if body is not None and isinstance(
-            body,
-            (
-                CreateCompletionRequest,
-                CreateChatCompletionRequest,
-            ),
-        ):
-            # When text completion or chat completion
-            for pattern, callback in self.pattern_and_formatters.items():
-                match = pattern.search(str(error))
-                if match is not None:
-                    return callback(body, match)
-
-        # Wrap other errors as internal server error
-        return 500, ErrorResponse(
-            message=str(error),
-            type="internal_server_error",
-            param=None,
-            code=None,
-        )
+_settings: Optional[ServerSettings] = None
 
-    def get_route_handler(
-        self,
-    ) -> Callable[[Request], Coroutine[None, None, Response]]:
-        """Defines custom route handler that catches exceptions and formats
-        in OpenAI style error response"""
-
-        original_route_handler = super().get_route_handler()
-
-        async def custom_route_handler(request: Request) -> Response:
-            try:
-                start_sec = time.perf_counter()
-                response = await original_route_handler(request)
-                elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000)
-                response.headers["openai-processing-ms"] = f"{elapsed_time_ms}"
-                return response
-            except HTTPException as unauthorized:
-                # api key check failed
-                raise unauthorized
-            except Exception as exc:
-                json_body = await request.json()
-                try:
-                    if "messages" in json_body:
-                        # Chat completion
-                        body: Optional[
-                            Union[
-                                CreateChatCompletionRequest,
-                                CreateCompletionRequest,
-                                CreateEmbeddingRequest,
-                            ]
-                        ] = CreateChatCompletionRequest(**json_body)
-                    elif "prompt" in json_body:
-                        # Text completion
-                        body = CreateCompletionRequest(**json_body)
-                    else:
-                        # Embedding
-                        body = CreateEmbeddingRequest(**json_body)
-                except Exception:
-                    # Invalid request body
-                    body = None
-
-                # Get proper error message from the exception
-                (
-                    status_code,
-                    error_message,
-                ) = self.error_message_wrapper(error=exc, body=body)
-                return JSONResponse(
-                    {"error": error_message},
-                    status_code=status_code,
-                )
-
-        return custom_route_handler
 
-router = APIRouter(route_class=RouteErrorHandler)
+def set_settings(settings: ServerSettings):
+    global _settings
+    _settings = settings
+
+
+def get_settings():
+    yield _settings
+
+
+LLAMA: Optional[LlamaProxy] = None
+
+llama_outer_lock = Lock()
+llama_inner_lock = Lock()
+
+
+def set_llama(models: List[ModelSettings]):
+    global LLAMA
+    LLAMA = LlamaProxy(models=models)
+
+
+def get_llama():
+    # NOTE: This double lock allows the currently streaming llama model to
+    # check if any other requests are pending in the same thread and cancel
+    # the stream if so.
+    llama_outer_lock.acquire()
+    release_outer_lock = True
+    try:
+        llama_inner_lock.acquire()
+        try:
+            llama_outer_lock.release()
+            release_outer_lock = False
+            yield LLAMA
+        finally:
+            llama_inner_lock.release()
+    finally:
+        if release_outer_lock:
+            llama_outer_lock.release()
+
+
+def create_app(
+    settings: Settings | None = None,
+    server_settings: ServerSettings | None = None,
+    model_settings: List[ModelSettings] | None = None,
+):
+    if server_settings is None and model_settings is None:
+        if settings is None:
+            settings = Settings()
+        server_settings = ServerSettings(
+            **{
+                k: v
+                for k, v in settings.model_dump().items()
+                if k in ServerSettings.model_fields
+            }
+        )
+        model_settings = [
+            ModelSettings(
+                **{
+                    k: v
+                    for k, v in settings.model_dump().items()
+                    if k in ModelSettings.model_fields
+                }
+            )
+        ]
+
+    assert (
+        server_settings is not None and model_settings is not None
+    ), "server_settings and model_settings must be provided together"
 
-def create_app(settings: Settings | None = None):
-    if settings is None:
-        settings = Settings()
-    set_settings(settings)
-    middleware = [
-        Middleware(RawContextMiddleware, plugins=(plugins.RequestIdPlugin(),))
-    ]
+    set_settings(server_settings)
+    middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))]
     app = FastAPI(
         middleware=middleware,
         title="🦙 llama.cpp Python API",
@@ -238,13 +132,12 @@ def create_app(settings: Settings | None = None):
     )
     app.include_router(router)
 
-    @app.get('/')
-    async def root():
-        return "pong"
+    assert model_settings is not None
+    set_llama(models=model_settings)
 
-    set_llama(settings)
     return app
 
+
 async def get_event_publisher(
     request: Request,
     inner_send_chan: MemoryObjectSendStream,
@@ -256,7 +149,10 @@ async def get_event_publisher(
                 await inner_send_chan.send(dict(data=json.dumps(chunk)))
                 if await request.is_disconnected():
                     raise anyio.get_cancelled_exc_class()()
-                if next(get_settings()).interrupt_requests and llama_outer_lock.locked():
+                if (
+                    next(get_settings()).interrupt_requests
+                    and llama_outer_lock.locked()
+                ):
                     await inner_send_chan.send(dict(data="[DONE]"))
                     raise anyio.get_cancelled_exc_class()()
             await inner_send_chan.send(dict(data="[DONE]"))
@@ -267,156 +163,6 @@ async def get_event_publisher(
                 raise e
 
 
-model_field = Field(
-    description="The model to use for generating completions.", default=None
-)
-
-max_tokens_field = Field(
-    default=16, ge=1, description="The maximum number of tokens to generate."
-)
-
-temperature_field = Field(
-    default=0.8,
-    ge=0.0,
-    le=2.0,
-    description="Adjust the randomness of the generated text.\n\n"
-    + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
-)
-
-top_p_field = Field(
-    default=0.95,
-    ge=0.0,
-    le=1.0,
-    description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"
-    + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",
-)
-
-min_p_field = Field(
-    default=0.05,
-    ge=0.0,
-    le=1.0,
-    description="Sets a minimum base probability threshold for token selection.\n\n"
-    + "The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter min_p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with min_p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.",
-)
-
-stop_field = Field(
-    default=None,
-    description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
-)
-
-stream_field = Field(
-    default=False,
-    description="Whether to stream the results as they are generated. Useful for chatbots.",
-)
-
-top_k_field = Field(
-    default=40,
-    ge=0,
-    description="Limit the next token selection to the K most probable tokens.\n\n"
-    + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.",
-)
-
-repeat_penalty_field = Field(
-    default=1.1,
-    ge=0.0,
-    description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
-    + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
-)
-
-presence_penalty_field = Field(
-    default=0.0,
-    ge=-2.0,
-    le=2.0,
-    description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
-)
-
-frequency_penalty_field = Field(
-    default=0.0,
-    ge=-2.0,
-    le=2.0,
-    description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
-)
-
-mirostat_mode_field = Field(
-    default=0,
-    ge=0,
-    le=2,
-    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)",
-)
-
-mirostat_tau_field = Field(
-    default=5.0,
-    ge=0.0,
-    le=10.0,
-    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text",
-)
-
-mirostat_eta_field = Field(
-    default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
-)
-
-grammar = Field(
-    default=None,
-    description="A CBNF grammar (as string) to be used for formatting the model's output.",
-)
-
-
-class CreateCompletionRequest(BaseModel):
-    prompt: Union[str, List[str]] = Field(
-        default="", description="The prompt to generate completions for."
-    )
-    suffix: Optional[str] = Field(
-        default=None,
-        description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
-    )
-    max_tokens: int = max_tokens_field
-    temperature: float = temperature_field
-    top_p: float = top_p_field
-    min_p: float = min_p_field
-    echo: bool = Field(
-        default=False,
-        description="Whether to echo the prompt in the generated text. Useful for chatbots.",
-    )
-    stop: Optional[Union[str, List[str]]] = stop_field
-    stream: bool = stream_field
-    logprobs: Optional[int] = Field(
-        default=None,
-        ge=0,
-        description="The number of logprobs to generate. If None, no logprobs are generated.",
-    )
-    presence_penalty: Optional[float] = presence_penalty_field
-    frequency_penalty: Optional[float] = frequency_penalty_field
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    logprobs: Optional[int] = Field(None)
-    seed: Optional[int] = Field(None)
-
-    # ignored or currently unsupported
-    model: Optional[str] = model_field
-    n: Optional[int] = 1
-    best_of: Optional[int] = 1
-    user: Optional[str] = Field(default=None)
-
-    # llama.cpp specific parameters
-    top_k: int = top_k_field
-    repeat_penalty: float = repeat_penalty_field
-    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
-    mirostat_mode: int = mirostat_mode_field
-    mirostat_tau: float = mirostat_tau_field
-    mirostat_eta: float = mirostat_eta_field
-    grammar: Optional[str] = None
-
-    model_config = {
-        "json_schema_extra": {
-            "examples": [
-                {
-                    "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
-                    "stop": ["\n", "###"],
-                }
-            ]
-        }
-    }
-
-
 def _logit_bias_tokens_to_input_ids(
     llama: llama_cpp.Llama,
     logit_bias: Dict[str, float],
@@ -433,7 +179,10 @@ def _logit_bias_tokens_to_input_ids(
 bearer_scheme = HTTPBearer(auto_error=False)
 
 
-async def authenticate(settings: Settings = Depends(get_settings), authorization: Optional[str] = Depends(bearer_scheme)):
+async def authenticate(
+    settings: Settings = Depends(get_settings),
+    authorization: Optional[str] = Depends(bearer_scheme),
+):
     # Skip API key check if it's not set in settings
     if settings.api_key is None:
         return True
@@ -450,10 +199,7 @@ async def authenticate(settings: Settings = Depends(get_settings), authorization
     )
 
 
-@router.post(
-    "/v1/completions",
-    summary="Completion"
-)
+@router.post("/v1/completions", summary="Completion")
 @router.post("/v1/engines/copilot-codex/completions", include_in_schema=False)
 async def create_completion(
     request: Request,
@@ -464,7 +210,7 @@ async def create_completion(
     if isinstance(body.prompt, list):
         assert len(body.prompt) <= 1
         body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
-    
+
     llama = llama(body.model)
 
     exclude = {
@@ -513,25 +259,8 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
     else:
         return iterator_or_completion
 
-class CreateEmbeddingRequest(BaseModel):
-    model: Optional[str] = model_field
-    input: Union[str, List[str]] = Field(description="The input to embed.")
-    user: Optional[str] = Field(default=None)
 
-    model_config = {
-        "json_schema_extra": {
-            "examples": [
-                {
-                    "input": "The food was delicious and the waiter...",
-                }
-            ]
-        }
-    }
-
-@router.post(
-    "/v1/embeddings",
-    summary="Embedding"
-)
+@router.post("/v1/embeddings", summary="Embedding")
 async def create_embedding(
     request: CreateEmbeddingRequest,
     llama: llama_cpp.Llama = Depends(get_llama),
@@ -542,88 +271,7 @@ async def create_embedding(
     )
 
 
-class ChatCompletionRequestMessage(BaseModel):
-    role: Literal["system", "user", "assistant", "function"] = Field(
-        default="user", description="The role of the message."
-    )
-    content: Optional[str] = Field(
-        default="", description="The content of the message."
-    )
-
-
-class CreateChatCompletionRequest(BaseModel):
-    messages: List[llama_cpp.ChatCompletionRequestMessage] = Field(
-        default=[], description="A list of messages to generate completions for."
-    )
-    functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
-        default=None,
-        description="A list of functions to apply to the generated completions.",
-    )
-    function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field(
-        default=None,
-        description="A function to apply to the generated completions.",
-    )
-    tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field(
-        default=None,
-        description="A list of tools to apply to the generated completions.",
-    )
-    tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
-        default=None,
-        description="A tool to apply to the generated completions.",
-    )  # TODO: verify
-    max_tokens: Optional[int] = Field(
-        default=None,
-        description="The maximum number of tokens to generate. Defaults to inf",
-    )
-    temperature: float = temperature_field
-    top_p: float = top_p_field
-    min_p: float = min_p_field
-    stop: Optional[Union[str, List[str]]] = stop_field
-    stream: bool = stream_field
-    presence_penalty: Optional[float] = presence_penalty_field
-    frequency_penalty: Optional[float] = frequency_penalty_field
-    logit_bias: Optional[Dict[str, float]] = Field(None)
-    seed: Optional[int] = Field(None)
-    response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field(
-        default=None,
-    )
-
-    # ignored or currently unsupported
-    model: Optional[str] = model_field
-    n: Optional[int] = 1
-    user: Optional[str] = Field(None)
-
-    # llama.cpp specific parameters
-    top_k: int = top_k_field
-    repeat_penalty: float = repeat_penalty_field
-    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
-    mirostat_mode: int = mirostat_mode_field
-    mirostat_tau: float = mirostat_tau_field
-    mirostat_eta: float = mirostat_eta_field
-    grammar: Optional[str] = None
-
-    model_config = {
-        "json_schema_extra": {
-            "examples": [
-                {
-                    "messages": [
-                        ChatCompletionRequestMessage(
-                            role="system", content="You are a helpful assistant."
-                        ).model_dump(),
-                        ChatCompletionRequestMessage(
-                            role="user", content="What is the capital of France?"
-                        ).model_dump(),
-                    ]
-                }
-            ]
-        }
-    }
-
-
-@router.post(
-    "/v1/chat/completions",
-    summary="Chat"
-)
+@router.post("/v1/chat/completions", summary="Chat")
 async def create_chat_completion(
     request: Request,
     body: CreateChatCompletionRequest,
@@ -676,22 +324,11 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
         return iterator_or_completion
 
 
-class ModelData(TypedDict):
-    id: str
-    object: Literal["model"]
-    owned_by: str
-    permissions: List[str]
-
-
-class ModelList(TypedDict):
-    object: Literal["list"]
-    data: List[ModelData]
-
-
 @router.get("/v1/models", summary="Models")
 async def get_models(
     settings: Settings = Depends(get_settings),
     authenticated: str = Depends(authenticate),
+    llama: llama_cpp.Llama = Depends(get_llama),
 ) -> ModelList:
     return {
         "object": "list",
@@ -701,6 +338,7 @@ async def get_models(
                 "object": "model",
                 "owned_by": "me",
                 "permissions": [],
-            } for model_alias in llama
+            }
+            for model_alias in llama
         ],
     }
diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py
new file mode 100644
index 000000000..750b396cd
--- /dev/null
+++ b/llama_cpp/server/cli.py
@@ -0,0 +1,99 @@
+from __future__ import annotations
+
+import argparse
+
+from typing import List, Literal, Union, Any, Type, TypeVar
+
+from pydantic import BaseModel
+
+from llama_cpp.server.settings import CommandLineSettings
+
+
+def _get_base_type(annotation: Type[Any]) -> Type[Any]:
+    if getattr(annotation, "__origin__", None) is Literal:
+        assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
+        return type(annotation.__args__[0])  # type: ignore
+    elif getattr(annotation, "__origin__", None) is Union:
+        assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
+        non_optional_args: List[Type[Any]] = [
+            arg for arg in annotation.__args__ if arg is not type(None)  # type: ignore
+        ]
+        if non_optional_args:
+            return _get_base_type(non_optional_args[0])
+    elif (
+        getattr(annotation, "__origin__", None) is list
+        or getattr(annotation, "__origin__", None) is List
+    ):
+        assert hasattr(annotation, "__args__") and len(annotation.__args__) >= 1  # type: ignore
+        return _get_base_type(annotation.__args__[0])  # type: ignore
+    return annotation
+
+
+def _contains_list_type(annotation: Type[Any] | None) -> bool:
+    origin = getattr(annotation, "__origin__", None)
+
+    if origin is list or origin is List:
+        return True
+    elif origin in (Literal, Union):
+        return any(_contains_list_type(arg) for arg in annotation.__args__)  # type: ignore
+    else:
+        return False
+
+
+def _parse_bool_arg(arg: str | bytes | bool) -> bool:
+    if isinstance(arg, bytes):
+        arg = arg.decode("utf-8")
+
+    true_values = {"1", "on", "t", "true", "y", "yes"}
+    false_values = {"0", "off", "f", "false", "n", "no"}
+
+    arg_str = str(arg).lower().strip()
+
+    if arg_str in true_values:
+        return True
+    elif arg_str in false_values:
+        return False
+    else:
+        raise ValueError(f"Invalid boolean argument: {arg}")
+
+
+def add_args_from_model(parser: argparse.ArgumentParser, model: type[BaseModel]):
+    """Add arguments from a pydantic model to an argparse parser."""
+
+    for name, field in model.model_fields.items():
+        description = field.description
+        if field.default and description and not field.is_required():
+            description += f" (default: {field.default})"
+        base_type = (
+            _get_base_type(field.annotation) if field.annotation is not None else str
+        )
+        list_type = _contains_list_type(field.annotation)
+        if base_type is not bool:
+            parser.add_argument(
+                f"--{name}",
+                dest=name,
+                nargs="*" if list_type else None,
+                type=base_type,
+                help=description,
+            )
+        if base_type is bool:
+            parser.add_argument(
+                f"--{name}",
+                dest=name,
+                type=_parse_bool_arg,
+                help=f"{description}",
+            )
+
+
+T = TypeVar("T", bound=type[BaseModel])
+
+
+def parse_model_from_args(model: T, args: argparse.Namespace) -> T:
+    """Parse a pydantic model from an argparse namespace."""
+    return model(
+        **{
+            k: v
+            for k, v in vars(args).items()
+            if v is not None and k in model.model_fields
+        }
+    )
diff --git a/llama_cpp/server/errors.py b/llama_cpp/server/errors.py
new file mode 100644
index 000000000..febe3e39d
--- /dev/null
+++ b/llama_cpp/server/errors.py
@@ -0,0 +1,210 @@
+from __future__ import annotations
+
+import sys
+import traceback
+import time
+from re import compile, Match, Pattern
+from typing import Callable, Coroutine, Optional, Tuple, Union, Dict
+from typing_extensions import TypedDict
+
+
+from fastapi import (
+    Request,
+    Response,
+    HTTPException,
+)
+from fastapi.responses import JSONResponse
+from fastapi.routing import APIRoute
+
+from llama_cpp.server.types import (
+    CreateCompletionRequest,
+    CreateEmbeddingRequest,
+    CreateChatCompletionRequest,
+)
+
+class ErrorResponse(TypedDict):
+    """OpenAI style error response"""
+
+    message: str
+    type: str
+    param: Optional[str]
+    code: Optional[str]
+
+
+class ErrorResponseFormatters:
+    """Collection of formatters for error responses.
+
+    Args:
+        request (Union[CreateCompletionRequest, CreateChatCompletionRequest]):
+            Request body
+        match (Match[str]): Match object from regex pattern
+
+    Returns:
+        Tuple[int, ErrorResponse]: Status code and error response
+    """
+
+    @staticmethod
+    def context_length_exceeded(
+        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        match,  # type: Match[str] # type: ignore
+    ) -> Tuple[int, ErrorResponse]:
+        """Formatter for context length exceeded error"""
+
+        context_window = int(match.group(2))
+        prompt_tokens = int(match.group(1))
+        completion_tokens = request.max_tokens
+        if hasattr(request, "messages"):
+            # Chat completion
+            message = (
+                "This model's maximum context length is {} tokens. "
+                "However, you requested {} tokens "
+                "({} in the messages, {} in the completion). "
+                "Please reduce the length of the messages or completion."
+            )
+        else:
+            # Text completion
+            message = (
+                "This model's maximum context length is {} tokens, "
+                "however you requested {} tokens "
+                "({} in your prompt; {} for the completion). "
+                "Please reduce your prompt; or completion length."
+            )
+        return 400, ErrorResponse(
+            message=message.format(
+                context_window,
+                completion_tokens + prompt_tokens,
+                prompt_tokens,
+                completion_tokens,
+            ), # type: ignore
+            type="invalid_request_error",
+            param="messages",
+            code="context_length_exceeded",
+        )
+
+    @staticmethod
+    def model_not_found(
+        request: Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+        match,  # type: Match[str] # type: ignore
+    ) -> Tuple[int, ErrorResponse]:
+        """Formatter for model_not_found error"""
+
+        model_path = str(match.group(1))
+        message = f"The model `{model_path}` does not exist"
+        return 400, ErrorResponse(
+            message=message,
+            type="invalid_request_error",
+            param=None,
+            code="model_not_found",
+        )
+
+
+class RouteErrorHandler(APIRoute):
+    """Custom APIRoute that handles application errors and exceptions"""
+
+    # key: regex pattern for original error message from llama_cpp
+    # value: formatter function
+    pattern_and_formatters: Dict[
+        "Pattern[str]",
+        Callable[
+            [
+                Union["CreateCompletionRequest", "CreateChatCompletionRequest"],
+                "Match[str]",
+            ],
+            Tuple[int, ErrorResponse],
+        ],
+    ] = {
+        compile(
+            r"Requested tokens \((\d+)\) exceed context window of (\d+)"
+        ): ErrorResponseFormatters.context_length_exceeded,
+        compile(
+            r"Model path does not exist: (.+)"
+        ): ErrorResponseFormatters.model_not_found,
+    }
+
+    def error_message_wrapper(
+        self,
+        error: Exception,
+        body: Optional[
+            Union[
+                "CreateChatCompletionRequest",
+                "CreateCompletionRequest",
+                "CreateEmbeddingRequest",
+            ]
+        ] = None,
+    ) -> Tuple[int, ErrorResponse]:
+        """Wraps error message in OpenAI style error response"""
+        print(f"Exception: {str(error)}", file=sys.stderr)
+        traceback.print_exc(file=sys.stderr)
+        if body is not None and isinstance(
+            body,
+            (
+                CreateCompletionRequest,
+                CreateChatCompletionRequest,
+            ),
+        ):
+            # When text completion or chat completion
+            for pattern, callback in self.pattern_and_formatters.items():
+                match = pattern.search(str(error))
+                if match is not None:
+                    return callback(body, match)
+
+        # Wrap other errors as internal server error
+        return 500, ErrorResponse(
+            message=str(error),
+            type="internal_server_error",
+            param=None,
+            code=None,
+        )
+
+    def get_route_handler(
+        self,
+    ) -> Callable[[Request], Coroutine[None, None, Response]]:
+        """Defines custom route handler that catches exceptions and formats
+        in OpenAI style error response"""
+
+        original_route_handler = super().get_route_handler()
+
+        async def custom_route_handler(request: Request) -> Response:
+            try:
+                start_sec = time.perf_counter()
+                response = await original_route_handler(request)
+                elapsed_time_ms = int((time.perf_counter() - start_sec) * 1000)
+                response.headers["openai-processing-ms"] = f"{elapsed_time_ms}"
+                return response
+            except HTTPException as unauthorized:
+                # api key check failed
+                raise unauthorized
+            except Exception as exc:
+                json_body = await request.json()
+                try:
+                    if "messages" in json_body:
+                        # Chat completion
+                        body: Optional[
+                            Union[
+                                CreateChatCompletionRequest,
+                                CreateCompletionRequest,
+                                CreateEmbeddingRequest,
+                            ]
+                        ] = CreateChatCompletionRequest(**json_body)
+                    elif "prompt" in json_body:
+                        # Text completion
+                        body = CreateCompletionRequest(**json_body)
+                    else:
+                        # Embedding
+                        body = CreateEmbeddingRequest(**json_body)
+                except Exception:
+                    # Invalid request body
+                    body = None
+
+                # Get proper error message from the exception
+                (
+                    status_code,
+                    error_message,
+                ) = self.error_message_wrapper(error=exc, body=body)
+                return JSONResponse(
+                    {"error": error_message},
+                    status_code=status_code,
+                )
+
+        return custom_route_handler
+
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 5ae9a045c..f11c1540a 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -1,80 +1,74 @@
 from __future__ import annotations
 
-import os
-import logging
-from typing import Optional, Union
+from typing import Optional, Union, List
 from threading import Lock
-import llama_cpp
-from llama_cpp.server.settings import Settings, ModelSettings
-
-FILE_EXT = ".gguf"
-MODEL_ENV_ARG = "MODEL"
-DEFAULT_MODEL_DIR = os.path.join(os.getcwd(), "/models")
 
-logger = logging.getLogger("uvicorn")
+import llama_cpp
 
-def models_root_dir(path: Optional[str] = None):
-    path = os.path.abspath(path or os.environ.get(MODEL_ENV_ARG, DEFAULT_MODEL_DIR))
-    if os.path.isdir(path): return path
-    return os.path.dirname(path)
+from llama_cpp.server.settings import ModelSettings
 
-def model_alias(path: str) -> str:
-    return path.split(os.path.sep)[-1].split(FILE_EXT)[0]
 
 class LlamaProxy:
-    _model: Optional[llama_cpp.Llama] = None
-    _models: dict[str,ModelSettings] = {}
+    def __init__(self, models: List[ModelSettings]) -> None:
+        assert len(models) > 0, "No models provided!"
 
-    def __init__(self, settings: Settings) -> None:
-        self._settings = settings
-        for model in settings.models:
+        self._model_settings_dict: dict[str, ModelSettings] = {}
+        for model in models:
             if not model.model_alias:
-                model.model_alias = model_alias(model.model)
-            self._models[model.model_alias] = model
-
-        model_root = models_root_dir(settings.model)
-        for filename in os.listdir(model_root):
-            if filename.endswith(FILE_EXT):
-                alias = model_alias(filename)
-                if alias in self._models: continue
-                exclude={'model', 'model_alias', 'models', 'host', 'port', 'interrupt_requests', 'config', 'ssl_keyfile', 'ssl_certfile', 'api_key'}
-                default_settings = settings.model_dump(exclude=exclude)
-                self._models[alias] = ModelSettings(model=os.path.join(model_root, filename), 
-                                                    model_alias=alias, **default_settings)
-
-        if os.path.isfile(settings.model):
-            alias = settings.model_alias
-            if alias is None: alias = model_alias(settings.model)
-            if alias not in self._models:
-                self._models[alias] = settings
-            self(alias)
+                model.model_alias = model.model
+            self._model_settings_dict[model.model_alias] = model
+
+        self._current_model: Optional[llama_cpp.Llama] = None
+        self._current_model_alias: Optional[str] = None
+
+        self._default_model_settings: ModelSettings = models[0]
+        self._default_model_alias: str = self._default_model_settings.model_alias  # type: ignore
+
+        # Load default model
+        self._current_model = self.load_llama_from_model_settings(
+            self._default_model_settings
+        )
+        self._current_model_alias = self._default_model_alias
 
     def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
-        # handle backward compatibility, model param optional
-        try:
-            model_alias = self._models[model].model_alias
-        except KeyError:
-            if self._model:
-                if self._settings.verbose: logger.warn(f"Model {model} NOT found! Using {self._model.alias}")
-                return self._model
-            else: raise Exception(404, f"Model {model} NOT found!")
-        
-        if self._model:
-            if self._model.alias == model_alias:
-                return self._model
-            del self._model
-
-        settings = self._models[model]
+        if model is None:
+            model = self._default_model_alias
+
+        if model == self._current_model_alias:
+            if self._current_model is not None:
+                return self._current_model
+
+        settings = self._model_settings_dict[model]
+        self._current_model = self.load_llama_from_model_settings(settings)
+        self._current_model_alias = model
+        return self._current_model
+
+    def __getitem__(self, model: str):
+        return self._model_settings_dict[model].model_dump()
+
+    def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]):
+        if isinstance(settings, (bytes, str)):
+            settings = ModelSettings.model_validate_json(settings)
+        self._model_settings_dict[model] = settings
 
+    def __iter__(self):
+        for model in self._model_settings_dict:
+            yield model
+
+    def free(self):
+        if self._current_model:
+            del self._current_model
+
+    @staticmethod
+    def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama:
         chat_handler = None
         if settings.chat_format == "llava-1-5":
             assert settings.clip_model_path is not None, "clip model not found"
             chat_handler = llama_cpp.llama_chat_format.Llava15ChatHandler(
-                clip_model_path=settings.clip_model_path,
-                verbose=settings.verbose
+                clip_model_path=settings.clip_model_path, verbose=settings.verbose
             )
 
-        self._model = llama_cpp.Llama(
+        _model = llama_cpp.Llama(
             model_path=settings.model,
             # Model Params
             n_gpu_layers=settings.n_gpu_layers,
@@ -114,7 +108,6 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
             # Misc
             verbose=settings.verbose,
         )
-        self._model.alias = model_alias
         if settings.cache:
             if settings.cache_type == "disk":
                 if settings.verbose:
@@ -124,47 +117,6 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
                 if settings.verbose:
                     print(f"Using ram cache with size {settings.cache_size}")
                 cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
-            self._model.set_cache(cache)
-        return self._model
-
-    def __getitem__(self, model: str):
-        return self._models[model].model_dump()
+            _model.set_cache(cache)
+        return _model
 
-    def __setitem__(self, model: str, settings: Union[ModelSettings, str, bytes]):
-        if isinstance(settings, (bytes, str)):
-            settings = ModelSettings.model_validate_json(settings)
-        self._models[model] = settings
-    
-    def __iter__(self):
-        for model in self._models:
-            yield model
-
-    def free(self):
-        if self._model: del self._model
-    
-LLAMA: Optional[LlamaProxy] = None
-
-llama_outer_lock = Lock()
-llama_inner_lock = Lock()
-
-def set_llama(settings: Settings):
-    global LLAMA
-    LLAMA = LlamaProxy(settings)
-
-def get_llama():
-    # NOTE: This double lock allows the currently streaming llama model to
-    # check if any other requests are pending in the same thread and cancel
-    # the stream if so.
-    llama_outer_lock.acquire()
-    release_outer_lock = True
-    try:
-        llama_inner_lock.acquire()
-        try:
-            llama_outer_lock.release()
-            release_outer_lock = False
-            yield LLAMA
-        finally:
-            llama_inner_lock.release()
-    finally:
-        if release_outer_lock:
-            llama_outer_lock.release()
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index ea3b65504..ea9cf26ec 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 
 import multiprocessing
+
 from typing import Optional, List, Literal
 from pydantic import Field
-from pydantic_settings import BaseSettings, SettingsConfigDict
+from pydantic_settings import BaseSettings
+
 import llama_cpp
 
 # Disable warning for model and model_alias settings
@@ -129,14 +131,12 @@ class ModelSettings(BaseSettings):
 
 
 class ServerSettings(BaseSettings):
-    model_config = SettingsConfigDict(env_file=".env", extra="ignore")
     host: str = Field(default="localhost", description="Listen address")
     port: int = Field(default=8000, description="Listen port")
     interrupt_requests: bool = Field(
         default=True,
         description="Whether to interrupt requests when a new request is received.",
     )
-    config: Optional[str] = Field(default=None, description="Path to config file")
     ssl_keyfile: Optional[str] = Field(
         default=None, description="SSL key file for HTTPS"
     )
@@ -150,18 +150,16 @@ class ServerSettings(BaseSettings):
 
 
 class Settings(ServerSettings, ModelSettings):
-    models: Optional[List[ModelSettings]] = Field(
-        default=[], description="Model configs, overwrites default config"
-    )
-
+    pass
 
-SETTINGS: Optional[ServerSettings] = None
 
-
-def set_settings(settings: ServerSettings):
-    global SETTINGS
-    SETTINGS = settings
+class CommandLineSettings(Settings):
+    config_file: Optional[str] = Field(
+        default=None, description="Path to a config file to load."
+    )
 
 
-def get_settings():
-    yield SETTINGS
+class ConfigFileSettings(ServerSettings):
+    models: List[ModelSettings] = Field(
+        default=[], description="Model configs, overwrites default config"
+    )
diff --git a/llama_cpp/server/types.py b/llama_cpp/server/types.py
new file mode 100644
index 000000000..f0867bc4e
--- /dev/null
+++ b/llama_cpp/server/types.py
@@ -0,0 +1,264 @@
+from __future__ import annotations
+
+from typing import List, Optional, Union, Dict
+from typing_extensions import TypedDict, Literal
+
+from pydantic import BaseModel, Field
+
+import llama_cpp
+
+
+model_field = Field(
+    description="The model to use for generating completions.", default=None
+)
+
+max_tokens_field = Field(
+    default=16, ge=1, description="The maximum number of tokens to generate."
+)
+
+temperature_field = Field(
+    default=0.8,
+    ge=0.0,
+    le=2.0,
+    description="Adjust the randomness of the generated text.\n\n"
+    + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.",
+)
+
+top_p_field = Field(
+    default=0.95,
+    ge=0.0,
+    le=1.0,
+    description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n"
+    + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.",
+)
+
+min_p_field = Field(
+    default=0.05,
+    ge=0.0,
+    le=1.0,
+    description="Sets a minimum base probability threshold for token selection.\n\n"
+    + "The Min-P sampling method was designed as an alternative to Top-P, and aims to ensure a balance of quality and variety. The parameter min_p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with min_p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out.",
+)
+
+stop_field = Field(
+    default=None,
+    description="A list of tokens at which to stop generation. If None, no stop tokens are used.",
+)
+
+stream_field = Field(
+    default=False,
+    description="Whether to stream the results as they are generated. Useful for chatbots.",
+)
+
+top_k_field = Field(
+    default=40,
+    ge=0,
+    description="Limit the next token selection to the K most probable tokens.\n\n"
+    + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.",
+)
+
+repeat_penalty_field = Field(
+    default=1.1,
+    ge=0.0,
+    description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n"
+    + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
+)
+
+presence_penalty_field = Field(
+    default=0.0,
+    ge=-2.0,
+    le=2.0,
+    description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+)
+
+frequency_penalty_field = Field(
+    default=0.0,
+    ge=-2.0,
+    le=2.0,
+    description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+)
+
+mirostat_mode_field = Field(
+    default=0,
+    ge=0,
+    le=2,
+    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)",
+)
+
+mirostat_tau_field = Field(
+    default=5.0,
+    ge=0.0,
+    le=10.0,
+    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text",
+)
+
+mirostat_eta_field = Field(
+    default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
+)
+
+grammar = Field(
+    default=None,
+    description="A CBNF grammar (as string) to be used for formatting the model's output.",
+)
+
+
+class CreateCompletionRequest(BaseModel):
+    prompt: Union[str, List[str]] = Field(
+        default="", description="The prompt to generate completions for."
+    )
+    suffix: Optional[str] = Field(
+        default=None,
+        description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.",
+    )
+    max_tokens: int = max_tokens_field
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    min_p: float = min_p_field
+    echo: bool = Field(
+        default=False,
+        description="Whether to echo the prompt in the generated text. Useful for chatbots.",
+    )
+    stop: Optional[Union[str, List[str]]] = stop_field
+    stream: bool = stream_field
+    logprobs: Optional[int] = Field(
+        default=None,
+        ge=0,
+        description="The number of logprobs to generate. If None, no logprobs are generated.",
+    )
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    logprobs: Optional[int] = Field(None)
+    seed: Optional[int] = Field(None)
+
+    # ignored or currently unsupported
+    model: Optional[str] = model_field
+    n: Optional[int] = 1
+    best_of: Optional[int] = 1
+    user: Optional[str] = Field(default=None)
+
+    # llama.cpp specific parameters
+    top_k: int = top_k_field
+    repeat_penalty: float = repeat_penalty_field
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+    mirostat_mode: int = mirostat_mode_field
+    mirostat_tau: float = mirostat_tau_field
+    mirostat_eta: float = mirostat_eta_field
+    grammar: Optional[str] = None
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                    "stop": ["\n", "###"],
+                }
+            ]
+        }
+    }
+
+
+class CreateEmbeddingRequest(BaseModel):
+    model: Optional[str] = model_field
+    input: Union[str, List[str]] = Field(description="The input to embed.")
+    user: Optional[str] = Field(default=None)
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "input": "The food was delicious and the waiter...",
+                }
+            ]
+        }
+    }
+
+
+class ChatCompletionRequestMessage(BaseModel):
+    role: Literal["system", "user", "assistant", "function"] = Field(
+        default="user", description="The role of the message."
+    )
+    content: Optional[str] = Field(
+        default="", description="The content of the message."
+    )
+
+
+class CreateChatCompletionRequest(BaseModel):
+    messages: List[llama_cpp.ChatCompletionRequestMessage] = Field(
+        default=[], description="A list of messages to generate completions for."
+    )
+    functions: Optional[List[llama_cpp.ChatCompletionFunction]] = Field(
+        default=None,
+        description="A list of functions to apply to the generated completions.",
+    )
+    function_call: Optional[llama_cpp.ChatCompletionRequestFunctionCall] = Field(
+        default=None,
+        description="A function to apply to the generated completions.",
+    )
+    tools: Optional[List[llama_cpp.ChatCompletionTool]] = Field(
+        default=None,
+        description="A list of tools to apply to the generated completions.",
+    )
+    tool_choice: Optional[llama_cpp.ChatCompletionToolChoiceOption] = Field(
+        default=None,
+        description="A tool to apply to the generated completions.",
+    )  # TODO: verify
+    max_tokens: Optional[int] = Field(
+        default=None,
+        description="The maximum number of tokens to generate. Defaults to inf",
+    )
+    temperature: float = temperature_field
+    top_p: float = top_p_field
+    min_p: float = min_p_field
+    stop: Optional[Union[str, List[str]]] = stop_field
+    stream: bool = stream_field
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
+    logit_bias: Optional[Dict[str, float]] = Field(None)
+    seed: Optional[int] = Field(None)
+    response_format: Optional[llama_cpp.ChatCompletionRequestResponseFormat] = Field(
+        default=None,
+    )
+
+    # ignored or currently unsupported
+    model: Optional[str] = model_field
+    n: Optional[int] = 1
+    user: Optional[str] = Field(None)
+
+    # llama.cpp specific parameters
+    top_k: int = top_k_field
+    repeat_penalty: float = repeat_penalty_field
+    logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
+    mirostat_mode: int = mirostat_mode_field
+    mirostat_tau: float = mirostat_tau_field
+    mirostat_eta: float = mirostat_eta_field
+    grammar: Optional[str] = None
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "messages": [
+                        ChatCompletionRequestMessage(
+                            role="system", content="You are a helpful assistant."
+                        ).model_dump(),
+                        ChatCompletionRequestMessage(
+                            role="user", content="What is the capital of France?"
+                        ).model_dump(),
+                    ]
+                }
+            ]
+        }
+    }
+
+
+class ModelData(TypedDict):
+    id: str
+    object: Literal["model"]
+    owned_by: str
+    permissions: List[str]
+
+
+class ModelList(TypedDict):
+    object: Literal["list"]
+    data: List[ModelData]

From 3472b6f90cd978d4f98322d40316cb15c122a191 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 21 Dec 2023 22:30:54 -0500
Subject: [PATCH 35/44] Remove formatting fix

---
 llama_cpp/llama.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index fb13e07c3..5477df733 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -955,7 +955,6 @@ def __init__(
 
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
-        
         self.chat_format = chat_format
         self.chat_handler = chat_handler
 

From 310e2e6ca14fd21eb274902c9ed40dc1f06f6e2a Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 21 Dec 2023 22:31:27 -0500
Subject: [PATCH 36/44] Format

---
 llama_cpp/llama.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 5477df733..c2c045549 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -955,6 +955,7 @@ def __init__(
 
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
+
         self.chat_format = chat_format
         self.chat_handler = chat_handler
 

From 5c9c35e805037d346557471bc7c30f3919bce190 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 21 Dec 2023 23:25:39 -0500
Subject: [PATCH 37/44] Use default model if not found in model settings

---
 llama_cpp/server/model.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index f11c1540a..ce52171ca 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -37,6 +37,9 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
         if model == self._current_model_alias:
             if self._current_model is not None:
                 return self._current_model
+        
+        if model not in self._model_settings_dict:
+            model = self._default_model_alias
 
         settings = self._model_settings_dict[model]
         self._current_model = self.load_llama_from_model_settings(settings)

From 950f721a01d3aedd4633489751d439df24499a40 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 22 Dec 2023 00:02:12 -0500
Subject: [PATCH 38/44] Fix

---
 llama_cpp/server/__main__.py |  8 +++-----
 llama_cpp/server/app.py      | 17 +++++++++++++++++
 llama_cpp/server/model.py    |  8 +++++---
 3 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 7a3587721..3cd38d5a6 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -31,11 +31,10 @@
 
 from llama_cpp.server.app import create_app
 from llama_cpp.server.settings import (
-    Server,
+    Settings,
     ServerSettings,
     ModelSettings,
     ConfigFileSettings,
-    set_server_settings,
 )
 from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
 
@@ -44,8 +43,7 @@ def main():
     description = "🦙 Llama.cpp python server. Host your own LLMs!🚀"
     parser = argparse.ArgumentParser(description=description)
 
-    add_args_from_model(parser, ModelSettings)
-    add_args_from_model(parser, ServerSettings)
+    add_args_from_model(parser, Settings)
     parser.add_argument(
         "--config-file",
         type=str,
@@ -88,7 +86,7 @@ def main():
                 )
             ]
         app = create_app(
-            settings=Server(
+            settings=Settings(
                 **server_settings.model_dump(), **model_settings[0].model_dump()
             )
         )
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 60fc8c0e9..5759b1e42 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 import json
 
 from threading import Lock
@@ -30,6 +31,7 @@
     LlamaProxy,
 )
 from llama_cpp.server.settings import (
+    ConfigFileSettings,
     Settings,
     ModelSettings,
     ServerSettings,
@@ -92,6 +94,21 @@ def create_app(
     server_settings: ServerSettings | None = None,
     model_settings: List[ModelSettings] | None = None,
 ):
+    config_file = os.environ.get("CONFIG_FILE", None)
+    if config_file is not None:
+        if not os.path.exists(config_file):
+            raise ValueError(f"Config file {config_file} not found!")
+        with open(config_file, "rb") as f:
+            config_file_settings = ConfigFileSettings.model_validate_json(f.read())
+            server_settings = ServerSettings(
+                **{
+                    k: v
+                    for k, v in config_file_settings.model_dump().items()
+                    if k in ServerSettings.model_fields
+                }
+            )
+            model_settings = config_file_settings.models
+
     if server_settings is None and model_settings is None:
         if settings is None:
             settings = Settings()
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index ce52171ca..19d308366 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -34,12 +34,14 @@ def __call__(self, model: Optional[str] = None) -> llama_cpp.Llama:
         if model is None:
             model = self._default_model_alias
 
+        if model not in self._model_settings_dict:
+            model = self._default_model_alias
+
         if model == self._current_model_alias:
             if self._current_model is not None:
                 return self._current_model
-        
-        if model not in self._model_settings_dict:
-            model = self._default_model_alias
+
+        self._current_model = None
 
         settings = self._model_settings_dict[model]
         self._current_model = self.load_llama_from_model_settings(settings)

From 8347a78ad4ba9dbcff169f814144e354e5401da6 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 22 Dec 2023 03:46:06 -0500
Subject: [PATCH 39/44] Cleanup

---
 llama_cpp/server/__main__.py | 37 ++++-----------
 llama_cpp/server/app.py      | 90 +++++++++++++-----------------------
 llama_cpp/server/cli.py      |  2 -
 llama_cpp/server/model.py    |  1 -
 llama_cpp/server/settings.py | 10 ++--
 5 files changed, 47 insertions(+), 93 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 3cd38d5a6..6c5e82712 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -9,7 +9,7 @@
 
 Then run:
 ```
-uvicorn llama_cpp.server.app:app --reload
+uvicorn llama_cpp.server.app:create_app --reload
 ```
 
 or
@@ -45,7 +45,7 @@ def main():
 
     add_args_from_model(parser, Settings)
     parser.add_argument(
-        "--config-file",
+        "--config_file",
         type=str,
         help="Path to a config file to load.",
     )
@@ -60,35 +60,16 @@ def main():
                 raise ValueError(f"Config file {config_file} not found!")
             with open(config_file, "rb") as f:
                 config_file_settings = ConfigFileSettings.model_validate_json(f.read())
-                server_settings = ServerSettings(
-                    **{
-                        k: v
-                        for k, v in config_file_settings.model_dump().items()
-                        if k in ServerSettings.model_fields
-                    }
-                )
+                server_settings = ServerSettings.model_validate(config_file_settings)
                 model_settings = config_file_settings.models
         else:
-            server_settings = ServerSettings(
-                **{
-                    k: v
-                    for k, v in vars(args).items()
-                    if k in ServerSettings.model_fields
-                }
-            )
-            model_settings = [
-                ModelSettings(
-                    **{
-                        k: v
-                        for k, v in vars(args).items()
-                        if k in ModelSettings.model_fields
-                    }
-                )
-            ]
+            server_settings = parse_model_from_args(ServerSettings, args)
+            model_settings = [parse_model_from_args(ModelSettings, args)]
+        assert server_settings is not None
+        assert model_settings is not None
         app = create_app(
-            settings=Settings(
-                **server_settings.model_dump(), **model_settings[0].model_dump()
-            )
+            server_settings_or_none=server_settings,
+            model_settings=model_settings,
         )
         uvicorn.run(
             app,
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 5759b1e42..fbb756f6e 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -47,30 +47,30 @@
 
 router = APIRouter(route_class=RouteErrorHandler)
 
-_settings: Optional[ServerSettings] = None
+_server_settings: Optional[ServerSettings] = None
 
 
-def set_settings(settings: ServerSettings):
-    global _settings
-    _settings = settings
+def set_server_settings(server_settings: ServerSettings):
+    global _server_settings
+    _server_settings = server_settings
 
 
-def get_settings():
-    yield _settings
+def get_server_settings():
+    yield _server_settings
 
 
-LLAMA: Optional[LlamaProxy] = None
+_llama_proxy: Optional[LlamaProxy] = None
 
 llama_outer_lock = Lock()
 llama_inner_lock = Lock()
 
 
-def set_llama(models: List[ModelSettings]):
-    global LLAMA
-    LLAMA = LlamaProxy(models=models)
+def set_llama_proxy(model_settings: List[ModelSettings]):
+    global _llama_proxy
+    _llama_proxy = LlamaProxy(models=model_settings)
 
 
-def get_llama():
+def get_llama_proxy():
     # NOTE: This double lock allows the currently streaming llama model to
     # check if any other requests are pending in the same thread and cancel
     # the stream if so.
@@ -81,7 +81,7 @@ def get_llama():
         try:
             llama_outer_lock.release()
             release_outer_lock = False
-            yield LLAMA
+            yield _llama_proxy
         finally:
             llama_inner_lock.release()
     finally:
@@ -100,40 +100,20 @@ def create_app(
             raise ValueError(f"Config file {config_file} not found!")
         with open(config_file, "rb") as f:
             config_file_settings = ConfigFileSettings.model_validate_json(f.read())
-            server_settings = ServerSettings(
-                **{
-                    k: v
-                    for k, v in config_file_settings.model_dump().items()
-                    if k in ServerSettings.model_fields
-                }
-            )
+            server_settings = ServerSettings.model_validate(config_file_settings)
             model_settings = config_file_settings.models
 
     if server_settings is None and model_settings is None:
         if settings is None:
             settings = Settings()
-        server_settings = ServerSettings(
-            **{
-                k: v
-                for k, v in settings.model_dump().items()
-                if k in ServerSettings.model_fields
-            }
-        )
-        model_settings = [
-            ModelSettings(
-                **{
-                    k: v
-                    for k, v in settings.model_dump().items()
-                    if k in ModelSettings.model_fields
-                }
-            )
-        ]
+        server_settings = ServerSettings.model_validate(settings)
+        model_settings = [ModelSettings.model_validate(settings)]
 
     assert (
         server_settings is not None and model_settings is not None
     ), "server_settings and model_settings must be provided together"
 
-    set_settings(server_settings)
+    set_server_settings(server_settings)
     middleware = [Middleware(RawContextMiddleware, plugins=(RequestIdPlugin(),))]
     app = FastAPI(
         middleware=middleware,
@@ -150,7 +130,7 @@ def create_app(
     app.include_router(router)
 
     assert model_settings is not None
-    set_llama(models=model_settings)
+    set_llama_proxy(model_settings=model_settings)
 
     return app
 
@@ -167,7 +147,7 @@ async def get_event_publisher(
                 if await request.is_disconnected():
                     raise anyio.get_cancelled_exc_class()()
                 if (
-                    next(get_settings()).interrupt_requests
+                    next(get_server_settings()).interrupt_requests
                     and llama_outer_lock.locked()
                 ):
                     await inner_send_chan.send(dict(data="[DONE]"))
@@ -197,7 +177,7 @@ def _logit_bias_tokens_to_input_ids(
 
 
 async def authenticate(
-    settings: Settings = Depends(get_settings),
+    settings: Settings = Depends(get_server_settings),
     authorization: Optional[str] = Depends(bearer_scheme),
 ):
     # Skip API key check if it's not set in settings
@@ -216,19 +196,18 @@ async def authenticate(
     )
 
 
-@router.post("/v1/completions", summary="Completion")
-@router.post("/v1/engines/copilot-codex/completions", include_in_schema=False)
+@router.post("/v1/completions", summary="Completion", dependencies=[Depends(authenticate)])
+@router.post("/v1/engines/copilot-codex/completions", include_in_schema=False, dependencies=[Depends(authenticate)])
 async def create_completion(
     request: Request,
     body: CreateCompletionRequest,
-    llama: llama_cpp.Llama = Depends(get_llama),
-    authenticated: str = Depends(authenticate),
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
 ) -> llama_cpp.Completion:
     if isinstance(body.prompt, list):
         assert len(body.prompt) <= 1
         body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
 
-    llama = llama(body.model)
+    llama = llama_proxy(body.model)
 
     exclude = {
         "n",
@@ -277,24 +256,21 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
         return iterator_or_completion
 
 
-@router.post("/v1/embeddings", summary="Embedding")
+@router.post("/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)])
 async def create_embedding(
     request: CreateEmbeddingRequest,
-    llama: llama_cpp.Llama = Depends(get_llama),
-    authenticated: str = Depends(authenticate),
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
 ):
     return await run_in_threadpool(
-        llama(request.model).create_embedding, **request.model_dump(exclude={"user"})
+        llama_proxy(request.model).create_embedding, **request.model_dump(exclude={"user"})
     )
 
 
-@router.post("/v1/chat/completions", summary="Chat")
+@router.post("/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)])
 async def create_chat_completion(
     request: Request,
     body: CreateChatCompletionRequest,
-    llama: llama_cpp.Llama = Depends(get_llama),
-    settings: Settings = Depends(get_settings),
-    authenticated: str = Depends(authenticate),
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
 ) -> llama_cpp.ChatCompletion:
     exclude = {
         "n",
@@ -302,7 +278,7 @@ async def create_chat_completion(
         "user",
     }
     kwargs = body.model_dump(exclude=exclude)
-    llama = llama(body.model)
+    llama = llama_proxy(body.model)
     if body.logit_bias is not None:
         kwargs["logit_bias"] = (
             _logit_bias_tokens_to_input_ids(llama, body.logit_bias)
@@ -341,11 +317,9 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
         return iterator_or_completion
 
 
-@router.get("/v1/models", summary="Models")
+@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)]])
 async def get_models(
-    settings: Settings = Depends(get_settings),
-    authenticated: str = Depends(authenticate),
-    llama: llama_cpp.Llama = Depends(get_llama),
+    llama_proxy: LlamaProxy = Depends(get_llama_proxy),
 ) -> ModelList:
     return {
         "object": "list",
@@ -356,6 +330,6 @@ async def get_models(
                 "owned_by": "me",
                 "permissions": [],
             }
-            for model_alias in llama
+            for model_alias in llama_proxy
         ],
     }
diff --git a/llama_cpp/server/cli.py b/llama_cpp/server/cli.py
index 750b396cd..8e32d2c0e 100644
--- a/llama_cpp/server/cli.py
+++ b/llama_cpp/server/cli.py
@@ -6,8 +6,6 @@
 
 from pydantic import BaseModel
 
-from llama_cpp.server.settings import CommandLineSettings
-
 
 def _get_base_type(annotation: Type[Any]) -> Type[Any]:
     if getattr(annotation, "__origin__", None) is Literal:
diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py
index 19d308366..b9373b7ac 100644
--- a/llama_cpp/server/model.py
+++ b/llama_cpp/server/model.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 from typing import Optional, Union, List
-from threading import Lock
 
 import llama_cpp
 
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index ea9cf26ec..752b68032 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -131,22 +131,24 @@ class ModelSettings(BaseSettings):
 
 
 class ServerSettings(BaseSettings):
+    # Uvicorn Settings
     host: str = Field(default="localhost", description="Listen address")
     port: int = Field(default=8000, description="Listen port")
-    interrupt_requests: bool = Field(
-        default=True,
-        description="Whether to interrupt requests when a new request is received.",
-    )
     ssl_keyfile: Optional[str] = Field(
         default=None, description="SSL key file for HTTPS"
     )
     ssl_certfile: Optional[str] = Field(
         default=None, description="SSL certificate file for HTTPS"
     )
+    # FastAPI Settings
     api_key: Optional[str] = Field(
         default=None,
         description="API key for authentication. If set all requests need to be authenticated.",
     )
+    interrupt_requests: bool = Field(
+        default=True,
+        description="Whether to interrupt requests when a new request is received.",
+    )
 
 
 class Settings(ServerSettings, ModelSettings):

From 02ab0e2153b5a29cfddd1dead255f931a129e80d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 22 Dec 2023 03:47:10 -0500
Subject: [PATCH 40/44] Fix

---
 llama_cpp/server/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index fbb756f6e..4b08cc6ce 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -317,7 +317,7 @@ def iterator() -> Iterator[llama_cpp.ChatCompletionChunk]:
         return iterator_or_completion
 
 
-@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)]])
+@router.get("/v1/models", summary="Models", dependencies=[Depends(authenticate)])
 async def get_models(
     llama_proxy: LlamaProxy = Depends(get_llama_proxy),
 ) -> ModelList:

From fd1bf6480efadb5dde58314014d1d817b9de9b94 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 22 Dec 2023 03:48:26 -0500
Subject: [PATCH 41/44] Fix

---
 llama_cpp/server/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 6c5e82712..a13d4a111 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -68,7 +68,7 @@ def main():
         assert server_settings is not None
         assert model_settings is not None
         app = create_app(
-            server_settings_or_none=server_settings,
+            server_settings=server_settings,
             model_settings=model_settings,
         )
         uvicorn.run(

From ecd84344b2e8ec517413d6c95400658c0d9e8ef7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 22 Dec 2023 04:11:39 -0500
Subject: [PATCH 42/44] Remove unnused CommandLineSettings

---
 llama_cpp/server/settings.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 752b68032..53ead7487 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -155,12 +155,6 @@ class Settings(ServerSettings, ModelSettings):
     pass
 
 
-class CommandLineSettings(Settings):
-    config_file: Optional[str] = Field(
-        default=None, description="Path to a config file to load."
-    )
-
-
 class ConfigFileSettings(ServerSettings):
     models: List[ModelSettings] = Field(
         default=[], description="Model configs, overwrites default config"

From 52861468b2f51a314ece494732d2084bfaf7aeb6 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 22 Dec 2023 04:27:49 -0500
Subject: [PATCH 43/44] Cleanup

---
 llama_cpp/server/__main__.py | 32 ++++++++++++++++----------------
 llama_cpp/server/app.py      | 21 ++++++++++++++++-----
 2 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index a13d4a111..fadfc5fb4 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -49,10 +49,10 @@ def main():
         type=str,
         help="Path to a config file to load.",
     )
+    server_settings: ServerSettings | None = None
+    model_settings: list[ModelSettings] = []
+    args = parser.parse_args()
     try:
-        args = parser.parse_args()
-        server_settings: ServerSettings | None = None
-        model_settings: list[ModelSettings] = []
         # Load server settings from config_file if provided
         config_file = os.environ.get("CONFIG_FILE", args.config_file)
         if config_file:
@@ -65,23 +65,23 @@ def main():
         else:
             server_settings = parse_model_from_args(ServerSettings, args)
             model_settings = [parse_model_from_args(ModelSettings, args)]
-        assert server_settings is not None
-        assert model_settings is not None
-        app = create_app(
-            server_settings=server_settings,
-            model_settings=model_settings,
-        )
-        uvicorn.run(
-            app,
-            host=os.getenv("HOST", server_settings.host),
-            port=int(os.getenv("PORT", server_settings.port)),
-            ssl_keyfile=server_settings.ssl_keyfile,
-            ssl_certfile=server_settings.ssl_certfile,
-        )
     except Exception as e:
         print(e, file=sys.stderr)
         parser.print_help()
         sys.exit(1)
+    assert server_settings is not None
+    assert model_settings is not None
+    app = create_app(
+        server_settings=server_settings,
+        model_settings=model_settings,
+    )
+    uvicorn.run(
+        app,
+        host=os.getenv("HOST", server_settings.host),
+        port=int(os.getenv("PORT", server_settings.port)),
+        ssl_keyfile=server_settings.ssl_keyfile,
+        ssl_certfile=server_settings.ssl_certfile,
+    )
 
 
 if __name__ == "__main__":
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 4b08cc6ce..a7ce63f76 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -196,8 +196,14 @@ async def authenticate(
     )
 
 
-@router.post("/v1/completions", summary="Completion", dependencies=[Depends(authenticate)])
-@router.post("/v1/engines/copilot-codex/completions", include_in_schema=False, dependencies=[Depends(authenticate)])
+@router.post(
+    "/v1/completions", summary="Completion", dependencies=[Depends(authenticate)]
+)
+@router.post(
+    "/v1/engines/copilot-codex/completions",
+    include_in_schema=False,
+    dependencies=[Depends(authenticate)],
+)
 async def create_completion(
     request: Request,
     body: CreateCompletionRequest,
@@ -256,17 +262,22 @@ def iterator() -> Iterator[llama_cpp.CreateCompletionStreamResponse]:
         return iterator_or_completion
 
 
-@router.post("/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)])
+@router.post(
+    "/v1/embeddings", summary="Embedding", dependencies=[Depends(authenticate)]
+)
 async def create_embedding(
     request: CreateEmbeddingRequest,
     llama_proxy: LlamaProxy = Depends(get_llama_proxy),
 ):
     return await run_in_threadpool(
-        llama_proxy(request.model).create_embedding, **request.model_dump(exclude={"user"})
+        llama_proxy(request.model).create_embedding,
+        **request.model_dump(exclude={"user"}),
     )
 
 
-@router.post("/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)])
+@router.post(
+    "/v1/chat/completions", summary="Chat", dependencies=[Depends(authenticate)]
+)
 async def create_chat_completion(
     request: Request,
     body: CreateChatCompletionRequest,

From 1b322b4fdcf62748ebd6189e6952053880fdc2a1 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 22 Dec 2023 05:15:41 -0500
Subject: [PATCH 44/44] Support default name for copilot-codex models

---
 llama_cpp/server/app.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index a7ce63f76..c54e4eb5c 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -213,7 +213,11 @@ async def create_completion(
         assert len(body.prompt) <= 1
         body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
 
-    llama = llama_proxy(body.model)
+    llama = llama_proxy(
+        body.model
+        if request.url.path != "/v1/engines/copilot-codex/completions"
+        else "copilot-codex"
+    )
 
     exclude = {
         "n",