====== PrivateGPT ======

Querying own files with GPT locally using any GPT model (Retrieval-augmented generation (RAG))
Supports text formats only (pdf, doc, txt, etc), limited support for tables, but not csv data or large tables. 

https://github.com/zylon-ai/private-gpt

===== CPU only docker =====

Docker version available.
<code bash>
#Install docker for user
curl -fSSL get.docker.com | sh
sudo apt-get install -y uidmap
dockerd-rootless-setuptool.sh install

#prepare, using Mistral-7B-OpenOrca Model, remove the LLAMACPP entries to use default
mkdir ~/aimodels
chmod 777 ~/aimodels/
cat << EOF >> docker-privategpt.yml
services:
  # https://hub.docker.com/r/3x3cut0r/privategpt
  privategpt:
    image: 3x3cut0r/privategpt:latest
    container_name: privategpt
    environment:
      LLAMACPP_LLM_HF_REPO_ID: "TheBloke/Mistral-7B-OpenOrca-GGUF"
      LLAMACPP_LLM_HF_MODEL_FILE: "mistral-7b-openorca.Q5_K_M.gguf"
      LLAMACPP_EMBEDDING_HF_MODEL_NAME: "BAAI/bge-large-en-v1.5"
      EMBEDDING_INGEST_MODE: "parallel"
      EMBEDDING_COUNT_WORKERS: "4"
    volumes:
      - /home/$USER/aimodels:/home/worker/app/models
    ports:
      - 8080:8080/tcp
EOF

docker compose -f docker-privategpt.yml up -d
docker logs --follow privategpt
</code>

===== NVIDIA docker =====

PrivateGPT Docker with NVIDIA GPU support - needs some adjustments to settings and docker compose files to use model variables properly:
<code bash>
git clone https://github.com/zylon-ai/private-gpt
cd private-gpt/

mkdir -p /opt/privategpt-storage/{local_data,models}
chown 1000:1000 /opt/privategpt-storage -R

# from PR https://github.com/zylon-ai/private-gpt/pull/1655/files
# and https://github.com/zylon-ai/private-gpt/issues/1405 

cat << EOD >> Dockerfile.local.gpu
FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 as base

# For tzdata
ENV DEBIAN_FRONTEND="noninteractive" TZ="Etc/UTC"

# Install Python 3.11 and set it as default
RUN apt-get update && \
    apt-get install -y software-properties-common && \
    add-apt-repository ppa:deadsnakes/ppa && \
    apt-get update && \ 
    apt-get install -y python3.11 python3.11-venv python3-pip && \
    ln -sf /usr/bin/python3.11 /usr/bin/python3 && \
    python3 --version

# Install poetry
RUN pip install pipx
RUN python3 -m pipx ensurepath
RUN pipx install poetry
ENV PATH="/root/.local/bin:$PATH"
ENV PATH=".venv/bin/:$PATH"

# Dependencies to build llama-cpp
RUN apt update && apt install -y \
  libopenblas-dev\
  ninja-build\
  build-essential\
  pkg-config\
  wget\
  gcc

# https://python-poetry.org/docs/configuration/#virtualenvsin-project
ENV POETRY_VIRTUALENVS_IN_PROJECT=true

#########################
FROM base as dependencies
#########################

WORKDIR /home/worker/app
COPY pyproject.toml poetry.lock ./

RUN poetry config installer.max-workers 10
RUN poetry install --extras "ui embeddings-huggingface llms-llama-cpp vector-stores-qdrant"

# Enable GPU support
ENV LLAMA_CUBLAS=1
RUN CMAKE_ARGS='-DLLAMA_CUBLAS=on' FORCE_CMAKE=1 poetry run pip install --upgrade --force-reinstall --no-cache-dir llama-cpp-python

################
FROM base as app
################

ENV PYTHONUNBUFFERED=1
ENV PORT=8080
EXPOSE 8080

# Prepare a non-root user
RUN adduser worker
WORKDIR /home/worker/app

RUN mkdir -p local_data; chown -R worker local_data
RUN mkdir -p models; chown -R worker models
COPY --chown=worker --from=dependencies /home/worker/app/.venv/ .venv
COPY --chown=worker private_gpt/ private_gpt
COPY --chown=worker fern/ fern
COPY --chown=worker *.yaml *.md ./
COPY --chown=worker scripts/ scripts
COPY --chown=worker pyproject.toml poetry.lock ./

# Copy the entry point script into the container and make it executable
COPY --chown=worker entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh

ENV PYTHONPATH="$PYTHONPATH:/private_gpt/"

#USER worker

#ENTRYPOINT ["/entrypoint.sh", "python", "-m", "private_gpt"]
ENTRYPOINT /entrypoint.sh python -m private_gpt
EOD

cat << EOD >> entrypoint.sh
#!/bin/sh

## Download the embedding and model files
echo "Running setup script"
poetry run python scripts/setup

## Execute the main container command
exec "$@"
EOD


cat << EOD >> settings-docker.yaml 
server:
  env_name: ${APP_ENV:prod}
  port: ${PORT:8080}

llm:
  mode: ${PGPT_LLM_MODE:mock}

embedding:
  mode: ${PGPT_EMBEDDING_MODE:sagemaker}

llamacpp:
  prompt_style: ${PGPT_PROMPT_STYLE:mistral}
  llm_hf_repo_id: ${PGPT_HF_REPO_ID:TheBloke/Mistral-7B-Instruct-v0.2-GGUF}
  llm_hf_model_file: ${PGPT_HF_MODEL_FILE:mistral-7b-instruct-v0.2.Q4_K_M.gguf}

huggingface:
  embedding_hf_model_name: ${PGPT_EMBEDDING_HF_MODEL_NAME:BAAI/bge-small-en-v1.5}

sagemaker:
  llm_endpoint_name: ${PGPT_SAGEMAKER_LLM_ENDPOINT_NAME:}
  embedding_endpoint_name: ${PGPT_SAGEMAKER_EMBEDDING_ENDPOINT_NAME:}

ollama:
  llm_model: ${PGPT_OLLAMA_LLM_MODEL:mistral}
  embedding_model: ${PGPT_OLLAMA_EMBEDDING_MODEL:nomic-embed-text}
  api_base: ${PGPT_OLLAMA_API_BASE:http://ollama:11434}
  embedding_api_base: ${PGPT_OLLAMA_EMBEDDING_API_BASE:http://ollama:11434}
  tfs_z: ${PGPT_OLLAMA_TFS_Z:1.0}
  top_k: ${PGPT_OLLAMA_TOP_K:40}
  top_p: ${PGPT_OLLAMA_TOP_P:0.9}
  repeat_last_n: ${PGPT_OLLAMA_REPEAT_LAST_N:64}
  repeat_penalty: ${PGPT_OLLAMA_REPEAT_PENALTY:1.2}
  request_timeout: ${PGPT_OLLAMA_REQUEST_TIMEOUT:600.0}

ui:
  enabled: true
  path: /
EOD

cat << EOD >> docker-compose-gpu-wulf.yaml
services:
  private-gpt-gpu:
    container_name: private-gpt-gpu
    restart: unless-stopped
    build:
      dockerfile: Dockerfile.local.gpu
    volumes:
      - /opt/privategpt-storage/local_data/:/home/worker/app/local_data
      - /opt/privategpt-storage/models/:/home/worker/app/models
    ports:
      - 8001:8080
    environment:
      PORT: 8080
      PGPT_PROFILES: docker
      PGPT_LLM_MODE: llamacpp
      PGPT_EMBEDDING_MODE: huggingface
      
      #Microsoft Phi-3 Mini 4k 
      PGPT_HF_REPO_ID: "microsoft/Phi-3-mini-4k-instruct-gguf"
      #PGPT_HF_MODEL_FILE: "Phi-3-mini-4k-instruct-fp16.gguf"
      PGPT_HF_MODEL_FILE: "Phi-3-mini-4k-instruct-q4.gguf"
      PGPT_PROMPT_STYLE: "chatml"

      #Meta Llama 3
      #PGPT_HF_REPO_ID: "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
      #PGPT_HF_MODEL_FILE: "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
      #PGPT_PROMPT_STYLE: "llama3"

      #OpenOrca Mistral 7B
      #PGPT_HF_REPO_ID: "TheBloke/Mistral-7B-OpenOrca-GGUF"
      #PGPT_HF_MODEL_FILE: "mistral-7b-openorca.Q5_K_M.gguf"
      #PGPT_PROMPT_STYLE: "mistral"

      PGPT_EMBEDDING_HF_MODEL_NAME: "BAAI/bge-small-en-v1.5"
      #PGPT_EMBEDDING_HF_MODEL_NAME: "BAAI/bge-large-en-v1.5"

      TOKENIZERS_PARALLELISM: True
      #PGPT_NGL: 20

      PGPT_MAX_NEW_TOKENS: 512
      PGPT_CONTEXT_WINDOW: 3900
      PGPT_TEMPERATURE: 0.1

      EMBEDDING_INGEST_MODE: "simple"
      EMBEDDING_COUNT_WORKERS: "2"
      HUGGINGFACE_TOKEN: "token to download gated models from huggingface"
      PYTORCH_CUDA_ALLOC_CONF: "max_split_size_mb:256"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
EOD

docker build -f Dockerfile.local.gpu .
docker compose -f docker-compose-gpu-wulf.yaml up -d
docker logs -n 20 --follow private-gpt-gpu 
nvtop

</code>

===== NGL settings patch =====

To add the amount of layers loaded in the GPU for llamacpp, apply this NGL option patch, then add       "PGPT_NGL: 20" to the docker compose environment section with 20 being the amount of layers or -1 for all.
<code bash>

cat << EOD >> ngl-settings-option.patch
diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py
index baffa4e..e8bddd2 100644
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@@ -57,7 +57,7 @@ class LLMComponent:
                     "top_k": settings.llamacpp.top_k,  # ollama and llama-cpp
                     "top_p": settings.llamacpp.top_p,  # ollama and llama-cpp
                     "repeat_penalty": settings.llamacpp.repeat_penalty,  # ollama llama-cpp
-                    "n_gpu_layers": -1,
+                    "n_gpu_layers": settings.llamacpp.ngl,
                     "offload_kqv": True,
                 }
                 self.llm = LlamaCPP(
diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
index 051cfca..701a1a9 100644
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@@ -145,6 +145,11 @@ class LlamaCPPSettings(BaseModel):
         1.1,
         description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
     )
+    ngl: int = Field(
+        -1,
+        description="Number of layers loaded in GPU (Default: -1)",
+    )
+


 class HuggingFaceSettings(BaseModel):
diff --git a/settings.yaml b/settings.yaml
index e881a55..c4c86cb 100644
--- a/settings.yaml
+++ b/settings.yaml
@@ -60,6 +60,7 @@ llamacpp:
   top_k: 40             # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
   top_p: 1.0            # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
   repeat_penalty: 1.1   # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
+  ngl: ${PGPT_NGL:-1}   # Sets number of layers offloaded to gpu

 embedding:
   # Should be matching the value above in most cases
EOD

git apply ngl-settings-option.patch

</code>

===== Max New Tokens / Context Size / Temperature settings patch =====

To be able to set Max New Tokens, Context Size and Temperature in the docker compose file as variables, the settings.yaml file needs to be adjusted.

docker compose file additions:
<code>
    environment:
      PGPT_MAX_NEW_TOKENS: 512
      PGPT_CONTEXT_WINDOW: 3900
      PGPT_TEMPERATURE: 0.1
</code>

<code bash>
cat << EOD >> token-ctx-temp-settings-option.patch
diff --git a/settings.yaml b/settings.yaml
index e881a55..8666b86 100644
--- a/settings.yaml
+++ b/settings.yaml
@@ -37,10 +37,10 @@ ui:
 llm:
   mode: llamacpp
   # Should be matching the selected model
-  max_new_tokens: 512
-  context_window: 3900
+  max_new_tokens: ${PGPT_MAX_NEW_TOKENS:512}
+  context_window: ${PGPT_CONTEXT_WINDOW:3900}
   tokenizer: mistralai/Mistral-7B-Instruct-v0.2
-  temperature: 0.1      # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
+  temperature: ${PGPT_TEMPERATURE:0.1}      # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
 
 rag:
   similarity_top_k: 2
EOD

git apply token-ctx-temp-settings-option.patch
</code>

===== CSS Customisation =====

To adjust the main input box and fix mobile/low height browser window issue of the input box wrapping to the right, some css trickery is required. The last three css lines are added to privategpt/ui/ui.py for this:
<code python privategpt/ui/ui.py>
def _build_ui_blocks(self) -> gr.Blocks:
        logger.debug("Creating the UI blocks")
        with gr.Blocks(
            title=UI_TAB_TITLE,
            theme=gr.themes.Soft(primary_hue=slate),
            css=".logo { "
            "display:flex;"
            "background-color: #000;"
            "height: 90px;"
            "border-radius: 8px;"
            "align-content: center;"
            "justify-content: center;"
            "align-items: center;"
            "font-size: xxx-large;"
            "color: #fff;"
            "font-weight: bold;"
            "}"
            ".logo img { height: 100% }"
            ".contain { display: flex !important; flex-direction: column !important; }"
            "#component-0, #component-3, #component-10, #component-8  { height: 100% !important; }"
            "#chatbot { flex-grow: 1 !important; overflow: auto !important;}"
            "#col { height: calc(100vh - 112px - 16px) !important; }"
            "#component-24 > label:nth-child(1) > textarea:nth-child(2) { min-height: 100px !important; }"
            "#component-24 { min-width: min(260px, 100%) !important;}"
            "#col { min-height:750px !important; }",
        ) as blocks:
            with gr.Row():

</code>