====== PrivateGPT ======
Querying own files with GPT locally using any GPT model (Retrieval-augmented generation (RAG))
Supports text formats only (pdf, doc, txt, etc), limited support for tables, but not csv data or large tables.
https://github.com/zylon-ai/private-gpt
===== CPU only docker =====
Docker version available.
#Install docker for user
curl -fSSL get.docker.com | sh
sudo apt-get install -y uidmap
dockerd-rootless-setuptool.sh install
#prepare, using Mistral-7B-OpenOrca Model, remove the LLAMACPP entries to use default
mkdir ~/aimodels
chmod 777 ~/aimodels/
cat << EOF >> docker-privategpt.yml
services:
# https://hub.docker.com/r/3x3cut0r/privategpt
privategpt:
image: 3x3cut0r/privategpt:latest
container_name: privategpt
environment:
LLAMACPP_LLM_HF_REPO_ID: "TheBloke/Mistral-7B-OpenOrca-GGUF"
LLAMACPP_LLM_HF_MODEL_FILE: "mistral-7b-openorca.Q5_K_M.gguf"
LLAMACPP_EMBEDDING_HF_MODEL_NAME: "BAAI/bge-large-en-v1.5"
EMBEDDING_INGEST_MODE: "parallel"
EMBEDDING_COUNT_WORKERS: "4"
volumes:
- /home/$USER/aimodels:/home/worker/app/models
ports:
- 8080:8080/tcp
EOF
docker compose -f docker-privategpt.yml up -d
docker logs --follow privategpt
===== NVIDIA docker =====
PrivateGPT Docker with NVIDIA GPU support - needs some adjustments to settings and docker compose files to use model variables properly:
git clone https://github.com/zylon-ai/private-gpt
cd private-gpt/
mkdir -p /opt/privategpt-storage/{local_data,models}
chown 1000:1000 /opt/privategpt-storage -R
# from PR https://github.com/zylon-ai/private-gpt/pull/1655/files
# and https://github.com/zylon-ai/private-gpt/issues/1405
cat << EOD >> Dockerfile.local.gpu
FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 as base
# For tzdata
ENV DEBIAN_FRONTEND="noninteractive" TZ="Etc/UTC"
# Install Python 3.11 and set it as default
RUN apt-get update && \
apt-get install -y software-properties-common && \
add-apt-repository ppa:deadsnakes/ppa && \
apt-get update && \
apt-get install -y python3.11 python3.11-venv python3-pip && \
ln -sf /usr/bin/python3.11 /usr/bin/python3 && \
python3 --version
# Install poetry
RUN pip install pipx
RUN python3 -m pipx ensurepath
RUN pipx install poetry
ENV PATH="/root/.local/bin:$PATH"
ENV PATH=".venv/bin/:$PATH"
# Dependencies to build llama-cpp
RUN apt update && apt install -y \
libopenblas-dev\
ninja-build\
build-essential\
pkg-config\
wget\
gcc
# https://python-poetry.org/docs/configuration/#virtualenvsin-project
ENV POETRY_VIRTUALENVS_IN_PROJECT=true
#########################
FROM base as dependencies
#########################
WORKDIR /home/worker/app
COPY pyproject.toml poetry.lock ./
RUN poetry config installer.max-workers 10
RUN poetry install --extras "ui embeddings-huggingface llms-llama-cpp vector-stores-qdrant"
# Enable GPU support
ENV LLAMA_CUBLAS=1
RUN CMAKE_ARGS='-DLLAMA_CUBLAS=on' FORCE_CMAKE=1 poetry run pip install --upgrade --force-reinstall --no-cache-dir llama-cpp-python
################
FROM base as app
################
ENV PYTHONUNBUFFERED=1
ENV PORT=8080
EXPOSE 8080
# Prepare a non-root user
RUN adduser worker
WORKDIR /home/worker/app
RUN mkdir -p local_data; chown -R worker local_data
RUN mkdir -p models; chown -R worker models
COPY --chown=worker --from=dependencies /home/worker/app/.venv/ .venv
COPY --chown=worker private_gpt/ private_gpt
COPY --chown=worker fern/ fern
COPY --chown=worker *.yaml *.md ./
COPY --chown=worker scripts/ scripts
COPY --chown=worker pyproject.toml poetry.lock ./
# Copy the entry point script into the container and make it executable
COPY --chown=worker entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENV PYTHONPATH="$PYTHONPATH:/private_gpt/"
#USER worker
#ENTRYPOINT ["/entrypoint.sh", "python", "-m", "private_gpt"]
ENTRYPOINT /entrypoint.sh python -m private_gpt
EOD
cat << EOD >> entrypoint.sh
#!/bin/sh
## Download the embedding and model files
echo "Running setup script"
poetry run python scripts/setup
## Execute the main container command
exec "$@"
EOD
cat << EOD >> settings-docker.yaml
server:
env_name: ${APP_ENV:prod}
port: ${PORT:8080}
llm:
mode: ${PGPT_LLM_MODE:mock}
embedding:
mode: ${PGPT_EMBEDDING_MODE:sagemaker}
llamacpp:
prompt_style: ${PGPT_PROMPT_STYLE:mistral}
llm_hf_repo_id: ${PGPT_HF_REPO_ID:TheBloke/Mistral-7B-Instruct-v0.2-GGUF}
llm_hf_model_file: ${PGPT_HF_MODEL_FILE:mistral-7b-instruct-v0.2.Q4_K_M.gguf}
huggingface:
embedding_hf_model_name: ${PGPT_EMBEDDING_HF_MODEL_NAME:BAAI/bge-small-en-v1.5}
sagemaker:
llm_endpoint_name: ${PGPT_SAGEMAKER_LLM_ENDPOINT_NAME:}
embedding_endpoint_name: ${PGPT_SAGEMAKER_EMBEDDING_ENDPOINT_NAME:}
ollama:
llm_model: ${PGPT_OLLAMA_LLM_MODEL:mistral}
embedding_model: ${PGPT_OLLAMA_EMBEDDING_MODEL:nomic-embed-text}
api_base: ${PGPT_OLLAMA_API_BASE:http://ollama:11434}
embedding_api_base: ${PGPT_OLLAMA_EMBEDDING_API_BASE:http://ollama:11434}
tfs_z: ${PGPT_OLLAMA_TFS_Z:1.0}
top_k: ${PGPT_OLLAMA_TOP_K:40}
top_p: ${PGPT_OLLAMA_TOP_P:0.9}
repeat_last_n: ${PGPT_OLLAMA_REPEAT_LAST_N:64}
repeat_penalty: ${PGPT_OLLAMA_REPEAT_PENALTY:1.2}
request_timeout: ${PGPT_OLLAMA_REQUEST_TIMEOUT:600.0}
ui:
enabled: true
path: /
EOD
cat << EOD >> docker-compose-gpu-wulf.yaml
services:
private-gpt-gpu:
container_name: private-gpt-gpu
restart: unless-stopped
build:
dockerfile: Dockerfile.local.gpu
volumes:
- /opt/privategpt-storage/local_data/:/home/worker/app/local_data
- /opt/privategpt-storage/models/:/home/worker/app/models
ports:
- 8001:8080
environment:
PORT: 8080
PGPT_PROFILES: docker
PGPT_LLM_MODE: llamacpp
PGPT_EMBEDDING_MODE: huggingface
#Microsoft Phi-3 Mini 4k
PGPT_HF_REPO_ID: "microsoft/Phi-3-mini-4k-instruct-gguf"
#PGPT_HF_MODEL_FILE: "Phi-3-mini-4k-instruct-fp16.gguf"
PGPT_HF_MODEL_FILE: "Phi-3-mini-4k-instruct-q4.gguf"
PGPT_PROMPT_STYLE: "chatml"
#Meta Llama 3
#PGPT_HF_REPO_ID: "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
#PGPT_HF_MODEL_FILE: "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
#PGPT_PROMPT_STYLE: "llama3"
#OpenOrca Mistral 7B
#PGPT_HF_REPO_ID: "TheBloke/Mistral-7B-OpenOrca-GGUF"
#PGPT_HF_MODEL_FILE: "mistral-7b-openorca.Q5_K_M.gguf"
#PGPT_PROMPT_STYLE: "mistral"
PGPT_EMBEDDING_HF_MODEL_NAME: "BAAI/bge-small-en-v1.5"
#PGPT_EMBEDDING_HF_MODEL_NAME: "BAAI/bge-large-en-v1.5"
TOKENIZERS_PARALLELISM: True
#PGPT_NGL: 20
PGPT_MAX_NEW_TOKENS: 512
PGPT_CONTEXT_WINDOW: 3900
PGPT_TEMPERATURE: 0.1
EMBEDDING_INGEST_MODE: "simple"
EMBEDDING_COUNT_WORKERS: "2"
HUGGINGFACE_TOKEN: "token to download gated models from huggingface"
PYTORCH_CUDA_ALLOC_CONF: "max_split_size_mb:256"
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
EOD
docker build -f Dockerfile.local.gpu .
docker compose -f docker-compose-gpu-wulf.yaml up -d
docker logs -n 20 --follow private-gpt-gpu
nvtop
===== NGL settings patch =====
To add the amount of layers loaded in the GPU for llamacpp, apply this NGL option patch, then add "PGPT_NGL: 20" to the docker compose environment section with 20 being the amount of layers or -1 for all.
cat << EOD >> ngl-settings-option.patch
diff --git a/private_gpt/components/llm/llm_component.py b/private_gpt/components/llm/llm_component.py
index baffa4e..e8bddd2 100644
--- a/private_gpt/components/llm/llm_component.py
+++ b/private_gpt/components/llm/llm_component.py
@@ -57,7 +57,7 @@ class LLMComponent:
"top_k": settings.llamacpp.top_k, # ollama and llama-cpp
"top_p": settings.llamacpp.top_p, # ollama and llama-cpp
"repeat_penalty": settings.llamacpp.repeat_penalty, # ollama llama-cpp
- "n_gpu_layers": -1,
+ "n_gpu_layers": settings.llamacpp.ngl,
"offload_kqv": True,
}
self.llm = LlamaCPP(
diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
index 051cfca..701a1a9 100644
--- a/private_gpt/settings/settings.py
+++ b/private_gpt/settings/settings.py
@@ -145,6 +145,11 @@ class LlamaCPPSettings(BaseModel):
1.1,
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
)
+ ngl: int = Field(
+ -1,
+ description="Number of layers loaded in GPU (Default: -1)",
+ )
+
class HuggingFaceSettings(BaseModel):
diff --git a/settings.yaml b/settings.yaml
index e881a55..c4c86cb 100644
--- a/settings.yaml
+++ b/settings.yaml
@@ -60,6 +60,7 @@ llamacpp:
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
+ ngl: ${PGPT_NGL:-1} # Sets number of layers offloaded to gpu
embedding:
# Should be matching the value above in most cases
EOD
git apply ngl-settings-option.patch
===== Max New Tokens / Context Size / Temperature settings patch =====
To be able to set Max New Tokens, Context Size and Temperature in the docker compose file as variables, the settings.yaml file needs to be adjusted.
docker compose file additions:
environment:
PGPT_MAX_NEW_TOKENS: 512
PGPT_CONTEXT_WINDOW: 3900
PGPT_TEMPERATURE: 0.1
cat << EOD >> token-ctx-temp-settings-option.patch
diff --git a/settings.yaml b/settings.yaml
index e881a55..8666b86 100644
--- a/settings.yaml
+++ b/settings.yaml
@@ -37,10 +37,10 @@ ui:
llm:
mode: llamacpp
# Should be matching the selected model
- max_new_tokens: 512
- context_window: 3900
+ max_new_tokens: ${PGPT_MAX_NEW_TOKENS:512}
+ context_window: ${PGPT_CONTEXT_WINDOW:3900}
tokenizer: mistralai/Mistral-7B-Instruct-v0.2
- temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
+ temperature: ${PGPT_TEMPERATURE:0.1} # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
rag:
similarity_top_k: 2
EOD
git apply token-ctx-temp-settings-option.patch
===== CSS Customisation =====
To adjust the main input box and fix mobile/low height browser window issue of the input box wrapping to the right, some css trickery is required. The last three css lines are added to privategpt/ui/ui.py for this:
def _build_ui_blocks(self) -> gr.Blocks:
logger.debug("Creating the UI blocks")
with gr.Blocks(
title=UI_TAB_TITLE,
theme=gr.themes.Soft(primary_hue=slate),
css=".logo { "
"display:flex;"
"background-color: #000;"
"height: 90px;"
"border-radius: 8px;"
"align-content: center;"
"justify-content: center;"
"align-items: center;"
"font-size: xxx-large;"
"color: #fff;"
"font-weight: bold;"
"}"
".logo img { height: 100% }"
".contain { display: flex !important; flex-direction: column !important; }"
"#component-0, #component-3, #component-10, #component-8 { height: 100% !important; }"
"#chatbot { flex-grow: 1 !important; overflow: auto !important;}"
"#col { height: calc(100vh - 112px - 16px) !important; }"
"#component-24 > label:nth-child(1) > textarea:nth-child(2) { min-height: 100px !important; }"
"#component-24 { min-width: min(260px, 100%) !important;}"
"#col { min-height:750px !important; }",
) as blocks:
with gr.Row():