#! /bin/bash ################################################################################ # Shell script that starts a copy of vLLM with a base model plus all the # available aLoRA adapters in this repository. # # To run this script: # 1. Create and activate a Python virtual environment using a tool such as # miniforge, uv, or venv. # 2. Install the fork of vLLM that supports aLoRA on your machine # (`VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/tdoublep/vllm.git@alora`) # 3. Install the Hugging Face CLI (`pip install -U "huggingface_hub[cli]"`) # 3. Download the intrinsics library by running: # hf download ibm-granite/rag-intrinsics-lib --local-dir ./rag-intrinsics-lib # 4. Edit the constants BASE_MODEL_NAME, BASE_MODEL_ORG, and PORT as needed # 5. Run this script from the root of your local copy of rag-intrinsics-lib. ################################################################################ BASE_MODEL_NAME=granite-3.3-8b-instruct BASE_MODEL_ORG=ibm-granite PORT=55555 export VLLM_API_KEY=rag_intrinsics_1234 # Find all aLoRA adapters for the target base model. Note that this can be # edited to serve both aLoRA and LoRA adapters simultaneously. ALORAS="" for item in "."/*; do # Remove the "./" name=$(basename -- "${item}") if [ -d "./${name}/alora/${BASE_MODEL_NAME}" ]; then ALORAS+="${name}=./${name}/alora/${BASE_MODEL_NAME} " fi done CMD="vllm serve ${BASE_MODEL_ORG}/${BASE_MODEL_NAME} \ --port ${PORT} \ --gpu-memory-utilization 0.45 \ --max-model-len 8192 \ --enable-lora \ --enable-activated-lora \ --enable-prefix-caching \ --max_lora_rank 64 \ --lora-modules $ALORAS" echo $CMD $CMD