Spaces:
Running
Running
| import modal | |
| from configs import ( | |
| vllm_image, | |
| hf_cache_vol, | |
| vllm_cache_vol, | |
| MODEL_NAME, | |
| MODEL_REVISION, | |
| MINUTE, | |
| N_GPU, | |
| API_KEY, | |
| VLLM_PORT, | |
| flashinfer_cache_vol, | |
| CHAT_TEMPLATE, | |
| ) | |
| app = modal.App("vibe-shopping-llm") | |
| def serve_llm(): | |
| import subprocess | |
| import os | |
| import torch | |
| min_pixels = 128 * 28 * 28 # min 128 tokens | |
| max_pixels = 500 * 28 * 28 # max 500 tokens (~640x640 image) | |
| major, minor = torch.cuda.get_device_capability() | |
| cmd = [ | |
| "env", | |
| f"TORCH_CUDA_ARCH_LIST={major}.{minor}", | |
| "vllm", | |
| "serve", | |
| MODEL_NAME, | |
| "--revision", | |
| MODEL_REVISION, | |
| "--uvicorn-log-level=info", | |
| "--tool-call-parser", | |
| "hermes", | |
| "--enable-auto-tool-choice", | |
| "--limit-mm-per-prompt", | |
| "image=100", | |
| "--chat-template", | |
| CHAT_TEMPLATE, | |
| "--tensor-parallel-size", | |
| str(N_GPU), | |
| "--enforce-eager", | |
| # Minimize token usage | |
| "--mm-processor-kwargs", | |
| f'{{"min_pixels": {min_pixels}, "max_pixels": {max_pixels}, "use_fast": true}}', | |
| # Extend context length to 65536 tokens | |
| # "--rope-scaling", | |
| # '{"rope_type":"yarn","factor":2.0,"original_max_position_embeddings":32768}', | |
| "--max-model-len", | |
| "32768", | |
| "--host", | |
| "0.0.0.0", | |
| "--port", | |
| str(VLLM_PORT), | |
| "--api-key", | |
| os.environ["API_KEY"], | |
| ] | |
| subprocess.Popen(cmd) | |
| ###### ------ FOR TESTING PURPOSES ONLY ------ ###### | |
| def test(test_timeout=25 * MINUTE, twice: bool = True): | |
| import os | |
| import json | |
| import time | |
| import urllib | |
| import dotenv | |
| dotenv.load_dotenv() | |
| if "OPENAI_API_KEY" not in os.environ: | |
| raise ValueError("OPENAI_API_KEY environment variable is not set.") | |
| print(f"Running health check for server at {serve_llm.get_web_url()}") | |
| up, start, delay = False, time.time(), 10 | |
| while not up: | |
| try: | |
| with urllib.request.urlopen( | |
| serve_llm.get_web_url() + "/health" | |
| ) as response: | |
| if response.getcode() == 200: | |
| up = True | |
| except Exception: | |
| if time.time() - start > test_timeout: | |
| break | |
| time.sleep(delay) | |
| assert up, f"Failed health check for server at {serve_llm.get_web_url()}" | |
| print(f"Successful health check for server at {serve_llm.get_web_url()}") | |
| messages = [{"role": "user", "content": "Testing! Is this thing on?"}] | |
| print(f"Sending a sample message to {serve_llm.get_web_url()}", *messages, sep="\n") | |
| headers = { | |
| "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}", | |
| "Content-Type": "application/json", | |
| } | |
| payload = json.dumps({"messages": messages, "model": MODEL_NAME}) | |
| req = urllib.request.Request( | |
| serve_llm.get_web_url() + "/v1/chat/completions", | |
| data=payload.encode("utf-8"), | |
| headers=headers, | |
| method="POST", | |
| ) | |
| with urllib.request.urlopen(req) as response: | |
| print(json.loads(response.read().decode())) | |
| if twice: | |
| print("Sending the same message again to test caching.") | |
| with urllib.request.urlopen(req) as response: | |
| print(json.loads(response.read().decode())) | |