refactor: update llm_zerogpu.py and llm_modal to align each other
Browse files
src/unpredictable_lord/llm_modal.py
CHANGED
|
@@ -18,8 +18,11 @@ MOUNT_DIR = "/data"
|
|
| 18 |
|
| 19 |
# https://huggingface.co/openai/gpt-oss-20b
|
| 20 |
MODEL_IDENTIFIER = "openai/gpt-oss-20b"
|
| 21 |
-
|
| 22 |
-
#
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# https://modal.com/docs/guide/gpu#specifying-gpu-type
|
| 25 |
GPU_NAME = "L4"
|
|
@@ -39,10 +42,6 @@ GPU = f"{GPU_NAME}:{GPU_NUM}"
|
|
| 39 |
# | L4 | 24 GB | $0.80 /h |
|
| 40 |
# | T4 | 16 GB | $0.59 /h |
|
| 41 |
|
| 42 |
-
# MAX_MODEL_TOKENS >= Input + Output
|
| 43 |
-
MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens
|
| 44 |
-
MAX_OUTPUT_TOKENS = 512
|
| 45 |
-
|
| 46 |
image = (
|
| 47 |
# https://hub.docker.com/r/nvidia/cuda/tags?name=12.8
|
| 48 |
# https://hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04
|
|
@@ -104,9 +103,6 @@ def load_model():
|
|
| 104 |
@app.function(
|
| 105 |
gpu=GPU,
|
| 106 |
volumes={MOUNT_DIR: MOUNT_VOLUME},
|
| 107 |
-
# secrets=[modal.Secret.from_name("huggingface-secret")],
|
| 108 |
-
# scaledown_window=15 * 60,
|
| 109 |
-
# timeout=30 * 60,
|
| 110 |
)
|
| 111 |
def generate_stream(input_tokens):
|
| 112 |
"""
|
|
|
|
| 18 |
|
| 19 |
# https://huggingface.co/openai/gpt-oss-20b
|
| 20 |
MODEL_IDENTIFIER = "openai/gpt-oss-20b"
|
| 21 |
+
|
| 22 |
+
# MAX_MODEL_TOKENS >= Input + Output
|
| 23 |
+
MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens
|
| 24 |
+
MAX_OUTPUT_TOKENS = 512
|
| 25 |
+
|
| 26 |
|
| 27 |
# https://modal.com/docs/guide/gpu#specifying-gpu-type
|
| 28 |
GPU_NAME = "L4"
|
|
|
|
| 42 |
# | L4 | 24 GB | $0.80 /h |
|
| 43 |
# | T4 | 16 GB | $0.59 /h |
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
image = (
|
| 46 |
# https://hub.docker.com/r/nvidia/cuda/tags?name=12.8
|
| 47 |
# https://hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04
|
|
|
|
| 103 |
@app.function(
|
| 104 |
gpu=GPU,
|
| 105 |
volumes={MOUNT_DIR: MOUNT_VOLUME},
|
|
|
|
|
|
|
|
|
|
| 106 |
)
|
| 107 |
def generate_stream(input_tokens):
|
| 108 |
"""
|
src/unpredictable_lord/llm_zerogpu.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
from threading import Thread
|
| 2 |
|
| 3 |
import openai_harmony as oh
|
|
@@ -7,27 +9,45 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
| 7 |
|
| 8 |
from unpredictable_lord.tokenstreamer import TokenStreamer
|
| 9 |
|
|
|
|
|
|
|
| 10 |
# https://huggingface.co/openai/gpt-oss-20b
|
| 11 |
MODEL_IDENTIFIER = "openai/gpt-oss-20b"
|
| 12 |
-
# https://huggingface.co/openai/gpt-oss-120b
|
| 13 |
-
# MODEL_IDENTIFIER = "openai/gpt-oss-120b"
|
| 14 |
|
| 15 |
# MAX_MODEL_TOKENS >= Input + Output
|
| 16 |
MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens
|
| 17 |
MAX_OUTPUT_TOKENS = 512
|
| 18 |
|
| 19 |
|
| 20 |
-
# Global model and tokenizer (loaded once
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
|
| 30 |
-
stop_token_ids = encoding.stop_tokens_for_assistant_actions()
|
| 31 |
|
| 32 |
|
| 33 |
@spaces.GPU
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
import subprocess
|
| 3 |
from threading import Thread
|
| 4 |
|
| 5 |
import openai_harmony as oh
|
|
|
|
| 9 |
|
| 10 |
from unpredictable_lord.tokenstreamer import TokenStreamer
|
| 11 |
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
# https://huggingface.co/openai/gpt-oss-20b
|
| 15 |
MODEL_IDENTIFIER = "openai/gpt-oss-20b"
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# MAX_MODEL_TOKENS >= Input + Output
|
| 18 |
MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens
|
| 19 |
MAX_OUTPUT_TOKENS = 512
|
| 20 |
|
| 21 |
|
| 22 |
+
# Global model and tokenizer (loaded once)
|
| 23 |
+
model = None
|
| 24 |
+
tokenizer = None
|
| 25 |
+
stop_token_ids = None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def load_model():
|
| 29 |
+
"""Load model and tokenizer into global variables."""
|
| 30 |
+
global model, tokenizer, stop_token_ids
|
| 31 |
+
|
| 32 |
+
if model is not None:
|
| 33 |
+
return
|
| 34 |
+
|
| 35 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_IDENTIFIER)
|
| 36 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 37 |
+
MODEL_IDENTIFIER,
|
| 38 |
+
torch_dtype="auto",
|
| 39 |
+
device_map="auto",
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Load stop token IDs
|
| 43 |
+
_encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
|
| 44 |
+
stop_token_ids = _encoding.stop_tokens_for_assistant_actions()
|
| 45 |
+
|
| 46 |
+
# Show GPU information
|
| 47 |
+
subprocess.run(["nvidia-smi"])
|
| 48 |
+
|
| 49 |
|
| 50 |
+
load_model()
|
|
|
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
@spaces.GPU
|