ryomo commited on
Commit
0300eff
·
1 Parent(s): 0ddba75

refactor: update llm_zerogpu.py and llm_modal to align each other

Browse files
src/unpredictable_lord/llm_modal.py CHANGED
@@ -18,8 +18,11 @@ MOUNT_DIR = "/data"
18
 
19
  # https://huggingface.co/openai/gpt-oss-20b
20
  MODEL_IDENTIFIER = "openai/gpt-oss-20b"
21
- # https://huggingface.co/openai/gpt-oss-120b
22
- # MODEL_IDENTIFIER = "openai/gpt-oss-120b"
 
 
 
23
 
24
  # https://modal.com/docs/guide/gpu#specifying-gpu-type
25
  GPU_NAME = "L4"
@@ -39,10 +42,6 @@ GPU = f"{GPU_NAME}:{GPU_NUM}"
39
  # | L4 | 24 GB | $0.80 /h |
40
  # | T4 | 16 GB | $0.59 /h |
41
 
42
- # MAX_MODEL_TOKENS >= Input + Output
43
- MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens
44
- MAX_OUTPUT_TOKENS = 512
45
-
46
  image = (
47
  # https://hub.docker.com/r/nvidia/cuda/tags?name=12.8
48
  # https://hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04
@@ -104,9 +103,6 @@ def load_model():
104
  @app.function(
105
  gpu=GPU,
106
  volumes={MOUNT_DIR: MOUNT_VOLUME},
107
- # secrets=[modal.Secret.from_name("huggingface-secret")],
108
- # scaledown_window=15 * 60,
109
- # timeout=30 * 60,
110
  )
111
  def generate_stream(input_tokens):
112
  """
 
18
 
19
  # https://huggingface.co/openai/gpt-oss-20b
20
  MODEL_IDENTIFIER = "openai/gpt-oss-20b"
21
+
22
+ # MAX_MODEL_TOKENS >= Input + Output
23
+ MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens
24
+ MAX_OUTPUT_TOKENS = 512
25
+
26
 
27
  # https://modal.com/docs/guide/gpu#specifying-gpu-type
28
  GPU_NAME = "L4"
 
42
  # | L4 | 24 GB | $0.80 /h |
43
  # | T4 | 16 GB | $0.59 /h |
44
 
 
 
 
 
45
  image = (
46
  # https://hub.docker.com/r/nvidia/cuda/tags?name=12.8
47
  # https://hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04
 
103
  @app.function(
104
  gpu=GPU,
105
  volumes={MOUNT_DIR: MOUNT_VOLUME},
 
 
 
106
  )
107
  def generate_stream(input_tokens):
108
  """
src/unpredictable_lord/llm_zerogpu.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  from threading import Thread
2
 
3
  import openai_harmony as oh
@@ -7,27 +9,45 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
7
 
8
  from unpredictable_lord.tokenstreamer import TokenStreamer
9
 
 
 
10
  # https://huggingface.co/openai/gpt-oss-20b
11
  MODEL_IDENTIFIER = "openai/gpt-oss-20b"
12
- # https://huggingface.co/openai/gpt-oss-120b
13
- # MODEL_IDENTIFIER = "openai/gpt-oss-120b"
14
 
15
  # MAX_MODEL_TOKENS >= Input + Output
16
  MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens
17
  MAX_OUTPUT_TOKENS = 512
18
 
19
 
20
- # Global model and tokenizer (loaded once at module import)
21
- tokenizer = AutoTokenizer.from_pretrained(MODEL_IDENTIFIER)
22
- model = AutoModelForCausalLM.from_pretrained(
23
- MODEL_IDENTIFIER,
24
- torch_dtype="auto",
25
- device_map="auto",
26
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # Load stop token IDs
29
- encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
30
- stop_token_ids = encoding.stop_tokens_for_assistant_actions()
31
 
32
 
33
  @spaces.GPU
 
1
+ import logging
2
+ import subprocess
3
  from threading import Thread
4
 
5
  import openai_harmony as oh
 
9
 
10
  from unpredictable_lord.tokenstreamer import TokenStreamer
11
 
12
+ logger = logging.getLogger(__name__)
13
+
14
  # https://huggingface.co/openai/gpt-oss-20b
15
  MODEL_IDENTIFIER = "openai/gpt-oss-20b"
 
 
16
 
17
  # MAX_MODEL_TOKENS >= Input + Output
18
  MAX_MODEL_TOKENS = 64 * 1024 # gpt-oss models support up to 128k(128*1024) tokens
19
  MAX_OUTPUT_TOKENS = 512
20
 
21
 
22
+ # Global model and tokenizer (loaded once)
23
+ model = None
24
+ tokenizer = None
25
+ stop_token_ids = None
26
+
27
+
28
+ def load_model():
29
+ """Load model and tokenizer into global variables."""
30
+ global model, tokenizer, stop_token_ids
31
+
32
+ if model is not None:
33
+ return
34
+
35
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_IDENTIFIER)
36
+ model = AutoModelForCausalLM.from_pretrained(
37
+ MODEL_IDENTIFIER,
38
+ torch_dtype="auto",
39
+ device_map="auto",
40
+ )
41
+
42
+ # Load stop token IDs
43
+ _encoding = oh.load_harmony_encoding(oh.HarmonyEncodingName.HARMONY_GPT_OSS)
44
+ stop_token_ids = _encoding.stop_tokens_for_assistant_actions()
45
+
46
+ # Show GPU information
47
+ subprocess.run(["nvidia-smi"])
48
+
49
 
50
+ load_model()
 
 
51
 
52
 
53
  @spaces.GPU