mahesh1209 commited on
Commit
fc871ce
·
verified ·
1 Parent(s): f07d740

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -6
app.py CHANGED
@@ -1,16 +1,18 @@
 
1
  import gradio as gr
2
  from llama_cpp import Llama
3
  import os
4
 
5
- # 🔽 Download GGUF model if not present
6
- model_url = "https://huggingface.co/TheBloke/Meditron-7B-GGUF/resolve/main/meditron-7b.Q4_K_M.gguf"
7
- model_path = "meditron-7b.Q4_K_M.gguf"
8
  if not os.path.exists(model_path):
9
  os.system(f"wget -O {model_path} {model_url}")
10
 
11
- # ⚙️ Load model with GPU acceleration (T4 sweet spot)
12
  try:
13
- llm = Llama(model_path=model_path, n_gpu_layers=35, n_ctx=2048, verbose=False)
 
14
  backend = "GPU"
15
  except Exception:
16
  llm = Llama(model_path=model_path, n_ctx=2048)
@@ -39,4 +41,3 @@ gr.Interface(
39
  title="🩺 Medical Diagnosis Chatbot",
40
  description=f"Enter symptoms to get possible diagnoses. Powered by Meditron-7B ({backend} mode)."
41
  ).launch()
42
-
 
1
+
2
  import gradio as gr
3
  from llama_cpp import Llama
4
  import os
5
 
6
+ # 🔽 Download smaller GGUF model (Q4_0 for faster load)
7
+ model_url = "https://huggingface.co/TheBloke/Meditron-7B-GGUF/resolve/main/meditron-7b.Q4_0.gguf"
8
+ model_path = "meditron-7b.Q4_0.gguf"
9
  if not os.path.exists(model_path):
10
  os.system(f"wget -O {model_path} {model_url}")
11
 
12
+ # ⚙️ Load model with GPU acceleration (T4 optimized)
13
  try:
14
+ llm = Llama(model_path=model_path, n_gpu_layers=28, n_ctx=2048, verbose=False)
15
+ llm("Hello", max_tokens=1) # 🔥 Warmup to reduce latency
16
  backend = "GPU"
17
  except Exception:
18
  llm = Llama(model_path=model_path, n_ctx=2048)
 
41
  title="🩺 Medical Diagnosis Chatbot",
42
  description=f"Enter symptoms to get possible diagnoses. Powered by Meditron-7B ({backend} mode)."
43
  ).launch()