import gradio as gr
from llama_cpp import Llama
import os

# 🔽 Download smaller GGUF model (Q4_0 for faster load)
model_url = "https://huggingface.co/TheBloke/Meditron-7B-GGUF/resolve/main/meditron-7b.Q4_0.gguf"
model_path = "meditron-7b.Q4_0.gguf"
if not os.path.exists(model_path):
    os.system(f"wget -O {model_path} {model_url}")

# ⚙️ Load model with GPU acceleration (T4 optimized)
try:
    llm = Llama(model_path=model_path, n_gpu_layers=28, n_ctx=2048, verbose=False)
    llm("Hello", max_tokens=1)  # 🔥 Warmup to reduce latency
    backend = "GPU"
except Exception:
    llm = Llama(model_path=model_path, n_ctx=2048)
    backend = "CPU"

# 🧠 Diagnosis function
def diagnose(symptoms):
    if not symptoms.strip():
        return "⚠️ Please enter symptoms to receive a diagnosis."
    prompt = f"""You are a cautious and knowledgeable medical diagnosis assistant. You do not provide definitive diagnoses, only possible conditions and recommended next steps. Always advise users to consult a licensed physician.

Symptoms: {symptoms}

Diagnosis:"""
    try:
        output = llm(prompt, max_tokens=512, stop=["User:", "\n\n"])
        return output["choices"][0]["text"].strip()
    except Exception as e:
        return f"⚠️ Model error: {str(e)}"

# 🎨 Gradio UI
gr.Interface(
    fn=diagnose,
    inputs=gr.Textbox(lines=5, placeholder="e.g. fever, cough, fatigue..."),
    outputs="text",
    title="🩺 Medical Diagnosis Chatbot",
    description=f"Enter symptoms to get possible diagnoses. Powered by Meditron-7B ({backend} mode)."
).launch()