SARM-Demo / app.py
Schrieffer2sy's picture
init
1748050
raw
history blame
4.94 kB
import gradio as gr
import torch
from transformers import AutoTokenizer
from sarm_llama import LlamaSARM
# --- 1. Load Model and Tokenizer ---
# This step automatically downloads your model files from the Hugging Face Hub.
# Ensure your model repository is public.
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "schrieffer/SARM-4B"
print(f"Loading model: {MODEL_ID} on {DEVICE}...")
# trust_remote_code=True is required because SARM has a custom architecture.
model = LlamaSARM.from_pretrained(
MODEL_ID,
sae_hidden_state_source_layer=16,
sae_latent_size=65536,
sae_k=192,
device_map=DEVICE,
trust_remote_code=True,
torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
print("Model loaded successfully!")
# --- 2. Define the Inference Function ---
# This function will be called by Gradio.
def get_reward_score(prompt: str, response: str) -> float:
"""
Receives a prompt and a response, and returns the reward score calculated by the SARM model.
"""
if not prompt or not response:
return 0.0
try:
# Use the same chat template as used during model training.
messages = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE)
with torch.no_grad():
score = model(input_ids).logits.item()
return round(score, 4)
except Exception as e:
print(f"Error: {e}")
# It might be better to return an error message on the UI, but here we simply return 0.
return 0.0
# --- 3. Create and Launch the Gradio Interface ---
# Use gr.Blocks() for a more flexible layout.
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# SARM: Interpretable Reward Model Demo
This is an interactive demo for the **SARM-4B** model (Sparse Autoencoder-enhanced Reward Model).
SARM is a novel reward model architecture that enhances interpretability by integrating a pretrained Sparse Autoencoder (SAE). It maps the internal hidden states of a large language model into a sparse and human-understandable feature space, making the resulting reward scores transparent and conceptually meaningful.
**How to use this Demo:**
1. Enter a **Prompt** (e.g., a question) in the left textbox below.
2. Enter a corresponding **Response** in the right textbox.
3. Click the "Calculate Reward Score" button.
The model will output a scalar score that evaluates the quality of the response. **A higher score indicates that the SARM model considers the response to be of better quality.**
---
*SARM Architecture*
![](https://huggingface.co/schrieffer/SARM-4B/resolve/main/sarm-framework.png?raw=true)
+ **Authors** (* indicates equal contribution)
Shuyi Zhang\*, Wei Shi\*, Sihang Li\*, Jiayi Liao, Tao Liang, Hengxing Cai, Xiang Wang
+ **Paper**: [Interpretable Reward Model via Sparse Autoencoder](https://arxiv.org/abs/2508.08746)
+ **Model**: [schrieffer/SARM-4B](https://huggingface.co/schrieffer/SARM-4B)
+ Finetuned from model: [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
+ **Code Repository:** [https://github.com/schrieffer-z/sarm](https://github.com/schrieffer-z/sarm)
"""
)
with gr.Row():
prompt_input = gr.Textbox(lines=3, label="Prompt / Question", placeholder="e.g., Can you explain the theory of relativity in simple terms?")
response_input = gr.Textbox(lines=5, label="Response to be Evaluated", placeholder="e.g., Of course! Albert Einstein's theory of relativity...")
calculate_btn = gr.Button("Calculate Reward Score", variant="primary")
score_output = gr.Number(label="Reward Score", info="A higher score is better.")
# Define the button's click behavior.
calculate_btn.click(
fn=get_reward_score,
inputs=[prompt_input, response_input],
outputs=score_output
)
gr.Examples(
examples=[
["What is the capital of France?", "The capital of France is Paris."],
["What is the capital of France?", "Berlin is a large city in Germany."],
["Write a short poem about the moon.", "Silver orb in velvet night, / Casting shadows, soft and light. / Silent watcher, distant, bright, / Guiding dreams till morning's light."],
["Write a short poem about the moon.", "The moon is a rock."]
],
inputs=[prompt_input, response_input],
outputs=score_output,
fn=get_reward_score,
cache_examples=True # Cache the results of the examples to speed up loading.
)
# Launch the application.
demo.launch()