Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoTokenizer | |
| from sarm_llama import LlamaSARM | |
| # --- 1. Load Model and Tokenizer --- | |
| # This step automatically downloads your model files from the Hugging Face Hub. | |
| # Ensure your model repository is public. | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| MODEL_ID = "schrieffer/SARM-4B" | |
| print(f"Loading model: {MODEL_ID} on {DEVICE}...") | |
| # trust_remote_code=True is required because SARM has a custom architecture. | |
| model = LlamaSARM.from_pretrained( | |
| MODEL_ID, | |
| sae_hidden_state_source_layer=16, | |
| sae_latent_size=65536, | |
| sae_k=192, | |
| device_map=DEVICE, | |
| trust_remote_code=True, | |
| torch_dtype=torch.bfloat16 | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) | |
| print("Model loaded successfully!") | |
| # --- 2. Define the Inference Function --- | |
| # This function will be called by Gradio. | |
| def get_reward_score(prompt: str, response: str) -> float: | |
| """ | |
| Receives a prompt and a response, and returns the reward score calculated by the SARM model. | |
| """ | |
| if not prompt or not response: | |
| return 0.0 | |
| try: | |
| # Use the same chat template as used during model training. | |
| messages = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}] | |
| input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE) | |
| with torch.no_grad(): | |
| score = model(input_ids).logits.item() | |
| return round(score, 4) | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| # It might be better to return an error message on the UI, but here we simply return 0. | |
| return 0.0 | |
| # --- 3. Create and Launch the Gradio Interface --- | |
| # Use gr.Blocks() for a more flexible layout. | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown( | |
| """ | |
| # SARM: Interpretable Reward Model Demo | |
| This is an interactive demo for the **SARM-4B** model (Sparse Autoencoder-enhanced Reward Model). | |
| SARM is a novel reward model architecture that enhances interpretability by integrating a pretrained Sparse Autoencoder (SAE). It maps the internal hidden states of a large language model into a sparse and human-understandable feature space, making the resulting reward scores transparent and conceptually meaningful. | |
| **How to use this Demo:** | |
| 1. Enter a **Prompt** (e.g., a question) in the left textbox below. | |
| 2. Enter a corresponding **Response** in the right textbox. | |
| 3. Click the "Calculate Reward Score" button. | |
| The model will output a scalar score that evaluates the quality of the response. **A higher score indicates that the SARM model considers the response to be of better quality.** | |
| --- | |
| *SARM Architecture* | |
|  | |
| + **Authors** (* indicates equal contribution) | |
| Shuyi Zhang\*, Wei Shi\*, Sihang Li\*, Jiayi Liao, Tao Liang, Hengxing Cai, Xiang Wang | |
| + **Paper**: [Interpretable Reward Model via Sparse Autoencoder](https://arxiv.org/abs/2508.08746) | |
| + **Model**: [schrieffer/SARM-4B](https://huggingface.co/schrieffer/SARM-4B) | |
| + Finetuned from model: [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | |
| + **Code Repository:** [https://github.com/schrieffer-z/sarm](https://github.com/schrieffer-z/sarm) | |
| """ | |
| ) | |
| with gr.Row(): | |
| prompt_input = gr.Textbox(lines=3, label="Prompt / Question", placeholder="e.g., Can you explain the theory of relativity in simple terms?") | |
| response_input = gr.Textbox(lines=5, label="Response to be Evaluated", placeholder="e.g., Of course! Albert Einstein's theory of relativity...") | |
| calculate_btn = gr.Button("Calculate Reward Score", variant="primary") | |
| score_output = gr.Number(label="Reward Score", info="A higher score is better.") | |
| # Define the button's click behavior. | |
| calculate_btn.click( | |
| fn=get_reward_score, | |
| inputs=[prompt_input, response_input], | |
| outputs=score_output | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["What is the capital of France?", "The capital of France is Paris."], | |
| ["What is the capital of France?", "Berlin is a large city in Germany."], | |
| ["Write a short poem about the moon.", "Silver orb in velvet night, / Casting shadows, soft and light. / Silent watcher, distant, bright, / Guiding dreams till morning's light."], | |
| ["Write a short poem about the moon.", "The moon is a rock."] | |
| ], | |
| inputs=[prompt_input, response_input], | |
| outputs=score_output, | |
| fn=get_reward_score, | |
| cache_examples=True # Cache the results of the examples to speed up loading. | |
| ) | |
| # Launch the application. | |
| demo.launch() | |