Spaces:

Schrieffer
/

SARM-Demo

Sleeping

App Files Files Community

SARM-Demo / app.py

Schrieffer2sy

init

1748050 4 months ago

raw

history blame

4.94 kB

	import gradio as gr
	import torch
	from transformers import AutoTokenizer
	from sarm_llama import LlamaSARM

	# --- 1. Load Model and Tokenizer ---
	# This step automatically downloads your model files from the Hugging Face Hub.
	# Ensure your model repository is public.

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	MODEL_ID = "schrieffer/SARM-4B"

	print(f"Loading model: {MODEL_ID} on {DEVICE}...")

	# trust_remote_code=True is required because SARM has a custom architecture.
	model = LlamaSARM.from_pretrained(
	MODEL_ID,
	sae_hidden_state_source_layer=16,
	sae_latent_size=65536,
	sae_k=192,
	device_map=DEVICE,
	trust_remote_code=True,
	torch_dtype=torch.bfloat16
	)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)

	print("Model loaded successfully!")

	# --- 2. Define the Inference Function ---
	# This function will be called by Gradio.

	def get_reward_score(prompt: str, response: str) -> float:
	"""
	Receives a prompt and a response, and returns the reward score calculated by the SARM model.
	"""
	if not prompt or not response:
	return 0.0

	try:
	# Use the same chat template as used during model training.
	messages = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
	input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE)

	with torch.no_grad():
	score = model(input_ids).logits.item()

	return round(score, 4)
	except Exception as e:
	print(f"Error: {e}")
	# It might be better to return an error message on the UI, but here we simply return 0.
	return 0.0

	# --- 3. Create and Launch the Gradio Interface ---

	# Use gr.Blocks() for a more flexible layout.
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(
	"""
	# SARM: Interpretable Reward Model Demo

	This is an interactive demo for the SARM-4B model (Sparse Autoencoder-enhanced Reward Model).

	SARM is a novel reward model architecture that enhances interpretability by integrating a pretrained Sparse Autoencoder (SAE). It maps the internal hidden states of a large language model into a sparse and human-understandable feature space, making the resulting reward scores transparent and conceptually meaningful.

	How to use this Demo:
	1. Enter a Prompt (e.g., a question) in the left textbox below.
	2. Enter a corresponding Response in the right textbox.
	3. Click the "Calculate Reward Score" button.

	The model will output a scalar score that evaluates the quality of the response. A higher score indicates that the SARM model considers the response to be of better quality.

	---

	SARM Architecture
	![](https://huggingface.co/schrieffer/SARM-4B/resolve/main/sarm-framework.png?raw=true)

	+ Authors (* indicates equal contribution)

	Shuyi Zhang\, Wei Shi\, Sihang Li\*, Jiayi Liao, Tao Liang, Hengxing Cai, Xiang Wang
	+ Paper: [Interpretable Reward Model via Sparse Autoencoder](https://arxiv.org/abs/2508.08746)

	+ Model: [schrieffer/SARM-4B](https://huggingface.co/schrieffer/SARM-4B)

	+ Finetuned from model: [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)

	+ Code Repository: [https://github.com/schrieffer-z/sarm](https://github.com/schrieffer-z/sarm)
	"""
	)

	with gr.Row():
	prompt_input = gr.Textbox(lines=3, label="Prompt / Question", placeholder="e.g., Can you explain the theory of relativity in simple terms?")
	response_input = gr.Textbox(lines=5, label="Response to be Evaluated", placeholder="e.g., Of course! Albert Einstein's theory of relativity...")

	calculate_btn = gr.Button("Calculate Reward Score", variant="primary")
	score_output = gr.Number(label="Reward Score", info="A higher score is better.")

	# Define the button's click behavior.
	calculate_btn.click(
	fn=get_reward_score,
	inputs=[prompt_input, response_input],
	outputs=score_output
	)

	gr.Examples(
	examples=[
	["What is the capital of France?", "The capital of France is Paris."],
	["What is the capital of France?", "Berlin is a large city in Germany."],
	["Write a short poem about the moon.", "Silver orb in velvet night, / Casting shadows, soft and light. / Silent watcher, distant, bright, / Guiding dreams till morning's light."],
	["Write a short poem about the moon.", "The moon is a rock."]
	],
	inputs=[prompt_input, response_input],
	outputs=score_output,
	fn=get_reward_score,
	cache_examples=True # Cache the results of the examples to speed up loading.
	)

	# Launch the application.
	demo.launch()