Spaces:

eaglelandsonce
/

BertScorer

Running

App Files Files Community

BertScorer / app.py

eaglelandsonce

Create app.py

c0663ad verified 27 days ago

raw

history blame contribute delete

3.15 kB

	# Install these first if needed:
	# pip install gradio bert-score transformers

	import gradio as gr
	from bert_score import score


	def compute_bertscore(candidate, reference, model_type, lang, rescale_with_baseline):
	if not candidate.strip() or not reference.strip():
	return "—", "—", "Please enter BOTH reference and candidate text."

	# BERTScore expects lists of strings
	cands = [candidate]
	refs = [reference]

	P, R, F1 = score(
	cands,
	refs,
	lang=lang,
	model_type=model_type,
	rescale_with_baseline=rescale_with_baseline,
	)

	precision = f"{P[0].item():.4f}"
	recall = f"{R[0].item():.4f}"
	f1 = f"{F1[0].item():.4f}"

	return precision, recall, f1


	with gr.Blocks() as demo:
	gr.Markdown(
	"""
	# 🔍 BERTScore Demo

	BERTScore evaluates the quality of generated text by comparing contextualized
	embeddings from models like BERT against a reference text.

	Unlike n-gram metrics (e.g., BLEU), BERTScore focuses on semantic similarity
	and is often better at capturing whether meaning is preserved.

	1. Enter a reference text (ground truth).
	2. Enter a candidate text (model output or paraphrase).
	3. Click Compute BERTScore.
	"""
	)

	with gr.Row():
	reference_input = gr.Textbox(
	label="Reference Text (Ground Truth)",
	lines=5,
	placeholder="e.g., The quick brown fox jumps over the lazy dog.",
	)
	candidate_input = gr.Textbox(
	label="Candidate Text (Generated/Paraphrased)",
	lines=5,
	placeholder="e.g., A fast brown fox leaps over a sleepy dog.",
	)

	with gr.Row():
	model_type = gr.Dropdown(
	label="Embedding Model",
	choices=[
	"microsoft/deberta-large-mnli",
	"bert-base-uncased",
	"roberta-large",
	],
	value="microsoft/deberta-large-mnli",
	info="Recommended: microsoft/deberta-large-mnli for English",
	)

	lang = gr.Dropdown(
	label="Language Code",
	choices=["en", "de", "fr", "es", "zh"],
	value="en",
	info="Language of the texts (ISO code).",
	)

	rescale_with_baseline = gr.Checkbox(
	label="Rescale with Baseline (recommended for comparing scores)",
	value=True,
	)

	compute_button = gr.Button("Compute BERTScore", variant="primary")

	with gr.Row():
	precision_output = gr.Textbox(
	label="Precision", interactive=False, value="—"
	)
	recall_output = gr.Textbox(
	label="Recall", interactive=False, value="—"
	)
	f1_output = gr.Textbox(
	label="F1 (Main BERTScore Metric)", interactive=False, value="—"
	)

	compute_button.click(
	fn=compute_bertscore,
	inputs=[candidate_input, reference_input, model_type, lang, rescale_with_baseline],
	outputs=[precision_output, recall_output, f1_output],
	)

	if __name__ == "__main__":
	demo.launch()