BertScorer / app.py
eaglelandsonce's picture
Create app.py
c0663ad verified
# Install these first if needed:
# pip install gradio bert-score transformers
import gradio as gr
from bert_score import score
def compute_bertscore(candidate, reference, model_type, lang, rescale_with_baseline):
if not candidate.strip() or not reference.strip():
return "β€”", "β€”", "Please enter BOTH reference and candidate text."
# BERTScore expects lists of strings
cands = [candidate]
refs = [reference]
P, R, F1 = score(
cands,
refs,
lang=lang,
model_type=model_type,
rescale_with_baseline=rescale_with_baseline,
)
precision = f"{P[0].item():.4f}"
recall = f"{R[0].item():.4f}"
f1 = f"{F1[0].item():.4f}"
return precision, recall, f1
with gr.Blocks() as demo:
gr.Markdown(
"""
# πŸ” BERTScore Demo
**BERTScore** evaluates the quality of generated text by comparing contextualized
embeddings from models like BERT against a reference text.
Unlike n-gram metrics (e.g., BLEU), BERTScore focuses on **semantic similarity**
and is often better at capturing whether **meaning is preserved**.
1. Enter a **reference** text (ground truth).
2. Enter a **candidate** text (model output or paraphrase).
3. Click **Compute BERTScore**.
"""
)
with gr.Row():
reference_input = gr.Textbox(
label="Reference Text (Ground Truth)",
lines=5,
placeholder="e.g., The quick brown fox jumps over the lazy dog.",
)
candidate_input = gr.Textbox(
label="Candidate Text (Generated/Paraphrased)",
lines=5,
placeholder="e.g., A fast brown fox leaps over a sleepy dog.",
)
with gr.Row():
model_type = gr.Dropdown(
label="Embedding Model",
choices=[
"microsoft/deberta-large-mnli",
"bert-base-uncased",
"roberta-large",
],
value="microsoft/deberta-large-mnli",
info="Recommended: microsoft/deberta-large-mnli for English",
)
lang = gr.Dropdown(
label="Language Code",
choices=["en", "de", "fr", "es", "zh"],
value="en",
info="Language of the texts (ISO code).",
)
rescale_with_baseline = gr.Checkbox(
label="Rescale with Baseline (recommended for comparing scores)",
value=True,
)
compute_button = gr.Button("Compute BERTScore", variant="primary")
with gr.Row():
precision_output = gr.Textbox(
label="Precision", interactive=False, value="β€”"
)
recall_output = gr.Textbox(
label="Recall", interactive=False, value="β€”"
)
f1_output = gr.Textbox(
label="F1 (Main BERTScore Metric)", interactive=False, value="β€”"
)
compute_button.click(
fn=compute_bertscore,
inputs=[candidate_input, reference_input, model_type, lang, rescale_with_baseline],
outputs=[precision_output, recall_output, f1_output],
)
if __name__ == "__main__":
demo.launch()