# Install these first if needed: # pip install gradio bert-score transformers import gradio as gr from bert_score import score def compute_bertscore(candidate, reference, model_type, lang, rescale_with_baseline): if not candidate.strip() or not reference.strip(): return "—", "—", "Please enter BOTH reference and candidate text." # BERTScore expects lists of strings cands = [candidate] refs = [reference] P, R, F1 = score( cands, refs, lang=lang, model_type=model_type, rescale_with_baseline=rescale_with_baseline, ) precision = f"{P[0].item():.4f}" recall = f"{R[0].item():.4f}" f1 = f"{F1[0].item():.4f}" return precision, recall, f1 with gr.Blocks() as demo: gr.Markdown( """ # 🔍 BERTScore Demo **BERTScore** evaluates the quality of generated text by comparing contextualized embeddings from models like BERT against a reference text. Unlike n-gram metrics (e.g., BLEU), BERTScore focuses on **semantic similarity** and is often better at capturing whether **meaning is preserved**. 1. Enter a **reference** text (ground truth). 2. Enter a **candidate** text (model output or paraphrase). 3. Click **Compute BERTScore**. """ ) with gr.Row(): reference_input = gr.Textbox( label="Reference Text (Ground Truth)", lines=5, placeholder="e.g., The quick brown fox jumps over the lazy dog.", ) candidate_input = gr.Textbox( label="Candidate Text (Generated/Paraphrased)", lines=5, placeholder="e.g., A fast brown fox leaps over a sleepy dog.", ) with gr.Row(): model_type = gr.Dropdown( label="Embedding Model", choices=[ "microsoft/deberta-large-mnli", "bert-base-uncased", "roberta-large", ], value="microsoft/deberta-large-mnli", info="Recommended: microsoft/deberta-large-mnli for English", ) lang = gr.Dropdown( label="Language Code", choices=["en", "de", "fr", "es", "zh"], value="en", info="Language of the texts (ISO code).", ) rescale_with_baseline = gr.Checkbox( label="Rescale with Baseline (recommended for comparing scores)", value=True, ) compute_button = gr.Button("Compute BERTScore", variant="primary") with gr.Row(): precision_output = gr.Textbox( label="Precision", interactive=False, value="—" ) recall_output = gr.Textbox( label="Recall", interactive=False, value="—" ) f1_output = gr.Textbox( label="F1 (Main BERTScore Metric)", interactive=False, value="—" ) compute_button.click( fn=compute_bertscore, inputs=[candidate_input, reference_input, model_type, lang, rescale_with_baseline], outputs=[precision_output, recall_output, f1_output], ) if __name__ == "__main__": demo.launch()