Spaces:
Running
Running
| # Install these first if needed: | |
| # pip install gradio bert-score transformers | |
| import gradio as gr | |
| from bert_score import score | |
| def compute_bertscore(candidate, reference, model_type, lang, rescale_with_baseline): | |
| if not candidate.strip() or not reference.strip(): | |
| return "β", "β", "Please enter BOTH reference and candidate text." | |
| # BERTScore expects lists of strings | |
| cands = [candidate] | |
| refs = [reference] | |
| P, R, F1 = score( | |
| cands, | |
| refs, | |
| lang=lang, | |
| model_type=model_type, | |
| rescale_with_baseline=rescale_with_baseline, | |
| ) | |
| precision = f"{P[0].item():.4f}" | |
| recall = f"{R[0].item():.4f}" | |
| f1 = f"{F1[0].item():.4f}" | |
| return precision, recall, f1 | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # π BERTScore Demo | |
| **BERTScore** evaluates the quality of generated text by comparing contextualized | |
| embeddings from models like BERT against a reference text. | |
| Unlike n-gram metrics (e.g., BLEU), BERTScore focuses on **semantic similarity** | |
| and is often better at capturing whether **meaning is preserved**. | |
| 1. Enter a **reference** text (ground truth). | |
| 2. Enter a **candidate** text (model output or paraphrase). | |
| 3. Click **Compute BERTScore**. | |
| """ | |
| ) | |
| with gr.Row(): | |
| reference_input = gr.Textbox( | |
| label="Reference Text (Ground Truth)", | |
| lines=5, | |
| placeholder="e.g., The quick brown fox jumps over the lazy dog.", | |
| ) | |
| candidate_input = gr.Textbox( | |
| label="Candidate Text (Generated/Paraphrased)", | |
| lines=5, | |
| placeholder="e.g., A fast brown fox leaps over a sleepy dog.", | |
| ) | |
| with gr.Row(): | |
| model_type = gr.Dropdown( | |
| label="Embedding Model", | |
| choices=[ | |
| "microsoft/deberta-large-mnli", | |
| "bert-base-uncased", | |
| "roberta-large", | |
| ], | |
| value="microsoft/deberta-large-mnli", | |
| info="Recommended: microsoft/deberta-large-mnli for English", | |
| ) | |
| lang = gr.Dropdown( | |
| label="Language Code", | |
| choices=["en", "de", "fr", "es", "zh"], | |
| value="en", | |
| info="Language of the texts (ISO code).", | |
| ) | |
| rescale_with_baseline = gr.Checkbox( | |
| label="Rescale with Baseline (recommended for comparing scores)", | |
| value=True, | |
| ) | |
| compute_button = gr.Button("Compute BERTScore", variant="primary") | |
| with gr.Row(): | |
| precision_output = gr.Textbox( | |
| label="Precision", interactive=False, value="β" | |
| ) | |
| recall_output = gr.Textbox( | |
| label="Recall", interactive=False, value="β" | |
| ) | |
| f1_output = gr.Textbox( | |
| label="F1 (Main BERTScore Metric)", interactive=False, value="β" | |
| ) | |
| compute_button.click( | |
| fn=compute_bertscore, | |
| inputs=[candidate_input, reference_input, model_type, lang, rescale_with_baseline], | |
| outputs=[precision_output, recall_output, f1_output], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |