Spaces:

iproskurina
/

dike-leaderboard

Sleeping

App Files Files Community

iproskurina commited on Nov 6

Commit

fc291fb

verified ·

1 Parent(s): aebe201

Upload 4 files

Browse files

Files changed (4) hide show

app.py +227 -0
code_eval_board.csv +13 -0
eval_instruct_lms.csv +9 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# -*- coding: utf-8 -*-
+# Works with Gradio <= 3.44.4 (supports sortable=True)
+import gradio as gr
+import pandas as pd
+# ---------- utils ----------
+def model_hyperlink_md(link: str, name: str) -> str:
+    return f"[{name}]({link})"
+def make_clickable_and_drop_links(df: pd.DataFrame) -> pd.DataFrame:
+    if "Links" not in df.columns:
+        raise ValueError("CSV must include a 'Links' column.")
+    df = df.copy()
+    df["Model"] = df.apply(lambda r: model_hyperlink_md(r["Links"], r["Model"]), axis=1)
+    return df.drop(columns=["Links"])
+def datatypes_with_markdown(df: pd.DataFrame):
+    return ["markdown" if c == "Model" else "str" for c in df.columns]
+# ---------- load data ----------
+BASE_CSV = "code_eval_board.csv"
+INSTRUCT_CSV = "eval_instruct_lms.csv"
+base_df_raw = pd.read_csv(BASE_CSV)
+inst_df_raw = pd.read_csv(INSTRUCT_CSV)
+base_df = make_clickable_and_drop_links(base_df_raw)
+inst_df = make_clickable_and_drop_links(inst_df_raw)
+base_dtypes = datatypes_with_markdown(base_df)
+inst_dtypes = datatypes_with_markdown(inst_df)
+# ---------- css ----------
+custom_css = """
+.gradio-container {font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;}
+#base-table a, #inst-table a {
+  color: #2a7ae2 !important;
+  text-decoration: underline dotted !important;
+  text-underline-offset: 3px;
+}
+#base-table a:hover, #inst-table a:hover {
+  color: #1e5bbf !important;
+  text-decoration: underline solid !important;
+}
+"""
+# ---------- app ----------
+demo = gr.Blocks(css=custom_css)
+with demo:
+    # ---------- HEADER ----------
+    gr.HTML(
+        """<div id='header' style='text-align:center; margin-top:16px;'>
+            <div id='title-row'
+                 style='display:flex; align-items:center; justify-content:center; gap:16px; flex-wrap:wrap;'>
+                <img src='https://legendaryladieshub.com/wp-content/uploads/2023/12/Dike_Greek-goddess-of-justice-and-moral-order_by-LLH-300x300.jpeg'
+                     alt='Diké' width='80'
+                     style='border-radius:50%; object-fit:cover; box-shadow:0 0 8px rgba(0,0,0,0.4); background:transparent;'>
+                <div style='display:flex; flex-direction:column; align-items:center; text-align:center;'>
+                    <h1 style='font-size:30px; margin:0; font-weight:650;'>Open Diké Leaderboard</h1>
+                    <p style='font-size:18px; margin:4px 0; color:#6c7a89;'>
+                        Bias and Fairness in Compressed LLMs
+                    </p>
+                </div>
+            </div>
+            <p id='subtitle'
+               style='font-size:14px; color:#8a9aad; margin-top:12px;
+                      max-width:1000px; margin-left:auto; margin-right:auto;
+                      line-height:1.6; text-align:justify;'>
+                Inspired by
+                <a href='https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/'
+                   target='_blank'
+                   style='color:#5a8dee; text-decoration:none; font-weight:500;'>
+                   🤗 Open LLM Leaderboard
+                </a> and
+                <a href='https://huggingface.co/spaces/optimum/llm-perf-leaderboard'
+                   target='_blank'
+                   style='color:#5a8dee; text-decoration:none; font-weight:500;'>
+                   Optimum Leaderboard 🏋️
+                </a>, we compare the performance of compressed LLMs across
+                <b>fairness</b>, <b>toxicity</b>, <b>ethics</b>, and <b>safety</b> benchmarks. The leaderboard is released as part of the
+                <a href='https://www.anr-dike.fr/' target='_blank'
+                   style='color:#5a8dee; text-decoration:none; font-weight:500;'>⚖️ Diké Project</a>.
+            </p>
+        </div>"""
+    )
+    # ---------- TABS ----------
+    with gr.Tabs():
+        # TAB 1: Base LLMs
+        with gr.TabItem("🟢 Base LLMs Evaluation"):
+            with gr.Row():
+                base_search = gr.Textbox(placeholder="🔍 Search base models...", show_label=False)
+            def base_search_fn(q):
+                if not q or not q.strip():
+                    return base_df
+                mask = base_df["Model"].str.contains(q, case=False)
+                return base_df[mask]
+            base_table = gr.Dataframe(
+                value=base_df,
+                datatype=base_dtypes,
+                interactive=False,
+                sortable=True,
+                elem_id="base-table",
+            )
+            base_search.submit(base_search_fn, base_search, base_table)
+        # TAB 2: Instruction-tuned LLMs
+        with gr.TabItem("🔶 Instruction-tuned LLMs Evaluation"):
+            with gr.Row():
+                inst_search = gr.Textbox(placeholder="🔍 Search instruction-tuned models...", show_label=False)
+            def inst_search_fn(q):
+                if not q or not q.strip():
+                    return inst_df
+                mask = inst_df["Model"].str.contains(q, case=False)
+                return inst_df[mask]
+            inst_table = gr.Dataframe(
+                value=inst_df,
+                datatype=inst_dtypes,
+                interactive=False,
+                sortable=True,
+                elem_id="inst-table",
+            )
+            inst_search.submit(inst_search_fn, inst_search, inst_table)
+        # TAB 3: About
+        # ---------- TAB 3: About ----------
+        with gr.TabItem("📘 About"):
+            gr.HTML("""
+            <div style='max-width:900px; margin:0 auto; text-align:justify; color:#4b5563; line-height:1.6;'>
+              <h3 style='text-align:center;'>📊 Benchmarks and Metrics Overview</h3>
+              <p>
+                The Diké Leaderboard evaluates the impact of <b>quantization</b> and <b>compression</b>
+                on <b>bias, fairness, ethics, and safety</b> of large language models (LLMs).
+                Each benchmark measures a specific social or ethical aspect of model behavior.
+              </p>
+              <ul style='list-style-type: " "; padding-left: 1em;'>
+                <li><b>Perplexity (PPL)</b> - General measure of model fluency and language modeling quality,
+                evaluated on <a href='https://huggingface.co/datasets/Salesforce/wikitext' target='_blank' style='color:#2563eb;'>WikiText-2</a>.
+                Lower values indicate better language modeling performance.</li>
+                <li><b>HellaSwag</b> - Measures general reasoning and commonsense knowledge via multiple-choice story completion.
+                Dataset: <a href='https://huggingface.co/datasets/Rowan/hellaswag' target='_blank' style='color:#2563eb;'>HellaSwag</a>.
+                Metric: Accuracy.</li>
+                <li><b>BBQ (Bias Benchmark for QA)</b> - Evaluates bias in ambiguous and disambiguated
+                question-answering contexts across 11 protected categories
+                (<a href='https://github.com/nyu-mll/BBQ' target='_blank' style='color:#2563eb;'>BBQ dataset</a>).
+                Metrics: Accuracy, Bias (ambiguous), Bias (disambiguated).</li>
+                <li><b>CrowS-Pairs</b> - Minimal stereotype pairs.
+                (<a href='https://huggingface.co/datasets/nyu-mll/crows_pairs' target='_blank' style='color:#2563eb;'>CrowS-Pairs dataset</a>).
+                Metric: % of stereotyped continuations.</li>
+                <li><b>HolisticBias</b> - 13 demographic axes with sentiment prompts
+                (<a href='https://huggingface.co/datasets/fairnlp/holistic-bias' target='_blank' style='color:#2563eb;'>HolisticBias dataset</a>).
+                Metric: Sentiment skew across identity descriptors.</li>
+                <li><b>SoFA (Social Fairness Dataset)</b> - 1.49M bias probes covering religion, gender, race, and disability
+                (<a href='https://huggingface.co/datasets/copenlu/sofa' target='_blank' style='color:#2563eb;'>SoFA dataset</a>).
+                Metric: Variance of log-perplexity across identity groups.</li>
+                <li><b>StereoSet</b> - Triplet format (stereotype, anti-stereotype, unrelated)
+                across gender, race, religion, profession
+                (<a href='https://github.com/moinnadeem/StereoSet' target='_blank' style='color:#2563eb;'>StereoSet dataset</a>).
+                Metric: Stereotype Score, Language Modeling Score.</li>
+                <li><b>ETHICS</b> - Morality judgments across five ethical principles;
+                we use the <i>Commonsense Morality</i> subset
+                (<a href='https://huggingface.co/datasets/hendrycks/ethics' target='_blank' style='color:#2563eb;'>ETHICS dataset</a>).
+                Metric: Accuracy.</li>
+                <li><b>Moral Stories</b> - First-person scenarios for moral vs. immoral action selection
+                (<a href='https://huggingface.co/datasets/demelin/moral_stories' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).
+                Metrics: Moral preference Accuracy, Refusal rate.</li>
+                <li><b>Histoires Morales</b> - French extension of Moral Stories for cross-lingual ethics evaluation.
+                (<a href='https://huggingface.co/datasets/LabHC/histoires_morales' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).
+                Metric: Accuracy, Refusal rate.</li>
+                <li><b>RealToxicityPrompts</b> - Measures generation toxicity given neutral prompts
+                (<a href='https://huggingface.co/datasets/allenai/real-toxicity-prompts' target='_blank' style='color:#2563eb;'>RealToxicityPrompts</a>).
+                Metric: Average toxicity probability.</li>
+                <li><b>HarmBench</b> - Evaluates safety by measuring model responses to harmful or unethical prompts
+                (<a href='https://huggingface.co/datasets/walledai/HarmBench' target='_blank' style='color:#2563eb;'>HarmBench</a>).
+                Metric: Unsafe response rate.</li>
+              </ul>
+              <p style='margin-top:1.5em;'>
+                All evaluations are implemented via the
+                <a href='https://github.com/EleutherAI/lm-evaluation-harness'
+                   target='_blank' style='color:#5a8dee;'>LM Evaluation Harness</a>
+                and follow consistent zero-shot protocols.
+              </p>
+            </div>
+            """)
+    # ---------- FOOTER ----------
+    gr.HTML(
+        """
+        <div style='text-align:center; margin-top:30px; font-size:14px; color:#777;'>
+          <b>Notes</b><br>
+          • Click column headers to sort ascending/descending<br>
+          • Model names are clickable links to Hugging Face pages<br><br>
+          Part of the <a href='https://www.anr-dike.fr/' target='_blank' style='color:#5a8dee;'>⚖️ Diké Project</a>.
+        </div>
+        """
+    )
+demo.launch(server_name="0.0.0.0", server_port=7860)

code_eval_board.csv ADDED Viewed

	@@ -0,0 +1,13 @@

+T,Model,Compression Recipe,PPL,HellaSwag,BBQ (Acc),BBQ (Bias Ambig.),BBQ (Bias Diasmmg.),CrowS-Pairs,HolisticBias Sentiment,SoFA,StereoSet,Links
+🟢,Llama-3-3B,base,7.55,73.67,41.02,4.91,4.47,64.54,31.26,0.198,65.19,https://huggingface.co/meta-llama/Llama-3-3B
+🔶,Llama-3-3B-Q,GPTQ 4-bit,7.99,71.23,40.42,5.20,3.97,64.24,22.31,0.200,65.31,https://huggingface.co/iproskurina/llama-3-3b-gptqmodel-4bit
+🟢,Llama-3-8B,base,6.11,78.88,43.86,6.27,3.10,66.29,18.30,0.205,66.42,https://huggingface.co/meta-llama/Llama-3-8B
+🔶,Llama-3-8B-Q,GPTQ 4-bit,6.49,77.93,42.45,6.14,3.15,65.92,13.05,0.203,65.89,https://huggingface.co/iproskurina/llama-3-8b-gptqmodel-4bit
+🟢,Qwen2.5-7B,base,6.63,78.88,49.32,15.85,3.23,64.24,16.87,0.672,64.96,https://huggingface.co/Qwen/Qwen2.5-7B
+🔶,Qwen2.5-7B-Q,GPTQ 4-bit,6.90,78.01,48.74,14.21,3.46,64.66,18.94,0.623,64.44,https://huggingface.co/iproskurina/qwen2.5-7b-gptqmodel-4bit
+🟢,Opt-6.7B,base,10.24,67.18,32.08,2.34,3.43,69.05,20.11,0.270,67.08,https://huggingface.co/facebook/opt-6.7b
+🔶,Opt-6.7B-Q,GPTQ 4-bit,10.39,-,-,-,-,68.39,20.99,0.271,-,https://huggingface.co/iproskurina/opt-6.7b-int4-c4
+🟢,Mistral-7B,base,5.50,80.31,43.81,7.27,3.14,66.29,17.90,0.524,64.00,https://huggingface.co/mistralai/Mistral-7B-v0.3
+🔶,Mistral-7B-Q,GPTQ 4-bit,5.64,80.08,43.19,6.06,3.48,66.89,23.70,0.768,63.75,https://huggingface.co/iproskurina/mistral-7b-gptqmodel-4bit
+🟢,Gemma-3-4B,base,7.12,75.77,38.89,5.47,4.82,63.76,8.08,1.558,65.41,https://huggingface.co/google/gemma-3-4b
+🔶,Gemma-3-4B-Q,GPTQ 4-bit,7.53,74.45,37.88,5.82,4.39,64.60,7.16,1.908,65.09,https://huggingface.co/iproskurina/gemma-3-4b-gptqmodel-4bit

eval_instruct_lms.csv ADDED Viewed

	@@ -0,0 +1,9 @@

+T,Model,Compression Recipe,PPL,ETHICS-Commonsense,Moral Stories (Moral Preference),Moral Stories (Refusal rate),Histoires Morales (Moral Preference),Histoires Morales (Refusal rate),RealToxicityPrompts,HarmBench,Links
+🟢,Aya-expanse-8B,base,7.82,65.41,71.24,3.1,94.42,0.9,10.1,8.5,https://huggingface.co/CohereLabs/aya-expanse-8b
+🔶,Aya-expanse-8B-Q,GPTQ 4-bit,8.03,58.04,68.35,3.7,42.28,6.7,11.3,9.5,https://huggingface.co/iproskurina/aya-expanse-8b-gptqmodel-4bit
+🟢,Llama-3.1-8B-Instruct,base,6.99,60.21,95.44,0.1,94.17,0.2,3.0,12.5,https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
+🔶,Llama-3.1-8B-Instruct-Q,GPTQ 4-bit,7.22,58.64,99.97,0.0,93.63,0.3,3.4,12.5,https://huggingface.co/iproskurina/llama-3.1-8b-instruct-gptqmodel-4bit
+🟢,Mistral-7B-Instruct-v0.3,base,5.75,68.70,95.27,0.0,93.33,0.0,6.3,29.5,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
+🔶,Mistral-7B-Instruct-v0.3-Q,GPTQ 4-bit,5.80,70.27,95.79,0.0,93.83,0.0,7.5,41.5,https://huggingface.co/iproskurina/mistral-7b-instruct-v0.3-gptqmodel-4bit
+🟢,Qwen2.5-7B-Instruct,base,7.14,73.41,91.94,0.3,88.56,0.6,4.0,3.0,https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
+🔶,Qwen2.5-7B-Instruct-Q,GPTQ 4-bit,7.31,72.64,94.32,0.1,88.42,0.7,4.1,2.5,https://huggingface.co/iproskurina/qwen2.5-7b-instruct-gptqmodel-4bit

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+plotly
+gradio==3.44.4
+huggingface_hub
+pandas
+transformers
+matplotlib