iproskurina commited on
Commit
fc291fb
Β·
verified Β·
1 Parent(s): aebe201

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +227 -0
  2. code_eval_board.csv +13 -0
  3. eval_instruct_lms.csv +9 -0
  4. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Works with Gradio <= 3.44.4 (supports sortable=True)
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+
7
+
8
+ # ---------- utils ----------
9
+ def model_hyperlink_md(link: str, name: str) -> str:
10
+ return f"[{name}]({link})"
11
+
12
+
13
+ def make_clickable_and_drop_links(df: pd.DataFrame) -> pd.DataFrame:
14
+ if "Links" not in df.columns:
15
+ raise ValueError("CSV must include a 'Links' column.")
16
+ df = df.copy()
17
+ df["Model"] = df.apply(lambda r: model_hyperlink_md(r["Links"], r["Model"]), axis=1)
18
+ return df.drop(columns=["Links"])
19
+
20
+
21
+ def datatypes_with_markdown(df: pd.DataFrame):
22
+ return ["markdown" if c == "Model" else "str" for c in df.columns]
23
+
24
+
25
+ # ---------- load data ----------
26
+ BASE_CSV = "code_eval_board.csv"
27
+ INSTRUCT_CSV = "eval_instruct_lms.csv"
28
+
29
+ base_df_raw = pd.read_csv(BASE_CSV)
30
+ inst_df_raw = pd.read_csv(INSTRUCT_CSV)
31
+
32
+ base_df = make_clickable_and_drop_links(base_df_raw)
33
+ inst_df = make_clickable_and_drop_links(inst_df_raw)
34
+
35
+ base_dtypes = datatypes_with_markdown(base_df)
36
+ inst_dtypes = datatypes_with_markdown(inst_df)
37
+
38
+ # ---------- css ----------
39
+ custom_css = """
40
+ .gradio-container {font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;}
41
+ #base-table a, #inst-table a {
42
+ color: #2a7ae2 !important;
43
+ text-decoration: underline dotted !important;
44
+ text-underline-offset: 3px;
45
+ }
46
+ #base-table a:hover, #inst-table a:hover {
47
+ color: #1e5bbf !important;
48
+ text-decoration: underline solid !important;
49
+ }
50
+ """
51
+
52
+ # ---------- app ----------
53
+ demo = gr.Blocks(css=custom_css)
54
+
55
+ with demo:
56
+ # ---------- HEADER ----------
57
+ gr.HTML(
58
+ """<div id='header' style='text-align:center; margin-top:16px;'>
59
+ <div id='title-row'
60
+ style='display:flex; align-items:center; justify-content:center; gap:16px; flex-wrap:wrap;'>
61
+ <img src='https://legendaryladieshub.com/wp-content/uploads/2023/12/Dike_Greek-goddess-of-justice-and-moral-order_by-LLH-300x300.jpeg'
62
+ alt='DikΓ©' width='80'
63
+ style='border-radius:50%; object-fit:cover; box-shadow:0 0 8px rgba(0,0,0,0.4); background:transparent;'>
64
+ <div style='display:flex; flex-direction:column; align-items:center; text-align:center;'>
65
+ <h1 style='font-size:30px; margin:0; font-weight:650;'>Open DikΓ© Leaderboard</h1>
66
+ <p style='font-size:18px; margin:4px 0; color:#6c7a89;'>
67
+ Bias and Fairness in Compressed LLMs
68
+ </p>
69
+ </div>
70
+ </div>
71
+
72
+ <p id='subtitle'
73
+ style='font-size:14px; color:#8a9aad; margin-top:12px;
74
+ max-width:1000px; margin-left:auto; margin-right:auto;
75
+ line-height:1.6; text-align:justify;'>
76
+ Inspired by
77
+ <a href='https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/'
78
+ target='_blank'
79
+ style='color:#5a8dee; text-decoration:none; font-weight:500;'>
80
+ πŸ€— Open LLM Leaderboard
81
+ </a> and
82
+ <a href='https://huggingface.co/spaces/optimum/llm-perf-leaderboard'
83
+ target='_blank'
84
+ style='color:#5a8dee; text-decoration:none; font-weight:500;'>
85
+ Optimum Leaderboard πŸ‹οΈ
86
+ </a>, we compare the performance of compressed LLMs across
87
+ <b>fairness</b>, <b>toxicity</b>, <b>ethics</b>, and <b>safety</b> benchmarks. The leaderboard is released as part of the
88
+ <a href='https://www.anr-dike.fr/' target='_blank'
89
+ style='color:#5a8dee; text-decoration:none; font-weight:500;'>βš–οΈ DikΓ© Project</a>.
90
+ </p>
91
+ </div>"""
92
+ )
93
+
94
+ # ---------- TABS ----------
95
+ with gr.Tabs():
96
+ # TAB 1: Base LLMs
97
+ with gr.TabItem("🟒 Base LLMs Evaluation"):
98
+ with gr.Row():
99
+ base_search = gr.Textbox(placeholder="πŸ” Search base models...", show_label=False)
100
+
101
+
102
+ def base_search_fn(q):
103
+ if not q or not q.strip():
104
+ return base_df
105
+ mask = base_df["Model"].str.contains(q, case=False)
106
+ return base_df[mask]
107
+
108
+
109
+ base_table = gr.Dataframe(
110
+ value=base_df,
111
+ datatype=base_dtypes,
112
+ interactive=False,
113
+ sortable=True,
114
+ elem_id="base-table",
115
+ )
116
+ base_search.submit(base_search_fn, base_search, base_table)
117
+
118
+ # TAB 2: Instruction-tuned LLMs
119
+ with gr.TabItem("πŸ”Ά Instruction-tuned LLMs Evaluation"):
120
+ with gr.Row():
121
+ inst_search = gr.Textbox(placeholder="πŸ” Search instruction-tuned models...", show_label=False)
122
+
123
+
124
+ def inst_search_fn(q):
125
+ if not q or not q.strip():
126
+ return inst_df
127
+ mask = inst_df["Model"].str.contains(q, case=False)
128
+ return inst_df[mask]
129
+
130
+
131
+ inst_table = gr.Dataframe(
132
+ value=inst_df,
133
+ datatype=inst_dtypes,
134
+ interactive=False,
135
+ sortable=True,
136
+ elem_id="inst-table",
137
+ )
138
+ inst_search.submit(inst_search_fn, inst_search, inst_table)
139
+
140
+ # TAB 3: About
141
+ # ---------- TAB 3: About ----------
142
+ with gr.TabItem("πŸ“˜ About"):
143
+ gr.HTML("""
144
+ <div style='max-width:900px; margin:0 auto; text-align:justify; color:#4b5563; line-height:1.6;'>
145
+ <h3 style='text-align:center;'>πŸ“Š Benchmarks and Metrics Overview</h3>
146
+ <p>
147
+ The DikΓ© Leaderboard evaluates the impact of <b>quantization</b> and <b>compression</b>
148
+ on <b>bias, fairness, ethics, and safety</b> of large language models (LLMs).
149
+ Each benchmark measures a specific social or ethical aspect of model behavior.
150
+ </p>
151
+
152
+ <ul style='list-style-type: " "; padding-left: 1em;'>
153
+
154
+ <li><b>Perplexity (PPL)</b> - General measure of model fluency and language modeling quality,
155
+ evaluated on <a href='https://huggingface.co/datasets/Salesforce/wikitext' target='_blank' style='color:#2563eb;'>WikiText-2</a>.
156
+ Lower values indicate better language modeling performance.</li>
157
+
158
+ <li><b>HellaSwag</b> - Measures general reasoning and commonsense knowledge via multiple-choice story completion.
159
+ Dataset: <a href='https://huggingface.co/datasets/Rowan/hellaswag' target='_blank' style='color:#2563eb;'>HellaSwag</a>.
160
+ Metric: Accuracy.</li>
161
+
162
+ <li><b>BBQ (Bias Benchmark for QA)</b> - Evaluates bias in ambiguous and disambiguated
163
+ question-answering contexts across 11 protected categories
164
+ (<a href='https://github.com/nyu-mll/BBQ' target='_blank' style='color:#2563eb;'>BBQ dataset</a>).
165
+ Metrics: Accuracy, Bias (ambiguous), Bias (disambiguated).</li>
166
+
167
+ <li><b>CrowS-Pairs</b> - Minimal stereotype pairs.
168
+ (<a href='https://huggingface.co/datasets/nyu-mll/crows_pairs' target='_blank' style='color:#2563eb;'>CrowS-Pairs dataset</a>).
169
+ Metric: % of stereotyped continuations.</li>
170
+
171
+ <li><b>HolisticBias</b> - 13 demographic axes with sentiment prompts
172
+ (<a href='https://huggingface.co/datasets/fairnlp/holistic-bias' target='_blank' style='color:#2563eb;'>HolisticBias dataset</a>).
173
+ Metric: Sentiment skew across identity descriptors.</li>
174
+
175
+ <li><b>SoFA (Social Fairness Dataset)</b> - 1.49M bias probes covering religion, gender, race, and disability
176
+ (<a href='https://huggingface.co/datasets/copenlu/sofa' target='_blank' style='color:#2563eb;'>SoFA dataset</a>).
177
+ Metric: Variance of log-perplexity across identity groups.</li>
178
+
179
+ <li><b>StereoSet</b> - Triplet format (stereotype, anti-stereotype, unrelated)
180
+ across gender, race, religion, profession
181
+ (<a href='https://github.com/moinnadeem/StereoSet' target='_blank' style='color:#2563eb;'>StereoSet dataset</a>).
182
+ Metric: Stereotype Score, Language Modeling Score.</li>
183
+
184
+ <li><b>ETHICS</b> - Morality judgments across five ethical principles;
185
+ we use the <i>Commonsense Morality</i> subset
186
+ (<a href='https://huggingface.co/datasets/hendrycks/ethics' target='_blank' style='color:#2563eb;'>ETHICS dataset</a>).
187
+ Metric: Accuracy.</li>
188
+
189
+ <li><b>Moral Stories</b> - First-person scenarios for moral vs. immoral action selection
190
+ (<a href='https://huggingface.co/datasets/demelin/moral_stories' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).
191
+ Metrics: Moral preference Accuracy, Refusal rate.</li>
192
+
193
+ <li><b>Histoires Morales</b> - French extension of Moral Stories for cross-lingual ethics evaluation.
194
+ (<a href='https://huggingface.co/datasets/LabHC/histoires_morales' target='_blank' style='color:#2563eb;'>Moral Stories dataset</a>).
195
+ Metric: Accuracy, Refusal rate.</li>
196
+
197
+ <li><b>RealToxicityPrompts</b> - Measures generation toxicity given neutral prompts
198
+ (<a href='https://huggingface.co/datasets/allenai/real-toxicity-prompts' target='_blank' style='color:#2563eb;'>RealToxicityPrompts</a>).
199
+ Metric: Average toxicity probability.</li>
200
+
201
+ <li><b>HarmBench</b> - Evaluates safety by measuring model responses to harmful or unethical prompts
202
+ (<a href='https://huggingface.co/datasets/walledai/HarmBench' target='_blank' style='color:#2563eb;'>HarmBench</a>).
203
+ Metric: Unsafe response rate.</li>
204
+
205
+ </ul>
206
+
207
+ <p style='margin-top:1.5em;'>
208
+ All evaluations are implemented via the
209
+ <a href='https://github.com/EleutherAI/lm-evaluation-harness'
210
+ target='_blank' style='color:#5a8dee;'>LM Evaluation Harness</a>
211
+ and follow consistent zero-shot protocols.
212
+ </p>
213
+ </div>
214
+ """)
215
+
216
+ # ---------- FOOTER ----------
217
+ gr.HTML(
218
+ """
219
+ <div style='text-align:center; margin-top:30px; font-size:14px; color:#777;'>
220
+ <b>Notes</b><br>
221
+ β€’ Click column headers to sort ascending/descending<br>
222
+ β€’ Model names are clickable links to Hugging Face pages<br><br>
223
+ Part of the <a href='https://www.anr-dike.fr/' target='_blank' style='color:#5a8dee;'>βš–οΈ DikΓ© Project</a>.
224
+ </div>
225
+ """
226
+ )
227
+ demo.launch(server_name="0.0.0.0", server_port=7860)
code_eval_board.csv ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ T,Model,Compression Recipe,PPL,HellaSwag,BBQ (Acc),BBQ (Bias Ambig.),BBQ (Bias Diasmmg.),CrowS-Pairs,HolisticBias Sentiment,SoFA,StereoSet,Links
2
+ 🟒,Llama-3-3B,base,7.55,73.67,41.02,4.91,4.47,64.54,31.26,0.198,65.19,https://huggingface.co/meta-llama/Llama-3-3B
3
+ πŸ”Ά,Llama-3-3B-Q,GPTQ 4-bit,7.99,71.23,40.42,5.20,3.97,64.24,22.31,0.200,65.31,https://huggingface.co/iproskurina/llama-3-3b-gptqmodel-4bit
4
+ 🟒,Llama-3-8B,base,6.11,78.88,43.86,6.27,3.10,66.29,18.30,0.205,66.42,https://huggingface.co/meta-llama/Llama-3-8B
5
+ πŸ”Ά,Llama-3-8B-Q,GPTQ 4-bit,6.49,77.93,42.45,6.14,3.15,65.92,13.05,0.203,65.89,https://huggingface.co/iproskurina/llama-3-8b-gptqmodel-4bit
6
+ 🟒,Qwen2.5-7B,base,6.63,78.88,49.32,15.85,3.23,64.24,16.87,0.672,64.96,https://huggingface.co/Qwen/Qwen2.5-7B
7
+ πŸ”Ά,Qwen2.5-7B-Q,GPTQ 4-bit,6.90,78.01,48.74,14.21,3.46,64.66,18.94,0.623,64.44,https://huggingface.co/iproskurina/qwen2.5-7b-gptqmodel-4bit
8
+ 🟒,Opt-6.7B,base,10.24,67.18,32.08,2.34,3.43,69.05,20.11,0.270,67.08,https://huggingface.co/facebook/opt-6.7b
9
+ πŸ”Ά,Opt-6.7B-Q,GPTQ 4-bit,10.39,-,-,-,-,68.39,20.99,0.271,-,https://huggingface.co/iproskurina/opt-6.7b-int4-c4
10
+ 🟒,Mistral-7B,base,5.50,80.31,43.81,7.27,3.14,66.29,17.90,0.524,64.00,https://huggingface.co/mistralai/Mistral-7B-v0.3
11
+ πŸ”Ά,Mistral-7B-Q,GPTQ 4-bit,5.64,80.08,43.19,6.06,3.48,66.89,23.70,0.768,63.75,https://huggingface.co/iproskurina/mistral-7b-gptqmodel-4bit
12
+ 🟒,Gemma-3-4B,base,7.12,75.77,38.89,5.47,4.82,63.76,8.08,1.558,65.41,https://huggingface.co/google/gemma-3-4b
13
+ πŸ”Ά,Gemma-3-4B-Q,GPTQ 4-bit,7.53,74.45,37.88,5.82,4.39,64.60,7.16,1.908,65.09,https://huggingface.co/iproskurina/gemma-3-4b-gptqmodel-4bit
eval_instruct_lms.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ T,Model,Compression Recipe,PPL,ETHICS-Commonsense,Moral Stories (Moral Preference),Moral Stories (Refusal rate),Histoires Morales (Moral Preference),Histoires Morales (Refusal rate),RealToxicityPrompts,HarmBench,Links
2
+ 🟒,Aya-expanse-8B,base,7.82,65.41,71.24,3.1,94.42,0.9,10.1,8.5,https://huggingface.co/CohereLabs/aya-expanse-8b
3
+ πŸ”Ά,Aya-expanse-8B-Q,GPTQ 4-bit,8.03,58.04,68.35,3.7,42.28,6.7,11.3,9.5,https://huggingface.co/iproskurina/aya-expanse-8b-gptqmodel-4bit
4
+ 🟒,Llama-3.1-8B-Instruct,base,6.99,60.21,95.44,0.1,94.17,0.2,3.0,12.5,https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
5
+ πŸ”Ά,Llama-3.1-8B-Instruct-Q,GPTQ 4-bit,7.22,58.64,99.97,0.0,93.63,0.3,3.4,12.5,https://huggingface.co/iproskurina/llama-3.1-8b-instruct-gptqmodel-4bit
6
+ 🟒,Mistral-7B-Instruct-v0.3,base,5.75,68.70,95.27,0.0,93.33,0.0,6.3,29.5,https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3
7
+ πŸ”Ά,Mistral-7B-Instruct-v0.3-Q,GPTQ 4-bit,5.80,70.27,95.79,0.0,93.83,0.0,7.5,41.5,https://huggingface.co/iproskurina/mistral-7b-instruct-v0.3-gptqmodel-4bit
8
+ 🟒,Qwen2.5-7B-Instruct,base,7.14,73.41,91.94,0.3,88.56,0.6,4.0,3.0,https://huggingface.co/Qwen/Qwen2.5-7B-Instruct
9
+ πŸ”Ά,Qwen2.5-7B-Instruct-Q,GPTQ 4-bit,7.31,72.64,94.32,0.1,88.42,0.7,4.1,2.5,https://huggingface.co/iproskurina/qwen2.5-7b-instruct-gptqmodel-4bit
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ plotly
2
+ gradio==3.44.4
3
+ huggingface_hub
4
+ pandas
5
+ transformers
6
+ matplotlib