RobertoBarrosoLuque commited on
Commit
69ab3a1
·
1 Parent(s): ff3bc98

Add 32B SFT

Browse files
data/evaluation_results.csv CHANGED
@@ -2,6 +2,9 @@ model,category,accuracy,precision,recall,num_samples
2
  Qwen2.5-VL-32B-BASE,masterCategory,0.909,0.9196051103650724,0.909,1000
3
  Qwen2.5-VL-32B-BASE,gender,0.546,0.9259626959624715,0.546,1000
4
  Qwen2.5-VL-32B-BASE,subCategory,0.432,0.7070035848765855,0.432,1000
 
 
 
5
  Qwen2-VL-72B-BASE,masterCategory,0.968968968968969,0.9711267688093789,0.968968968968969,999
6
  Qwen2-VL-72B-BASE,gender,0.7607607607607607,0.9354341592843324,0.7607607607607607,999
7
  Qwen2-VL-72B-BASE,subCategory,0.34134134134134136,0.6784829173652965,0.34134134134134136,999
 
2
  Qwen2.5-VL-32B-BASE,masterCategory,0.909,0.9196051103650724,0.909,1000
3
  Qwen2.5-VL-32B-BASE,gender,0.546,0.9259626959624715,0.546,1000
4
  Qwen2.5-VL-32B-BASE,subCategory,0.432,0.7070035848765855,0.432,1000
5
+ Qwen2.5-VL-32B-SFT,masterCategory,0.7898550724637681,0.8517880287498527,0.7898550724637681,690
6
+ Qwen2.5-VL-32B-SFT,gender,0.9,0.8784316770186336,0.9,690
7
+ Qwen2.5-VL-32B-SFT,subCategory,0.9173913043478261,0.9378322745486548,0.9173913043478261,690
8
  Qwen2-VL-72B-BASE,masterCategory,0.968968968968969,0.9711267688093789,0.968968968968969,999
9
  Qwen2-VL-72B-BASE,gender,0.7607607607607607,0.9354341592843324,0.7607607607607607,999
10
  Qwen2-VL-72B-BASE,subCategory,0.34134134134134136,0.6784829173652965,0.34134134134134136,999
generate_eval_results.py CHANGED
@@ -14,6 +14,7 @@ test_df = pd.read_csv(DATA_PATH / "test.csv")
14
  # Define model prediction files and their display names
15
  model_files = {
16
  "Qwen2.5-VL-32B-BASE": "df_pred_FireworksAI_qwen2p5-vl-32b-instruct-ralh0ben.csv",
 
17
  "Qwen2-VL-72B-BASE": "df_pred_FireworksAI_qwen2-vl-72b-BASE-instruct-yaxztv7t.csv",
18
  "Qwen2-VL-72B-SFT": "df_pred_FireworksAI_qwen-72b-SFT-fashion-catalog-oueqouqs.csv",
19
  "GPT-5-Mini": "df_pred_OpenAI_gpt-5-mini-2025-08-07.csv",
 
14
  # Define model prediction files and their display names
15
  model_files = {
16
  "Qwen2.5-VL-32B-BASE": "df_pred_FireworksAI_qwen2p5-vl-32b-instruct-ralh0ben.csv",
17
+ "Qwen2.5-VL-32B-SFT": "df_pred_FireworksAI_qwen-32b-SFT-fashion-catalog-c6fhxibo.csv",
18
  "Qwen2-VL-72B-BASE": "df_pred_FireworksAI_qwen2-vl-72b-BASE-instruct-yaxztv7t.csv",
19
  "Qwen2-VL-72B-SFT": "df_pred_FireworksAI_qwen-72b-SFT-fashion-catalog-oueqouqs.csv",
20
  "GPT-5-Mini": "df_pred_OpenAI_gpt-5-mini-2025-08-07.csv",
notebooks/02-model-evals.ipynb CHANGED
@@ -245,10 +245,59 @@
245
  "3. Run test set through deployment for base model and save results"
246
  ]
247
  },
 
 
 
 
 
 
 
 
248
  {
249
  "cell_type": "code",
250
  "execution_count": null,
251
- "id": "19",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  "metadata": {},
253
  "outputs": [],
254
  "source": [
@@ -258,7 +307,7 @@
258
  {
259
  "cell_type": "code",
260
  "execution_count": null,
261
- "id": "20",
262
  "metadata": {},
263
  "outputs": [],
264
  "source": [
@@ -268,7 +317,7 @@
268
  {
269
  "cell_type": "code",
270
  "execution_count": null,
271
- "id": "21",
272
  "metadata": {},
273
  "outputs": [],
274
  "source": [
@@ -290,7 +339,7 @@
290
  },
291
  {
292
  "cell_type": "markdown",
293
- "id": "22",
294
  "metadata": {},
295
  "source": [
296
  "#### Run test set through closed source model"
@@ -299,7 +348,7 @@
299
  {
300
  "cell_type": "code",
301
  "execution_count": null,
302
- "id": "23",
303
  "metadata": {},
304
  "outputs": [],
305
  "source": [
@@ -322,7 +371,7 @@
322
  },
323
  {
324
  "cell_type": "markdown",
325
- "id": "24",
326
  "metadata": {},
327
  "source": [
328
  "### Compare eval metrics across models"
@@ -331,7 +380,7 @@
331
  {
332
  "cell_type": "code",
333
  "execution_count": null,
334
- "id": "25",
335
  "metadata": {},
336
  "outputs": [],
337
  "source": [
@@ -352,7 +401,7 @@
352
  {
353
  "cell_type": "code",
354
  "execution_count": null,
355
- "id": "26",
356
  "metadata": {},
357
  "outputs": [],
358
  "source": [
@@ -367,7 +416,7 @@
367
  {
368
  "cell_type": "code",
369
  "execution_count": null,
370
- "id": "27",
371
  "metadata": {},
372
  "outputs": [],
373
  "source": [
@@ -404,7 +453,7 @@
404
  {
405
  "cell_type": "code",
406
  "execution_count": null,
407
- "id": "28",
408
  "metadata": {},
409
  "outputs": [],
410
  "source": [
@@ -414,7 +463,7 @@
414
  {
415
  "cell_type": "code",
416
  "execution_count": null,
417
- "id": "29",
418
  "metadata": {},
419
  "outputs": [],
420
  "source": [
 
245
  "3. Run test set through deployment for base model and save results"
246
  ]
247
  },
248
+ {
249
+ "cell_type": "markdown",
250
+ "id": "19",
251
+ "metadata": {},
252
+ "source": [
253
+ "#### Run evals on Qwen 32B SFT\n"
254
+ ]
255
+ },
256
  {
257
  "cell_type": "code",
258
  "execution_count": null,
259
+ "id": "20",
260
+ "metadata": {},
261
+ "outputs": [],
262
+ "source": [
263
+ "! firectl -a pyroworks create deployment accounts/pyroworks/models/qwen-32b-fashion-catalog --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": null,
269
+ "id": "21",
270
+ "metadata": {},
271
+ "outputs": [],
272
+ "source": [
273
+ "# Run with concurrent requests using await directly in Jupyter\n",
274
+ "df_predictions_qwen_32b_fine_tuned = await run_inference_on_dataframe_async(\n",
275
+ " df_test,\n",
276
+ " model=\"accounts/pyroworks/deployedModels/qwen-32b-fashion-catalog-c6fhxibo\",\n",
277
+ " provider=\"FireworksAI\",\n",
278
+ " api_key=FIREWORKS_API_KEY,\n",
279
+ " max_concurrent_requests=20, # Adjust based on rate limits\n",
280
+ ")\n",
281
+ "\n",
282
+ "results_qwen_fine_tuned_32b = evaluate_all_categories(\n",
283
+ " df_ground_truth=df_test,\n",
284
+ " df_predictions=df_predictions_qwen_32b_fine_tuned,\n",
285
+ " categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
286
+ ")"
287
+ ]
288
+ },
289
+ {
290
+ "cell_type": "markdown",
291
+ "id": "22",
292
+ "metadata": {},
293
+ "source": [
294
+ "#### Run evals on Qwen 72B SFT"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": null,
300
+ "id": "23",
301
  "metadata": {},
302
  "outputs": [],
303
  "source": [
 
307
  {
308
  "cell_type": "code",
309
  "execution_count": null,
310
+ "id": "24",
311
  "metadata": {},
312
  "outputs": [],
313
  "source": [
 
317
  {
318
  "cell_type": "code",
319
  "execution_count": null,
320
+ "id": "25",
321
  "metadata": {},
322
  "outputs": [],
323
  "source": [
 
339
  },
340
  {
341
  "cell_type": "markdown",
342
+ "id": "26",
343
  "metadata": {},
344
  "source": [
345
  "#### Run test set through closed source model"
 
348
  {
349
  "cell_type": "code",
350
  "execution_count": null,
351
+ "id": "27",
352
  "metadata": {},
353
  "outputs": [],
354
  "source": [
 
371
  },
372
  {
373
  "cell_type": "markdown",
374
+ "id": "28",
375
  "metadata": {},
376
  "source": [
377
  "### Compare eval metrics across models"
 
380
  {
381
  "cell_type": "code",
382
  "execution_count": null,
383
+ "id": "29",
384
  "metadata": {},
385
  "outputs": [],
386
  "source": [
 
401
  {
402
  "cell_type": "code",
403
  "execution_count": null,
404
+ "id": "30",
405
  "metadata": {},
406
  "outputs": [],
407
  "source": [
 
416
  {
417
  "cell_type": "code",
418
  "execution_count": null,
419
+ "id": "31",
420
  "metadata": {},
421
  "outputs": [],
422
  "source": [
 
453
  {
454
  "cell_type": "code",
455
  "execution_count": null,
456
+ "id": "32",
457
  "metadata": {},
458
  "outputs": [],
459
  "source": [
 
463
  {
464
  "cell_type": "code",
465
  "execution_count": null,
466
+ "id": "33",
467
  "metadata": {},
468
  "outputs": [],
469
  "source": [