Spaces:
Running
Running
| """ | |
| Script to generate evaluation results CSV for all model predictions | |
| """ | |
| import pandas as pd | |
| from pathlib import Path | |
| from src.modules.evals import evaluate_all_categories, extract_metrics | |
| DATA_PATH = Path(__file__).parent / "data" | |
| # Load test.csv (ground truth) | |
| test_df = pd.read_csv(DATA_PATH / "test.csv") | |
| # Define model prediction files and their display names | |
| model_files = { | |
| "Qwen2.5-VL-32B-BASE": "df_pred_FireworksAI_qwen2p5-vl-32b-instruct-ralh0ben.csv", | |
| "Qwen2.5-VL-32B-SFT": "df_pred_FireworksAI_qwen-32b-SFT-fashion-catalog-c6fhxibo.csv", | |
| "Qwen2-VL-72B-BASE": "df_pred_FireworksAI_qwen2-vl-72b-BASE-instruct-yaxztv7t.csv", | |
| "Qwen2-VL-72B-SFT": "df_pred_FireworksAI_qwen-72b-SFT-fashion-catalog-oueqouqs.csv", | |
| "GPT-5-Mini": "df_pred_OpenAI_gpt-5-mini-2025-08-07.csv", | |
| } | |
| # Collect all metrics | |
| all_metrics = [] | |
| for model_name, filename in model_files.items(): | |
| pred_file = DATA_PATH / filename | |
| if not pred_file.exists(): | |
| print(f"Warning: {filename} not found, skipping...") | |
| continue | |
| print(f"\nEvaluating {model_name}...") | |
| print("=" * 60) | |
| # Load predictions | |
| pred_df = pd.read_csv(pred_file) | |
| # Evaluate all categories | |
| results = evaluate_all_categories( | |
| df_ground_truth=test_df, | |
| df_predictions=pred_df, | |
| id_col="id", | |
| categories=["masterCategory", "gender", "subCategory"], | |
| ) | |
| # Extract metrics for this model | |
| model_metrics = extract_metrics(results, model_name) | |
| all_metrics.extend(model_metrics) | |
| # Create DataFrame with all metrics | |
| metrics_df = pd.DataFrame(all_metrics) | |
| # Save to CSV | |
| output_file = DATA_PATH / "evaluation_results.csv" | |
| metrics_df.to_csv(output_file, index=False) | |
| print(f"\n{'=' * 60}") | |
| print(f"Evaluation complete! Results saved to: {output_file}") | |
| print(f"{'=' * 60}") | |
| print("\nSummary:") | |
| print(metrics_df.to_string(index=False)) | |