catalog-extract / generate_eval_results.py
RobertoBarrosoLuque
Add 32B SFT
69ab3a1
"""
Script to generate evaluation results CSV for all model predictions
"""
import pandas as pd
from pathlib import Path
from src.modules.evals import evaluate_all_categories, extract_metrics
DATA_PATH = Path(__file__).parent / "data"
# Load test.csv (ground truth)
test_df = pd.read_csv(DATA_PATH / "test.csv")
# Define model prediction files and their display names
model_files = {
"Qwen2.5-VL-32B-BASE": "df_pred_FireworksAI_qwen2p5-vl-32b-instruct-ralh0ben.csv",
"Qwen2.5-VL-32B-SFT": "df_pred_FireworksAI_qwen-32b-SFT-fashion-catalog-c6fhxibo.csv",
"Qwen2-VL-72B-BASE": "df_pred_FireworksAI_qwen2-vl-72b-BASE-instruct-yaxztv7t.csv",
"Qwen2-VL-72B-SFT": "df_pred_FireworksAI_qwen-72b-SFT-fashion-catalog-oueqouqs.csv",
"GPT-5-Mini": "df_pred_OpenAI_gpt-5-mini-2025-08-07.csv",
}
# Collect all metrics
all_metrics = []
for model_name, filename in model_files.items():
pred_file = DATA_PATH / filename
if not pred_file.exists():
print(f"Warning: {filename} not found, skipping...")
continue
print(f"\nEvaluating {model_name}...")
print("=" * 60)
# Load predictions
pred_df = pd.read_csv(pred_file)
# Evaluate all categories
results = evaluate_all_categories(
df_ground_truth=test_df,
df_predictions=pred_df,
id_col="id",
categories=["masterCategory", "gender", "subCategory"],
)
# Extract metrics for this model
model_metrics = extract_metrics(results, model_name)
all_metrics.extend(model_metrics)
# Create DataFrame with all metrics
metrics_df = pd.DataFrame(all_metrics)
# Save to CSV
output_file = DATA_PATH / "evaluation_results.csv"
metrics_df.to_csv(output_file, index=False)
print(f"\n{'=' * 60}")
print(f"Evaluation complete! Results saved to: {output_file}")
print(f"{'=' * 60}")
print("\nSummary:")
print(metrics_df.to_string(index=False))