Spaces:

fireworks-ai
/

catalog-extract

Running

App Files Files Community

RobertoBarrosoLuque commited on 17 days ago

Commit

ff3bc98

1 Parent(s): 6739aa3

Add interactive plots

Browse files

Files changed (7) hide show

data/evaluation_results.csv +13 -0
generate_eval_results.py +61 -0
notebooks/01-eda-and-fine-tuning.ipynb +40 -3
notebooks/02-model-evals.ipynb +91 -22
src/app.py +60 -14
src/modules/data_processing.py +0 -5
src/modules/viz.py +224 -0

data/evaluation_results.csv ADDED Viewed

	@@ -0,0 +1,13 @@

+model,category,accuracy,precision,recall,num_samples
+Qwen2.5-VL-32B-BASE,masterCategory,0.909,0.9196051103650724,0.909,1000
+Qwen2.5-VL-32B-BASE,gender,0.546,0.9259626959624715,0.546,1000
+Qwen2.5-VL-32B-BASE,subCategory,0.432,0.7070035848765855,0.432,1000
+Qwen2-VL-72B-BASE,masterCategory,0.968968968968969,0.9711267688093789,0.968968968968969,999
+Qwen2-VL-72B-BASE,gender,0.7607607607607607,0.9354341592843324,0.7607607607607607,999
+Qwen2-VL-72B-BASE,subCategory,0.34134134134134136,0.6784829173652965,0.34134134134134136,999
+Qwen2-VL-72B-SFT,masterCategory,0.993993993993994,0.9940108529582213,0.993993993993994,999
+Qwen2-VL-72B-SFT,gender,0.9169169169169169,0.9144956029794004,0.9169169169169169,999
+Qwen2-VL-72B-SFT,subCategory,0.9419419419419419,0.9512743495222181,0.9419419419419419,999
+GPT-5-Mini,masterCategory,0.981,0.9810138759482104,0.981,1000
+GPT-5-Mini,gender,0.907,0.9260515702929443,0.907,1000
+GPT-5-Mini,subCategory,0.897,0.944355065421394,0.897,1000

generate_eval_results.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""
+Script to generate evaluation results CSV for all model predictions
+"""
+import pandas as pd
+from pathlib import Path
+from src.modules.evals import evaluate_all_categories, extract_metrics
+DATA_PATH = Path(__file__).parent / "data"
+# Load test.csv (ground truth)
+test_df = pd.read_csv(DATA_PATH / "test.csv")
+# Define model prediction files and their display names
+model_files = {
+    "Qwen2.5-VL-32B-BASE": "df_pred_FireworksAI_qwen2p5-vl-32b-instruct-ralh0ben.csv",
+    "Qwen2-VL-72B-BASE": "df_pred_FireworksAI_qwen2-vl-72b-BASE-instruct-yaxztv7t.csv",
+    "Qwen2-VL-72B-SFT": "df_pred_FireworksAI_qwen-72b-SFT-fashion-catalog-oueqouqs.csv",
+    "GPT-5-Mini": "df_pred_OpenAI_gpt-5-mini-2025-08-07.csv",
+}
+# Collect all metrics
+all_metrics = []
+for model_name, filename in model_files.items():
+    pred_file = DATA_PATH / filename
+    if not pred_file.exists():
+        print(f"Warning: {filename} not found, skipping...")
+        continue
+    print(f"\nEvaluating {model_name}...")
+    print("=" * 60)
+    # Load predictions
+    pred_df = pd.read_csv(pred_file)
+    # Evaluate all categories
+    results = evaluate_all_categories(
+        df_ground_truth=test_df,
+        df_predictions=pred_df,
+        id_col="id",
+        categories=["masterCategory", "gender", "subCategory"],
+    )
+    # Extract metrics for this model
+    model_metrics = extract_metrics(results, model_name)
+    all_metrics.extend(model_metrics)
+# Create DataFrame with all metrics
+metrics_df = pd.DataFrame(all_metrics)
+# Save to CSV
+output_file = DATA_PATH / "evaluation_results.csv"
+metrics_df.to_csv(output_file, index=False)
+print(f"\n{'=' * 60}")
+print(f"Evaluation complete! Results saved to: {output_file}")
+print(f"{'=' * 60}")
+print("\nSummary:")
+print(metrics_df.to_string(index=False))

notebooks/01-eda-and-fine-tuning.ipynb CHANGED Viewed

@@ -273,10 +273,47 @@
     "| `--eval-auto-carveout` | Auto validation split | Always include |"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "22",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -286,7 +323,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "23",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -297,7 +334,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "24",
    "metadata": {},
    "outputs": [],
    "source": []

     "| `--eval-auto-carveout` | Auto validation split | Always include |"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "22",
+   "metadata": {},
+   "source": [
+    "##### Fine tune Qwen 2.5 vl 32B"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! firectl -a pyroworks create sftj --base-model accounts/fireworks/models/qwen2p5-vl-32b-instruct --dataset accounts/pyroworks/datasets/fashion-catalog-train --output-model qwen-32b-fashion-catalog --display-name \"Qwen2.5-32b-fashion-catalog\" --epochs 3 --learning-rate 0.0001 --early-stop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### Check status of job\n",
+    "! firectl -a pyroworks get sftj j588i1qm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25",
+   "metadata": {},
+   "source": [
+    "##### Fine tune Qwen 2.5 vl 72B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "27",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "28",
    "metadata": {},
    "outputs": [],
    "source": []

notebooks/02-model-evals.ipynb CHANGED Viewed

@@ -57,6 +57,14 @@
    "cell_type": "markdown",
    "id": "3",
    "metadata": {},
    "source": [
     "#### Run example image through a serverless Qwen VL model to test"
    ]
@@ -64,7 +72,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -80,7 +88,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -96,7 +104,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -105,7 +113,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "7",
    "metadata": {},
    "source": [
     "*Important*: If you are following through this notebook make sure to replace \"pyroworks\" with your account name"
@@ -113,19 +121,80 @@
   },
   {
    "cell_type": "markdown",
-   "id": "8",
    "metadata": {},
    "source": [
     "#### Run test set through base OSS model\n",
-    "1. Create a deployment for accounts/fireworks/models/qwen2-vl-72b-instruct\n",
     "2. Check deployment status\n",
-    "3. Run test set through deployment for base model and save results"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -135,7 +204,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "10",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -145,7 +214,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "11",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -167,7 +236,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "12",
    "metadata": {},
    "source": [
     "#### Run test set through fine tuned FW Qwen model\n",
@@ -179,7 +248,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "13",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,7 +258,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "14",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -199,7 +268,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "15",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -221,7 +290,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "16",
    "metadata": {},
    "source": [
     "#### Run test set through closed source model"
@@ -230,7 +299,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "17",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -253,7 +322,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "18",
    "metadata": {},
    "source": [
     "### Compare eval metrics across models"
@@ -262,7 +331,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "19",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -283,7 +352,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "20",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -298,7 +367,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "21",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -335,7 +404,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "22",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -345,7 +414,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "23",
    "metadata": {},
    "outputs": [],
    "source": [

    "cell_type": "markdown",
    "id": "3",
    "metadata": {},
+   "source": [
+    "**Note: if using this notebook make sure to replace \"pyroworks\" with your account name**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4",
+   "metadata": {},
    "source": [
     "#### Run example image through a serverless Qwen VL model to test"
    ]
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "5",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "6",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "7",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "markdown",
+   "id": "8",
    "metadata": {},
    "source": [
     "*Important*: If you are following through this notebook make sure to replace \"pyroworks\" with your account name"
   },
   {
    "cell_type": "markdown",
+   "id": "9",
    "metadata": {},
    "source": [
     "#### Run test set through base OSS model\n",
+    "1. Create a deployment for the model for faster inference\n",
     "2. Check deployment status\n",
+    "3. Run test set through deployment for base model and save results\n",
+    "\n",
+    "NOTE:make sure to delete or scale down deployment when done to avoid costs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10",
+   "metadata": {},
+   "source": [
+    "##### Run inference on Qwen 2.5 VL 32B\n",
+    "m"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! firectl create deployment accounts/fireworks/models/qwen2p5-vl-32b-instruct --min-replica-count 1 --max-replica-count 1 --accelerator-type NVIDIA_H100_80GB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! firectl -a pyroworks get deployment itmxuke2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_predictions_qwen_base_32b = await run_inference_on_dataframe_async(\n",
+    "    df_test,\n",
+    "    model=\"accounts/pyroworks/deployedModels/qwen2p5-vl-32b-instruct-ralh0ben\",\n",
+    "    provider=\"FireworksAI\",\n",
+    "    api_key=FIREWORKS_API_KEY,\n",
+    "    max_concurrent_requests=20,  # Adjust based on rate limits\n",
+    ")\n",
+    "\n",
+    "results_qwen_base_32b = evaluate_all_categories(\n",
+    "    df_ground_truth=df_test,\n",
+    "    df_predictions=df_predictions_qwen_base_32b,\n",
+    "    categories=[\"masterCategory\", \"gender\", \"subCategory\"]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14",
+   "metadata": {},
+   "source": [
+    "##### Run inference on Qwen 2.5 VL 72B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "16",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "17",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "markdown",
+   "id": "18",
    "metadata": {},
    "source": [
     "#### Run test set through fine tuned FW Qwen model\n",
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "19",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "20",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "21",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "markdown",
+   "id": "22",
    "metadata": {},
    "source": [
     "#### Run test set through closed source model"
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "23",
    "metadata": {},
    "outputs": [],
    "source": [
   },
   {
    "cell_type": "markdown",
+   "id": "24",
    "metadata": {},
    "source": [
     "### Compare eval metrics across models"
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "25",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "26",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "27",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "28",
    "metadata": {},
    "outputs": [],
    "source": [
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "29",
    "metadata": {},
    "outputs": [],
    "source": [

src/app.py CHANGED Viewed

@@ -11,6 +11,11 @@ from dotenv import load_dotenv
 from src.modules.vlm_inference import analyze_product_image
 from src.modules.data_processing import pil_to_base64
 from src.modules.evals import run_inference_on_dataframe
 load_dotenv()
@@ -25,6 +30,7 @@ MAX_CONCURRENT_REQUESTS = 10
 FILE_PATH = Path(__file__).parents[1]
 ASSETS_PATH = FILE_PATH / "assets"
 _NOTEBOOK_PATH = "https://huggingface.co/spaces/fireworks-ai/catalog-extract/blob/main/notebooks/01-eda-and-fine-tuning.ipynb"
 # Prompt style display names
@@ -56,13 +62,10 @@ def analyze_single_image(
         return "No image provided", "", "", ""
     try:
-        # Convert PIL Image to base64
         img_b64 = pil_to_base64(image_input)
-        # Determine provider from model name
         model_id = AVAILABLE_MODELS[model_name]
         api_key = os.getenv("FIREWORKS_API_KEY")
-        # Map display name to prompt key
         prompt_style = (
             PROMPT_STYLES.get(prompt_style_display) if prompt_style_display else None
         )
@@ -304,7 +307,7 @@ def create_demo_interface():
                     outputs=[image_input],
                 )
-            # Tab 3: Model Evaluation (show uploaded charts)
             with gr.TabItem("📈 Model Performance"):
                 gr.Markdown(
                     """
@@ -316,17 +319,60 @@ def create_demo_interface():
                     """
                 )
-                # Display uploaded evaluation charts
-                with gr.Row():
-                    gr.Image(
-                        value=str(ASSETS_PATH / "Accuracy.png"),
-                        interactive=False,
-                        show_label=False,
                     )
-                    gr.Image(
-                        value=str(ASSETS_PATH / "Accuracy-precision-recall.png"),
-                        interactive=False,
-                        show_label=False,
                     )
             with gr.Row():

 from src.modules.vlm_inference import analyze_product_image
 from src.modules.data_processing import pil_to_base64
 from src.modules.evals import run_inference_on_dataframe
+from src.modules.viz import (
+    load_evaluation_data,
+    create_accuracy_plot,
+    create_precision_recall_plot,
+)
 load_dotenv()
 FILE_PATH = Path(__file__).parents[1]
 ASSETS_PATH = FILE_PATH / "assets"
+DATA_PATH = FILE_PATH / "data"
 _NOTEBOOK_PATH = "https://huggingface.co/spaces/fireworks-ai/catalog-extract/blob/main/notebooks/01-eda-and-fine-tuning.ipynb"
 # Prompt style display names
         return "No image provided", "", "", ""
     try:
         img_b64 = pil_to_base64(image_input)
         model_id = AVAILABLE_MODELS[model_name]
         api_key = os.getenv("FIREWORKS_API_KEY")
         prompt_style = (
             PROMPT_STYLES.get(prompt_style_display) if prompt_style_display else None
         )
                     outputs=[image_input],
                 )
+            # Tab 3: Model Evaluation (interactive charts)
             with gr.TabItem("📈 Model Performance"):
                 gr.Markdown(
                     """
                     """
                 )
+                eval_df = load_evaluation_data(DATA_PATH)
+                if eval_df is not None:
+                    all_models = eval_df["model"].unique().tolist()
+                    all_categories = eval_df["category"].unique().tolist()
+                    with gr.Row():
+                        model_filter = gr.CheckboxGroup(
+                            choices=all_models,
+                            value=all_models,
+                            label="Select Models to Display",
+                            interactive=True,
+                        )
+                        category_filter = gr.CheckboxGroup(
+                            choices=all_categories,
+                            value=all_categories,
+                            label="Select Categories to Display",
+                            interactive=True,
+                        )
+                    with gr.Row():
+                        accuracy_plot = gr.Plot()
+                    with gr.Row():
+                        precision_recall_plot = gr.Plot()
+                    def update_plots(selected_models, selected_categories):
+                        acc_fig = create_accuracy_plot(
+                            eval_df, selected_models, selected_categories
+                        )
+                        pr_fig = create_precision_recall_plot(
+                            eval_df, selected_models, selected_categories
+                        )
+                        return acc_fig, pr_fig
+                    model_filter.change(
+                        fn=update_plots,
+                        inputs=[model_filter, category_filter],
+                        outputs=[accuracy_plot, precision_recall_plot],
+                    )
+                    category_filter.change(
+                        fn=update_plots,
+                        inputs=[model_filter, category_filter],
+                        outputs=[accuracy_plot, precision_recall_plot],
                     )
+                    demo.load(
+                        fn=update_plots,
+                        inputs=[model_filter, category_filter],
+                        outputs=[accuracy_plot, precision_recall_plot],
+                    )
+                else:
+                    gr.Markdown(
+                        "⚠️ Evaluation data not found. Please run `python generate_eval_results.py` first."
                     )
             with gr.Row():

src/modules/data_processing.py CHANGED Viewed

@@ -36,13 +36,9 @@ def image_to_base64(img_bytes):
 def create_training_example(row):
     """Create a training example with both classification and description tasks"""
-    # Convert image to base64
     img_b64 = image_to_base64(row["image"])
-    # Create multi-task prompt combining classification and description
     user_prompt = "Analyze this fashion product image and provide: 1) Master category, 2) Gender, 3) Sub-category, and 4) A detailed description."
-    # Create structured response with all classification info
     assistant_response = f"""
         Master Category: {row['masterCategory']}
         Gender: {row['gender']}
@@ -50,7 +46,6 @@ def create_training_example(row):
         Description: This is a {row['gender'].lower()} {row['subCategory'].lower()} from the {row['masterCategory'].lower()} category."""
-    # Format as OpenAI-compatible messages
     return {
         "messages": [
             {

 def create_training_example(row):
     """Create a training example with both classification and description tasks"""
     img_b64 = image_to_base64(row["image"])
     user_prompt = "Analyze this fashion product image and provide: 1) Master category, 2) Gender, 3) Sub-category, and 4) A detailed description."
     assistant_response = f"""
         Master Category: {row['masterCategory']}
         Gender: {row['gender']}
         Description: This is a {row['gender'].lower()} {row['subCategory'].lower()} from the {row['masterCategory'].lower()} category."""
     return {
         "messages": [
             {

src/modules/viz.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import matplotlib
+matplotlib.use("Agg")
+def load_evaluation_data(data_path) -> pd.DataFrame:
+    """Load evaluation results from CSV"""
+    eval_file = data_path / "evaluation_results.csv"
+    if eval_file.exists():
+        return pd.read_csv(eval_file)
+    return None
+def get_model_style(model_name):
+    """
+    Get color and hatch pattern for a model
+    Color scheme:
+    - GPT models: Gray (#808080)
+    - Qwen2.5-VL-32B: Light purple (#9B87E8) - BASE solid, SFT with pattern
+    - Qwen2-VL-72B: Medium blue (#5B7FD8) - BASE solid, SFT with pattern
+    Returns:
+        tuple: (color, hatch_pattern)
+    """
+    if "GPT" in model_name or "gpt" in model_name:
+        return "#808080", None
+    if "Qwen2.5" in model_name or "qwen2p5" in model_name or "32B" in model_name:
+        if "SFT" in model_name:
+            return "#9B87E8", "///"
+        else:
+            return "#9B87E8", None
+    if "Qwen2" in model_name or "72B" in model_name:
+        if "SFT" in model_name:
+            return "#5B7FD8", "///"
+        else:
+            return "#5B7FD8", None
+    return "#6B4DB8", None
+def create_accuracy_plot(
+    eval_df: pd.DataFrame,
+    selected_models: list = None,
+    selected_categories: list = None,
+):
+    """
+    Create bar chart of accuracy by category, colored by model
+    Args:
+        eval_df: DataFrame with evaluation results
+        selected_models: List of models to display (None for all)
+        selected_categories: List of categories to display (None for all)
+    Returns:
+        matplotlib figure
+    """
+    if eval_df is None:
+        return None
+    # Filter data
+    df_filtered = eval_df.copy()
+    if selected_models:
+        df_filtered = df_filtered[df_filtered["model"].isin(selected_models)]
+    if selected_categories:
+        df_filtered = df_filtered[df_filtered["category"].isin(selected_categories)]
+    # Create figure
+    fig, ax = plt.subplots(figsize=(12, 6))
+    # Get unique categories and models
+    categories = df_filtered["category"].unique()
+    models = df_filtered["model"].unique()
+    # Set up bar positions
+    x = range(len(categories))
+    width = 0.8 / len(models)
+    for i, model in enumerate(models):
+        model_data = df_filtered[df_filtered["model"] == model]
+        accuracies = [
+            model_data[model_data["category"] == cat]["accuracy"].values[0]
+            for cat in categories
+        ]
+        color, hatch = get_model_style(model)
+        offset = (i - len(models) / 2) * width + width / 2
+        ax.bar(
+            [xi + offset for xi in x],
+            accuracies,
+            width,
+            label=model,
+            color=color,
+            hatch=hatch,
+            alpha=0.8,
+            edgecolor="white",
+            linewidth=1.2,
+        )
+    # Customize plot
+    ax.set_xlabel("Category", fontsize=12, fontweight="bold")
+    ax.set_ylabel("Accuracy", fontsize=12, fontweight="bold")
+    ax.set_title("Model Accuracy by Category", fontsize=14, fontweight="bold")
+    ax.set_xticks(x)
+    ax.set_xticklabels(categories, rotation=0)
+    ax.set_ylim(0, 1.0)
+    ax.legend(loc="lower right", framealpha=0.9)
+    ax.grid(axis="y", alpha=0.3, linestyle="--")
+    plt.tight_layout()
+    return fig
+def create_precision_recall_plot(
+    eval_df: pd.DataFrame,
+    selected_models: list = None,
+    selected_categories: list = None,
+):
+    """
+    Create subplot with precision and recall by category, colored by model
+    Args:
+        eval_df: DataFrame with evaluation results
+        selected_models: List of models to display (None for all)
+        selected_categories: List of categories to display (None for all)
+    Returns:
+        matplotlib figure
+    """
+    if eval_df is None:
+        return None
+    # Filter data
+    df_filtered = eval_df.copy()
+    if selected_models:
+        df_filtered = df_filtered[df_filtered["model"].isin(selected_models)]
+    if selected_categories:
+        df_filtered = df_filtered[df_filtered["category"].isin(selected_categories)]
+    # Create figure with subplots
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
+    # Get unique categories and models
+    categories = df_filtered["category"].unique()
+    models = df_filtered["model"].unique()
+    # Set up bar positions
+    x = range(len(categories))
+    width = 0.8 / len(models)
+    # Plot precision bars
+    for i, model in enumerate(models):
+        model_data = df_filtered[df_filtered["model"] == model]
+        precisions = [
+            model_data[model_data["category"] == cat]["precision"].values[0]
+            for cat in categories
+        ]
+        # Get color and pattern for this model
+        color, hatch = get_model_style(model)
+        offset = (i - len(models) / 2) * width + width / 2
+        ax1.bar(
+            [xi + offset for xi in x],
+            precisions,
+            width,
+            label=model,
+            color=color,
+            hatch=hatch,
+            alpha=0.8,
+            edgecolor="white",
+            linewidth=1.2,
+        )
+    # Customize precision plot
+    ax1.set_xlabel("Category", fontsize=12, fontweight="bold")
+    ax1.set_ylabel("Precision", fontsize=12, fontweight="bold")
+    ax1.set_title("Model Precision by Category", fontsize=14, fontweight="bold")
+    ax1.set_xticks(x)
+    ax1.set_xticklabels(categories, rotation=0)
+    ax1.set_ylim(0, 1.0)
+    ax1.legend(loc="lower right", framealpha=0.9)
+    ax1.grid(axis="y", alpha=0.3, linestyle="--")
+    # Plot recall bars
+    for i, model in enumerate(models):
+        model_data = df_filtered[df_filtered["model"] == model]
+        recalls = [
+            model_data[model_data["category"] == cat]["recall"].values[0]
+            for cat in categories
+        ]
+        # Get color and pattern for this model
+        color, hatch = get_model_style(model)
+        offset = (i - len(models) / 2) * width + width / 2
+        ax2.bar(
+            [xi + offset for xi in x],
+            recalls,
+            width,
+            label=model,
+            color=color,
+            hatch=hatch,
+            alpha=0.8,
+            edgecolor="white",
+            linewidth=1.2,
+        )
+    ax2.set_xlabel("Category", fontsize=12, fontweight="bold")
+    ax2.set_ylabel("Recall", fontsize=12, fontweight="bold")
+    ax2.set_title("Model Recall by Category", fontsize=14, fontweight="bold")
+    ax2.set_xticks(x)
+    ax2.set_xticklabels(categories, rotation=0)
+    ax2.set_ylim(0, 1.0)
+    ax2.legend(loc="lower right", framealpha=0.9)
+    ax2.grid(axis="y", alpha=0.3, linestyle="--")
+    plt.tight_layout()
+    return fig