Spaces:

fireworks-ai
/

catalog-extract

Running

App Files Files Community

RobertoBarrosoLuque commited on Oct 3

Commit

4818cbb

1 Parent(s): 360f329

Data exploration ready

Browse files

Files changed (4) hide show

.pre-commit-config.yaml +0 -6
notebooks/eda-and-fine-tuning.ipynb +232 -0
requirements.txt +5 -0
src/preprocessing/data_processing.py +56 -0

.pre-commit-config.yaml CHANGED Viewed

@@ -40,9 +40,3 @@ repos:
     hooks:
       - id: black
         args: ["--target-version", "py311"]
-  - repo: https://github.com/Yelp/detect-secrets
-    rev: v1.5.0
-    hooks:
-      - id: detect-secrets
-        exclude: ^(graphql-mock/pnpm-lock\.yaml|.*\.ipynb)$

     hooks:
       - id: black
         args: ["--target-version", "py311"]

notebooks/eda-and-fine-tuning.ipynb ADDED Viewed

	@@ -0,0 +1,232 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from PIL import Image\n",
+    "import io\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1",
+   "metadata": {},
+   "source": [
+    "## Understand the data and split into train and test\n",
+    "1. Shape of dataset\n",
+    "2. Distribution / balance of categories\n",
+    "3. Train-test split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds =load_dataset(\"ceyda/fashion-products-small\")\n",
+    "df = ds['train'].to_pandas()\n",
+    "print(f\"Shape of dataset: {df.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "### For expediency we will randomly sample only 10,000 total rows\n",
+    "sample_size = 10000\n",
+    "df = df.sample(n=sample_size, random_state=42)\n",
+    "print(f\"Shape of dataset after sampling: {df.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_category_distribution_by_percent(df, col):\n",
+    "    count_df = df.groupby(col)[\"id\"].count().reset_index(name=\"count\")\n",
+    "    _denominator = df.shape[0]\n",
+    "    count_df.loc[:, \"percent\"] = (count_df[\"count\"] / _denominator) * 100\n",
+    "    return count_df.sort_values(by=\"percent\", ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "m_cat = get_category_distribution_by_percent(df, \"masterCategory\")\n",
+    "m_cat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_category_distribution_by_percent(df, \"subCategory\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "get_category_distribution_by_percent(df, \"gender\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8",
+   "metadata": {},
+   "source": [
+    "As seen above the dataset is imbalanced, especially around masterCategory. Lets filter out any masterCategory with less than 2% of the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cat_less_than_2_percent = m_cat.loc[m_cat.loc[:, \"percent\"] < 2, \"masterCategory\"].values\n",
+    "print(f\"Starting with {df.shape}\")\n",
+    "df = df.loc[~df.loc[:, \"masterCategory\"].isin(cat_less_than_2_percent)]\n",
+    "print(f\"Finished with {df.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train, df_test = train_test_split(df, test_size=0.15, random_state=42)\n",
+    "print(f\"Train shape: {df_train.shape}\")\n",
+    "print(f\"Test shape: {df_test.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Save datasets in /data folder\n",
+    "df_train.to_csv(\"../data/train.csv\", index=False)\n",
+    "df_test.to_csv(\"../data/test.csv\", index=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12",
+   "metadata": {},
+   "source": [
+    "## Fine tuning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import base64\n",
+    "from io import BytesIO\n",
+    "\n",
+    "def pil_to_base64(pil_image):\n",
+    "    \"\"\"Convert PIL Image to base64 string\"\"\"\n",
+    "    buffered = BytesIO()\n",
+    "    pil_image.save(buffered, format=\"PNG\")\n",
+    "    img_str = base64.b64encode(buffered.getvalue()).decode()\n",
+    "    return f\"data:image/png;base64,{img_str}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_bytes = df_train['image'][0]['bytes']\n",
+    "img = Image.open(io.BytesIO(img_bytes))\n",
+    "plt.imshow(img)\n",
+    "plt.axis('off')\n",
+    "plt.title(ds['train'][0].get('productDisplayName', 'Product'))\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15",
+   "metadata": {},
+   "source": [
+    "Convert dataset to Fireworks jsonl as specified in [the docs](https://fireworks.ai/docs/fine-tuning/fine-tuning-vlm#supervised-fine-tuning-for-vlms-sft)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_train.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

requirements.txt CHANGED Viewed

@@ -2,3 +2,8 @@ huggingface_hub==0.34.3
 fireworks-ai==0.19.18
 gradio==5.42.0
 python-dotenv==1.0.0

 fireworks-ai==0.19.18
 gradio==5.42.0
 python-dotenv==1.0.0
+ipython
+scikit-learn
+jupyter
+altair
+matplotlib

src/preprocessing/data_processing.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import base64
+from io import BytesIO
+def pil_to_base64(pil_image):
+    """Convert PIL Image to base64 string"""
+    buffered = BytesIO()
+    pil_image.save(buffered, format="PNG")
+    img_str = base64.b64encode(buffered.getvalue()).decode()
+    return f"data:image/png;base64,{img_str}"
+def image_to_base64(img_bytes):
+    """Convert image bytes to base64 string with MIME type"""
+    if isinstance(img_bytes, dict) and "bytes" in img_bytes:
+        img_bytes = img_bytes["bytes"]
+    # Encode to base64
+    b64_string = base64.b64encode(img_bytes).decode("utf-8")
+    return f"data:image/jpeg;base64,{b64_string}"
+def create_training_example(row):
+    """Create a training example with both classification and description tasks"""
+    # Convert image to base64
+    img_b64 = image_to_base64(row["image"])
+    # Create multi-task prompt combining classification and description
+    user_prompt = "Analyze this fashion product image and provide: 1) Master category, 2) Gender, 3) Sub-category, and 4) A detailed description."
+    # Create structured response with all classification info
+    assistant_response = f"""
+        Master Category: {row['masterCategory']}
+        Gender: {row['gender']}
+        Sub-category: {row['subCategory']}
+        Description: This is a {row['gender'].lower()} {row['subCategory'].lower()} from the {row['masterCategory'].lower()} category."""
+    # Format as OpenAI-compatible messages
+    return {
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are a fashion product analyst. Classify products and generate detailed descriptions based on images.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": img_b64}},
+                    {"type": "text", "text": user_prompt},
+                ],
+            },
+            {"role": "assistant", "content": assistant_response},
+        ]
+    }