Spaces:

mostlyai
/

synthetic-sdk-demo

Sleeping

App Files Files Community

ZennyKenny commited on Sep 23

Commit

2ecf2d7

verified ·

1 Parent(s): 33611d7

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -94

app.py CHANGED Viewed

@@ -33,12 +33,12 @@ class SyntheticDataGenerator:
     def initialize_mostly_ai(self) -> Tuple[bool, str]:
         if not MOSTLY_AI_AVAILABLE:
-            return False, "mostly ai sdk not installed. please install with: pip install mostlyai[local]"
         try:
             self.mostly = MostlyAI(local=True, local_port=8080)
-            return True, "mostly ai sdk initialized successfully."
         except Exception as e:
-            return False, f"failed to initialize mostly ai sdk: {str(e)}"
     def train_generator(
         self,
@@ -59,7 +59,7 @@ class SyntheticDataGenerator:
         weight_decay: float = 0.0001,
     ) -> Tuple[bool, str]:
         if not self.mostly:
-            return False, "mostly ai sdk not initialized. please initialize the sdk first."
         try:
             self.original_data = data
             train_config = {
@@ -86,33 +86,33 @@ class SyntheticDataGenerator:
                 ]
             }
             self.generator = self.mostly.train(config=train_config)
-            return True, f"training completed successfully. model name: {name}"
         except Exception as e:
-            return False, f"training failed with error: {str(e)}"
     def generate_synthetic_data(self, size: int) -> Tuple[Optional[pd.DataFrame], str]:
         if not self.generator:
-            return None, "no trained generator available. please train a model first."
         try:
             synthetic_data = self.mostly.generate(self.generator, size=size)
             df = synthetic_data.data()
-            return df, f"synthetic data generated successfully. {len(df)} records created."
         except Exception as e:
-            return None, f"synthetic data generation failed with error: {str(e)}"
     def estimate_memory_usage(self, df: pd.DataFrame) -> str:
         if df is None or df.empty:
-            return "no data available to analyze."
         memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
         rows, cols = len(df), len(df.columns)
         estimated_training_mb = memory_mb * 4
-        status = "good" if memory_mb < 100 else ("large" if memory_mb < 500 else "very large")
         return f"""
-memory usage estimate:
-- data size: {memory_mb:.1f} mb
-- estimated training memory: {estimated_training_mb:.1f} mb
-- status: {status}
-- rows: {rows:,} | columns: {cols}
         """.strip()
@@ -123,7 +123,7 @@ _last_synth_df: Optional[pd.DataFrame] = None
 def initialize_sdk() -> str:
     ok, msg = generator.initialize_mostly_ai()
-    return ("success: " if ok else "error: ") + msg
 def train_model(
@@ -144,7 +144,7 @@ def train_model(
     weight_decay: float,
 ) -> str:
     if data is None or data.empty:
-        return "error: no data provided. please upload or create sample data first."
     # enforce backend caps regardless of ui inputs
     epochs = min(int(epochs), MAX_EPOCHS)
@@ -167,7 +167,7 @@ def train_model(
         dropout_rate=dropout_rate,
         weight_decay=weight_decay,
     )
-    return ("success: " if ok else "error: ") + msg
 def generate_data(size: int) -> Tuple[Optional[pd.DataFrame], str]:
@@ -175,13 +175,13 @@ def generate_data(size: int) -> Tuple[Optional[pd.DataFrame], str]:
     synth_df, message = generator.generate_synthetic_data(size)
     if synth_df is not None:
         _last_synth_df = synth_df.copy()
-        return synth_df, f"success: {message}"
     else:
-        return None, f"error: {message}"
 def download_csv_prepare() -> Optional[str]:
-    """return a path to the latest synthetic csv; used as output to gr.file"""
     global _last_synth_df
     if _last_synth_df is None or _last_synth_df.empty:
         return None
@@ -203,14 +203,14 @@ def create_comparison_plot(original_df: pd.DataFrame, synthetic_df: pd.DataFrame
     for i, col in enumerate(numeric_cols[: n_rows * n_cols]):
         row = i // n_cols + 1
         col_idx = i % n_cols + 1
-        fig.add_trace(go.Histogram(x=original_df[col], name=f"original {col}", opacity=0.7, nbinsx=20), row=row, col=col_idx)
-        fig.add_trace(go.Histogram(x=synthetic_df[col], name=f"synthetic {col}", opacity=0.7, nbinsx=20), row=row, col=col_idx)
-    fig.update_layout(title="original vs synthetic data comparison", height=300 * n_rows, showlegend=True)
     return fig
 def create_interface():
-    with gr.Blocks(title="mostly ai synthetic data generator", theme=gr.themes.Soft()) as demo:
         gr.Image(
             value="https://img.mailinblue.com/8225865/images/content_library/original/6880d164e4e4ea1a183ad4c0.png",
             show_label=False,
@@ -225,131 +225,133 @@ def create_interface():
         A Python toolkit for generating high-fidelity, privacy-safe synthetic data. This is a limited demo space intended to showcase the features of the [Synthetic Data SDK](https://github.com/mostly-ai/mostlyai).
-        **Demo Space Limitations**: Datasets are supported up to **{MAX_ROWS:,} rows** and **{MAX_COLS} columns**.
         Training is supported up to **≤ {MAX_EPOCHS} epochs** and **≤ {MAX_TRAINING_MINUTES} minutes**.
-    """
         )
-        with gr.Tab("quick start"):
-            gr.Markdown("### initialize the sdk and upload your data")
             with gr.Row():
                 with gr.Column():
-                    init_btn = gr.Button("initialize mostly ai sdk", variant="primary")
-                    init_status = gr.Textbox(label="initialization status", interactive=False)
                 with gr.Column():
                     gr.Markdown(
                         """
-                    **next steps:**
-                    1. initialize the sdk
-                    2. go to the "upload data and train model" tab to upload your csv file
-                    3. train a model on your data
-                    4. generate synthetic data
                     """
                     )
-        with gr.Tab("upload data and train model"):
-            gr.Markdown("### upload your csv file to generate synthetic data")
             gr.Markdown(
                 f"""
-            **file requirements & limits:**
-            - format: csv with header row
-            - size: optimized for hugging face spaces (2 vcpu, 16gb ram)
-            - **this app will automatically trim to the first {MAX_ROWS:,} rows and first {MAX_COLS} columns.**
             """
             )
-            file_upload = gr.File(label="upload csv file", file_types=[".csv"], file_count="single")
-            uploaded_data = gr.Dataframe(label="uploaded (trimmed) data", interactive=False)
-            memory_info = gr.Markdown(label="memory usage info", visible=False)
             with gr.Row():
                 with gr.Column(scale=1):
                     model_name = gr.Textbox(
-                        value="my synthetic model",
-                        label="generator name",
-                        placeholder="enter a name for your generator",
-                        info="appears in training runs and saved generators."
                     )
                     epochs = gr.Slider(
-                        1, MAX_EPOCHS, value=MAX_EPOCHS, step=1, label=f"training epochs (≤ {MAX_EPOCHS})",
-                        info=f"maximum number of passes over the training data. capped at {MAX_EPOCHS}."
                     )
                     max_training_time = gr.Slider(
                         1, MAX_TRAINING_MINUTES, value=MAX_TRAINING_MINUTES, step=1,
-                        label=f"maximum training time (minutes, ≤ {MAX_TRAINING_MINUTES})",
-                        info=f"upper bound in minutes; training stops if exceeded. capped at {MAX_TRAINING_MINUTES}."
                     )
                     batch_size = gr.Slider(
-                        8, 1024, value=32, step=8, label="batch size",
-                        info="number of rows per optimization step. larger can speed up but needs more memory."
                     )
                     value_protection = gr.Checkbox(
-                        label="value protection",
-                        info="adds protections to reduce memorization of unique or sensitive values.",
                         value=False
                     )
                     rare_category_protection = gr.Checkbox(
-                        label="rare category protection",
-                        info="prevents overfitting to infrequent categories to improve privacy and robustness.",
                         value=False
                     )
                 with gr.Column(scale=1):
                     flexible_generation = gr.Checkbox(
-                        label="flexible generation",
-                        info="allows generation when inputs slightly differ from training schema.",
                         value=True
                     )
                     model_size = gr.Dropdown(
                         choices=["SMALL", "MEDIUM", "LARGE"],
                         value="MEDIUM",
-                        label="model size",
-                        info="sets model capacity. larger can improve fidelity but uses more compute."
                     )
                     target_accuracy = gr.Slider(
-                        0.50, 0.999, value=0.95, step=0.001, label="target accuracy",
-                        info="stop early when validation accuracy reaches this threshold."
                     )
                     validation_split = gr.Slider(
-                        0.05, 0.5, value=0.2, step=0.01, label="validation split",
-                        info="fraction of the dataset held out for validation during training."
                     )
                     early_stopping_patience = gr.Slider(
-                        0, 50, value=10, step=1, label="early stopping patience (epochs)",
-                        info="stop if no validation improvement after this many epochs."
                     )
                 with gr.Column(scale=1):
                     learning_rate = gr.Number(
-                        value=0.001, precision=6, label="learning rate",
-                        info="step size for the optimizer. typical range: 1e-4 to 1e-2."
                     )
                     dropout_rate = gr.Slider(
-                        0.0, 0.6, value=0.1, step=0.01, label="dropout rate",
-                        info="regularization to reduce overfitting by randomly dropping units."
                     )
                     weight_decay = gr.Number(
-                        value=0.0001, precision=6, label="weight decay",
-                        info="l2 regularization strength applied to model weights."
                     )
-                    train_btn = gr.Button("train model", variant="primary")
-                    train_status = gr.Textbox(label="training status", interactive=False)
-        with gr.Tab("generate data"):
-            gr.Markdown("### generate synthetic data from your trained model")
             with gr.Row():
                 with gr.Column():
-                    gen_size = gr.Slider(10, 1000, value=100, step=10, label="number of records to generate",
-                                         info="how many synthetic rows to create in the table.")
-                    generate_btn = gr.Button("generate synthetic data", variant="primary")
                 with gr.Column():
-                    gen_status = gr.Textbox(label="generation status", interactive=False)
-            synthetic_data = gr.Dataframe(label="synthetic data", interactive=False)
             with gr.Row():
-                csv_download_btn = gr.Button("download csv", variant="secondary")
                 with gr.Group(visible=False) as csv_group:
-                    csv_file = gr.File(label="synthetic csv", interactive=False)
-                comparison_plot = gr.Plot(label="data comparison")
         init_btn.click(initialize_sdk, outputs=[init_status])
@@ -383,7 +385,7 @@ def create_interface():
         def process_uploaded_file(file):
             if file is None:
-                return None, "no file uploaded.", gr.update(visible=False)
             try:
                 df = pd.read_csv(file.name)
                 original_shape = df.shape
@@ -393,12 +395,15 @@ def create_interface():
                     df = df.iloc[:MAX_ROWS].copy()
                 trimmed_note = ""
                 if df.shape != original_shape:
-                    trimmed_note = f" (trimmed to {df.shape[0]:,} rows × {df.shape[1]} columns from {original_shape[0]:,} × {original_shape[1]})"
-                success_msg = f"file uploaded successfully.{trimmed_note}"
                 mem_info = generator.estimate_memory_usage(df)
                 return df, success_msg, gr.update(value=mem_info, visible=True)
             except Exception as e:
-                return None, f"error reading file: {str(e)}", gr.update(visible=False)
         file_upload.change(process_uploaded_file, inputs=[file_upload], outputs=[uploaded_data, train_status, memory_info])
@@ -406,5 +411,5 @@ def create_interface():
 if __name__ == "__main__":
-    demo = create_interface()
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

     def initialize_mostly_ai(self) -> Tuple[bool, str]:
         if not MOSTLY_AI_AVAILABLE:
+            return False, "Mostly AI SDK not installed. Please install with: pip install mostlyai[local]."
         try:
             self.mostly = MostlyAI(local=True, local_port=8080)
+            return True, "Mostly AI SDK initialized successfully."
         except Exception as e:
+            return False, f"Failed to initialize Mostly AI SDK: {str(e)}"
     def train_generator(
         self,
         weight_decay: float = 0.0001,
     ) -> Tuple[bool, str]:
         if not self.mostly:
+            return False, "Mostly AI SDK not initialized. Please initialize the SDK first."
         try:
             self.original_data = data
             train_config = {
                 ]
             }
             self.generator = self.mostly.train(config=train_config)
+            return True, f"Training completed successfully. Model name: {name}"
         except Exception as e:
+            return False, f"Training failed with error: {str(e)}"
     def generate_synthetic_data(self, size: int) -> Tuple[Optional[pd.DataFrame], str]:
         if not self.generator:
+            return None, "No trained generator available. Please train a model first."
         try:
             synthetic_data = self.mostly.generate(self.generator, size=size)
             df = synthetic_data.data()
+            return df, f"Synthetic data generated successfully. {len(df)} records created."
         except Exception as e:
+            return None, f"Synthetic data generation failed with error: {str(e)}"
     def estimate_memory_usage(self, df: pd.DataFrame) -> str:
         if df is None or df.empty:
+            return "No data available to analyze."
         memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
         rows, cols = len(df), len(df.columns)
         estimated_training_mb = memory_mb * 4
+        status = "Good" if memory_mb < 100 else ("Large" if memory_mb < 500 else "Very Large")
         return f"""
+**Memory Usage Estimate**
+- Data size: {memory_mb:.1f} MB
+- Estimated training memory: {estimated_training_mb:.1f} MB
+- Status: {status}
+- Rows: {rows:,} | Columns: {cols}
         """.strip()
 def initialize_sdk() -> str:
     ok, msg = generator.initialize_mostly_ai()
+    return ("Success: " if ok else "Error: ") + msg
 def train_model(
     weight_decay: float,
 ) -> str:
     if data is None or data.empty:
+        return "Error: No data provided. Please upload or create sample data first."
     # enforce backend caps regardless of ui inputs
     epochs = min(int(epochs), MAX_EPOCHS)
         dropout_rate=dropout_rate,
         weight_decay=weight_decay,
     )
+    return ("Success: " if ok else "Error: ") + msg
 def generate_data(size: int) -> Tuple[Optional[pd.DataFrame], str]:
     synth_df, message = generator.generate_synthetic_data(size)
     if synth_df is not None:
         _last_synth_df = synth_df.copy()
+        return synth_df, f"Success: {message}"
     else:
+        return None, f"Error: {message}"
 def download_csv_prepare() -> Optional[str]:
+    """Return a path to the latest synthetic CSV; used as output to gr.File."""
     global _last_synth_df
     if _last_synth_df is None or _last_synth_df.empty:
         return None
     for i, col in enumerate(numeric_cols[: n_rows * n_cols]):
         row = i // n_cols + 1
         col_idx = i % n_cols + 1
+        fig.add_trace(go.Histogram(x=original_df[col], name=f"Original {col}", opacity=0.7, nbinsx=20), row=row, col=col_idx)
+        fig.add_trace(go.Histogram(x=synthetic_df[col], name=f"Synthetic {col}", opacity=0.7, nbinsx=20), row=row, col=col_idx)
+    fig.update_layout(title="Original vs. Synthetic Data Comparison", height=300 * n_rows, showlegend=True)
     return fig
 def create_interface():
+    with gr.Blocks(title="MOSTLY AI Synthetic Data Generator", theme=gr.themes.Soft()) as demo:
         gr.Image(
             value="https://img.mailinblue.com/8225865/images/content_library/original/6880d164e4e4ea1a183ad4c0.png",
             show_label=False,
         A Python toolkit for generating high-fidelity, privacy-safe synthetic data. This is a limited demo space intended to showcase the features of the [Synthetic Data SDK](https://github.com/mostly-ai/mostlyai).
+        **Demo Space Limitations:** Datasets are supported up to **{MAX_ROWS:,} rows** and **{MAX_COLS} columns**.
         Training is supported up to **≤ {MAX_EPOCHS} epochs** and **≤ {MAX_TRAINING_MINUTES} minutes**.
+        """
         )
+        with gr.Tab("Quick Start"):
+            gr.Markdown("### Initialize the SDK and Upload Your Data")
             with gr.Row():
                 with gr.Column():
+                    init_btn = gr.Button("Initialize Mostly AI SDK", variant="primary")
+                    init_status = gr.Textbox(label="Initialization Status", interactive=False)
                 with gr.Column():
                     gr.Markdown(
                         """
+                    **Next Steps**
+                    1. Initialize the SDK.
+                    2. Go to the “Upload Data and Train Model” tab to upload your CSV file.
+                    3. Train a model on your data.
+                    4. Generate synthetic data.
                     """
                     )
+        with gr.Tab("Upload Data and Train Model"):
+            gr.Markdown("### Upload Your CSV File to Generate Synthetic Data")
             gr.Markdown(
                 f"""
+            **File Requirements & Limits**
+            - Format: CSV with a header row.
+            - Size: Optimized for Hugging Face Spaces (2 vCPU, 16 GB RAM).
+            - This app will automatically trim to the first **{MAX_ROWS:,}** rows and first **{MAX_COLS}** columns.
             """
             )
+            file_upload = gr.File(label="Upload CSV File", file_types=[".csv"], file_count="single")
+            uploaded_data = gr.Dataframe(label="Uploaded (Trimmed) Data", interactive=False)
+            memory_info = gr.Markdown(label="Memory Usage Info", visible=False)
             with gr.Row():
                 with gr.Column(scale=1):
                     model_name = gr.Textbox(
+                        value="My Synthetic Model",
+                        label="Generator Name",
+                        placeholder="Enter a name for your generator.",
+                        info="Appears in training runs and saved generators."
                     )
                     epochs = gr.Slider(
+                        1, MAX_EPOCHS, value=MAX_EPOCHS, step=1, label=f"Training Epochs (≤ {MAX_EPOCHS})",
+                        info=f"Maximum number of passes over the training data. Capped at {MAX_EPOCHS}."
                     )
                     max_training_time = gr.Slider(
                         1, MAX_TRAINING_MINUTES, value=MAX_TRAINING_MINUTES, step=1,
+                        label=f"Maximum Training Time (minutes, ≤ {MAX_TRAINING_MINUTES})",
+                        info=f"Upper bound in minutes; training stops if exceeded. Capped at {MAX_TRAINING_MINUTES}."
                     )
                     batch_size = gr.Slider(
+                        8, 1024, value=32, step=8, label="Batch Size",
+                        info="Number of rows per optimization step. Larger can speed up but requires more memory."
                     )
                     value_protection = gr.Checkbox(
+                        label="Value Protection",
+                        info="Adds protections to reduce memorization of unique or sensitive values.",
                         value=False
                     )
                     rare_category_protection = gr.Checkbox(
+                        label="Rare Category Protection",
+                        info="Prevents overfitting to infrequent categories to improve privacy and robustness.",
                         value=False
                     )
                 with gr.Column(scale=1):
                     flexible_generation = gr.Checkbox(
+                        label="Flexible Generation",
+                        info="Allows generation when inputs slightly differ from the training schema.",
                         value=True
                     )
                     model_size = gr.Dropdown(
                         choices=["SMALL", "MEDIUM", "LARGE"],
                         value="MEDIUM",
+                        label="Model Size",
+                        info="Sets model capacity. Larger can improve fidelity but uses more compute."
                     )
                     target_accuracy = gr.Slider(
+                        0.50, 0.999, value=0.95, step=0.001, label="Target Accuracy",
+                        info="Stop early when validation accuracy reaches this threshold."
                     )
                     validation_split = gr.Slider(
+                        0.05, 0.5, value=0.2, step=0.01, label="Validation Split",
+                        info="Fraction of the dataset held out for validation during training."
                     )
                     early_stopping_patience = gr.Slider(
+                        0, 50, value=10, step=1, label="Early Stopping Patience (epochs)",
+                        info="Stop if no validation improvement after this many epochs."
                     )
                 with gr.Column(scale=1):
                     learning_rate = gr.Number(
+                        value=0.001, precision=6, label="Learning Rate",
+                        info="Step size for the optimizer. Typical range: 1e-4 to 1e-2."
                     )
                     dropout_rate = gr.Slider(
+                        0.0, 0.6, value=0.1, step=0.01, label="Dropout Rate",
+                        info="Regularization to reduce overfitting by randomly dropping units."
                     )
                     weight_decay = gr.Number(
+                        value=0.0001, precision=6, label="Weight Decay",
+                        info="L2 regularization strength applied to model weights."
                     )
+                    train_btn = gr.Button("Train Model", variant="primary")
+                    train_status = gr.Textbox(label="Training Status", interactive=False)
+        with gr.Tab("Generate Data"):
+            gr.Markdown("### Generate Synthetic Data From Your Trained Model")
             with gr.Row():
                 with gr.Column():
+                    gen_size = gr.Slider(
+                        10, 1000, value=100, step=10, label="Number of Records to Generate",
+                        info="How many synthetic rows to create in the table."
+                    )
+                    generate_btn = gr.Button("Generate Synthetic Data", variant="primary")
                 with gr.Column():
+                    gen_status = gr.Textbox(label="Generation Status", interactive=False)
+            synthetic_data = gr.Dataframe(label="Synthetic Data", interactive=False)
             with gr.Row():
+                csv_download_btn = gr.Button("Download CSV", variant="secondary")
                 with gr.Group(visible=False) as csv_group:
+                    csv_file = gr.File(label="Synthetic CSV", interactive=False)
+                comparison_plot = gr.Plot(label="Data Comparison")
         init_btn.click(initialize_sdk, outputs=[init_status])
         def process_uploaded_file(file):
             if file is None:
+                return None, "No file uploaded.", gr.update(visible=False)
             try:
                 df = pd.read_csv(file.name)
                 original_shape = df.shape
                     df = df.iloc[:MAX_ROWS].copy()
                 trimmed_note = ""
                 if df.shape != original_shape:
+                    trimmed_note = (
+                        f" (trimmed to {df.shape[0]:,} rows × {df.shape[1]} columns "
+                        f"from {original_shape[0]:,} × {original_shape[1]})"
+                    )
+                success_msg = f"File uploaded successfully.{trimmed_note}"
                 mem_info = generator.estimate_memory_usage(df)
                 return df, success_msg, gr.update(value=mem_info, visible=True)
             except Exception as e:
+                return None, f"Error reading file: {str(e)}", gr.update(visible=False)
         file_upload.change(process_uploaded_file, inputs=[file_upload], outputs=[uploaded_data, train_status, memory_info])
 if __name__ == "__main__":
+    sdk_demo = create_interface()
+    sdk_demo.launch(server_name="0.0.0.0", server_port=7860, share=True)