Spaces:

mostlyai
/

synthetic-sdk-demo

Sleeping

App Files Files Community

ZennyKenny commited on Sep 15

Commit

5362aed

verified ·

1 Parent(s): 5426d51

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -48

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ class SyntheticDataGenerator:
         try:
             self.mostly = MostlyAI(local=True, local_port=8080)
-            return True, "Mostly AI SDK initialized successfully!"
         except Exception as e:
             return False, f"Failed to initialize Mostly AI SDK: {str(e)}"
@@ -39,7 +39,7 @@ class SyntheticDataGenerator:
     def train_generator(self, data: pd.DataFrame, name: str, epochs: int = 10, max_training_time: int = 60, batch_size: int = 32, value_protection: bool = True) -> Tuple[bool, str]:
         """Train the synthetic data generator"""
         if not self.mostly:
-            return False, "Mostly AI SDK not initialized"
         try:
             self.original_data = data
@@ -62,37 +62,37 @@ class SyntheticDataGenerator:
             self.generator = self.mostly.train(
                 config = train_config
             )
-            return True, f"Generator trained successfully! Model: {name}"
         except Exception as e:
-            return False, f"Training failed: {str(e)}"
     def generate_synthetic_data(self, size: int) -> Tuple[pd.DataFrame, str]:
         """Generate synthetic data"""
         if not self.generator:
-            return None, "No trained generator available"
         try:
             synthetic_data = self.mostly.generate(self.generator, size=size)
             df = synthetic_data.data()
-            return df, f"Generated {len(df)} synthetic records successfully!"
         except Exception as e:
-            return None, f"Generation failed: {str(e)}"
     def get_quality_report(self) -> str:
         """Get quality assurance report"""
         if not self.generator:
-            return "No trained generator available"
         try:
             report = self.generator.reports(display=False)
             return str(report)
         except Exception as e:
-            return f"Failed to generate report: {str(e)}"
     def estimate_memory_usage(self, df: pd.DataFrame) -> str:
         """Estimate memory usage for the dataset"""
         if df is None or df.empty:
-            return "No data to analyze"
         # Calculate approximate memory usage
         memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
@@ -101,10 +101,15 @@ class SyntheticDataGenerator:
         # Estimate training memory (roughly 3-5x the data size)
         estimated_training_mb = memory_mb * 4
-        status = "✅ Good" if memory_mb < 100 else "⚠️ Large" if memory_mb < 500 else "❌ Very Large"
         return f"""
-**Memory Usage Estimate:**
 - Data size: {memory_mb:.1f} MB
 - Estimated training memory: {estimated_training_mb:.1f} MB
 - Status: {status}
@@ -118,30 +123,30 @@ generator = SyntheticDataGenerator()
 def initialize_sdk() -> Tuple[str, str]:
     """Initialize the Mostly AI SDK"""
     success, message = generator.initialize_mostly_ai()
-    status = "✅ Success" if success else "❌ Error"
     return status, message
 def train_model(data: pd.DataFrame, model_name: str, epochs: int, max_training_time: int, batch_size: int, value_protection: bool) -> Tuple[str, str]:
     """Train the synthetic data generator"""
     if data is None or data.empty:
-        return "❌ Error", "Please upload or create sample data first"
     success, message = generator.train_generator(data, model_name, epochs, max_training_time, batch_size, value_protection)
-    status = "✅ Success" if success else "❌ Error"
     return status, message
 def generate_data(size: int) -> Tuple[pd.DataFrame, str]:
     """Generate synthetic data"""
     if generator.generator is None:
-        return None, "❌ Please train a model first"
     synthetic_df, message = generator.generate_synthetic_data(size)
     if synthetic_df is not None:
-        status = "✅ Success"
     else:
-        status = "❌ Error"
-    return synthetic_df, f"{status} - {message}"
 def get_quality_report() -> str:
     """Get quality report"""
@@ -214,12 +219,33 @@ def download_csv(df: pd.DataFrame) -> str:
 def create_interface():
     with gr.Blocks(title="MOSTLY AI Synthetic Data Generator", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
-        # 🎭 MOSTLY AI Synthetic Data Generator
-        Generate high-quality synthetic data using the Mostly AI SDK. Upload your own CSV files to generate synthetic data that preserves the statistical properties of your original dataset.
         """)
-        with gr.Tab("🚀 Quick Start"):
             gr.Markdown("### Initialize the SDK and upload your data")
             with gr.Row():
@@ -236,13 +262,13 @@ def create_interface():
                     4. Generate synthetic data
                     """)
-        with gr.Tab("📊 Upload Data and Train Model"):
             gr.Markdown("### Upload your CSV file to generate synthetic data")
             gr.Markdown("""
-            **📋 File Requirements:**
-            - **Format:** CSV with header row
-            - **Size:** Optimized for Hugging Face Spaces (2 vCPU, 16GB RAM)
             """)
             file_upload = gr.File(
@@ -274,7 +300,7 @@ def create_interface():
             get_report_btn = gr.Button("Get Quality Report", variant="secondary")
-        with gr.Tab("🎲 Generate Data"):
             gr.Markdown("### Generate synthetic data from your trained model")
             with gr.Row():
@@ -331,37 +357,20 @@ def create_interface():
         # Handle file upload with size and column limits
         def process_uploaded_file(file):
             if file is None:
-                return None, "No file uploaded", gr.update(visible=False)
             try:
                 # Read the CSV file
                 df = pd.read_csv(file.name)
-                # # Check column limit (max 20 columns)
-                # if len(df.columns) > 20:
-                #     return None, f"❌ Too many columns! Maximum allowed: 20, found: {len(df.columns)}. Please reduce the number of columns in your CSV file.", gr.update(visible=False)
-                # # Check row limit (max 10,000 records)
-                # if len(df) > 10000:
-                #     return None, f"❌ Too many records! Maximum allowed: 10,000, found: {len(df)}. Please reduce the number of rows in your CSV file.", gr.update(visible=False)
-                # # Check minimum requirements
-                # if len(df) < 1000:
-                #     return None, f"❌ Too few records! Minimum required: 1,000, found: {len(df)}. Please provide more data for training.", gr.update(visible=False)
-                # if len(df.columns) < 2:
-                #     return None, f"❌ Too few columns! Minimum required: 2, found: {len(df.columns)}. Please provide more columns for training.", gr.update(visible=False)
-                # Success message with file info
-                success_msg = f"✅ File uploaded successfully! {len(df)} rows × {len(df.columns)} columns"
-                # Generate memory usage info
                 memory_info = generator.estimate_memory_usage(df)
                 return df, success_msg, gr.update(value=memory_info, visible=True)
             except Exception as e:
-                return None, f"❌ Error reading file: {str(e)}", gr.update(visible=False)
         file_upload.change(
             process_uploaded_file,
@@ -377,4 +386,4 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         server_port=7860,
         share=True
-    )

         try:
             self.mostly = MostlyAI(local=True, local_port=8080)
+            return True, "Mostly AI SDK initialized successfully."
         except Exception as e:
             return False, f"Failed to initialize Mostly AI SDK: {str(e)}"
     def train_generator(self, data: pd.DataFrame, name: str, epochs: int = 10, max_training_time: int = 60, batch_size: int = 32, value_protection: bool = True) -> Tuple[bool, str]:
         """Train the synthetic data generator"""
         if not self.mostly:
+            return False, "Mostly AI SDK not initialized. Please initialize the SDK first."
         try:
             self.original_data = data
             self.generator = self.mostly.train(
                 config = train_config
             )
+            return True, f"Training completed successfully. Model name: {name}"
         except Exception as e:
+            return False, f"Training failed with error: {str(e)}"
     def generate_synthetic_data(self, size: int) -> Tuple[pd.DataFrame, str]:
         """Generate synthetic data"""
         if not self.generator:
+            return None, "No trained generator available. Please train a model first."
         try:
             synthetic_data = self.mostly.generate(self.generator, size=size)
             df = synthetic_data.data()
+            return df, f"Synthetic data generated successfully. {len(df)} records created."
         except Exception as e:
+            return None, f"Synthetic data generation failed with error: {str(e)}"
     def get_quality_report(self) -> str:
         """Get quality assurance report"""
         if not self.generator:
+            return "No trained generator available. Please train a model first."
         try:
             report = self.generator.reports(display=False)
             return str(report)
         except Exception as e:
+            return f"Failed to generate quality report: {str(e)}"
     def estimate_memory_usage(self, df: pd.DataFrame) -> str:
         """Estimate memory usage for the dataset"""
         if df is None or df.empty:
+            return "No data available to analyze."
         # Calculate approximate memory usage
         memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
         # Estimate training memory (roughly 3-5x the data size)
         estimated_training_mb = memory_mb * 4
+        if memory_mb < 100:
+            status = "Good"
+        elif memory_mb < 500:
+            status = "Large"
+        else:
+            status = "Very Large"
         return f"""
+Memory Usage Estimate:
 - Data size: {memory_mb:.1f} MB
 - Estimated training memory: {estimated_training_mb:.1f} MB
 - Status: {status}
 def initialize_sdk() -> Tuple[str, str]:
     """Initialize the Mostly AI SDK"""
     success, message = generator.initialize_mostly_ai()
+    status = "Success" if success else "Error"
     return status, message
 def train_model(data: pd.DataFrame, model_name: str, epochs: int, max_training_time: int, batch_size: int, value_protection: bool) -> Tuple[str, str]:
     """Train the synthetic data generator"""
     if data is None or data.empty:
+        return "Error", "No data provided. Please upload or create sample data first."
     success, message = generator.train_generator(data, model_name, epochs, max_training_time, batch_size, value_protection)
+    status = "Success" if success else "Error"
     return status, message
 def generate_data(size: int) -> Tuple[pd.DataFrame, str]:
     """Generate synthetic data"""
     if generator.generator is None:
+        return None, "Error: No trained model available. Please train a model first."
     synthetic_df, message = generator.generate_synthetic_data(size)
     if synthetic_df is not None:
+        status = "Success"
     else:
+        status = "Error"
+    return synthetic_df, f"{status}: {message}"
 def get_quality_report() -> str:
     """Get quality report"""
 def create_interface():
     with gr.Blocks(title="MOSTLY AI Synthetic Data Generator", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
+        # MOSTLY AI Synthetic Data Generator
+        [Documentation](https://mostly-ai.github.io/mostlyai/) | [Technical White Paper](https://arxiv.org/abs/2508.00718) | [Usage Examples](https://mostly-ai.github.io/mostlyai/usage/) | [Free Cloud Service](https://app.mostly.ai/)
+        A Python toolkit for generating high-fidelity, privacy-safe synthetic data.
+        **Modes of operation:**
+        - **LOCAL mode** trains and generates synthetic data on your own compute resources.
+        - **CLIENT mode** connects to a remote MOSTLY AI platform for training and generation.
+        - Generators trained locally can be imported to the platform for sharing and collaboration.
+        **Key resources managed by the SDK:**
+        - **Generators**: Train on your tabular or language data assets.
+        - **Synthetic datasets**: Generate any number of synthetic samples as needed.
+        - **Connectors**: Connect to organizational data sources for reading and writing data.
+        **Common intents and API primitives:**
+        - Train a generator: `g = mostly.train(config)`
+        - Generate records: `sd = mostly.generate(g, config)`
+        - Probe generator: `df = mostly.probe(g, config)`
+        - Connect to data source: `c = mostly.connect(config)`
         """)
+        # display image above tabs
+        gr.Image(value="https://img.mailinblue.com/8225865/images/content_library/original/6880d164e4e4ea1a183ad4c0.png", show_label=False, elem_id="header-image")
+        with gr.Tab("Quick Start"):
             gr.Markdown("### Initialize the SDK and upload your data")
             with gr.Row():
                     4. Generate synthetic data
                     """)
+        with gr.Tab("Upload Data and Train Model"):
             gr.Markdown("### Upload your CSV file to generate synthetic data")
             gr.Markdown("""
+            **File Requirements:**
+            - Format: CSV with header row
+            - Size: Optimized for Hugging Face Spaces (2 vCPU, 16GB RAM)
             """)
             file_upload = gr.File(
             get_report_btn = gr.Button("Get Quality Report", variant="secondary")
+        with gr.Tab("Generate Data"):
             gr.Markdown("### Generate synthetic data from your trained model")
             with gr.Row():
         # Handle file upload with size and column limits
         def process_uploaded_file(file):
             if file is None:
+                return None, "No file uploaded.", gr.update(visible=False)
             try:
                 # Read the CSV file
                 df = pd.read_csv(file.name)
+                success_msg = f"File uploaded successfully. {len(df)} rows × {len(df.columns)} columns"
                 memory_info = generator.estimate_memory_usage(df)
                 return df, success_msg, gr.update(value=memory_info, visible=True)
             except Exception as e:
+                return None, f"Error reading file: {str(e)}", gr.update(visible=False)
         file_upload.change(
             process_uploaded_file,
         server_name="0.0.0.0",
         server_port=7860,
         share=True
+    )