ZennyKenny commited on
Commit
5362aed
·
verified ·
1 Parent(s): 5426d51

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -48
app.py CHANGED
@@ -31,7 +31,7 @@ class SyntheticDataGenerator:
31
 
32
  try:
33
  self.mostly = MostlyAI(local=True, local_port=8080)
34
- return True, "Mostly AI SDK initialized successfully!"
35
  except Exception as e:
36
  return False, f"Failed to initialize Mostly AI SDK: {str(e)}"
37
 
@@ -39,7 +39,7 @@ class SyntheticDataGenerator:
39
  def train_generator(self, data: pd.DataFrame, name: str, epochs: int = 10, max_training_time: int = 60, batch_size: int = 32, value_protection: bool = True) -> Tuple[bool, str]:
40
  """Train the synthetic data generator"""
41
  if not self.mostly:
42
- return False, "Mostly AI SDK not initialized"
43
 
44
  try:
45
  self.original_data = data
@@ -62,37 +62,37 @@ class SyntheticDataGenerator:
62
  self.generator = self.mostly.train(
63
  config = train_config
64
  )
65
- return True, f"Generator trained successfully! Model: {name}"
66
  except Exception as e:
67
- return False, f"Training failed: {str(e)}"
68
 
69
  def generate_synthetic_data(self, size: int) -> Tuple[pd.DataFrame, str]:
70
  """Generate synthetic data"""
71
  if not self.generator:
72
- return None, "No trained generator available"
73
 
74
  try:
75
  synthetic_data = self.mostly.generate(self.generator, size=size)
76
  df = synthetic_data.data()
77
- return df, f"Generated {len(df)} synthetic records successfully!"
78
  except Exception as e:
79
- return None, f"Generation failed: {str(e)}"
80
 
81
  def get_quality_report(self) -> str:
82
  """Get quality assurance report"""
83
  if not self.generator:
84
- return "No trained generator available"
85
 
86
  try:
87
  report = self.generator.reports(display=False)
88
  return str(report)
89
  except Exception as e:
90
- return f"Failed to generate report: {str(e)}"
91
 
92
  def estimate_memory_usage(self, df: pd.DataFrame) -> str:
93
  """Estimate memory usage for the dataset"""
94
  if df is None or df.empty:
95
- return "No data to analyze"
96
 
97
  # Calculate approximate memory usage
98
  memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
@@ -101,10 +101,15 @@ class SyntheticDataGenerator:
101
  # Estimate training memory (roughly 3-5x the data size)
102
  estimated_training_mb = memory_mb * 4
103
 
104
- status = "✅ Good" if memory_mb < 100 else "⚠️ Large" if memory_mb < 500 else "❌ Very Large"
 
 
 
 
 
105
 
106
  return f"""
107
- **Memory Usage Estimate:**
108
  - Data size: {memory_mb:.1f} MB
109
  - Estimated training memory: {estimated_training_mb:.1f} MB
110
  - Status: {status}
@@ -118,30 +123,30 @@ generator = SyntheticDataGenerator()
118
  def initialize_sdk() -> Tuple[str, str]:
119
  """Initialize the Mostly AI SDK"""
120
  success, message = generator.initialize_mostly_ai()
121
- status = "Success" if success else "Error"
122
  return status, message
123
 
124
  def train_model(data: pd.DataFrame, model_name: str, epochs: int, max_training_time: int, batch_size: int, value_protection: bool) -> Tuple[str, str]:
125
  """Train the synthetic data generator"""
126
  if data is None or data.empty:
127
- return "Error", "Please upload or create sample data first"
128
 
129
  success, message = generator.train_generator(data, model_name, epochs, max_training_time, batch_size, value_protection)
130
- status = "Success" if success else "Error"
131
  return status, message
132
 
133
  def generate_data(size: int) -> Tuple[pd.DataFrame, str]:
134
  """Generate synthetic data"""
135
  if generator.generator is None:
136
- return None, " Please train a model first"
137
 
138
  synthetic_df, message = generator.generate_synthetic_data(size)
139
  if synthetic_df is not None:
140
- status = "Success"
141
  else:
142
- status = "Error"
143
 
144
- return synthetic_df, f"{status} - {message}"
145
 
146
  def get_quality_report() -> str:
147
  """Get quality report"""
@@ -214,12 +219,33 @@ def download_csv(df: pd.DataFrame) -> str:
214
  def create_interface():
215
  with gr.Blocks(title="MOSTLY AI Synthetic Data Generator", theme=gr.themes.Soft()) as demo:
216
  gr.Markdown("""
217
- # 🎭 MOSTLY AI Synthetic Data Generator
 
 
 
 
 
 
 
 
 
218
 
219
- Generate high-quality synthetic data using the Mostly AI SDK. Upload your own CSV files to generate synthetic data that preserves the statistical properties of your original dataset.
 
 
 
 
 
 
 
 
 
220
  """)
221
 
222
- with gr.Tab("🚀 Quick Start"):
 
 
 
223
  gr.Markdown("### Initialize the SDK and upload your data")
224
 
225
  with gr.Row():
@@ -236,13 +262,13 @@ def create_interface():
236
  4. Generate synthetic data
237
  """)
238
 
239
- with gr.Tab("📊 Upload Data and Train Model"):
240
  gr.Markdown("### Upload your CSV file to generate synthetic data")
241
 
242
  gr.Markdown("""
243
- **📋 File Requirements:**
244
- - **Format:** CSV with header row
245
- - **Size:** Optimized for Hugging Face Spaces (2 vCPU, 16GB RAM)
246
  """)
247
 
248
  file_upload = gr.File(
@@ -274,7 +300,7 @@ def create_interface():
274
 
275
  get_report_btn = gr.Button("Get Quality Report", variant="secondary")
276
 
277
- with gr.Tab("🎲 Generate Data"):
278
  gr.Markdown("### Generate synthetic data from your trained model")
279
 
280
  with gr.Row():
@@ -331,37 +357,20 @@ def create_interface():
331
  # Handle file upload with size and column limits
332
  def process_uploaded_file(file):
333
  if file is None:
334
- return None, "No file uploaded", gr.update(visible=False)
335
 
336
  try:
337
  # Read the CSV file
338
  df = pd.read_csv(file.name)
339
 
340
- # # Check column limit (max 20 columns)
341
- # if len(df.columns) > 20:
342
- # return None, f"❌ Too many columns! Maximum allowed: 20, found: {len(df.columns)}. Please reduce the number of columns in your CSV file.", gr.update(visible=False)
343
 
344
- # # Check row limit (max 10,000 records)
345
- # if len(df) > 10000:
346
- # return None, f"❌ Too many records! Maximum allowed: 10,000, found: {len(df)}. Please reduce the number of rows in your CSV file.", gr.update(visible=False)
347
-
348
- # # Check minimum requirements
349
- # if len(df) < 1000:
350
- # return None, f"❌ Too few records! Minimum required: 1,000, found: {len(df)}. Please provide more data for training.", gr.update(visible=False)
351
-
352
- # if len(df.columns) < 2:
353
- # return None, f"❌ Too few columns! Minimum required: 2, found: {len(df.columns)}. Please provide more columns for training.", gr.update(visible=False)
354
-
355
- # Success message with file info
356
- success_msg = f"✅ File uploaded successfully! {len(df)} rows × {len(df.columns)} columns"
357
-
358
- # Generate memory usage info
359
  memory_info = generator.estimate_memory_usage(df)
360
 
361
  return df, success_msg, gr.update(value=memory_info, visible=True)
362
 
363
  except Exception as e:
364
- return None, f"Error reading file: {str(e)}", gr.update(visible=False)
365
 
366
  file_upload.change(
367
  process_uploaded_file,
@@ -377,4 +386,4 @@ if __name__ == "__main__":
377
  server_name="0.0.0.0",
378
  server_port=7860,
379
  share=True
380
- )
 
31
 
32
  try:
33
  self.mostly = MostlyAI(local=True, local_port=8080)
34
+ return True, "Mostly AI SDK initialized successfully."
35
  except Exception as e:
36
  return False, f"Failed to initialize Mostly AI SDK: {str(e)}"
37
 
 
39
  def train_generator(self, data: pd.DataFrame, name: str, epochs: int = 10, max_training_time: int = 60, batch_size: int = 32, value_protection: bool = True) -> Tuple[bool, str]:
40
  """Train the synthetic data generator"""
41
  if not self.mostly:
42
+ return False, "Mostly AI SDK not initialized. Please initialize the SDK first."
43
 
44
  try:
45
  self.original_data = data
 
62
  self.generator = self.mostly.train(
63
  config = train_config
64
  )
65
+ return True, f"Training completed successfully. Model name: {name}"
66
  except Exception as e:
67
+ return False, f"Training failed with error: {str(e)}"
68
 
69
  def generate_synthetic_data(self, size: int) -> Tuple[pd.DataFrame, str]:
70
  """Generate synthetic data"""
71
  if not self.generator:
72
+ return None, "No trained generator available. Please train a model first."
73
 
74
  try:
75
  synthetic_data = self.mostly.generate(self.generator, size=size)
76
  df = synthetic_data.data()
77
+ return df, f"Synthetic data generated successfully. {len(df)} records created."
78
  except Exception as e:
79
+ return None, f"Synthetic data generation failed with error: {str(e)}"
80
 
81
  def get_quality_report(self) -> str:
82
  """Get quality assurance report"""
83
  if not self.generator:
84
+ return "No trained generator available. Please train a model first."
85
 
86
  try:
87
  report = self.generator.reports(display=False)
88
  return str(report)
89
  except Exception as e:
90
+ return f"Failed to generate quality report: {str(e)}"
91
 
92
  def estimate_memory_usage(self, df: pd.DataFrame) -> str:
93
  """Estimate memory usage for the dataset"""
94
  if df is None or df.empty:
95
+ return "No data available to analyze."
96
 
97
  # Calculate approximate memory usage
98
  memory_mb = df.memory_usage(deep=True).sum() / (1024 * 1024)
 
101
  # Estimate training memory (roughly 3-5x the data size)
102
  estimated_training_mb = memory_mb * 4
103
 
104
+ if memory_mb < 100:
105
+ status = "Good"
106
+ elif memory_mb < 500:
107
+ status = "Large"
108
+ else:
109
+ status = "Very Large"
110
 
111
  return f"""
112
+ Memory Usage Estimate:
113
  - Data size: {memory_mb:.1f} MB
114
  - Estimated training memory: {estimated_training_mb:.1f} MB
115
  - Status: {status}
 
123
  def initialize_sdk() -> Tuple[str, str]:
124
  """Initialize the Mostly AI SDK"""
125
  success, message = generator.initialize_mostly_ai()
126
+ status = "Success" if success else "Error"
127
  return status, message
128
 
129
  def train_model(data: pd.DataFrame, model_name: str, epochs: int, max_training_time: int, batch_size: int, value_protection: bool) -> Tuple[str, str]:
130
  """Train the synthetic data generator"""
131
  if data is None or data.empty:
132
+ return "Error", "No data provided. Please upload or create sample data first."
133
 
134
  success, message = generator.train_generator(data, model_name, epochs, max_training_time, batch_size, value_protection)
135
+ status = "Success" if success else "Error"
136
  return status, message
137
 
138
  def generate_data(size: int) -> Tuple[pd.DataFrame, str]:
139
  """Generate synthetic data"""
140
  if generator.generator is None:
141
+ return None, "Error: No trained model available. Please train a model first."
142
 
143
  synthetic_df, message = generator.generate_synthetic_data(size)
144
  if synthetic_df is not None:
145
+ status = "Success"
146
  else:
147
+ status = "Error"
148
 
149
+ return synthetic_df, f"{status}: {message}"
150
 
151
  def get_quality_report() -> str:
152
  """Get quality report"""
 
219
  def create_interface():
220
  with gr.Blocks(title="MOSTLY AI Synthetic Data Generator", theme=gr.themes.Soft()) as demo:
221
  gr.Markdown("""
222
+ # MOSTLY AI Synthetic Data Generator
223
+
224
+ [Documentation](https://mostly-ai.github.io/mostlyai/) | [Technical White Paper](https://arxiv.org/abs/2508.00718) | [Usage Examples](https://mostly-ai.github.io/mostlyai/usage/) | [Free Cloud Service](https://app.mostly.ai/)
225
+
226
+ A Python toolkit for generating high-fidelity, privacy-safe synthetic data.
227
+
228
+ **Modes of operation:**
229
+ - **LOCAL mode** trains and generates synthetic data on your own compute resources.
230
+ - **CLIENT mode** connects to a remote MOSTLY AI platform for training and generation.
231
+ - Generators trained locally can be imported to the platform for sharing and collaboration.
232
 
233
+ **Key resources managed by the SDK:**
234
+ - **Generators**: Train on your tabular or language data assets.
235
+ - **Synthetic datasets**: Generate any number of synthetic samples as needed.
236
+ - **Connectors**: Connect to organizational data sources for reading and writing data.
237
+
238
+ **Common intents and API primitives:**
239
+ - Train a generator: `g = mostly.train(config)`
240
+ - Generate records: `sd = mostly.generate(g, config)`
241
+ - Probe generator: `df = mostly.probe(g, config)`
242
+ - Connect to data source: `c = mostly.connect(config)`
243
  """)
244
 
245
+ # display image above tabs
246
+ gr.Image(value="https://img.mailinblue.com/8225865/images/content_library/original/6880d164e4e4ea1a183ad4c0.png", show_label=False, elem_id="header-image")
247
+
248
+ with gr.Tab("Quick Start"):
249
  gr.Markdown("### Initialize the SDK and upload your data")
250
 
251
  with gr.Row():
 
262
  4. Generate synthetic data
263
  """)
264
 
265
+ with gr.Tab("Upload Data and Train Model"):
266
  gr.Markdown("### Upload your CSV file to generate synthetic data")
267
 
268
  gr.Markdown("""
269
+ **File Requirements:**
270
+ - Format: CSV with header row
271
+ - Size: Optimized for Hugging Face Spaces (2 vCPU, 16GB RAM)
272
  """)
273
 
274
  file_upload = gr.File(
 
300
 
301
  get_report_btn = gr.Button("Get Quality Report", variant="secondary")
302
 
303
+ with gr.Tab("Generate Data"):
304
  gr.Markdown("### Generate synthetic data from your trained model")
305
 
306
  with gr.Row():
 
357
  # Handle file upload with size and column limits
358
  def process_uploaded_file(file):
359
  if file is None:
360
+ return None, "No file uploaded.", gr.update(visible=False)
361
 
362
  try:
363
  # Read the CSV file
364
  df = pd.read_csv(file.name)
365
 
366
+ success_msg = f"File uploaded successfully. {len(df)} rows × {len(df.columns)} columns"
 
 
367
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
  memory_info = generator.estimate_memory_usage(df)
369
 
370
  return df, success_msg, gr.update(value=memory_info, visible=True)
371
 
372
  except Exception as e:
373
+ return None, f"Error reading file: {str(e)}", gr.update(visible=False)
374
 
375
  file_upload.change(
376
  process_uploaded_file,
 
386
  server_name="0.0.0.0",
387
  server_port=7860,
388
  share=True
389
+ )