baiganinn commited on
Commit
3fdaf15
·
1 Parent(s): a9b3cfe

fix: bugs in workflow

Browse files
Files changed (2) hide show
  1. app.py +43 -11
  2. mapping.py +51 -2
app.py CHANGED
@@ -43,8 +43,24 @@ def predict_exoplanets(uploaded_file):
43
  if uploaded_file is None:
44
  return None, "Error: Please upload a CSV file", None
45
 
46
- # Read uploaded file
47
- df_uploaded = pd.read_csv(uploaded_file.name, comment='#')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  info_msg = f"Loaded rows: {len(df_uploaded)}\n"
50
  info_msg += f"Columns in uploaded dataset: {len(df_uploaded.columns)}\n\n"
@@ -72,17 +88,29 @@ def predict_exoplanets(uploaded_file):
72
  # Prepare X with correct columns
73
  info_msg += f"Creating DataFrame with {len(expected_features)} columns...\n"
74
 
75
- # Create empty DataFrame with correct columns
76
- X = pd.DataFrame(index=df_mapped.index, columns=expected_features)
77
 
78
  # Fill columns that exist in df_mapped
79
  for col in expected_features:
80
  if col in df_mapped.columns:
81
- X[col] = df_mapped[col].values
82
- else:
83
- X[col] = 0.0 # Fill missing columns with zeros
84
-
85
- # Convert all columns to numeric format
 
 
 
 
 
 
 
 
 
 
 
 
86
  X = X.apply(pd.to_numeric, errors='coerce')
87
 
88
  # Calculate statistics
@@ -94,8 +122,12 @@ def predict_exoplanets(uploaded_file):
94
 
95
  info_msg += f"DEBUG: X.shape = {X.shape}, expected: ({len(df_mapped)}, {len(expected_features)})\n"
96
 
97
- # Fill NaN with mean values
98
- X = X.fillna(X.mean().fillna(0))
 
 
 
 
99
 
100
  info_msg += f"DEBUG: After fillna X.shape = {X.shape}\n"
101
 
 
43
  if uploaded_file is None:
44
  return None, "Error: Please upload a CSV file", None
45
 
46
+ # Read uploaded file with robust error handling
47
+ try:
48
+ df_uploaded = pd.read_csv(uploaded_file.name, comment='#', low_memory=False)
49
+ except Exception as e:
50
+ try:
51
+ # Try with different encoding
52
+ df_uploaded = pd.read_csv(uploaded_file.name, comment='#', encoding='latin1', low_memory=False)
53
+ except Exception as e2:
54
+ return None, f"Error reading CSV file: {str(e)}\nAlternative attempt: {str(e2)}", None
55
+
56
+ # Ensure all columns are properly formatted (no multi-dimensional data)
57
+ for col in df_uploaded.columns:
58
+ # Check if column contains lists or arrays
59
+ if df_uploaded[col].dtype == 'object':
60
+ first_val = df_uploaded[col].dropna().iloc[0] if len(df_uploaded[col].dropna()) > 0 else None
61
+ if isinstance(first_val, (list, tuple)):
62
+ # Flatten lists/tuples - take first element
63
+ df_uploaded[col] = df_uploaded[col].apply(lambda x: x[0] if isinstance(x, (list, tuple)) and len(x) > 0 else x)
64
 
65
  info_msg = f"Loaded rows: {len(df_uploaded)}\n"
66
  info_msg += f"Columns in uploaded dataset: {len(df_uploaded.columns)}\n\n"
 
88
  # Prepare X with correct columns
89
  info_msg += f"Creating DataFrame with {len(expected_features)} columns...\n"
90
 
91
+ # Create empty DataFrame with correct columns filled with zeros
92
+ X = pd.DataFrame(0.0, index=df_mapped.index, columns=expected_features)
93
 
94
  # Fill columns that exist in df_mapped
95
  for col in expected_features:
96
  if col in df_mapped.columns:
97
+ try:
98
+ # Convert to numeric, handling any data type
99
+ col_data = pd.to_numeric(df_mapped[col], errors='coerce')
100
+ # Ensure we have a 1D Series, flatten if needed
101
+ if hasattr(col_data, 'values'):
102
+ col_values = col_data.values
103
+ if len(col_values.shape) > 1:
104
+ info_msg += f"Warning: Column '{col}' has shape {col_values.shape}, flattening...\n"
105
+ col_values = col_values.flatten()[:len(X)] # Take only first N values
106
+ X[col] = col_values
107
+ else:
108
+ X[col] = col_data
109
+ except Exception as e:
110
+ info_msg += f"Warning: Could not convert column '{col}': {str(e)}\n"
111
+ X[col] = 0.0
112
+
113
+ # Ensure all columns are numeric
114
  X = X.apply(pd.to_numeric, errors='coerce')
115
 
116
  # Calculate statistics
 
122
 
123
  info_msg += f"DEBUG: X.shape = {X.shape}, expected: ({len(df_mapped)}, {len(expected_features)})\n"
124
 
125
+ # Fill NaN with column means, then with 0 for any remaining NaN
126
+ X = X.fillna(X.mean())
127
+ X = X.fillna(0)
128
+
129
+ # Ensure no infinite values
130
+ X = X.replace([float('inf'), float('-inf')], 0)
131
 
132
  info_msg += f"DEBUG: After fillna X.shape = {X.shape}\n"
133
 
mapping.py CHANGED
@@ -313,6 +313,34 @@ Mapping:"""
313
 
314
  if valid_mapping:
315
  df_mapped = df_mapped.rename(columns=valid_mapping)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  return df_mapped
318
 
@@ -330,9 +358,30 @@ Mapping:"""
330
  # Копируем датафрейм чтобы не изменять оригинал
331
  df_work = uploaded_df.copy()
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  # Конвертируем координаты в градусы если они в текстовом формате
334
  coord_columns = [col for col in df_work.columns if any(
335
- keyword in col.lower() for keyword in ['ra', 'dec', 'coord', 'right_ascension', 'declination']
336
  )]
337
 
338
  for col in coord_columns:
@@ -362,7 +411,7 @@ Mapping:"""
362
  # Check which target columns are missing
363
  missing_cols = set(target_columns) - set(mapped_df.columns)
364
  if missing_cols:
365
- info_msg += f"\nWarning: Missing {len(missing_cols)} target columns (will be filled with NaN)\n"
366
 
367
  return mapped_df, mapping, info_msg
368
 
 
313
 
314
  if valid_mapping:
315
  df_mapped = df_mapped.rename(columns=valid_mapping)
316
+
317
+ # Ensure all columns are properly flattened and converted to numeric where possible
318
+ for col in df_mapped.columns:
319
+ try:
320
+ # Get the column as a Series
321
+ col_data = df_mapped[col]
322
+
323
+ # Check if it's actually a Series (not a DataFrame)
324
+ if not isinstance(col_data, pd.Series):
325
+ continue
326
+
327
+ # Check if column has object dtype or might contain complex data
328
+ if col_data.dtype == 'object':
329
+ try:
330
+ # Try to convert to numeric
331
+ df_mapped[col] = pd.to_numeric(col_data, errors='coerce')
332
+ except:
333
+ pass
334
+
335
+ # Ensure column is 1D
336
+ if hasattr(col_data, 'values'):
337
+ col_values = col_data.values
338
+ if len(col_values.shape) > 1:
339
+ # Flatten multi-dimensional arrays
340
+ df_mapped[col] = col_values.flatten()[:len(df_mapped)]
341
+ except Exception as e:
342
+ # Skip problematic columns
343
+ continue
344
 
345
  return df_mapped
346
 
 
358
  # Копируем датафрейм чтобы не изменять оригинал
359
  df_work = uploaded_df.copy()
360
 
361
+ # Clean up column names - remove extra spaces, special characters
362
+ df_work.columns = df_work.columns.str.strip()
363
+
364
+ # Handle any multi-dimensional columns before mapping
365
+ for col in df_work.columns:
366
+ if df_work[col].dtype == 'object':
367
+ # Check if column contains complex structures
368
+ first_val = df_work[col].dropna().iloc[0] if len(df_work[col].dropna()) > 0 else None
369
+
370
+ if isinstance(first_val, (list, tuple)):
371
+ # Flatten lists/tuples - take first element
372
+ df_work[col] = df_work[col].apply(
373
+ lambda x: x[0] if isinstance(x, (list, tuple)) and len(x) > 0 else (x if not isinstance(x, (list, tuple)) else None)
374
+ )
375
+ elif isinstance(first_val, str):
376
+ # Try to convert string representations of numbers
377
+ try:
378
+ df_work[col] = pd.to_numeric(df_work[col], errors='ignore')
379
+ except:
380
+ pass
381
+
382
  # Конвертируем координаты в градусы если они в текстовом формате
383
  coord_columns = [col for col in df_work.columns if any(
384
+ keyword in col.lower() for keyword in ['ra', 'dec', 'coord', 'right_ascension', 'declination', 'rastr', 'decstr']
385
  )]
386
 
387
  for col in coord_columns:
 
411
  # Check which target columns are missing
412
  missing_cols = set(target_columns) - set(mapped_df.columns)
413
  if missing_cols:
414
+ info_msg += f"\nWarning: Missing {len(missing_cols)} target columns (will be filled with zeros)\n"
415
 
416
  return mapped_df, mapping, info_msg
417