Spaces:
Sleeping
Sleeping
fix: bugs in workflow
Browse files- app.py +43 -11
- mapping.py +51 -2
app.py
CHANGED
|
@@ -43,8 +43,24 @@ def predict_exoplanets(uploaded_file):
|
|
| 43 |
if uploaded_file is None:
|
| 44 |
return None, "Error: Please upload a CSV file", None
|
| 45 |
|
| 46 |
-
# Read uploaded file
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
info_msg = f"Loaded rows: {len(df_uploaded)}\n"
|
| 50 |
info_msg += f"Columns in uploaded dataset: {len(df_uploaded.columns)}\n\n"
|
|
@@ -72,17 +88,29 @@ def predict_exoplanets(uploaded_file):
|
|
| 72 |
# Prepare X with correct columns
|
| 73 |
info_msg += f"Creating DataFrame with {len(expected_features)} columns...\n"
|
| 74 |
|
| 75 |
-
# Create empty DataFrame with correct columns
|
| 76 |
-
X = pd.DataFrame(index=df_mapped.index, columns=expected_features)
|
| 77 |
|
| 78 |
# Fill columns that exist in df_mapped
|
| 79 |
for col in expected_features:
|
| 80 |
if col in df_mapped.columns:
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
X = X.apply(pd.to_numeric, errors='coerce')
|
| 87 |
|
| 88 |
# Calculate statistics
|
|
@@ -94,8 +122,12 @@ def predict_exoplanets(uploaded_file):
|
|
| 94 |
|
| 95 |
info_msg += f"DEBUG: X.shape = {X.shape}, expected: ({len(df_mapped)}, {len(expected_features)})\n"
|
| 96 |
|
| 97 |
-
# Fill NaN with
|
| 98 |
-
X = X.fillna(X.mean()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
info_msg += f"DEBUG: After fillna X.shape = {X.shape}\n"
|
| 101 |
|
|
|
|
| 43 |
if uploaded_file is None:
|
| 44 |
return None, "Error: Please upload a CSV file", None
|
| 45 |
|
| 46 |
+
# Read uploaded file with robust error handling
|
| 47 |
+
try:
|
| 48 |
+
df_uploaded = pd.read_csv(uploaded_file.name, comment='#', low_memory=False)
|
| 49 |
+
except Exception as e:
|
| 50 |
+
try:
|
| 51 |
+
# Try with different encoding
|
| 52 |
+
df_uploaded = pd.read_csv(uploaded_file.name, comment='#', encoding='latin1', low_memory=False)
|
| 53 |
+
except Exception as e2:
|
| 54 |
+
return None, f"Error reading CSV file: {str(e)}\nAlternative attempt: {str(e2)}", None
|
| 55 |
+
|
| 56 |
+
# Ensure all columns are properly formatted (no multi-dimensional data)
|
| 57 |
+
for col in df_uploaded.columns:
|
| 58 |
+
# Check if column contains lists or arrays
|
| 59 |
+
if df_uploaded[col].dtype == 'object':
|
| 60 |
+
first_val = df_uploaded[col].dropna().iloc[0] if len(df_uploaded[col].dropna()) > 0 else None
|
| 61 |
+
if isinstance(first_val, (list, tuple)):
|
| 62 |
+
# Flatten lists/tuples - take first element
|
| 63 |
+
df_uploaded[col] = df_uploaded[col].apply(lambda x: x[0] if isinstance(x, (list, tuple)) and len(x) > 0 else x)
|
| 64 |
|
| 65 |
info_msg = f"Loaded rows: {len(df_uploaded)}\n"
|
| 66 |
info_msg += f"Columns in uploaded dataset: {len(df_uploaded.columns)}\n\n"
|
|
|
|
| 88 |
# Prepare X with correct columns
|
| 89 |
info_msg += f"Creating DataFrame with {len(expected_features)} columns...\n"
|
| 90 |
|
| 91 |
+
# Create empty DataFrame with correct columns filled with zeros
|
| 92 |
+
X = pd.DataFrame(0.0, index=df_mapped.index, columns=expected_features)
|
| 93 |
|
| 94 |
# Fill columns that exist in df_mapped
|
| 95 |
for col in expected_features:
|
| 96 |
if col in df_mapped.columns:
|
| 97 |
+
try:
|
| 98 |
+
# Convert to numeric, handling any data type
|
| 99 |
+
col_data = pd.to_numeric(df_mapped[col], errors='coerce')
|
| 100 |
+
# Ensure we have a 1D Series, flatten if needed
|
| 101 |
+
if hasattr(col_data, 'values'):
|
| 102 |
+
col_values = col_data.values
|
| 103 |
+
if len(col_values.shape) > 1:
|
| 104 |
+
info_msg += f"Warning: Column '{col}' has shape {col_values.shape}, flattening...\n"
|
| 105 |
+
col_values = col_values.flatten()[:len(X)] # Take only first N values
|
| 106 |
+
X[col] = col_values
|
| 107 |
+
else:
|
| 108 |
+
X[col] = col_data
|
| 109 |
+
except Exception as e:
|
| 110 |
+
info_msg += f"Warning: Could not convert column '{col}': {str(e)}\n"
|
| 111 |
+
X[col] = 0.0
|
| 112 |
+
|
| 113 |
+
# Ensure all columns are numeric
|
| 114 |
X = X.apply(pd.to_numeric, errors='coerce')
|
| 115 |
|
| 116 |
# Calculate statistics
|
|
|
|
| 122 |
|
| 123 |
info_msg += f"DEBUG: X.shape = {X.shape}, expected: ({len(df_mapped)}, {len(expected_features)})\n"
|
| 124 |
|
| 125 |
+
# Fill NaN with column means, then with 0 for any remaining NaN
|
| 126 |
+
X = X.fillna(X.mean())
|
| 127 |
+
X = X.fillna(0)
|
| 128 |
+
|
| 129 |
+
# Ensure no infinite values
|
| 130 |
+
X = X.replace([float('inf'), float('-inf')], 0)
|
| 131 |
|
| 132 |
info_msg += f"DEBUG: After fillna X.shape = {X.shape}\n"
|
| 133 |
|
mapping.py
CHANGED
|
@@ -313,6 +313,34 @@ Mapping:"""
|
|
| 313 |
|
| 314 |
if valid_mapping:
|
| 315 |
df_mapped = df_mapped.rename(columns=valid_mapping)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
|
| 317 |
return df_mapped
|
| 318 |
|
|
@@ -330,9 +358,30 @@ Mapping:"""
|
|
| 330 |
# Копируем датафрейм чтобы не изменять оригинал
|
| 331 |
df_work = uploaded_df.copy()
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
# Конвертируем координаты в градусы если они в текстовом формате
|
| 334 |
coord_columns = [col for col in df_work.columns if any(
|
| 335 |
-
keyword in col.lower() for keyword in ['ra', 'dec', 'coord', 'right_ascension', 'declination']
|
| 336 |
)]
|
| 337 |
|
| 338 |
for col in coord_columns:
|
|
@@ -362,7 +411,7 @@ Mapping:"""
|
|
| 362 |
# Check which target columns are missing
|
| 363 |
missing_cols = set(target_columns) - set(mapped_df.columns)
|
| 364 |
if missing_cols:
|
| 365 |
-
info_msg += f"\nWarning: Missing {len(missing_cols)} target columns (will be filled with
|
| 366 |
|
| 367 |
return mapped_df, mapping, info_msg
|
| 368 |
|
|
|
|
| 313 |
|
| 314 |
if valid_mapping:
|
| 315 |
df_mapped = df_mapped.rename(columns=valid_mapping)
|
| 316 |
+
|
| 317 |
+
# Ensure all columns are properly flattened and converted to numeric where possible
|
| 318 |
+
for col in df_mapped.columns:
|
| 319 |
+
try:
|
| 320 |
+
# Get the column as a Series
|
| 321 |
+
col_data = df_mapped[col]
|
| 322 |
+
|
| 323 |
+
# Check if it's actually a Series (not a DataFrame)
|
| 324 |
+
if not isinstance(col_data, pd.Series):
|
| 325 |
+
continue
|
| 326 |
+
|
| 327 |
+
# Check if column has object dtype or might contain complex data
|
| 328 |
+
if col_data.dtype == 'object':
|
| 329 |
+
try:
|
| 330 |
+
# Try to convert to numeric
|
| 331 |
+
df_mapped[col] = pd.to_numeric(col_data, errors='coerce')
|
| 332 |
+
except:
|
| 333 |
+
pass
|
| 334 |
+
|
| 335 |
+
# Ensure column is 1D
|
| 336 |
+
if hasattr(col_data, 'values'):
|
| 337 |
+
col_values = col_data.values
|
| 338 |
+
if len(col_values.shape) > 1:
|
| 339 |
+
# Flatten multi-dimensional arrays
|
| 340 |
+
df_mapped[col] = col_values.flatten()[:len(df_mapped)]
|
| 341 |
+
except Exception as e:
|
| 342 |
+
# Skip problematic columns
|
| 343 |
+
continue
|
| 344 |
|
| 345 |
return df_mapped
|
| 346 |
|
|
|
|
| 358 |
# Копируем датафрейм чтобы не изменять оригинал
|
| 359 |
df_work = uploaded_df.copy()
|
| 360 |
|
| 361 |
+
# Clean up column names - remove extra spaces, special characters
|
| 362 |
+
df_work.columns = df_work.columns.str.strip()
|
| 363 |
+
|
| 364 |
+
# Handle any multi-dimensional columns before mapping
|
| 365 |
+
for col in df_work.columns:
|
| 366 |
+
if df_work[col].dtype == 'object':
|
| 367 |
+
# Check if column contains complex structures
|
| 368 |
+
first_val = df_work[col].dropna().iloc[0] if len(df_work[col].dropna()) > 0 else None
|
| 369 |
+
|
| 370 |
+
if isinstance(first_val, (list, tuple)):
|
| 371 |
+
# Flatten lists/tuples - take first element
|
| 372 |
+
df_work[col] = df_work[col].apply(
|
| 373 |
+
lambda x: x[0] if isinstance(x, (list, tuple)) and len(x) > 0 else (x if not isinstance(x, (list, tuple)) else None)
|
| 374 |
+
)
|
| 375 |
+
elif isinstance(first_val, str):
|
| 376 |
+
# Try to convert string representations of numbers
|
| 377 |
+
try:
|
| 378 |
+
df_work[col] = pd.to_numeric(df_work[col], errors='ignore')
|
| 379 |
+
except:
|
| 380 |
+
pass
|
| 381 |
+
|
| 382 |
# Конвертируем координаты в градусы если они в текстовом формате
|
| 383 |
coord_columns = [col for col in df_work.columns if any(
|
| 384 |
+
keyword in col.lower() for keyword in ['ra', 'dec', 'coord', 'right_ascension', 'declination', 'rastr', 'decstr']
|
| 385 |
)]
|
| 386 |
|
| 387 |
for col in coord_columns:
|
|
|
|
| 411 |
# Check which target columns are missing
|
| 412 |
missing_cols = set(target_columns) - set(mapped_df.columns)
|
| 413 |
if missing_cols:
|
| 414 |
+
info_msg += f"\nWarning: Missing {len(missing_cols)} target columns (will be filled with zeros)\n"
|
| 415 |
|
| 416 |
return mapped_df, mapping, info_msg
|
| 417 |
|