Spaces:

chenguittiMaroua
/

asm-app

Sleeping

chenguittiMaroua commited on Apr 14

Commit

3388479

verified ·

1 Parent(s): 1a97873

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -725,8 +725,30 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
 @limiter.limit("5/minute")
 async def summarize_document(request: Request, file: UploadFile = File(...)):
     try:
-        file_ext, content = await process_uploaded_file(file)
-        text = extract_text(content, file_ext)
         if not text.strip():
             raise HTTPException(400, "No extractable text found")

 @limiter.limit("5/minute")
 async def summarize_document(request: Request, file: UploadFile = File(...)):
     try:
+        # Check file type
+        filename = file.filename.lower()
+        # Process different file types
+        if filename.endswith(('.txt', '.md')):
+            text = (await file.read()).decode('utf-8')
+        elif filename.endswith('.docx'):
+            doc = Document(file.file)
+            text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
+        elif filename.endswith('.pptx'):
+            prs = Presentation(file.file)
+            text = []
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    if hasattr(shape, "text"):
+                        text.append(shape.text)
+            text = "\n".join(text)
+        elif filename.endswith('.pdf'):
+            content = await file.read()
+            text = extract_text(content, 'pdf')  # Your existing PDF extraction
+        else:
+            # Fallback to textract for other formats (rtf, etc.)
+            content = await file.read()
+            text = textract.process(content).decode('utf-8')
         if not text.strip():
             raise HTTPException(400, "No extractable text found")