Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 14

Commit

13000ac

verified ·

1 Parent(s): 3538165

Update main.py

Browse files

Files changed (1) hide show

main.py +15 -100

main.py CHANGED Viewed

@@ -723,121 +723,36 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
-SUPPORTED_EXTENSIONS: Dict[str, str] = {
-    '.txt': 'text/plain',
-    '.md': 'text/markdown',
-    '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-    '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
-    '.pdf': 'application/pdf',
-    '.rtf': 'application/rtf',
-    '.odt': 'application/vnd.oasis.opendocument.text'
-}
-SUPPORTED_EXTENSIONS: Dict[str, str] = {
-    '.txt': 'text/plain',
-    '.md': 'text/markdown',
-    '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
-    '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
-    '.pdf': 'application/pdf',
-    '.rtf': 'application/rtf',
-    '.odt': 'application/vnd.oasis.opendocument.text'
-}
 @app.post("/summarize")
 @limiter.limit("5/minute")
 async def summarize_document(request: Request, file: UploadFile = File(...)):
     try:
-        # Validate file type
-        if not file.filename:
-            raise HTTPException(status_code=400, detail="No filename provided")
-        filename = file.filename.lower()
-        file_ext = next((ext for ext in SUPPORTED_EXTENSIONS if filename.endswith(ext)), None)
-        if not file_ext:
-            supported_formats = ", ".join(SUPPORTED_EXTENSIONS.keys())
-            raise HTTPException(
-                status_code=400,
-                detail=f"Unsupported file format. Supported formats: {supported_formats}"
-            )
-        # Process file based on extension
-        text = await extract_text_from_file(file, file_ext)
         if not text.strip():
-            raise HTTPException(
-                status_code=400,
-                detail="The document appears to be empty or contains no extractable text"
-            )
         # Clean and chunk text
-        text = clean_text(text)
-        summary = await generate_summary(text)
-        return {"summary": summary}
-    except HTTPException as he:
-        logger.warning(f"Client error: {he.detail}")
         raise
     except Exception as e:
-        logger.error(f"Summarization failed: {str(e)}", exc_info=True)
         raise HTTPException(500, "Document summarization failed")
-async def extract_text_from_file(file: UploadFile, file_ext: str) -> str:
-    """Extract text from different file formats"""
-    try:
-        if file_ext in ('.txt', '.md', '.rtf'):
-            return (await file.read()).decode('utf-8')
-        elif file_ext == '.docx':
-            doc = Document(file.file)
-            return "\n".join([paragraph.text for paragraph in doc.paragraphs])
-        elif file_ext == '.pptx':
-            prs = Presentation(file.file)
-            text = []
-            for slide in prs.slides:
-                for shape in slide.shapes:
-                    if hasattr(shape, "text"):
-                        text.append(shape.text)
-            return "\n".join(text)
-        elif file_ext == '.pdf':
-            content = await file.read()
-            return extract_text(content, 'pdf')
-        elif file_ext == '.odt':
-            content = await file.read()
-            try:
-                return content.decode('utf-8')
-            except UnicodeDecodeError:
-                raise HTTPException(
-                    status_code=400,
-                    detail="ODT file parsing requires additional libraries"
-                )
-    except Exception as e:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Failed to extract text from file: {str(e)}"
-        )
-def clean_text(text: str) -> str:
-    """Clean and normalize text"""
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text
-async def generate_summary(text: str, chunk_size: int = 1000) -> str:
-    """Generate summary from text in chunks"""
-    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
-    summarizer = get_summarizer()
-    summaries = []
-    for chunk in chunks:
-        summary = summarizer(
-            chunk,
-            max_length=150,
-            min_length=50,
-            do_sample=False
-        )[0]["summary_text"]
-        summaries.append(summary)
-    return " ".join(summaries)
 @app.post("/qa")
 @limiter.limit("5/minute")
 async def question_answering(

 @app.post("/summarize")
 @limiter.limit("5/minute")
 async def summarize_document(request: Request, file: UploadFile = File(...)):
     try:
+        file_ext, content = await process_uploaded_file(file)
+        text = extract_text(content, file_ext)
         if not text.strip():
+            raise HTTPException(400, "No extractable text found")
         # Clean and chunk text
+        text = re.sub(r'\s+', ' ', text).strip()
+        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
+        # Summarize each chunk
+        summarizer = get_summarizer()
+        summaries = []
+        for chunk in chunks:
+            summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
+            summaries.append(summary)
+        return {"summary": " ".join(summaries)}
+    except HTTPException:
         raise
     except Exception as e:
+        logger.error(f"Summarization failed: {str(e)}")
         raise HTTPException(500, "Document summarization failed")
 @app.post("/qa")
 @limiter.limit("5/minute")
 async def question_answering(