Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 25

Commit

8c7e530

verified ·

1 Parent(s): 0c9d79d

Update main.py

Browse files

Files changed (1) hide show

main.py +45 -48

main.py CHANGED Viewed

@@ -725,82 +725,79 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
 @app.post("/summarize")
 @limiter.limit("5/minute")
 async def summarize_document(request: Request, file: UploadFile = File(...)):
-    """
-    Summarize content from various file types (PDF, Word, Excel, PowerPoint, Images)
-    Returns a concise summary of the document's main points.
-    """
     try:
-        # Use your existing file processing and validation
         file_ext, content = await process_uploaded_file(file)
-        # Use your existing text extraction function
-        text = extract_text(content, file_ext)
         if not text.strip():
             raise HTTPException(400, "No extractable text found")
-        # Clean text (preserving your existing approach)
         text = re.sub(r'\s+', ' ', text).strip()
-        # Improved chunking with sentence awareness
-        sentences = re.split(r'(?<=[.!?]) +', text)
         chunks = []
-        current_chunk = ""
         for sentence in sentences:
-            if len(current_chunk) + len(sentence) <= 1000:
-                current_chunk += " " + sentence
             else:
-                chunks.append(current_chunk.strip())
-                current_chunk = sentence
         if current_chunk:
-            chunks.append(current_chunk.strip())
-        # Get your cached summarizer
         summarizer = get_summarizer()
-        # Summarize each chunk with error handling
-        summaries = []
-        for chunk in chunks:
-            try:
-                summary = summarizer(
-                    chunk,
-                    max_length=150,
-                    min_length=50,
-                    do_sample=False,
-                    truncation=True
-                )[0]["summary_text"]
-                summaries.append(summary)
-            except Exception as chunk_error:
-                logger.warning(f"Failed to summarize chunk: {str(chunk_error)}")
-                # Fallback: include the first 3 sentences of the chunk
-                fallback = " ".join(chunk.split('.')[:3]) + "."
-                summaries.append(fallback)
-        # Combine and clean the final summary
-        combined_summary = " ".join(summaries)
-        combined_summary = re.sub(r'\s+', ' ', combined_summary).strip()
-        # If summary is too long, summarize it again
-        if len(combined_summary.split()) > 300:
-            combined_summary = summarizer(
-                combined_summary,
-                max_length=200,
-                min_length=100,
                 do_sample=False,
                 truncation=True
             )[0]["summary_text"]
-        return {"summary": combined_summary}
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Summarization failed: {str(e)}", exc_info=True)
         raise HTTPException(500, "Document summarization failed")
 @app.post("/qa")
 @limiter.limit("5/minute")
 async def question_answering(

+from concurrent.futures import ThreadPoolExecutor
+import asyncio
+# Global thread pool for CPU-bound tasks
+executor = ThreadPoolExecutor(max_workers=4)
 @app.post("/summarize")
 @limiter.limit("5/minute")
 async def summarize_document(request: Request, file: UploadFile = File(...)):
+    """Optimized document summarization with parallel processing"""
     try:
+        # 1. Fast file processing
         file_ext, content = await process_uploaded_file(file)
+        # 2. Parallel text extraction
+        loop = asyncio.get_event_loop()
+        text = await loop.run_in_executor(executor, extract_text, content, file_ext)
         if not text.strip():
             raise HTTPException(400, "No extractable text found")
+        # 3. Efficient text cleaning
         text = re.sub(r'\s+', ' ', text).strip()
+        # 4. Smart chunking with sentence boundaries
+        sentences = [s for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
         chunks = []
+        current_chunk = []
+        current_length = 0
         for sentence in sentences:
+            sent_length = len(sentence)
+            if current_length + sent_length <= 800:  # Slightly smaller chunks for faster processing
+                current_chunk.append(sentence)
+                current_length += sent_length
             else:
+                chunks.append(' '.join(current_chunk))
+                current_chunk = [sentence]
+                current_length = sent_length
         if current_chunk:
+            chunks.append(' '.join(current_chunk))
+        # 5. Parallel summarization
         summarizer = get_summarizer()
+        def summarize_chunk(chunk):
+            return summarizer(
+                chunk,
+                max_length=120,  # Smaller output for faster processing
+                min_length=40,
                 do_sample=False,
                 truncation=True
             )[0]["summary_text"]
+        # Process chunks in parallel
+        with ThreadPoolExecutor(max_workers=min(4, len(chunks))) as pool:
+            summaries = list(pool.map(summarize_chunk, chunks))
+        # 6. Fast final combination
+        combined = ' '.join(summaries)
+        # Optional: Single refinement pass if needed
+        if len(combined.split()) > 200:
+            combined = summarize_chunk(combined[:3000])  # Limit input size
+        return {"summary": combined}
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Summarization failed: {str(e)}", exc_info=True)
         raise HTTPException(500, "Document summarization failed")
 @app.post("/qa")
 @limiter.limit("5/minute")
 async def question_answering(