chenguittiMaroua commited on
Commit
8c7e530
·
verified ·
1 Parent(s): 0c9d79d

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +45 -48
main.py CHANGED
@@ -725,82 +725,79 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
725
 
726
 
727
 
 
 
 
 
 
 
728
  @app.post("/summarize")
729
  @limiter.limit("5/minute")
730
  async def summarize_document(request: Request, file: UploadFile = File(...)):
731
- """
732
- Summarize content from various file types (PDF, Word, Excel, PowerPoint, Images)
733
- Returns a concise summary of the document's main points.
734
- """
735
  try:
736
- # Use your existing file processing and validation
737
  file_ext, content = await process_uploaded_file(file)
738
 
739
- # Use your existing text extraction function
740
- text = extract_text(content, file_ext)
 
741
 
742
  if not text.strip():
743
  raise HTTPException(400, "No extractable text found")
744
-
745
- # Clean text (preserving your existing approach)
746
  text = re.sub(r'\s+', ' ', text).strip()
747
 
748
- # Improved chunking with sentence awareness
749
- sentences = re.split(r'(?<=[.!?]) +', text)
750
  chunks = []
751
- current_chunk = ""
 
752
 
753
  for sentence in sentences:
754
- if len(current_chunk) + len(sentence) <= 1000:
755
- current_chunk += " " + sentence
 
 
756
  else:
757
- chunks.append(current_chunk.strip())
758
- current_chunk = sentence
 
759
  if current_chunk:
760
- chunks.append(current_chunk.strip())
761
-
762
- # Get your cached summarizer
763
  summarizer = get_summarizer()
764
 
765
- # Summarize each chunk with error handling
766
- summaries = []
767
- for chunk in chunks:
768
- try:
769
- summary = summarizer(
770
- chunk,
771
- max_length=150,
772
- min_length=50,
773
- do_sample=False,
774
- truncation=True
775
- )[0]["summary_text"]
776
- summaries.append(summary)
777
- except Exception as chunk_error:
778
- logger.warning(f"Failed to summarize chunk: {str(chunk_error)}")
779
- # Fallback: include the first 3 sentences of the chunk
780
- fallback = " ".join(chunk.split('.')[:3]) + "."
781
- summaries.append(fallback)
782
-
783
- # Combine and clean the final summary
784
- combined_summary = " ".join(summaries)
785
- combined_summary = re.sub(r'\s+', ' ', combined_summary).strip()
786
-
787
- # If summary is too long, summarize it again
788
- if len(combined_summary.split()) > 300:
789
- combined_summary = summarizer(
790
- combined_summary,
791
- max_length=200,
792
- min_length=100,
793
  do_sample=False,
794
  truncation=True
795
  )[0]["summary_text"]
796
 
797
- return {"summary": combined_summary}
 
 
 
 
 
 
 
 
 
 
 
798
 
799
  except HTTPException:
800
  raise
801
  except Exception as e:
802
  logger.error(f"Summarization failed: {str(e)}", exc_info=True)
803
  raise HTTPException(500, "Document summarization failed")
 
804
  @app.post("/qa")
805
  @limiter.limit("5/minute")
806
  async def question_answering(
 
725
 
726
 
727
 
728
+ from concurrent.futures import ThreadPoolExecutor
729
+ import asyncio
730
+
731
+ # Global thread pool for CPU-bound tasks
732
+ executor = ThreadPoolExecutor(max_workers=4)
733
+
734
  @app.post("/summarize")
735
  @limiter.limit("5/minute")
736
  async def summarize_document(request: Request, file: UploadFile = File(...)):
737
+ """Optimized document summarization with parallel processing"""
 
 
 
738
  try:
739
+ # 1. Fast file processing
740
  file_ext, content = await process_uploaded_file(file)
741
 
742
+ # 2. Parallel text extraction
743
+ loop = asyncio.get_event_loop()
744
+ text = await loop.run_in_executor(executor, extract_text, content, file_ext)
745
 
746
  if not text.strip():
747
  raise HTTPException(400, "No extractable text found")
748
+
749
+ # 3. Efficient text cleaning
750
  text = re.sub(r'\s+', ' ', text).strip()
751
 
752
+ # 4. Smart chunking with sentence boundaries
753
+ sentences = [s for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
754
  chunks = []
755
+ current_chunk = []
756
+ current_length = 0
757
 
758
  for sentence in sentences:
759
+ sent_length = len(sentence)
760
+ if current_length + sent_length <= 800: # Slightly smaller chunks for faster processing
761
+ current_chunk.append(sentence)
762
+ current_length += sent_length
763
  else:
764
+ chunks.append(' '.join(current_chunk))
765
+ current_chunk = [sentence]
766
+ current_length = sent_length
767
  if current_chunk:
768
+ chunks.append(' '.join(current_chunk))
769
+
770
+ # 5. Parallel summarization
771
  summarizer = get_summarizer()
772
 
773
+ def summarize_chunk(chunk):
774
+ return summarizer(
775
+ chunk,
776
+ max_length=120, # Smaller output for faster processing
777
+ min_length=40,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778
  do_sample=False,
779
  truncation=True
780
  )[0]["summary_text"]
781
 
782
+ # Process chunks in parallel
783
+ with ThreadPoolExecutor(max_workers=min(4, len(chunks))) as pool:
784
+ summaries = list(pool.map(summarize_chunk, chunks))
785
+
786
+ # 6. Fast final combination
787
+ combined = ' '.join(summaries)
788
+
789
+ # Optional: Single refinement pass if needed
790
+ if len(combined.split()) > 200:
791
+ combined = summarize_chunk(combined[:3000]) # Limit input size
792
+
793
+ return {"summary": combined}
794
 
795
  except HTTPException:
796
  raise
797
  except Exception as e:
798
  logger.error(f"Summarization failed: {str(e)}", exc_info=True)
799
  raise HTTPException(500, "Document summarization failed")
800
+
801
  @app.post("/qa")
802
  @limiter.limit("5/minute")
803
  async def question_answering(