Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -725,82 +725,79 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
|
|
| 725 |
|
| 726 |
|
| 727 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 728 |
@app.post("/summarize")
|
| 729 |
@limiter.limit("5/minute")
|
| 730 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
| 731 |
-
"""
|
| 732 |
-
Summarize content from various file types (PDF, Word, Excel, PowerPoint, Images)
|
| 733 |
-
Returns a concise summary of the document's main points.
|
| 734 |
-
"""
|
| 735 |
try:
|
| 736 |
-
#
|
| 737 |
file_ext, content = await process_uploaded_file(file)
|
| 738 |
|
| 739 |
-
#
|
| 740 |
-
|
|
|
|
| 741 |
|
| 742 |
if not text.strip():
|
| 743 |
raise HTTPException(400, "No extractable text found")
|
| 744 |
-
|
| 745 |
-
#
|
| 746 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 747 |
|
| 748 |
-
#
|
| 749 |
-
sentences = re.split(r'(?<=[.!?])
|
| 750 |
chunks = []
|
| 751 |
-
current_chunk =
|
|
|
|
| 752 |
|
| 753 |
for sentence in sentences:
|
| 754 |
-
|
| 755 |
-
|
|
|
|
|
|
|
| 756 |
else:
|
| 757 |
-
chunks.append(
|
| 758 |
-
current_chunk = sentence
|
|
|
|
| 759 |
if current_chunk:
|
| 760 |
-
chunks.append(
|
| 761 |
-
|
| 762 |
-
#
|
| 763 |
summarizer = get_summarizer()
|
| 764 |
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
chunk,
|
| 771 |
-
max_length=150,
|
| 772 |
-
min_length=50,
|
| 773 |
-
do_sample=False,
|
| 774 |
-
truncation=True
|
| 775 |
-
)[0]["summary_text"]
|
| 776 |
-
summaries.append(summary)
|
| 777 |
-
except Exception as chunk_error:
|
| 778 |
-
logger.warning(f"Failed to summarize chunk: {str(chunk_error)}")
|
| 779 |
-
# Fallback: include the first 3 sentences of the chunk
|
| 780 |
-
fallback = " ".join(chunk.split('.')[:3]) + "."
|
| 781 |
-
summaries.append(fallback)
|
| 782 |
-
|
| 783 |
-
# Combine and clean the final summary
|
| 784 |
-
combined_summary = " ".join(summaries)
|
| 785 |
-
combined_summary = re.sub(r'\s+', ' ', combined_summary).strip()
|
| 786 |
-
|
| 787 |
-
# If summary is too long, summarize it again
|
| 788 |
-
if len(combined_summary.split()) > 300:
|
| 789 |
-
combined_summary = summarizer(
|
| 790 |
-
combined_summary,
|
| 791 |
-
max_length=200,
|
| 792 |
-
min_length=100,
|
| 793 |
do_sample=False,
|
| 794 |
truncation=True
|
| 795 |
)[0]["summary_text"]
|
| 796 |
|
| 797 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 798 |
|
| 799 |
except HTTPException:
|
| 800 |
raise
|
| 801 |
except Exception as e:
|
| 802 |
logger.error(f"Summarization failed: {str(e)}", exc_info=True)
|
| 803 |
raise HTTPException(500, "Document summarization failed")
|
|
|
|
| 804 |
@app.post("/qa")
|
| 805 |
@limiter.limit("5/minute")
|
| 806 |
async def question_answering(
|
|
|
|
| 725 |
|
| 726 |
|
| 727 |
|
| 728 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 729 |
+
import asyncio
|
| 730 |
+
|
| 731 |
+
# Global thread pool for CPU-bound tasks
|
| 732 |
+
executor = ThreadPoolExecutor(max_workers=4)
|
| 733 |
+
|
| 734 |
@app.post("/summarize")
|
| 735 |
@limiter.limit("5/minute")
|
| 736 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
| 737 |
+
"""Optimized document summarization with parallel processing"""
|
|
|
|
|
|
|
|
|
|
| 738 |
try:
|
| 739 |
+
# 1. Fast file processing
|
| 740 |
file_ext, content = await process_uploaded_file(file)
|
| 741 |
|
| 742 |
+
# 2. Parallel text extraction
|
| 743 |
+
loop = asyncio.get_event_loop()
|
| 744 |
+
text = await loop.run_in_executor(executor, extract_text, content, file_ext)
|
| 745 |
|
| 746 |
if not text.strip():
|
| 747 |
raise HTTPException(400, "No extractable text found")
|
| 748 |
+
|
| 749 |
+
# 3. Efficient text cleaning
|
| 750 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 751 |
|
| 752 |
+
# 4. Smart chunking with sentence boundaries
|
| 753 |
+
sentences = [s for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
|
| 754 |
chunks = []
|
| 755 |
+
current_chunk = []
|
| 756 |
+
current_length = 0
|
| 757 |
|
| 758 |
for sentence in sentences:
|
| 759 |
+
sent_length = len(sentence)
|
| 760 |
+
if current_length + sent_length <= 800: # Slightly smaller chunks for faster processing
|
| 761 |
+
current_chunk.append(sentence)
|
| 762 |
+
current_length += sent_length
|
| 763 |
else:
|
| 764 |
+
chunks.append(' '.join(current_chunk))
|
| 765 |
+
current_chunk = [sentence]
|
| 766 |
+
current_length = sent_length
|
| 767 |
if current_chunk:
|
| 768 |
+
chunks.append(' '.join(current_chunk))
|
| 769 |
+
|
| 770 |
+
# 5. Parallel summarization
|
| 771 |
summarizer = get_summarizer()
|
| 772 |
|
| 773 |
+
def summarize_chunk(chunk):
|
| 774 |
+
return summarizer(
|
| 775 |
+
chunk,
|
| 776 |
+
max_length=120, # Smaller output for faster processing
|
| 777 |
+
min_length=40,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
do_sample=False,
|
| 779 |
truncation=True
|
| 780 |
)[0]["summary_text"]
|
| 781 |
|
| 782 |
+
# Process chunks in parallel
|
| 783 |
+
with ThreadPoolExecutor(max_workers=min(4, len(chunks))) as pool:
|
| 784 |
+
summaries = list(pool.map(summarize_chunk, chunks))
|
| 785 |
+
|
| 786 |
+
# 6. Fast final combination
|
| 787 |
+
combined = ' '.join(summaries)
|
| 788 |
+
|
| 789 |
+
# Optional: Single refinement pass if needed
|
| 790 |
+
if len(combined.split()) > 200:
|
| 791 |
+
combined = summarize_chunk(combined[:3000]) # Limit input size
|
| 792 |
+
|
| 793 |
+
return {"summary": combined}
|
| 794 |
|
| 795 |
except HTTPException:
|
| 796 |
raise
|
| 797 |
except Exception as e:
|
| 798 |
logger.error(f"Summarization failed: {str(e)}", exc_info=True)
|
| 799 |
raise HTTPException(500, "Document summarization failed")
|
| 800 |
+
|
| 801 |
@app.post("/qa")
|
| 802 |
@limiter.limit("5/minute")
|
| 803 |
async def question_answering(
|