Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -723,121 +723,36 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
|
|
| 723 |
|
| 724 |
|
| 725 |
|
| 726 |
-
SUPPORTED_EXTENSIONS: Dict[str, str] = {
|
| 727 |
-
'.txt': 'text/plain',
|
| 728 |
-
'.md': 'text/markdown',
|
| 729 |
-
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
| 730 |
-
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
| 731 |
-
'.pdf': 'application/pdf',
|
| 732 |
-
'.rtf': 'application/rtf',
|
| 733 |
-
'.odt': 'application/vnd.oasis.opendocument.text'
|
| 734 |
-
}
|
| 735 |
|
| 736 |
-
SUPPORTED_EXTENSIONS: Dict[str, str] = {
|
| 737 |
-
'.txt': 'text/plain',
|
| 738 |
-
'.md': 'text/markdown',
|
| 739 |
-
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
| 740 |
-
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
| 741 |
-
'.pdf': 'application/pdf',
|
| 742 |
-
'.rtf': 'application/rtf',
|
| 743 |
-
'.odt': 'application/vnd.oasis.opendocument.text'
|
| 744 |
-
}
|
| 745 |
|
| 746 |
@app.post("/summarize")
|
| 747 |
@limiter.limit("5/minute")
|
| 748 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
| 749 |
try:
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
raise HTTPException(status_code=400, detail="No filename provided")
|
| 753 |
-
|
| 754 |
-
filename = file.filename.lower()
|
| 755 |
-
file_ext = next((ext for ext in SUPPORTED_EXTENSIONS if filename.endswith(ext)), None)
|
| 756 |
-
|
| 757 |
-
if not file_ext:
|
| 758 |
-
supported_formats = ", ".join(SUPPORTED_EXTENSIONS.keys())
|
| 759 |
-
raise HTTPException(
|
| 760 |
-
status_code=400,
|
| 761 |
-
detail=f"Unsupported file format. Supported formats: {supported_formats}"
|
| 762 |
-
)
|
| 763 |
-
|
| 764 |
-
# Process file based on extension
|
| 765 |
-
text = await extract_text_from_file(file, file_ext)
|
| 766 |
|
| 767 |
if not text.strip():
|
| 768 |
-
raise HTTPException(
|
| 769 |
-
status_code=400,
|
| 770 |
-
detail="The document appears to be empty or contains no extractable text"
|
| 771 |
-
)
|
| 772 |
|
| 773 |
# Clean and chunk text
|
| 774 |
-
text =
|
| 775 |
-
|
| 776 |
|
| 777 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
|
| 779 |
-
|
| 780 |
-
|
|
|
|
| 781 |
raise
|
| 782 |
except Exception as e:
|
| 783 |
-
logger.error(f"Summarization failed: {str(e)}"
|
| 784 |
raise HTTPException(500, "Document summarization failed")
|
| 785 |
-
|
| 786 |
-
async def extract_text_from_file(file: UploadFile, file_ext: str) -> str:
|
| 787 |
-
"""Extract text from different file formats"""
|
| 788 |
-
try:
|
| 789 |
-
if file_ext in ('.txt', '.md', '.rtf'):
|
| 790 |
-
return (await file.read()).decode('utf-8')
|
| 791 |
-
elif file_ext == '.docx':
|
| 792 |
-
doc = Document(file.file)
|
| 793 |
-
return "\n".join([paragraph.text for paragraph in doc.paragraphs])
|
| 794 |
-
elif file_ext == '.pptx':
|
| 795 |
-
prs = Presentation(file.file)
|
| 796 |
-
text = []
|
| 797 |
-
for slide in prs.slides:
|
| 798 |
-
for shape in slide.shapes:
|
| 799 |
-
if hasattr(shape, "text"):
|
| 800 |
-
text.append(shape.text)
|
| 801 |
-
return "\n".join(text)
|
| 802 |
-
elif file_ext == '.pdf':
|
| 803 |
-
content = await file.read()
|
| 804 |
-
return extract_text(content, 'pdf')
|
| 805 |
-
elif file_ext == '.odt':
|
| 806 |
-
content = await file.read()
|
| 807 |
-
try:
|
| 808 |
-
return content.decode('utf-8')
|
| 809 |
-
except UnicodeDecodeError:
|
| 810 |
-
raise HTTPException(
|
| 811 |
-
status_code=400,
|
| 812 |
-
detail="ODT file parsing requires additional libraries"
|
| 813 |
-
)
|
| 814 |
-
except Exception as e:
|
| 815 |
-
raise HTTPException(
|
| 816 |
-
status_code=400,
|
| 817 |
-
detail=f"Failed to extract text from file: {str(e)}"
|
| 818 |
-
)
|
| 819 |
-
|
| 820 |
-
def clean_text(text: str) -> str:
|
| 821 |
-
"""Clean and normalize text"""
|
| 822 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
| 823 |
-
return text
|
| 824 |
-
|
| 825 |
-
async def generate_summary(text: str, chunk_size: int = 1000) -> str:
|
| 826 |
-
"""Generate summary from text in chunks"""
|
| 827 |
-
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
| 828 |
-
summarizer = get_summarizer()
|
| 829 |
-
summaries = []
|
| 830 |
-
for chunk in chunks:
|
| 831 |
-
summary = summarizer(
|
| 832 |
-
chunk,
|
| 833 |
-
max_length=150,
|
| 834 |
-
min_length=50,
|
| 835 |
-
do_sample=False
|
| 836 |
-
)[0]["summary_text"]
|
| 837 |
-
summaries.append(summary)
|
| 838 |
-
return " ".join(summaries)
|
| 839 |
-
|
| 840 |
-
|
| 841 |
@app.post("/qa")
|
| 842 |
@limiter.limit("5/minute")
|
| 843 |
async def question_answering(
|
|
|
|
| 723 |
|
| 724 |
|
| 725 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
|
| 728 |
@app.post("/summarize")
|
| 729 |
@limiter.limit("5/minute")
|
| 730 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
| 731 |
try:
|
| 732 |
+
file_ext, content = await process_uploaded_file(file)
|
| 733 |
+
text = extract_text(content, file_ext)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
|
| 735 |
if not text.strip():
|
| 736 |
+
raise HTTPException(400, "No extractable text found")
|
|
|
|
|
|
|
|
|
|
| 737 |
|
| 738 |
# Clean and chunk text
|
| 739 |
+
text = re.sub(r'\s+', ' ', text).strip()
|
| 740 |
+
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
|
| 741 |
|
| 742 |
+
# Summarize each chunk
|
| 743 |
+
summarizer = get_summarizer()
|
| 744 |
+
summaries = []
|
| 745 |
+
for chunk in chunks:
|
| 746 |
+
summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
|
| 747 |
+
summaries.append(summary)
|
| 748 |
|
| 749 |
+
return {"summary": " ".join(summaries)}
|
| 750 |
+
|
| 751 |
+
except HTTPException:
|
| 752 |
raise
|
| 753 |
except Exception as e:
|
| 754 |
+
logger.error(f"Summarization failed: {str(e)}")
|
| 755 |
raise HTTPException(500, "Document summarization failed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
@app.post("/qa")
|
| 757 |
@limiter.limit("5/minute")
|
| 758 |
async def question_answering(
|