chenguittiMaroua commited on
Commit
3388479
·
verified ·
1 Parent(s): 1a97873

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +24 -2
main.py CHANGED
@@ -725,8 +725,30 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
725
  @limiter.limit("5/minute")
726
  async def summarize_document(request: Request, file: UploadFile = File(...)):
727
  try:
728
- file_ext, content = await process_uploaded_file(file)
729
- text = extract_text(content, file_ext)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
730
 
731
  if not text.strip():
732
  raise HTTPException(400, "No extractable text found")
 
725
  @limiter.limit("5/minute")
726
  async def summarize_document(request: Request, file: UploadFile = File(...)):
727
  try:
728
+ # Check file type
729
+ filename = file.filename.lower()
730
+
731
+ # Process different file types
732
+ if filename.endswith(('.txt', '.md')):
733
+ text = (await file.read()).decode('utf-8')
734
+ elif filename.endswith('.docx'):
735
+ doc = Document(file.file)
736
+ text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
737
+ elif filename.endswith('.pptx'):
738
+ prs = Presentation(file.file)
739
+ text = []
740
+ for slide in prs.slides:
741
+ for shape in slide.shapes:
742
+ if hasattr(shape, "text"):
743
+ text.append(shape.text)
744
+ text = "\n".join(text)
745
+ elif filename.endswith('.pdf'):
746
+ content = await file.read()
747
+ text = extract_text(content, 'pdf') # Your existing PDF extraction
748
+ else:
749
+ # Fallback to textract for other formats (rtf, etc.)
750
+ content = await file.read()
751
+ text = textract.process(content).decode('utf-8')
752
 
753
  if not text.strip():
754
  raise HTTPException(400, "No extractable text found")