chenguittiMaroua commited on
Commit
13000ac
·
verified ·
1 Parent(s): 3538165

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +15 -100
main.py CHANGED
@@ -723,121 +723,36 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
723
 
724
 
725
 
726
- SUPPORTED_EXTENSIONS: Dict[str, str] = {
727
- '.txt': 'text/plain',
728
- '.md': 'text/markdown',
729
- '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
730
- '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
731
- '.pdf': 'application/pdf',
732
- '.rtf': 'application/rtf',
733
- '.odt': 'application/vnd.oasis.opendocument.text'
734
- }
735
 
736
- SUPPORTED_EXTENSIONS: Dict[str, str] = {
737
- '.txt': 'text/plain',
738
- '.md': 'text/markdown',
739
- '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
740
- '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
741
- '.pdf': 'application/pdf',
742
- '.rtf': 'application/rtf',
743
- '.odt': 'application/vnd.oasis.opendocument.text'
744
- }
745
 
746
  @app.post("/summarize")
747
  @limiter.limit("5/minute")
748
  async def summarize_document(request: Request, file: UploadFile = File(...)):
749
  try:
750
- # Validate file type
751
- if not file.filename:
752
- raise HTTPException(status_code=400, detail="No filename provided")
753
-
754
- filename = file.filename.lower()
755
- file_ext = next((ext for ext in SUPPORTED_EXTENSIONS if filename.endswith(ext)), None)
756
-
757
- if not file_ext:
758
- supported_formats = ", ".join(SUPPORTED_EXTENSIONS.keys())
759
- raise HTTPException(
760
- status_code=400,
761
- detail=f"Unsupported file format. Supported formats: {supported_formats}"
762
- )
763
-
764
- # Process file based on extension
765
- text = await extract_text_from_file(file, file_ext)
766
 
767
  if not text.strip():
768
- raise HTTPException(
769
- status_code=400,
770
- detail="The document appears to be empty or contains no extractable text"
771
- )
772
 
773
  # Clean and chunk text
774
- text = clean_text(text)
775
- summary = await generate_summary(text)
776
 
777
- return {"summary": summary}
 
 
 
 
 
778
 
779
- except HTTPException as he:
780
- logger.warning(f"Client error: {he.detail}")
 
781
  raise
782
  except Exception as e:
783
- logger.error(f"Summarization failed: {str(e)}", exc_info=True)
784
  raise HTTPException(500, "Document summarization failed")
785
-
786
- async def extract_text_from_file(file: UploadFile, file_ext: str) -> str:
787
- """Extract text from different file formats"""
788
- try:
789
- if file_ext in ('.txt', '.md', '.rtf'):
790
- return (await file.read()).decode('utf-8')
791
- elif file_ext == '.docx':
792
- doc = Document(file.file)
793
- return "\n".join([paragraph.text for paragraph in doc.paragraphs])
794
- elif file_ext == '.pptx':
795
- prs = Presentation(file.file)
796
- text = []
797
- for slide in prs.slides:
798
- for shape in slide.shapes:
799
- if hasattr(shape, "text"):
800
- text.append(shape.text)
801
- return "\n".join(text)
802
- elif file_ext == '.pdf':
803
- content = await file.read()
804
- return extract_text(content, 'pdf')
805
- elif file_ext == '.odt':
806
- content = await file.read()
807
- try:
808
- return content.decode('utf-8')
809
- except UnicodeDecodeError:
810
- raise HTTPException(
811
- status_code=400,
812
- detail="ODT file parsing requires additional libraries"
813
- )
814
- except Exception as e:
815
- raise HTTPException(
816
- status_code=400,
817
- detail=f"Failed to extract text from file: {str(e)}"
818
- )
819
-
820
- def clean_text(text: str) -> str:
821
- """Clean and normalize text"""
822
- text = re.sub(r'\s+', ' ', text).strip()
823
- return text
824
-
825
- async def generate_summary(text: str, chunk_size: int = 1000) -> str:
826
- """Generate summary from text in chunks"""
827
- chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
828
- summarizer = get_summarizer()
829
- summaries = []
830
- for chunk in chunks:
831
- summary = summarizer(
832
- chunk,
833
- max_length=150,
834
- min_length=50,
835
- do_sample=False
836
- )[0]["summary_text"]
837
- summaries.append(summary)
838
- return " ".join(summaries)
839
-
840
-
841
  @app.post("/qa")
842
  @limiter.limit("5/minute")
843
  async def question_answering(
 
723
 
724
 
725
 
 
 
 
 
 
 
 
 
 
726
 
 
 
 
 
 
 
 
 
 
727
 
728
  @app.post("/summarize")
729
  @limiter.limit("5/minute")
730
  async def summarize_document(request: Request, file: UploadFile = File(...)):
731
  try:
732
+ file_ext, content = await process_uploaded_file(file)
733
+ text = extract_text(content, file_ext)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
734
 
735
  if not text.strip():
736
+ raise HTTPException(400, "No extractable text found")
 
 
 
737
 
738
  # Clean and chunk text
739
+ text = re.sub(r'\s+', ' ', text).strip()
740
+ chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
741
 
742
+ # Summarize each chunk
743
+ summarizer = get_summarizer()
744
+ summaries = []
745
+ for chunk in chunks:
746
+ summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
747
+ summaries.append(summary)
748
 
749
+ return {"summary": " ".join(summaries)}
750
+
751
+ except HTTPException:
752
  raise
753
  except Exception as e:
754
+ logger.error(f"Summarization failed: {str(e)}")
755
  raise HTTPException(500, "Document summarization failed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
756
  @app.post("/qa")
757
  @limiter.limit("5/minute")
758
  async def question_answering(