Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 10, 2025

Commit

c6e8137

verified ·

1 Parent(s): 56f1984

Update main.py

Browse files

Files changed (1) hide show

main.py +155 -143

main.py CHANGED Viewed

@@ -1,10 +1,11 @@
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from transformers import pipeline
 import io
 import fitz  # PyMuPDF
-from PIL import Image, UnidentifiedImageError
 import pandas as pd
 import uvicorn
 from docx import Document
@@ -12,8 +13,13 @@ from pptx import Presentation
 import pytesseract
 import logging
 import re
-from typing import Tuple
-import traceback
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -21,6 +27,10 @@ logger = logging.getLogger(__name__)
 app = FastAPI()
 # CORS Configuration
 app.add_middleware(
     CORSMiddleware,
@@ -32,176 +42,172 @@ app.add_middleware(
 # Constants
 MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
 SUPPORTED_FILE_TYPES = {
-    "docx": "Word Document",
-    "xlsx": "Excel Spreadsheet",
-    "pptx": "PowerPoint",
-    "pdf": "PDF",
-    "jpg": "JPEG Image",
-    "jpeg": "JPEG Image",
-    "png": "PNG Image"
 }
-# Initialize models at startup
-try:
-    logger.info("Loading ML models...")
-    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-    qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
-    image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
-    logger.info("Models loaded successfully")
-except Exception as e:
-    logger.error(f"Failed to load models: {str(e)}")
-    raise RuntimeError("Model initialization failed")
-async def validate_file(file: UploadFile) -> Tuple[str, bytes]:
-    """Validate file type and size"""
     if not file.filename:
         raise HTTPException(400, "No filename provided")
     file_ext = file.filename.split('.')[-1].lower()
     if file_ext not in SUPPORTED_FILE_TYPES:
-        raise HTTPException(400, f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES.values())}")
     content = await file.read()
     if len(content) > MAX_FILE_SIZE:
         raise HTTPException(413, f"File too large. Max size: {MAX_FILE_SIZE//1024//1024}MB")
-    await file.seek(0)
     return file_ext, content
-def extract_text_from_pdf(content: bytes) -> str:
-    """Extract text from PDF with error handling"""
-    try:
-        with fitz.open(stream=content, filetype="pdf") as doc:
-            if doc.is_encrypted:
-                if not doc.authenticate(""):  # Try empty password
-                    raise ValueError("Encrypted PDF - cannot extract text")
-            return "\n".join(page.get_text("text") for page in doc)
-    except Exception as e:
-        logger.error(f"PDF extraction failed: {str(e)}")
-        raise ValueError(f"Failed to process PDF: {str(e)}")
-def extract_text_from_docx(content: bytes) -> str:
-    """Extract text from Word document"""
-    try:
-        doc = Document(io.BytesIO(content))
-        return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
-    except Exception as e:
-        logger.error(f"DOCX extraction failed: {str(e)}")
-        raise ValueError("Failed to process Word document")
-def extract_text_from_excel(content: bytes) -> str:
-    """Extract text from Excel (first sheet only)"""
-    try:
-        df = pd.read_excel(io.BytesIO(content), sheet_name=0)
-        return "\n".join(df.iloc[:, 0].dropna().astype(str).tolist())
-    except Exception as e:
-        logger.error(f"Excel extraction failed: {str(e)}")
-        raise ValueError("Failed to process Excel file")
-def extract_text_from_pptx(content: bytes) -> str:
-    """Extract text from PowerPoint"""
     try:
-        ppt = Presentation(io.BytesIO(content))
-        return "\n".join(shape.text for slide in ppt.slides
-                        for shape in slide.shapes if hasattr(shape, "text"))
-    except Exception as e:
-        logger.error(f"PPTX extraction failed: {str(e)}")
-        raise ValueError("Failed to process PowerPoint file")
-def extract_text_from_image(content: bytes) -> str:
-    """Extract text from image using OCR or captioning"""
-    try:
-        image = Image.open(io.BytesIO(content))
-        # First try OCR
-        try:
-            text = pytesseract.image_to_string(image, timeout=10)  # 10 second timeout
-            if text.strip():
-                return text
-        except Exception as ocr_error:
-            logger.warning(f"OCR failed: {str(ocr_error)}")
-        # Fallback to image captioning
-        try:
-            caption = image_captioner(image)[0]['generated_text']
-            return f"Image description: {caption}"
-        except Exception as caption_error:
-            logger.error(f"Image captioning failed: {str(caption_error)}")
-            raise ValueError("Could not process image")
-    except UnidentifiedImageError:
-        raise ValueError("Invalid image file")
     except Exception as e:
-        logger.error(f"Image processing failed: {str(e)}")
-        raise ValueError("Failed to process image")
-EXTRACTION_FUNCTIONS = {
-    "pdf": extract_text_from_pdf,
-    "docx": extract_text_from_docx,
-    "xlsx": extract_text_from_excel,
-    "pptx": extract_text_from_pptx,
-    "jpg": extract_text_from_image,
-    "jpeg": extract_text_from_image,
-    "png": extract_text_from_image
-}
 @app.post("/summarize")
-async def summarize_document(file: UploadFile = File(...)):
     try:
-        file_ext, content = await validate_file(file)
-        # Get the appropriate extraction function
-        extractor = EXTRACTION_FUNCTIONS.get(file_ext)
-        if not extractor:
-            raise HTTPException(400, "Unsupported file type")
-        # Extract text
-        text = extractor(content)
         if not text.strip():
             raise HTTPException(400, "No extractable text found")
-        # Clean and summarize
-        clean_text = re.sub(r'\s+', ' ', text).strip()[:3000]  # Limit to 3000 chars
-        summary = summarizer(clean_text, max_length=150, min_length=30, do_sample=False)[0]["summary_text"]
-        return {"summary": summary}
-    except HTTPException as he:
-        raise he
-    except ValueError as ve:
-        logger.error(f"Processing error: {str(ve)}")
-        raise HTTPException(422, detail=str(ve))
     except Exception as e:
-        logger.error(f"Unexpected error: {str(e)}\n{traceback.format_exc()}")
-        raise HTTPException(500, detail=f"Document processing failed: {str(e)}")
 @app.post("/qa")
 async def question_answering(
     file: UploadFile = File(...),
     question: str = Form(...),
     language: str = Form("fr")
 ):
     try:
-        file_ext, content = await validate_file(file)
-        # Get the appropriate extraction function
-        extractor = EXTRACTION_FUNCTIONS.get(file_ext)
-        if not extractor:
-            raise HTTPException(400, "Unsupported file type")
-        # Extract text
-        text = extractor(content)
         if not text.strip():
             raise HTTPException(400, "No extractable text found")
-        # Clean text
-        clean_text = re.sub(r'\s+', ' ', text).strip()[:3000]  # Limit to 3000 chars
-        # Check for theme questions
         theme_keywords = ["thème", "sujet principal", "quoi le sujet", "theme", "main topic"]
         if any(kw in question.lower() for kw in theme_keywords):
             try:
-                theme = summarizer(clean_text, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
                 return {
                     "question": question,
                     "answer": f"Le document traite principalement de : {theme}",
@@ -209,7 +215,7 @@ async def question_answering(
                     "language": language
                 }
             except Exception:
-                theme = clean_text[:200] + ("..." if len(clean_text) > 200 else "")
                 return {
                     "question": question,
                     "answer": f"D'après le document : {theme}",
@@ -217,24 +223,30 @@ async def question_answering(
                     "language": language,
                     "warning": "theme_summary_fallback"
                 }
         # Standard QA
-        result = qa_model(question=question, context=clean_text)
         return {
             "question": question,
             "answer": result["answer"],
             "confidence": result["score"],
             "language": language
         }
-    except HTTPException as he:
-        raise he
-    except ValueError as ve:
-        logger.error(f"Processing error: {str(ve)}")
-        raise HTTPException(422, detail=str(ve))
     except Exception as e:
-        logger.error(f"Unexpected error: {str(e)}\n{traceback.format_exc()}")
-        raise HTTPException(500, detail=f"Question answering failed: {str(e)}")
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from transformers import pipeline
+from typing import Tuple
 import io
 import fitz  # PyMuPDF
+from PIL import Image
 import pandas as pd
 import uvicorn
 from docx import Document
 import pytesseract
 import logging
 import re
+from slowapi import Limiter
+from slowapi.util import get_remote_address
+from slowapi.errors import RateLimitExceeded
+from slowapi.middleware import SlowAPIMiddleware
+# Initialize rate limiter
+limiter = Limiter(key_func=get_remote_address)
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 app = FastAPI()
+# Apply rate limiting middleware
+app.state.limiter = limiter
+app.add_middleware(SlowAPIMiddleware)
 # CORS Configuration
 app.add_middleware(
     CORSMiddleware,
 # Constants
 MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
 SUPPORTED_FILE_TYPES = {
+    "docx", "xlsx", "pptx", "pdf", "jpg", "jpeg", "png"
 }
+# Model caching
+summarizer = None
+qa_model = None
+image_captioner = None
+def get_summarizer():
+    global summarizer
+    if summarizer is None:
+        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    return summarizer
+def get_qa_model():
+    global qa_model
+    if qa_model is None:
+        qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
+    return qa_model
+def get_image_captioner():
+    global image_captioner
+    if image_captioner is None:
+        image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
+    return image_captioner
+async def process_uploaded_file(file: UploadFile) -> Tuple[str, bytes]:
+    """Validate and process uploaded file with special handling for each type"""
     if not file.filename:
         raise HTTPException(400, "No filename provided")
     file_ext = file.filename.split('.')[-1].lower()
     if file_ext not in SUPPORTED_FILE_TYPES:
+        raise HTTPException(400, f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}")
     content = await file.read()
     if len(content) > MAX_FILE_SIZE:
         raise HTTPException(413, f"File too large. Max size: {MAX_FILE_SIZE//1024//1024}MB")
+    # Special validation for PDFs
+    if file_ext == "pdf":
+        try:
+            with fitz.open(stream=content, filetype="pdf") as doc:
+                if doc.is_encrypted:
+                    if not doc.authenticate(""):
+                        raise ValueError("Encrypted PDF - cannot extract text")
+                if len(doc) > 50:
+                    raise ValueError("PDF too large (max 50 pages)")
+        except Exception as e:
+            logger.error(f"PDF validation failed: {str(e)}")
+            raise HTTPException(422, detail=f"Invalid PDF file: {str(e)}")
+    await file.seek(0)  # Reset file pointer for processing
     return file_ext, content
+def extract_text(content: bytes, file_ext: str) -> str:
+    """Extract text from various file formats with enhanced support"""
     try:
+        if file_ext == "docx":
+            doc = Document(io.BytesIO(content))
+            return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
+        elif file_ext in {"xlsx", "xls"}:
+            df = pd.read_excel(io.BytesIO(content), sheet_name=None)
+            all_text = []
+            for sheet_name, sheet_data in df.items():
+                sheet_text = []
+                for column in sheet_data.columns:
+                    sheet_text.extend(sheet_data[column].dropna().astype(str).tolist())
+                all_text.append(f"Sheet: {sheet_name}\n" + "\n".join(sheet_text))
+            return "\n\n".join(all_text)
+        elif file_ext == "pptx":
+            ppt = Presentation(io.BytesIO(content))
+            text = []
+            for slide in ppt.slides:
+                for shape in slide.shapes:
+                    if hasattr(shape, "text") and shape.text.strip():
+                        text.append(shape.text)
+            return "\n".join(text)
+        elif file_ext == "pdf":
+            pdf = fitz.open(stream=content, filetype="pdf")
+            return "\n".join(page.get_text("text") for page in pdf)
+        elif file_ext in {"jpg", "jpeg", "png"}:
+            # First try OCR
+            try:
+                image = Image.open(io.BytesIO(content))
+                text = pytesseract.image_to_string(image, config='--psm 6')
+                if text.strip():
+                    return text
+                # If OCR fails, try image captioning
+                captioner = get_image_captioner()
+                result = captioner(image)
+                return result[0]['generated_text']
+            except Exception as img_e:
+                logger.error(f"Image processing failed: {str(img_e)}")
+                raise ValueError("Could not extract text or caption from image")
     except Exception as e:
+        logger.error(f"Text extraction failed for {file_ext}: {str(e)}")
+        raise HTTPException(422, f"Failed to extract text from {file_ext} file")
 @app.post("/summarize")
+@limiter.limit("5/minute")
+async def summarize_document(request: Request, file: UploadFile = File(...)):
     try:
+        file_ext, content = await process_uploaded_file(file)
+        text = extract_text(content, file_ext)
         if not text.strip():
             raise HTTPException(400, "No extractable text found")
+        # Clean and chunk text
+        text = re.sub(r'\s+', ' ', text).strip()
+        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
+        # Summarize each chunk
+        summarizer = get_summarizer()
+        summaries = []
+        for chunk in chunks:
+            summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
+            summaries.append(summary)
+        return {"summary": " ".join(summaries)}
+    except HTTPException:
+        raise
     except Exception as e:
+        logger.error(f"Summarization failed: {str(e)}")
+        raise HTTPException(500, "Document summarization failed")
 @app.post("/qa")
+@limiter.limit("5/minute")
 async def question_answering(
+    request: Request,
     file: UploadFile = File(...),
     question: str = Form(...),
     language: str = Form("fr")
 ):
     try:
+        file_ext, content = await process_uploaded_file(file)
+        text = extract_text(content, file_ext)
         if not text.strip():
             raise HTTPException(400, "No extractable text found")
+        # Clean and truncate text
+        text = re.sub(r'\s+', ' ', text).strip()[:5000]
+        # Theme detection
         theme_keywords = ["thème", "sujet principal", "quoi le sujet", "theme", "main topic"]
         if any(kw in question.lower() for kw in theme_keywords):
             try:
+                summarizer = get_summarizer()
+                summary_output = summarizer(
+                    text,
+                    max_length=min(100, len(text)//4),
+                    min_length=30,
+                    do_sample=False,
+                    truncation=True
+                )
+                theme = summary_output[0].get("summary_text", text[:200] + "...")
                 return {
                     "question": question,
                     "answer": f"Le document traite principalement de : {theme}",
                     "language": language
                 }
             except Exception:
+                theme = text[:200] + ("..." if len(text) > 200 else "")
                 return {
                     "question": question,
                     "answer": f"D'après le document : {theme}",
                     "language": language,
                     "warning": "theme_summary_fallback"
                 }
         # Standard QA
+        qa = get_qa_model()
+        result = qa(question=question, context=text[:3000])
         return {
             "question": question,
             "answer": result["answer"],
             "confidence": result["score"],
             "language": language
         }
+    except HTTPException:
+        raise
     except Exception as e:
+        logger.error(f"QA processing failed: {str(e)}")
+        raise HTTPException(500, detail=f"Analysis failed: {str(e)}")
+@app.exception_handler(RateLimitExceeded)
+async def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
+    return JSONResponse(
+        status_code=429,
+        content={"detail": "Too many requests. Please try again later."}
+    )
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)