Spaces:

roshini7sn
/

chatpdfurl

Sleeping

App Files Files Community

roshini7sn commited on 28 days ago

Commit

d8dbb12

verified ·

1 Parent(s): 2caf310

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -119

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
-# =========================================================
 # GLOBAL CACHE
-# =========================================================
 CACHE = {
     "last_text_hash": None,
     "chunks": None,
@@ -32,11 +32,13 @@ from transformers import (
     M2M100ForConditionalGeneration,
 )
-# =========================================================
 # CONFIG
-# =========================================================
 EMBED_MODEL = "intfloat/e5-small-v2"
 LLM_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
 TRANS_MODEL_ID = "facebook/nllb-200-distilled-600M"
 CHUNK_SIZE = 1500
@@ -44,103 +46,73 @@ CHUNK_OVERLAP = 300
 MIN_SECTION_LEN = 300
-# =========================================================
 # CLEAN TEXT
-# =========================================================
 def clean_text(text):
     return " ".join(text.replace("\r", "\n").split())
-# =========================================================
-# SAFE HTML DECODER (Brotli-proof)
-# =========================================================
-def safe_decode_content(url, resp):
-    """Decode HTML safely. If Brotli fails, retry without compression."""
-    enc = resp.headers.get("Content-Encoding", "").lower()
-    try:
-        if "br" in enc:
-            import brotli
-            try:
-                # Try standard brotli decode
-                return brotli.decompress(resp.content).decode("utf-8", errors="ignore")
-            except Exception:
-                # Retry plain text request
-                retry = requests.get(
-                    url,
-                    headers={
-                        "User-Agent": "Mozilla/5.0",
-                        "Accept-Encoding": "identity"
-                    },
-                    timeout=20,
-                )
-                return retry.text
-        return resp.text
-    except Exception as e:
-        raise RuntimeError(f"Decompression error: {e}")
-# =========================================================
 # PDF INGEST
-# =========================================================
 def extract_text_from_pdf(path):
     reader = PdfReader(path)
     text = ""
     for page in reader.pages:
-        text += "\n" + (page.extract_text() or "")
     return clean_text(text)
 def extract_pdf_from_url(url):
-    r = requests.get(url, timeout=20)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
     tmp.write(r.content)
     tmp.flush()
-    text = extract_text_from_pdf(tmp.name)
     tmp.close()
-    return text
-# =========================================================
-# OTHER FILE INGEST
-# =========================================================
 def extract_docx_from_url(url):
-    r = requests.get(url, timeout=20)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
     tmp.write(r.content)
     tmp.flush()
     doc = docx.Document(tmp.name)
     tmp.close()
-    return clean_text("\n".join(p.text for p in doc.paragraphs))
 def extract_txt_from_url(url):
-    return clean_text(requests.get(url, timeout=20).text)
 def extract_csv_from_url(url):
-    df = pd.read_csv(StringIO(requests.get(url, timeout=20).text))
     return clean_text(df.to_string())
 def extract_html_from_url(url):
-    headers = {
-        "User-Agent": "Mozilla/5.0",
-        "Accept-Encoding": "gzip, deflate, br"
-    }
-    resp = requests.get(url, headers=headers, timeout=20)
-    html = safe_decode_content(url, resp)
-    soup = BeautifulSoup(html, "html.parser")
     return clean_text(soup.get_text(separator=" "))
-# =========================================================
 # FILE TYPE DETECTION
-# =========================================================
 def detect_filetype(url, headers):
     u = url.lower()
     c = headers.get("Content-Type", "").lower()
@@ -156,13 +128,13 @@ def detect_filetype(url, headers):
     return "html"
-# =========================================================
 # SECTION-AWARE CHUNKING
-# =========================================================
 SECTION_KEYWORDS = [
-    "abstract", "introduction", "method", "methodology",
-    "materials and methods", "results", "discussion",
-    "conclusion", "conclusions", "background"
 ]
@@ -182,9 +154,7 @@ def is_heading(line):
 def split_into_sections(text):
     lines = text.split("\n")
-    sections = []
-    buf = []
-    title = "Document"
     for line in lines:
         if is_heading(line):
@@ -196,37 +166,51 @@ def split_into_sections(text):
             buf.append(line)
     if buf:
-        sections.append((title, "\n".join(buf)))
-    return sections if sections else [("Document", text)]
 def chunk_text(text):
     sections = split_into_sections(text)
     chunks = []
     for _, body in sections:
         paragraphs = [p.strip() for p in re.split(r"\n\s*\n", body) if p.strip()]
-        cur = ""
         for para in paragraphs:
-            if not cur:
-                cur = para
-            elif len(cur) + len(para) + 2 <= CHUNK_SIZE:
-                cur += "\n\n" + para
             else:
-                chunks.append(cur)
-                cur = para
-        if cur:
-            chunks.append(cur)
     return chunks
-# =========================================================
-# SEMANTIC SEARCH
-# =========================================================
 class SemanticSearch:
     def __init__(self, model):
         self.embedder = SentenceTransformer(model)
@@ -240,10 +224,12 @@ class SemanticSearch:
         h = hashlib.md5("".join(chunks).encode()).hexdigest()
         if CACHE["last_text_hash"] == h:
             self.chunks = CACHE["chunks"]
             self.knn = CACHE["knn"]
             return
         self.chunks = chunks
         emb = self.embedder.encode(chunks, convert_to_numpy=True)
@@ -266,10 +252,11 @@ class SemanticSearch:
 vs = None
-# =========================================================
-# LOAD LOCAL QWEN
-# =========================================================
 print("Loading Qwen 0.5B…")
 q_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
 q_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL).to("cpu")
 q_model.eval()
@@ -292,15 +279,15 @@ def run_llm(system, user):
         temperature=0.6,
         eos_token_id=q_tokenizer.eos_token_id,
     )
     gen = out[0][inp["input_ids"].shape[1]:]
     return q_tokenizer.decode(gen, skip_special_tokens=True).strip()
-# =========================================================
-# TRANSLATION (NLLB)
-# =========================================================
-print("Loading NLLB translator…")
 trans_tokenizer = NllbTokenizer.from_pretrained(TRANS_MODEL_ID)
 trans_model = M2M100ForConditionalGeneration.from_pretrained(TRANS_MODEL_ID).to("cpu")
@@ -326,20 +313,23 @@ def translate_to_indic(text, lang):
     try:
         tgt = LANG_CODES[lang]
         inputs = trans_tokenizer(text, return_tensors="pt").to("cpu")
         output = trans_model.generate(
             **inputs,
             forced_bos_token_id=trans_tokenizer.convert_tokens_to_ids(tgt),
-            max_new_tokens=300,
         )
         return trans_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
-    except:
         return text
-# =========================================================
 # RAG PROMPT
-# =========================================================
 def build_prompt(question, retrieved):
     ctx = "\n\n---\n\n".join([c for c, _ in retrieved])
     return f"""
@@ -351,8 +341,8 @@ Follow these rules:
 1. Use ONLY the context. Do not add external knowledge.
 2. If the context does not contain the answer, say:
    "I don't know based on this document."
-3. When possible, give the answer in short, clear points.
-4. Always answer in English.
 CONTEXT:
 {ctx}
@@ -360,13 +350,13 @@ CONTEXT:
 QUESTION:
 {question}
-Write your answer below:
 """.strip()
-# =========================================================
 # SOURCE DISPLAY
-# =========================================================
 def highlight_sources(retrieved):
     html = "<h4>📚 Source Passages</h4>"
     for i, (chunk, score) in enumerate(retrieved):
@@ -379,9 +369,9 @@ def highlight_sources(retrieved):
     return html
-# =========================================================
 # ANSWER FUNCTION
-# =========================================================
 def answer_question(q, lang):
     global vs
     if vs is None:
@@ -396,9 +386,9 @@ def answer_question(q, lang):
     return final, highlight_sources(retrieved)
-# =========================================================
 # LOADERS
-# =========================================================
 def load_pdf_ui(file, lang):
     global vs
     if not file:
@@ -419,7 +409,7 @@ def load_url_ui(url, lang):
         return "Enter a URL."
     try:
-        head = requests.head(url, allow_redirects=True, timeout=20)
         ftype = detect_filetype(url, head.headers)
         if ftype == "pdf":
@@ -443,20 +433,17 @@ def load_url_ui(url, lang):
     return f"URL loaded with {len(chunks)} chunks."
-# =========================================================
 # UI
-# =========================================================
 def create_app():
     with gr.Blocks() as demo:
         gr.Markdown("<h1>📘 Multilingual Chat with PDF / URL</h1>")
         lang = gr.Dropdown(
-            [
-                "auto", "English", "Hindi", "Telugu", "Tamil", "Kannada",
-                "Malayalam", "Bengali", "Marathi", "Gujarati",
-                "Odia", "Punjabi", "Assamese",
-            ],
             value="auto",
             label="Answer Language"
         )
@@ -477,13 +464,6 @@ def create_app():
             gr.Button("Ask").click(answer_question, [q, lang], [a, cits])
-            gr.Markdown("### ✨ Example Questions")
-            with gr.Row():
-                gr.Button("Summarize the document").click(lambda: "Summarize the document", None, q)
-                gr.Button("What are the key findings?").click(lambda: "What are the key findings?", None, q)
-                gr.Button("Explain the methodology").click(lambda: "Explain the methodology used", None, q)
-                gr.Button("What conclusions are given?").click(lambda: "What are the conclusions?", None, q)
     return demo

+# =========================
 # GLOBAL CACHE
+# =========================
 CACHE = {
     "last_text_hash": None,
     "chunks": None,
     M2M100ForConditionalGeneration,
 )
+# =========================
 # CONFIG
+# =========================
 EMBED_MODEL = "intfloat/e5-small-v2"
 LLM_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
+# ⭐ Fully open-source translation model (works everywhere)
 TRANS_MODEL_ID = "facebook/nllb-200-distilled-600M"
 CHUNK_SIZE = 1500
 MIN_SECTION_LEN = 300
+# =========================
 # CLEAN TEXT
+# =========================
 def clean_text(text):
     return " ".join(text.replace("\r", "\n").split())
+# =========================
 # PDF INGEST
+# =========================
 def extract_text_from_pdf(path):
     reader = PdfReader(path)
     text = ""
     for page in reader.pages:
+        page_text = page.extract_text() or ""
+        text += "\n" + page_text
     return clean_text(text)
 def extract_pdf_from_url(url):
+    r = requests.get(url, timeout=10)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
     tmp.write(r.content)
     tmp.flush()
+    txt = extract_text_from_pdf(tmp.name)
     tmp.close()
+    return txt
+# =========================
+# DOCX / TXT / CSV / HTML INGEST
+# =========================
 def extract_docx_from_url(url):
+    r = requests.get(url, timeout=10)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
     tmp.write(r.content)
     tmp.flush()
     doc = docx.Document(tmp.name)
+    text = "\n".join(p.text for p in doc.paragraphs)
     tmp.close()
+    return clean_text(text)
 def extract_txt_from_url(url):
+    return clean_text(requests.get(url, timeout=10).text)
 def extract_csv_from_url(url):
+    df = pd.read_csv(StringIO(requests.get(url, timeout=10).text))
     return clean_text(df.to_string())
 def extract_html_from_url(url):
+    downloaded = trafilatura.fetch_url(url)
+    if downloaded:
+        extracted = trafilatura.extract(downloaded)
+        if extracted:
+            return clean_text(extracted)
+    resp = requests.get(url, timeout=10)
+    soup = BeautifulSoup(resp.text, "html.parser")
     return clean_text(soup.get_text(separator=" "))
+# =========================
 # FILE TYPE DETECTION
+# =========================
 def detect_filetype(url, headers):
     u = url.lower()
     c = headers.get("Content-Type", "").lower()
     return "html"
+# =========================
 # SECTION-AWARE CHUNKING
+# =========================
 SECTION_KEYWORDS = [
+    "introduction", "method", "methodology", "materials and methods",
+    "results", "discussion", "conclusion", "conclusions", "abstract",
+    "background", "analysis"
 ]
 def split_into_sections(text):
     lines = text.split("\n")
+    sections, title, buf = [], "Document", []
     for line in lines:
         if is_heading(line):
             buf.append(line)
     if buf:
+        body = "\n".join(buf)
+        if len(body) > MIN_SECTION_LEN:
+            sections.append((title, body))
+    if not sections:
+        return [("Document", text)]
+    return sections
 def chunk_text(text):
     sections = split_into_sections(text)
+    if len(sections) == 1:
+        chunks = []
+        start = 0
+        while start < len(text):
+            end = min(start + CHUNK_SIZE, len(text))
+            chunks.append(text[start:end])
+            start += CHUNK_SIZE - CHUNK_OVERLAP
+        return chunks
     chunks = []
     for _, body in sections:
         paragraphs = [p.strip() for p in re.split(r"\n\s*\n", body) if p.strip()]
+        current = ""
         for para in paragraphs:
+            if not current:
+                current = para
+            elif len(current) + len(para) + 2 <= CHUNK_SIZE:
+                current += "\n\n" + para
             else:
+                chunks.append(current)
+                current = para
+        if current:
+            chunks.append(current)
     return chunks
+# =========================
+# SEMANTIC SEARCH (KNN)
+# =========================
 class SemanticSearch:
     def __init__(self, model):
         self.embedder = SentenceTransformer(model)
         h = hashlib.md5("".join(chunks).encode()).hexdigest()
         if CACHE["last_text_hash"] == h:
+            print("⚡ Using cached embeddings")
             self.chunks = CACHE["chunks"]
             self.knn = CACHE["knn"]
             return
+        print("➡ Rebuilding embeddings…")
         self.chunks = chunks
         emb = self.embedder.encode(chunks, convert_to_numpy=True)
 vs = None
+# =========================
+# LOAD QWEN FOR RAG
+# =========================
 print("Loading Qwen 0.5B…")
 q_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
 q_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL).to("cpu")
 q_model.eval()
         temperature=0.6,
         eos_token_id=q_tokenizer.eos_token_id,
     )
     gen = out[0][inp["input_ids"].shape[1]:]
     return q_tokenizer.decode(gen, skip_special_tokens=True).strip()
+# =========================
+# LOAD NLLB TRANSLATOR
+# =========================
+print("Loading NLLB-200 translator…")
 trans_tokenizer = NllbTokenizer.from_pretrained(TRANS_MODEL_ID)
 trans_model = M2M100ForConditionalGeneration.from_pretrained(TRANS_MODEL_ID).to("cpu")
     try:
         tgt = LANG_CODES[lang]
         inputs = trans_tokenizer(text, return_tensors="pt").to("cpu")
         output = trans_model.generate(
             **inputs,
             forced_bos_token_id=trans_tokenizer.convert_tokens_to_ids(tgt),
+            max_new_tokens=300
         )
         return trans_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
+    except Exception as e:
+        print("Translation error:", e)
         return text
+# =========================
 # RAG PROMPT
+# =========================
 def build_prompt(question, retrieved):
     ctx = "\n\n---\n\n".join([c for c, _ in retrieved])
     return f"""
 1. Use ONLY the context. Do not add external knowledge.
 2. If the context does not contain the answer, say:
    "I don't know based on this document."
+3. When possible, structure your answer into short, clear points.
+4. Keep the answer concise, factual, and in English.
 CONTEXT:
 {ctx}
 QUESTION:
 {question}
+Write your answer below (in English):
 """.strip()
+# =========================
 # SOURCE DISPLAY
+# =========================
 def highlight_sources(retrieved):
     html = "<h4>📚 Source Passages</h4>"
     for i, (chunk, score) in enumerate(retrieved):
     return html
+# =========================
 # ANSWER FUNCTION
+# =========================
 def answer_question(q, lang):
     global vs
     if vs is None:
     return final, highlight_sources(retrieved)
+# =========================
 # LOADERS
+# =========================
 def load_pdf_ui(file, lang):
     global vs
     if not file:
         return "Enter a URL."
     try:
+        head = requests.head(url, allow_redirects=True, timeout=10)
         ftype = detect_filetype(url, head.headers)
         if ftype == "pdf":
     return f"URL loaded with {len(chunks)} chunks."
+# =========================
 # UI
+# =========================
 def create_app():
     with gr.Blocks() as demo:
         gr.Markdown("<h1>📘 Multilingual Chat with PDF / URL</h1>")
         lang = gr.Dropdown(
+            ["auto", "English", "Hindi", "Telugu", "Tamil", "Kannada", "Malayalam",
+             "Bengali", "Marathi", "Gujarati", "Odia", "Punjabi", "Assamese"],
             value="auto",
             label="Answer Language"
         )
             gr.Button("Ask").click(answer_question, [q, lang], [a, cits])
     return demo