Spaces:

roshini7sn
/

chatpdfurl

Sleeping

App Files Files Community

roshini7sn commited on 25 days ago

Commit

9dc84ae

verified ·

1 Parent(s): d8dbb12

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -37

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ from sentence_transformers import SentenceTransformer
 from sklearn.neighbors import NearestNeighbors
 import torch
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
@@ -38,7 +39,7 @@ from transformers import (
 EMBED_MODEL = "intfloat/e5-small-v2"
 LLM_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
-# ⭐ Fully open-source translation model (works everywhere)
 TRANS_MODEL_ID = "facebook/nllb-200-distilled-600M"
 CHUNK_SIZE = 1500
@@ -49,14 +50,14 @@ MIN_SECTION_LEN = 300
 # =========================
 # CLEAN TEXT
 # =========================
-def clean_text(text):
     return " ".join(text.replace("\r", "\n").split())
 # =========================
 # PDF INGEST
 # =========================
-def extract_text_from_pdf(path):
     reader = PdfReader(path)
     text = ""
     for page in reader.pages:
@@ -65,8 +66,8 @@ def extract_text_from_pdf(path):
     return clean_text(text)
-def extract_pdf_from_url(url):
-    r = requests.get(url, timeout=10)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
     tmp.write(r.content)
     tmp.flush()
@@ -76,44 +77,100 @@ def extract_pdf_from_url(url):
 # =========================
-# DOCX / TXT / CSV / HTML INGEST
 # =========================
-def extract_docx_from_url(url):
-    r = requests.get(url, timeout=10)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
     tmp.write(r.content)
     tmp.flush()
-    doc = docx.Document(tmp.name)
-    text = "\n".join(p.text for p in doc.paragraphs)
     tmp.close()
     return clean_text(text)
-def extract_txt_from_url(url):
-    return clean_text(requests.get(url, timeout=10).text)
-def extract_csv_from_url(url):
-    df = pd.read_csv(StringIO(requests.get(url, timeout=10).text))
     return clean_text(df.to_string())
-def extract_html_from_url(url):
-    downloaded = trafilatura.fetch_url(url)
-    if downloaded:
-        extracted = trafilatura.extract(downloaded)
-        if extracted:
-            return clean_text(extracted)
-    resp = requests.get(url, timeout=10)
-    soup = BeautifulSoup(resp.text, "html.parser")
-    return clean_text(soup.get_text(separator=" "))
 # =========================
 # FILE TYPE DETECTION
 # =========================
-def detect_filetype(url, headers):
     u = url.lower()
     c = headers.get("Content-Type", "").lower()
@@ -138,7 +195,7 @@ SECTION_KEYWORDS = [
 ]
-def is_heading(line):
     line = line.strip()
     if not line or len(line) > 120:
         return False
@@ -152,7 +209,7 @@ def is_heading(line):
     return False
-def split_into_sections(text):
     lines = text.split("\n")
     sections, title, buf = [], "Document", []
@@ -176,9 +233,10 @@ def split_into_sections(text):
     return sections
-def chunk_text(text):
     sections = split_into_sections(text)
     if len(sections) == 1:
         chunks = []
         start = 0
@@ -212,7 +270,7 @@ def chunk_text(text):
 # SEMANTIC SEARCH (KNN)
 # =========================
 class SemanticSearch:
-    def __init__(self, model):
         self.embedder = SentenceTransformer(model)
         self.knn = None
         self.chunks = []
@@ -263,12 +321,14 @@ q_model.eval()
 @torch.no_grad()
-def run_llm(system, user):
     messages = [
         {"role": "system", "content": system},
         {"role": "user", "content": user},
     ]
-    text = q_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inp = q_tokenizer(text, return_tensors="pt").to("cpu")
     out = q_model.generate(
@@ -307,21 +367,19 @@ LANG_CODES = {
 }
-def translate_to_indic(text, lang):
     if lang == "English" or lang == "auto":
         return text
     try:
         tgt = LANG_CODES[lang]
         inputs = trans_tokenizer(text, return_tensors="pt").to("cpu")
         output = trans_model.generate(
             **inputs,
             forced_bos_token_id=trans_tokenizer.convert_tokens_to_ids(tgt),
-            max_new_tokens=300
         )
         return trans_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
     except Exception as e:
         print("Translation error:", e)
         return text
@@ -409,7 +467,7 @@ def load_url_ui(url, lang):
         return "Enter a URL."
     try:
-        head = requests.head(url, allow_redirects=True, timeout=10)
         ftype = detect_filetype(url, head.headers)
         if ftype == "pdf":
@@ -442,8 +500,11 @@ def create_app():
         gr.Markdown("<h1>📘 Multilingual Chat with PDF / URL</h1>")
         lang = gr.Dropdown(
-            ["auto", "English", "Hindi", "Telugu", "Tamil", "Kannada", "Malayalam",
-             "Bengali", "Marathi", "Gujarati", "Odia", "Punjabi", "Assamese"],
             value="auto",
             label="Answer Language"
         )
@@ -464,6 +525,24 @@ def create_app():
             gr.Button("Ask").click(answer_question, [q, lang], [a, cits])
     return demo

 from sklearn.neighbors import NearestNeighbors
 import torch
+import brotli
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
 EMBED_MODEL = "intfloat/e5-small-v2"
 LLM_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
+# Translation model (open, no auth required)
 TRANS_MODEL_ID = "facebook/nllb-200-distilled-600M"
 CHUNK_SIZE = 1500
 # =========================
 # CLEAN TEXT
 # =========================
+def clean_text(text: str) -> str:
     return " ".join(text.replace("\r", "\n").split())
 # =========================
 # PDF INGEST
 # =========================
+def extract_text_from_pdf(path: str) -> str:
     reader = PdfReader(path)
     text = ""
     for page in reader.pages:
     return clean_text(text)
+def extract_pdf_from_url(url: str) -> str:
+    r = requests.get(url, timeout=20)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
     tmp.write(r.content)
     tmp.flush()
 # =========================
+# DOCX / TXT / CSV INGEST
 # =========================
+def extract_docx_from_url(url: str) -> str:
+    r = requests.get(url, timeout=20)
     tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
     tmp.write(r.content)
     tmp.flush()
+    document = docx.Document(tmp.name)
+    text = "\n".join(p.text for p in document.paragraphs)
     tmp.close()
     return clean_text(text)
+def extract_txt_from_url(url: str) -> str:
+    return clean_text(requests.get(url, timeout=20).text)
+def extract_csv_from_url(url: str) -> str:
+    df = pd.read_csv(StringIO(requests.get(url, timeout=20).text))
     return clean_text(df.to_string())
+# =========================
+# ROBUST HTML + IN-PAGE PDF HANDLER
+# =========================
+def extract_html_from_url(url: str) -> str:
+    """
+    Robust extractor for research sites:
+    - Handles brotli (br) encoding
+    - Detects <a href="...pdf"> links inside HTML and downloads PDF
+    - Falls back to cleaned HTML text
+    """
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/120.0 Safari/537.36"
+        ),
+        "Accept": "*/*",
+        "Accept-Encoding": "gzip, deflate, br",
+    }
+    # 1) Fetch HTML
+    try:
+        resp = requests.get(url, headers=headers, timeout=20)
+        if resp.headers.get("Content-Encoding") == "br":
+            html = brotli.decompress(resp.content).decode("utf-8", errors="ignore")
+        else:
+            html = resp.text
+    except Exception as e:
+        return f"Error loading HTML: {e}"
+    soup = BeautifulSoup(html, "html.parser")
+    # 2) Try to find a PDF link inside the page
+    pdf_links = [a["href"] for a in soup.find_all("a", href=True)
+                 if ".pdf" in a["href"].lower()]
+    if pdf_links:
+        pdf_url = pdf_links[0]
+        if pdf_url.startswith("/"):
+            from urllib.parse import urljoin
+            pdf_url = urljoin(url, pdf_url)
+        try:
+            pdf_resp = requests.get(pdf_url, headers=headers, timeout=20)
+            if pdf_resp.status_code == 200:
+                tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
+                tmp.write(pdf_resp.content)
+                tmp.flush()
+                text = extract_text_from_pdf(tmp.name)
+                tmp.close()
+                return text
+        except Exception:
+            # If PDF fails, fall back to HTML extraction
+            pass
+    # 3) Try trafilatura for main text
+    extracted = trafilatura.extract(html)
+    if extracted and len(extracted) > 200:
+        return clean_text(extracted)
+    # 4) Raw HTML fallback
+    for bad in soup(["script", "style", "noscript"]):
+        bad.decompose()
+    return clean_text(soup.get_text(" ", strip=True))
 # =========================
 # FILE TYPE DETECTION
 # =========================
+def detect_filetype(url: str, headers) -> str:
     u = url.lower()
     c = headers.get("Content-Type", "").lower()
 ]
+def is_heading(line: str) -> bool:
     line = line.strip()
     if not line or len(line) > 120:
         return False
     return False
+def split_into_sections(text: str):
     lines = text.split("\n")
     sections, title, buf = [], "Document", []
     return sections
+def chunk_text(text: str):
     sections = split_into_sections(text)
+    # fallback: sliding window if no good sections
     if len(sections) == 1:
         chunks = []
         start = 0
 # SEMANTIC SEARCH (KNN)
 # =========================
 class SemanticSearch:
+    def __init__(self, model: str):
         self.embedder = SentenceTransformer(model)
         self.knn = None
         self.chunks = []
 @torch.no_grad()
+def run_llm(system: str, user: str) -> str:
     messages = [
         {"role": "system", "content": system},
         {"role": "user", "content": user},
     ]
+    text = q_tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
     inp = q_tokenizer(text, return_tensors="pt").to("cpu")
     out = q_model.generate(
 }
+def translate_to_indic(text: str, lang: str) -> str:
     if lang == "English" or lang == "auto":
         return text
     try:
         tgt = LANG_CODES[lang]
         inputs = trans_tokenizer(text, return_tensors="pt").to("cpu")
         output = trans_model.generate(
             **inputs,
             forced_bos_token_id=trans_tokenizer.convert_tokens_to_ids(tgt),
+            max_new_tokens=300,
         )
         return trans_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
     except Exception as e:
         print("Translation error:", e)
         return text
         return "Enter a URL."
     try:
+        head = requests.head(url, allow_redirects=True, timeout=20)
         ftype = detect_filetype(url, head.headers)
         if ftype == "pdf":
         gr.Markdown("<h1>📘 Multilingual Chat with PDF / URL</h1>")
         lang = gr.Dropdown(
+            [
+                "auto", "English", "Hindi", "Telugu", "Tamil",
+                "Kannada", "Malayalam", "Bengali", "Marathi",
+                "Gujarati", "Odia", "Punjabi", "Assamese"
+            ],
             value="auto",
             label="Answer Language"
         )
             gr.Button("Ask").click(answer_question, [q, lang], [a, cits])
+            # Example Questions
+            gr.Markdown("### ✨ Example Questions")
+            with gr.Row():
+                ex1 = gr.Button("Give a summary of this document")
+                ex2 = gr.Button("What are the key findings?")
+                ex3 = gr.Button("Explain the methodology used")
+                ex4 = gr.Button("List the main conclusions")
+                ex5 = gr.Button("Explain in simple terms")
+                ex6 = gr.Button("What is the significance of this study?")
+            ex1.click(lambda: "Give a summary of this document", None, q)
+            ex2.click(lambda: "What are the key findings?", None, q)
+            ex3.click(lambda: "Explain the methodology used", None, q)
+            ex4.click(lambda: "List the main conclusions", None, q)
+            ex5.click(lambda: "Explain this in simple terms", None, q)
+            ex6.click(lambda: "What is the significance of this study?", None, q)
     return demo