roshini7sn commited on
Commit
9dc84ae
·
verified ·
1 Parent(s): d8dbb12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -37
app.py CHANGED
@@ -25,6 +25,7 @@ from sentence_transformers import SentenceTransformer
25
  from sklearn.neighbors import NearestNeighbors
26
 
27
  import torch
 
28
  from transformers import (
29
  AutoModelForCausalLM,
30
  AutoTokenizer,
@@ -38,7 +39,7 @@ from transformers import (
38
  EMBED_MODEL = "intfloat/e5-small-v2"
39
  LLM_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
40
 
41
- # Fully open-source translation model (works everywhere)
42
  TRANS_MODEL_ID = "facebook/nllb-200-distilled-600M"
43
 
44
  CHUNK_SIZE = 1500
@@ -49,14 +50,14 @@ MIN_SECTION_LEN = 300
49
  # =========================
50
  # CLEAN TEXT
51
  # =========================
52
- def clean_text(text):
53
  return " ".join(text.replace("\r", "\n").split())
54
 
55
 
56
  # =========================
57
  # PDF INGEST
58
  # =========================
59
- def extract_text_from_pdf(path):
60
  reader = PdfReader(path)
61
  text = ""
62
  for page in reader.pages:
@@ -65,8 +66,8 @@ def extract_text_from_pdf(path):
65
  return clean_text(text)
66
 
67
 
68
- def extract_pdf_from_url(url):
69
- r = requests.get(url, timeout=10)
70
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
71
  tmp.write(r.content)
72
  tmp.flush()
@@ -76,44 +77,100 @@ def extract_pdf_from_url(url):
76
 
77
 
78
  # =========================
79
- # DOCX / TXT / CSV / HTML INGEST
80
  # =========================
81
- def extract_docx_from_url(url):
82
- r = requests.get(url, timeout=10)
83
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
84
  tmp.write(r.content)
85
  tmp.flush()
86
- doc = docx.Document(tmp.name)
87
- text = "\n".join(p.text for p in doc.paragraphs)
88
  tmp.close()
89
  return clean_text(text)
90
 
91
 
92
- def extract_txt_from_url(url):
93
- return clean_text(requests.get(url, timeout=10).text)
94
 
95
 
96
- def extract_csv_from_url(url):
97
- df = pd.read_csv(StringIO(requests.get(url, timeout=10).text))
98
  return clean_text(df.to_string())
99
 
100
 
101
- def extract_html_from_url(url):
102
- downloaded = trafilatura.fetch_url(url)
103
- if downloaded:
104
- extracted = trafilatura.extract(downloaded)
105
- if extracted:
106
- return clean_text(extracted)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- resp = requests.get(url, timeout=10)
109
- soup = BeautifulSoup(resp.text, "html.parser")
110
- return clean_text(soup.get_text(separator=" "))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
 
113
  # =========================
114
  # FILE TYPE DETECTION
115
  # =========================
116
- def detect_filetype(url, headers):
117
  u = url.lower()
118
  c = headers.get("Content-Type", "").lower()
119
 
@@ -138,7 +195,7 @@ SECTION_KEYWORDS = [
138
  ]
139
 
140
 
141
- def is_heading(line):
142
  line = line.strip()
143
  if not line or len(line) > 120:
144
  return False
@@ -152,7 +209,7 @@ def is_heading(line):
152
  return False
153
 
154
 
155
- def split_into_sections(text):
156
  lines = text.split("\n")
157
  sections, title, buf = [], "Document", []
158
 
@@ -176,9 +233,10 @@ def split_into_sections(text):
176
  return sections
177
 
178
 
179
- def chunk_text(text):
180
  sections = split_into_sections(text)
181
 
 
182
  if len(sections) == 1:
183
  chunks = []
184
  start = 0
@@ -212,7 +270,7 @@ def chunk_text(text):
212
  # SEMANTIC SEARCH (KNN)
213
  # =========================
214
  class SemanticSearch:
215
- def __init__(self, model):
216
  self.embedder = SentenceTransformer(model)
217
  self.knn = None
218
  self.chunks = []
@@ -263,12 +321,14 @@ q_model.eval()
263
 
264
 
265
  @torch.no_grad()
266
- def run_llm(system, user):
267
  messages = [
268
  {"role": "system", "content": system},
269
  {"role": "user", "content": user},
270
  ]
271
- text = q_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
272
  inp = q_tokenizer(text, return_tensors="pt").to("cpu")
273
 
274
  out = q_model.generate(
@@ -307,21 +367,19 @@ LANG_CODES = {
307
  }
308
 
309
 
310
- def translate_to_indic(text, lang):
311
  if lang == "English" or lang == "auto":
312
  return text
313
 
314
  try:
315
  tgt = LANG_CODES[lang]
316
-
317
  inputs = trans_tokenizer(text, return_tensors="pt").to("cpu")
318
  output = trans_model.generate(
319
  **inputs,
320
  forced_bos_token_id=trans_tokenizer.convert_tokens_to_ids(tgt),
321
- max_new_tokens=300
322
  )
323
  return trans_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
324
-
325
  except Exception as e:
326
  print("Translation error:", e)
327
  return text
@@ -409,7 +467,7 @@ def load_url_ui(url, lang):
409
  return "Enter a URL."
410
 
411
  try:
412
- head = requests.head(url, allow_redirects=True, timeout=10)
413
  ftype = detect_filetype(url, head.headers)
414
 
415
  if ftype == "pdf":
@@ -442,8 +500,11 @@ def create_app():
442
  gr.Markdown("<h1>📘 Multilingual Chat with PDF / URL</h1>")
443
 
444
  lang = gr.Dropdown(
445
- ["auto", "English", "Hindi", "Telugu", "Tamil", "Kannada", "Malayalam",
446
- "Bengali", "Marathi", "Gujarati", "Odia", "Punjabi", "Assamese"],
 
 
 
447
  value="auto",
448
  label="Answer Language"
449
  )
@@ -464,6 +525,24 @@ def create_app():
464
 
465
  gr.Button("Ask").click(answer_question, [q, lang], [a, cits])
466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  return demo
468
 
469
 
 
25
  from sklearn.neighbors import NearestNeighbors
26
 
27
  import torch
28
+ import brotli
29
  from transformers import (
30
  AutoModelForCausalLM,
31
  AutoTokenizer,
 
39
  EMBED_MODEL = "intfloat/e5-small-v2"
40
  LLM_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
41
 
42
+ # Translation model (open, no auth required)
43
  TRANS_MODEL_ID = "facebook/nllb-200-distilled-600M"
44
 
45
  CHUNK_SIZE = 1500
 
50
  # =========================
51
  # CLEAN TEXT
52
  # =========================
53
+ def clean_text(text: str) -> str:
54
  return " ".join(text.replace("\r", "\n").split())
55
 
56
 
57
  # =========================
58
  # PDF INGEST
59
  # =========================
60
+ def extract_text_from_pdf(path: str) -> str:
61
  reader = PdfReader(path)
62
  text = ""
63
  for page in reader.pages:
 
66
  return clean_text(text)
67
 
68
 
69
+ def extract_pdf_from_url(url: str) -> str:
70
+ r = requests.get(url, timeout=20)
71
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
72
  tmp.write(r.content)
73
  tmp.flush()
 
77
 
78
 
79
  # =========================
80
+ # DOCX / TXT / CSV INGEST
81
  # =========================
82
+ def extract_docx_from_url(url: str) -> str:
83
+ r = requests.get(url, timeout=20)
84
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
85
  tmp.write(r.content)
86
  tmp.flush()
87
+ document = docx.Document(tmp.name)
88
+ text = "\n".join(p.text for p in document.paragraphs)
89
  tmp.close()
90
  return clean_text(text)
91
 
92
 
93
+ def extract_txt_from_url(url: str) -> str:
94
+ return clean_text(requests.get(url, timeout=20).text)
95
 
96
 
97
+ def extract_csv_from_url(url: str) -> str:
98
+ df = pd.read_csv(StringIO(requests.get(url, timeout=20).text))
99
  return clean_text(df.to_string())
100
 
101
 
102
+ # =========================
103
+ # ROBUST HTML + IN-PAGE PDF HANDLER
104
+ # =========================
105
+ def extract_html_from_url(url: str) -> str:
106
+ """
107
+ Robust extractor for research sites:
108
+ - Handles brotli (br) encoding
109
+ - Detects <a href="...pdf"> links inside HTML and downloads PDF
110
+ - Falls back to cleaned HTML text
111
+ """
112
+ headers = {
113
+ "User-Agent": (
114
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
115
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
116
+ "Chrome/120.0 Safari/537.36"
117
+ ),
118
+ "Accept": "*/*",
119
+ "Accept-Encoding": "gzip, deflate, br",
120
+ }
121
+
122
+ # 1) Fetch HTML
123
+ try:
124
+ resp = requests.get(url, headers=headers, timeout=20)
125
+
126
+ if resp.headers.get("Content-Encoding") == "br":
127
+ html = brotli.decompress(resp.content).decode("utf-8", errors="ignore")
128
+ else:
129
+ html = resp.text
130
+ except Exception as e:
131
+ return f"Error loading HTML: {e}"
132
+
133
+ soup = BeautifulSoup(html, "html.parser")
134
+
135
+ # 2) Try to find a PDF link inside the page
136
+ pdf_links = [a["href"] for a in soup.find_all("a", href=True)
137
+ if ".pdf" in a["href"].lower()]
138
 
139
+ if pdf_links:
140
+ pdf_url = pdf_links[0]
141
+ if pdf_url.startswith("/"):
142
+ from urllib.parse import urljoin
143
+ pdf_url = urljoin(url, pdf_url)
144
+
145
+ try:
146
+ pdf_resp = requests.get(pdf_url, headers=headers, timeout=20)
147
+ if pdf_resp.status_code == 200:
148
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
149
+ tmp.write(pdf_resp.content)
150
+ tmp.flush()
151
+ text = extract_text_from_pdf(tmp.name)
152
+ tmp.close()
153
+ return text
154
+ except Exception:
155
+ # If PDF fails, fall back to HTML extraction
156
+ pass
157
+
158
+ # 3) Try trafilatura for main text
159
+ extracted = trafilatura.extract(html)
160
+ if extracted and len(extracted) > 200:
161
+ return clean_text(extracted)
162
+
163
+ # 4) Raw HTML fallback
164
+ for bad in soup(["script", "style", "noscript"]):
165
+ bad.decompose()
166
+
167
+ return clean_text(soup.get_text(" ", strip=True))
168
 
169
 
170
  # =========================
171
  # FILE TYPE DETECTION
172
  # =========================
173
+ def detect_filetype(url: str, headers) -> str:
174
  u = url.lower()
175
  c = headers.get("Content-Type", "").lower()
176
 
 
195
  ]
196
 
197
 
198
+ def is_heading(line: str) -> bool:
199
  line = line.strip()
200
  if not line or len(line) > 120:
201
  return False
 
209
  return False
210
 
211
 
212
+ def split_into_sections(text: str):
213
  lines = text.split("\n")
214
  sections, title, buf = [], "Document", []
215
 
 
233
  return sections
234
 
235
 
236
+ def chunk_text(text: str):
237
  sections = split_into_sections(text)
238
 
239
+ # fallback: sliding window if no good sections
240
  if len(sections) == 1:
241
  chunks = []
242
  start = 0
 
270
  # SEMANTIC SEARCH (KNN)
271
  # =========================
272
  class SemanticSearch:
273
+ def __init__(self, model: str):
274
  self.embedder = SentenceTransformer(model)
275
  self.knn = None
276
  self.chunks = []
 
321
 
322
 
323
  @torch.no_grad()
324
+ def run_llm(system: str, user: str) -> str:
325
  messages = [
326
  {"role": "system", "content": system},
327
  {"role": "user", "content": user},
328
  ]
329
+ text = q_tokenizer.apply_chat_template(
330
+ messages, tokenize=False, add_generation_prompt=True
331
+ )
332
  inp = q_tokenizer(text, return_tensors="pt").to("cpu")
333
 
334
  out = q_model.generate(
 
367
  }
368
 
369
 
370
+ def translate_to_indic(text: str, lang: str) -> str:
371
  if lang == "English" or lang == "auto":
372
  return text
373
 
374
  try:
375
  tgt = LANG_CODES[lang]
 
376
  inputs = trans_tokenizer(text, return_tensors="pt").to("cpu")
377
  output = trans_model.generate(
378
  **inputs,
379
  forced_bos_token_id=trans_tokenizer.convert_tokens_to_ids(tgt),
380
+ max_new_tokens=300,
381
  )
382
  return trans_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
 
383
  except Exception as e:
384
  print("Translation error:", e)
385
  return text
 
467
  return "Enter a URL."
468
 
469
  try:
470
+ head = requests.head(url, allow_redirects=True, timeout=20)
471
  ftype = detect_filetype(url, head.headers)
472
 
473
  if ftype == "pdf":
 
500
  gr.Markdown("<h1>📘 Multilingual Chat with PDF / URL</h1>")
501
 
502
  lang = gr.Dropdown(
503
+ [
504
+ "auto", "English", "Hindi", "Telugu", "Tamil",
505
+ "Kannada", "Malayalam", "Bengali", "Marathi",
506
+ "Gujarati", "Odia", "Punjabi", "Assamese"
507
+ ],
508
  value="auto",
509
  label="Answer Language"
510
  )
 
525
 
526
  gr.Button("Ask").click(answer_question, [q, lang], [a, cits])
527
 
528
+ # Example Questions
529
+ gr.Markdown("### ✨ Example Questions")
530
+
531
+ with gr.Row():
532
+ ex1 = gr.Button("Give a summary of this document")
533
+ ex2 = gr.Button("What are the key findings?")
534
+ ex3 = gr.Button("Explain the methodology used")
535
+ ex4 = gr.Button("List the main conclusions")
536
+ ex5 = gr.Button("Explain in simple terms")
537
+ ex6 = gr.Button("What is the significance of this study?")
538
+
539
+ ex1.click(lambda: "Give a summary of this document", None, q)
540
+ ex2.click(lambda: "What are the key findings?", None, q)
541
+ ex3.click(lambda: "Explain the methodology used", None, q)
542
+ ex4.click(lambda: "List the main conclusions", None, q)
543
+ ex5.click(lambda: "Explain this in simple terms", None, q)
544
+ ex6.click(lambda: "What is the significance of this study?", None, q)
545
+
546
  return demo
547
 
548