roshini7sn commited on
Commit
d8dbb12
·
verified ·
1 Parent(s): 2caf310

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -119
app.py CHANGED
@@ -1,6 +1,6 @@
1
- # =========================================================
2
  # GLOBAL CACHE
3
- # =========================================================
4
  CACHE = {
5
  "last_text_hash": None,
6
  "chunks": None,
@@ -32,11 +32,13 @@ from transformers import (
32
  M2M100ForConditionalGeneration,
33
  )
34
 
35
- # =========================================================
36
  # CONFIG
37
- # =========================================================
38
  EMBED_MODEL = "intfloat/e5-small-v2"
39
  LLM_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
 
 
40
  TRANS_MODEL_ID = "facebook/nllb-200-distilled-600M"
41
 
42
  CHUNK_SIZE = 1500
@@ -44,103 +46,73 @@ CHUNK_OVERLAP = 300
44
  MIN_SECTION_LEN = 300
45
 
46
 
47
- # =========================================================
48
  # CLEAN TEXT
49
- # =========================================================
50
  def clean_text(text):
51
  return " ".join(text.replace("\r", "\n").split())
52
 
53
 
54
- # =========================================================
55
- # SAFE HTML DECODER (Brotli-proof)
56
- # =========================================================
57
- def safe_decode_content(url, resp):
58
- """Decode HTML safely. If Brotli fails, retry without compression."""
59
- enc = resp.headers.get("Content-Encoding", "").lower()
60
-
61
- try:
62
- if "br" in enc:
63
- import brotli
64
- try:
65
- # Try standard brotli decode
66
- return brotli.decompress(resp.content).decode("utf-8", errors="ignore")
67
- except Exception:
68
- # Retry plain text request
69
- retry = requests.get(
70
- url,
71
- headers={
72
- "User-Agent": "Mozilla/5.0",
73
- "Accept-Encoding": "identity"
74
- },
75
- timeout=20,
76
- )
77
- return retry.text
78
-
79
- return resp.text
80
-
81
- except Exception as e:
82
- raise RuntimeError(f"Decompression error: {e}")
83
-
84
-
85
- # =========================================================
86
  # PDF INGEST
87
- # =========================================================
88
  def extract_text_from_pdf(path):
89
  reader = PdfReader(path)
90
  text = ""
91
  for page in reader.pages:
92
- text += "\n" + (page.extract_text() or "")
 
93
  return clean_text(text)
94
 
95
 
96
  def extract_pdf_from_url(url):
97
- r = requests.get(url, timeout=20)
98
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
99
  tmp.write(r.content)
100
  tmp.flush()
101
- text = extract_text_from_pdf(tmp.name)
102
  tmp.close()
103
- return text
104
 
105
 
106
- # =========================================================
107
- # OTHER FILE INGEST
108
- # =========================================================
109
  def extract_docx_from_url(url):
110
- r = requests.get(url, timeout=20)
111
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
112
  tmp.write(r.content)
113
  tmp.flush()
114
  doc = docx.Document(tmp.name)
 
115
  tmp.close()
116
- return clean_text("\n".join(p.text for p in doc.paragraphs))
117
 
118
 
119
  def extract_txt_from_url(url):
120
- return clean_text(requests.get(url, timeout=20).text)
121
 
122
 
123
  def extract_csv_from_url(url):
124
- df = pd.read_csv(StringIO(requests.get(url, timeout=20).text))
125
  return clean_text(df.to_string())
126
 
127
 
128
  def extract_html_from_url(url):
129
- headers = {
130
- "User-Agent": "Mozilla/5.0",
131
- "Accept-Encoding": "gzip, deflate, br"
132
- }
133
-
134
- resp = requests.get(url, headers=headers, timeout=20)
135
-
136
- html = safe_decode_content(url, resp)
137
- soup = BeautifulSoup(html, "html.parser")
138
  return clean_text(soup.get_text(separator=" "))
139
 
140
 
141
- # =========================================================
142
  # FILE TYPE DETECTION
143
- # =========================================================
144
  def detect_filetype(url, headers):
145
  u = url.lower()
146
  c = headers.get("Content-Type", "").lower()
@@ -156,13 +128,13 @@ def detect_filetype(url, headers):
156
  return "html"
157
 
158
 
159
- # =========================================================
160
  # SECTION-AWARE CHUNKING
161
- # =========================================================
162
  SECTION_KEYWORDS = [
163
- "abstract", "introduction", "method", "methodology",
164
- "materials and methods", "results", "discussion",
165
- "conclusion", "conclusions", "background"
166
  ]
167
 
168
 
@@ -182,9 +154,7 @@ def is_heading(line):
182
 
183
  def split_into_sections(text):
184
  lines = text.split("\n")
185
- sections = []
186
- buf = []
187
- title = "Document"
188
 
189
  for line in lines:
190
  if is_heading(line):
@@ -196,37 +166,51 @@ def split_into_sections(text):
196
  buf.append(line)
197
 
198
  if buf:
199
- sections.append((title, "\n".join(buf)))
 
 
200
 
201
- return sections if sections else [("Document", text)]
 
 
 
202
 
203
 
204
  def chunk_text(text):
205
  sections = split_into_sections(text)
206
 
 
 
 
 
 
 
 
 
 
207
  chunks = []
208
  for _, body in sections:
209
  paragraphs = [p.strip() for p in re.split(r"\n\s*\n", body) if p.strip()]
210
- cur = ""
211
 
212
  for para in paragraphs:
213
- if not cur:
214
- cur = para
215
- elif len(cur) + len(para) + 2 <= CHUNK_SIZE:
216
- cur += "\n\n" + para
217
  else:
218
- chunks.append(cur)
219
- cur = para
220
 
221
- if cur:
222
- chunks.append(cur)
223
 
224
  return chunks
225
 
226
 
227
- # =========================================================
228
- # SEMANTIC SEARCH
229
- # =========================================================
230
  class SemanticSearch:
231
  def __init__(self, model):
232
  self.embedder = SentenceTransformer(model)
@@ -240,10 +224,12 @@ class SemanticSearch:
240
  h = hashlib.md5("".join(chunks).encode()).hexdigest()
241
 
242
  if CACHE["last_text_hash"] == h:
 
243
  self.chunks = CACHE["chunks"]
244
  self.knn = CACHE["knn"]
245
  return
246
 
 
247
  self.chunks = chunks
248
  emb = self.embedder.encode(chunks, convert_to_numpy=True)
249
 
@@ -266,10 +252,11 @@ class SemanticSearch:
266
  vs = None
267
 
268
 
269
- # =========================================================
270
- # LOAD LOCAL QWEN
271
- # =========================================================
272
  print("Loading Qwen 0.5B…")
 
273
  q_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
274
  q_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL).to("cpu")
275
  q_model.eval()
@@ -292,15 +279,15 @@ def run_llm(system, user):
292
  temperature=0.6,
293
  eos_token_id=q_tokenizer.eos_token_id,
294
  )
295
-
296
  gen = out[0][inp["input_ids"].shape[1]:]
297
  return q_tokenizer.decode(gen, skip_special_tokens=True).strip()
298
 
299
 
300
- # =========================================================
301
- # TRANSLATION (NLLB)
302
- # =========================================================
303
- print("Loading NLLB translator…")
 
304
  trans_tokenizer = NllbTokenizer.from_pretrained(TRANS_MODEL_ID)
305
  trans_model = M2M100ForConditionalGeneration.from_pretrained(TRANS_MODEL_ID).to("cpu")
306
 
@@ -326,20 +313,23 @@ def translate_to_indic(text, lang):
326
 
327
  try:
328
  tgt = LANG_CODES[lang]
 
329
  inputs = trans_tokenizer(text, return_tensors="pt").to("cpu")
330
  output = trans_model.generate(
331
  **inputs,
332
  forced_bos_token_id=trans_tokenizer.convert_tokens_to_ids(tgt),
333
- max_new_tokens=300,
334
  )
335
  return trans_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
336
- except:
 
 
337
  return text
338
 
339
 
340
- # =========================================================
341
  # RAG PROMPT
342
- # =========================================================
343
  def build_prompt(question, retrieved):
344
  ctx = "\n\n---\n\n".join([c for c, _ in retrieved])
345
  return f"""
@@ -351,8 +341,8 @@ Follow these rules:
351
  1. Use ONLY the context. Do not add external knowledge.
352
  2. If the context does not contain the answer, say:
353
  "I don't know based on this document."
354
- 3. When possible, give the answer in short, clear points.
355
- 4. Always answer in English.
356
 
357
  CONTEXT:
358
  {ctx}
@@ -360,13 +350,13 @@ CONTEXT:
360
  QUESTION:
361
  {question}
362
 
363
- Write your answer below:
364
  """.strip()
365
 
366
 
367
- # =========================================================
368
  # SOURCE DISPLAY
369
- # =========================================================
370
  def highlight_sources(retrieved):
371
  html = "<h4>📚 Source Passages</h4>"
372
  for i, (chunk, score) in enumerate(retrieved):
@@ -379,9 +369,9 @@ def highlight_sources(retrieved):
379
  return html
380
 
381
 
382
- # =========================================================
383
  # ANSWER FUNCTION
384
- # =========================================================
385
  def answer_question(q, lang):
386
  global vs
387
  if vs is None:
@@ -396,9 +386,9 @@ def answer_question(q, lang):
396
  return final, highlight_sources(retrieved)
397
 
398
 
399
- # =========================================================
400
  # LOADERS
401
- # =========================================================
402
  def load_pdf_ui(file, lang):
403
  global vs
404
  if not file:
@@ -419,7 +409,7 @@ def load_url_ui(url, lang):
419
  return "Enter a URL."
420
 
421
  try:
422
- head = requests.head(url, allow_redirects=True, timeout=20)
423
  ftype = detect_filetype(url, head.headers)
424
 
425
  if ftype == "pdf":
@@ -443,20 +433,17 @@ def load_url_ui(url, lang):
443
  return f"URL loaded with {len(chunks)} chunks."
444
 
445
 
446
- # =========================================================
447
  # UI
448
- # =========================================================
449
  def create_app():
450
  with gr.Blocks() as demo:
451
 
452
  gr.Markdown("<h1>📘 Multilingual Chat with PDF / URL</h1>")
453
 
454
  lang = gr.Dropdown(
455
- [
456
- "auto", "English", "Hindi", "Telugu", "Tamil", "Kannada",
457
- "Malayalam", "Bengali", "Marathi", "Gujarati",
458
- "Odia", "Punjabi", "Assamese",
459
- ],
460
  value="auto",
461
  label="Answer Language"
462
  )
@@ -477,13 +464,6 @@ def create_app():
477
 
478
  gr.Button("Ask").click(answer_question, [q, lang], [a, cits])
479
 
480
- gr.Markdown("### ✨ Example Questions")
481
- with gr.Row():
482
- gr.Button("Summarize the document").click(lambda: "Summarize the document", None, q)
483
- gr.Button("What are the key findings?").click(lambda: "What are the key findings?", None, q)
484
- gr.Button("Explain the methodology").click(lambda: "Explain the methodology used", None, q)
485
- gr.Button("What conclusions are given?").click(lambda: "What are the conclusions?", None, q)
486
-
487
  return demo
488
 
489
 
 
1
+ # =========================
2
  # GLOBAL CACHE
3
+ # =========================
4
  CACHE = {
5
  "last_text_hash": None,
6
  "chunks": None,
 
32
  M2M100ForConditionalGeneration,
33
  )
34
 
35
+ # =========================
36
  # CONFIG
37
+ # =========================
38
  EMBED_MODEL = "intfloat/e5-small-v2"
39
  LLM_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
40
+
41
+ # ⭐ Fully open-source translation model (works everywhere)
42
  TRANS_MODEL_ID = "facebook/nllb-200-distilled-600M"
43
 
44
  CHUNK_SIZE = 1500
 
46
  MIN_SECTION_LEN = 300
47
 
48
 
49
+ # =========================
50
  # CLEAN TEXT
51
+ # =========================
52
  def clean_text(text):
53
  return " ".join(text.replace("\r", "\n").split())
54
 
55
 
56
+ # =========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  # PDF INGEST
58
+ # =========================
59
  def extract_text_from_pdf(path):
60
  reader = PdfReader(path)
61
  text = ""
62
  for page in reader.pages:
63
+ page_text = page.extract_text() or ""
64
+ text += "\n" + page_text
65
  return clean_text(text)
66
 
67
 
68
  def extract_pdf_from_url(url):
69
+ r = requests.get(url, timeout=10)
70
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
71
  tmp.write(r.content)
72
  tmp.flush()
73
+ txt = extract_text_from_pdf(tmp.name)
74
  tmp.close()
75
+ return txt
76
 
77
 
78
+ # =========================
79
+ # DOCX / TXT / CSV / HTML INGEST
80
+ # =========================
81
  def extract_docx_from_url(url):
82
+ r = requests.get(url, timeout=10)
83
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".docx")
84
  tmp.write(r.content)
85
  tmp.flush()
86
  doc = docx.Document(tmp.name)
87
+ text = "\n".join(p.text for p in doc.paragraphs)
88
  tmp.close()
89
+ return clean_text(text)
90
 
91
 
92
  def extract_txt_from_url(url):
93
+ return clean_text(requests.get(url, timeout=10).text)
94
 
95
 
96
  def extract_csv_from_url(url):
97
+ df = pd.read_csv(StringIO(requests.get(url, timeout=10).text))
98
  return clean_text(df.to_string())
99
 
100
 
101
  def extract_html_from_url(url):
102
+ downloaded = trafilatura.fetch_url(url)
103
+ if downloaded:
104
+ extracted = trafilatura.extract(downloaded)
105
+ if extracted:
106
+ return clean_text(extracted)
107
+
108
+ resp = requests.get(url, timeout=10)
109
+ soup = BeautifulSoup(resp.text, "html.parser")
 
110
  return clean_text(soup.get_text(separator=" "))
111
 
112
 
113
+ # =========================
114
  # FILE TYPE DETECTION
115
+ # =========================
116
  def detect_filetype(url, headers):
117
  u = url.lower()
118
  c = headers.get("Content-Type", "").lower()
 
128
  return "html"
129
 
130
 
131
+ # =========================
132
  # SECTION-AWARE CHUNKING
133
+ # =========================
134
  SECTION_KEYWORDS = [
135
+ "introduction", "method", "methodology", "materials and methods",
136
+ "results", "discussion", "conclusion", "conclusions", "abstract",
137
+ "background", "analysis"
138
  ]
139
 
140
 
 
154
 
155
  def split_into_sections(text):
156
  lines = text.split("\n")
157
+ sections, title, buf = [], "Document", []
 
 
158
 
159
  for line in lines:
160
  if is_heading(line):
 
166
  buf.append(line)
167
 
168
  if buf:
169
+ body = "\n".join(buf)
170
+ if len(body) > MIN_SECTION_LEN:
171
+ sections.append((title, body))
172
 
173
+ if not sections:
174
+ return [("Document", text)]
175
+
176
+ return sections
177
 
178
 
179
  def chunk_text(text):
180
  sections = split_into_sections(text)
181
 
182
+ if len(sections) == 1:
183
+ chunks = []
184
+ start = 0
185
+ while start < len(text):
186
+ end = min(start + CHUNK_SIZE, len(text))
187
+ chunks.append(text[start:end])
188
+ start += CHUNK_SIZE - CHUNK_OVERLAP
189
+ return chunks
190
+
191
  chunks = []
192
  for _, body in sections:
193
  paragraphs = [p.strip() for p in re.split(r"\n\s*\n", body) if p.strip()]
194
+ current = ""
195
 
196
  for para in paragraphs:
197
+ if not current:
198
+ current = para
199
+ elif len(current) + len(para) + 2 <= CHUNK_SIZE:
200
+ current += "\n\n" + para
201
  else:
202
+ chunks.append(current)
203
+ current = para
204
 
205
+ if current:
206
+ chunks.append(current)
207
 
208
  return chunks
209
 
210
 
211
+ # =========================
212
+ # SEMANTIC SEARCH (KNN)
213
+ # =========================
214
  class SemanticSearch:
215
  def __init__(self, model):
216
  self.embedder = SentenceTransformer(model)
 
224
  h = hashlib.md5("".join(chunks).encode()).hexdigest()
225
 
226
  if CACHE["last_text_hash"] == h:
227
+ print("⚡ Using cached embeddings")
228
  self.chunks = CACHE["chunks"]
229
  self.knn = CACHE["knn"]
230
  return
231
 
232
+ print("➡ Rebuilding embeddings…")
233
  self.chunks = chunks
234
  emb = self.embedder.encode(chunks, convert_to_numpy=True)
235
 
 
252
  vs = None
253
 
254
 
255
+ # =========================
256
+ # LOAD QWEN FOR RAG
257
+ # =========================
258
  print("Loading Qwen 0.5B…")
259
+
260
  q_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
261
  q_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL).to("cpu")
262
  q_model.eval()
 
279
  temperature=0.6,
280
  eos_token_id=q_tokenizer.eos_token_id,
281
  )
 
282
  gen = out[0][inp["input_ids"].shape[1]:]
283
  return q_tokenizer.decode(gen, skip_special_tokens=True).strip()
284
 
285
 
286
+ # =========================
287
+ # LOAD NLLB TRANSLATOR
288
+ # =========================
289
+ print("Loading NLLB-200 translator…")
290
+
291
  trans_tokenizer = NllbTokenizer.from_pretrained(TRANS_MODEL_ID)
292
  trans_model = M2M100ForConditionalGeneration.from_pretrained(TRANS_MODEL_ID).to("cpu")
293
 
 
313
 
314
  try:
315
  tgt = LANG_CODES[lang]
316
+
317
  inputs = trans_tokenizer(text, return_tensors="pt").to("cpu")
318
  output = trans_model.generate(
319
  **inputs,
320
  forced_bos_token_id=trans_tokenizer.convert_tokens_to_ids(tgt),
321
+ max_new_tokens=300
322
  )
323
  return trans_tokenizer.batch_decode(output, skip_special_tokens=True)[0]
324
+
325
+ except Exception as e:
326
+ print("Translation error:", e)
327
  return text
328
 
329
 
330
+ # =========================
331
  # RAG PROMPT
332
+ # =========================
333
  def build_prompt(question, retrieved):
334
  ctx = "\n\n---\n\n".join([c for c, _ in retrieved])
335
  return f"""
 
341
  1. Use ONLY the context. Do not add external knowledge.
342
  2. If the context does not contain the answer, say:
343
  "I don't know based on this document."
344
+ 3. When possible, structure your answer into short, clear points.
345
+ 4. Keep the answer concise, factual, and in English.
346
 
347
  CONTEXT:
348
  {ctx}
 
350
  QUESTION:
351
  {question}
352
 
353
+ Write your answer below (in English):
354
  """.strip()
355
 
356
 
357
+ # =========================
358
  # SOURCE DISPLAY
359
+ # =========================
360
  def highlight_sources(retrieved):
361
  html = "<h4>📚 Source Passages</h4>"
362
  for i, (chunk, score) in enumerate(retrieved):
 
369
  return html
370
 
371
 
372
+ # =========================
373
  # ANSWER FUNCTION
374
+ # =========================
375
  def answer_question(q, lang):
376
  global vs
377
  if vs is None:
 
386
  return final, highlight_sources(retrieved)
387
 
388
 
389
+ # =========================
390
  # LOADERS
391
+ # =========================
392
  def load_pdf_ui(file, lang):
393
  global vs
394
  if not file:
 
409
  return "Enter a URL."
410
 
411
  try:
412
+ head = requests.head(url, allow_redirects=True, timeout=10)
413
  ftype = detect_filetype(url, head.headers)
414
 
415
  if ftype == "pdf":
 
433
  return f"URL loaded with {len(chunks)} chunks."
434
 
435
 
436
+ # =========================
437
  # UI
438
+ # =========================
439
  def create_app():
440
  with gr.Blocks() as demo:
441
 
442
  gr.Markdown("<h1>📘 Multilingual Chat with PDF / URL</h1>")
443
 
444
  lang = gr.Dropdown(
445
+ ["auto", "English", "Hindi", "Telugu", "Tamil", "Kannada", "Malayalam",
446
+ "Bengali", "Marathi", "Gujarati", "Odia", "Punjabi", "Assamese"],
 
 
 
447
  value="auto",
448
  label="Answer Language"
449
  )
 
464
 
465
  gr.Button("Ask").click(answer_question, [q, lang], [a, cits])
466
 
 
 
 
 
 
 
 
467
  return demo
468
 
469