korayaggul commited on
Commit
7be7203
·
verified ·
1 Parent(s): d6058e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -81
app.py CHANGED
@@ -17,12 +17,12 @@ SAMPLE_JSON_MIN = """[
17
  },
18
  {
19
  "id": "ex-002",
20
- "question": "question",
21
  "answer": "answer"
22
  },
23
  {
24
  "id": "ex-003",
25
- "question": "question",
26
  "answer": "answer"
27
  }
28
  ]"""
@@ -33,6 +33,35 @@ def download_minimal_sample_json():
33
  tmp.flush()
34
  return tmp.name
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  # ---------------------------
37
  # Model: reward/quality (Transformers uyumlu)
38
  # ---------------------------
@@ -43,7 +72,7 @@ try:
43
  task="text-classification",
44
  model=MODEL_ID,
45
  tokenizer=MODEL_ID,
46
- function_to_apply="none" # regresyon skoru
47
  )
48
  MODEL_READY = True
49
  LOAD_ERR = ""
@@ -55,7 +84,7 @@ except Exception as e:
55
  # Scoring & labeling
56
  # ---------------------------
57
  def score_pair(question: str, answer: str) -> float:
58
- """Reward skoru (yüksek = daha kaliteli). Model yoksa yumuşak heuristik."""
59
  if not MODEL_READY:
60
  base = 0.3
61
  if question.strip().endswith("?"):
@@ -71,10 +100,10 @@ def score_pair(question: str, answer: str) -> float:
71
 
72
  def label_mapper_from_distribution(scores: List[float]):
73
  """
74
- Skorlar negatif de olabilir; sabit eşikler yerine dağılım tabanlı eşik:
75
- - low : < 33. persentil
76
- - medium: 33–66
77
- - high : >= 66
78
  """
79
  if not scores:
80
  return lambda s: "medium"
@@ -100,8 +129,7 @@ def label_mapper_from_distribution(scores: List[float]):
100
  # ---------------------------
101
  def _extract_module(item: Dict[str, Any], q_text: str) -> str | None:
102
  """
103
- 'CUSTOMERS' gibi modül adını soru+context içinden yakalar.
104
- Büyük harfli 3+ harfli kelimeleri modül adayı sayıyoruz.
105
  """
106
  ctx = f"{item.get('context','')} {q_text}"
107
  m = re.search(r"\b([A-Z]{3,})\b", ctx)
@@ -109,10 +137,9 @@ def _extract_module(item: Dict[str, Any], q_text: str) -> str | None:
109
 
110
  def _roles_from_answer(ans: str) -> List[str]:
111
  """
112
- Cevaptan rol listesini çıkarır; Title Case yapar; boşları atar.
113
- Örn: "Sales representatives, customer consultants and sales managers"
114
  """
115
- parts = re.split(r",| and ", ans, flags=re.IGNORECASE)
116
  roles = []
117
  for p in parts:
118
  t = p.strip(" .")
@@ -122,33 +149,31 @@ def _roles_from_answer(ans: str) -> List[str]:
122
  roles.append(t)
123
  return [r for r in roles if r]
124
 
125
- def _std_sentence(s: str) -> str:
126
- s = s.strip()
127
- if not s:
128
- return s
129
- if s[0].islower():
130
- s = s[0].upper() + s[1:]
131
- if s[-1] not in ".!?":
132
- s += "."
133
- return s
134
-
135
  def improve_smart(item: Dict[str, Any]) -> Dict[str, Any]:
136
  """
137
- TOPLAM DÜZENLEYİCİ:
138
- - WHO tipi ve modül adı varsa: "Which roles are authorized to access the {MODULE} module in DealerTIQ?"
139
- Cevap: "Authorized roles include A, B, and C."
140
- - Aksi halde genel temizlik (soru işareti, büyük harf, noktalama, çok kısa cevabı kısaca netleştirme).
 
 
141
  """
142
- q = (item.get("question") or "").strip()
143
- a = (item.get("answer") or "").strip()
144
  meta = item.get("metadata") or {}
145
  qtype = (meta.get("question_type") or "").lower()
 
 
 
146
 
147
- module = _extract_module(item, q)
148
- roles = _roles_from_answer(a)
149
 
150
- # WHO tipi + modül saptandıysa güçlü şablon
151
  if qtype == "who" and module:
 
152
  new_q = f"Which roles are authorized to access the {module} module in DealerTIQ?"
153
  if roles:
154
  if len(roles) == 1:
@@ -159,48 +184,81 @@ def improve_smart(item: Dict[str, Any]) -> Dict[str, Any]:
159
  roles_str = ", ".join(roles[:-1]) + f", and {roles[-1]}"
160
  new_a = f"Authorized roles include {roles_str}."
161
  else:
162
- # Rol çıkarılamadıysa mevcut cevabı sadece standardize et
163
- new_a = _std_sentence(a) if a else a
164
 
165
- # context'i de standardize et (varsa)
166
  if item.get("context"):
167
  item["context"] = f"DealerTIQ — {module} module"
168
 
169
- item["question"] = new_q
170
- item["answer"] = new_a
171
  return item
172
 
173
- # Genel temizlik (LLM yok; güvenli kurallar)
174
- if q:
175
- q2 = q
176
- # "permission to use X" → "authorized to access the X module"
177
- m = re.search(r"permission to use\s+([A-Z]{3,})", q2, flags=re.IGNORECASE)
178
- if m:
179
- mod = m.group(1)
180
- q2 = f"Which roles are authorized to access the {mod} module in DealerTIQ"
181
- if not q2.endswith("?"):
182
- q2 += "?"
183
- if q2[0].islower():
184
- q2 = q2[0].upper() + q2[1:]
185
- q = q2
186
-
187
- if a:
188
- a2 = a.strip()
189
- if roles:
190
- if len(roles) == 1:
191
- roles_str = roles[0]
192
- elif len(roles) == 2:
193
- roles_str = " and ".join(roles)
 
 
 
 
 
 
 
194
  else:
195
- roles_str = ", ".join(roles[:-1]) + f", and {roles[-1]}"
196
- a2 = f"Authorized roles include {roles_str}."
197
- a = _std_sentence(a2)
 
 
 
 
 
 
 
 
 
 
198
 
199
- if a and len(a.split()) < 6:
200
- a = a + " This answer has been clarified for brevity and precision."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- item["question"] = q
203
- item["answer"] = a
 
 
 
204
  return item
205
 
206
  # ---------------------------
@@ -208,22 +266,22 @@ def improve_smart(item: Dict[str, Any]) -> Dict[str, Any]:
208
  # ---------------------------
209
  def process_json(file) -> Tuple[List[Dict[str, Any]], str, str, str]:
210
  """
211
- Girdi: JSON (list veya tek obje)
212
- Adımlar:
213
- 1) İlk skor turu (tüm örnekler)
214
- 2) Dağılım tabanlı label (low/medium/high)
215
- 3) LOW olanları otomatik rewrite (improve_smart)
216
- 4) Yeniden skorla, quality_before / quality_after alanlarını yaz
217
- Çıktı:
218
  - Summary (Dataframe)
219
- - Preview JSON (ilk 50)
220
  - Download JSON path
221
  - Warn/Info
222
  """
223
  data = json.load(open(file.name))
224
  items: List[Dict[str, Any]] = data if isinstance(data, list) else [data]
225
 
226
- # 1) İlk skor turu
227
  first = []
228
  scores = []
229
  for raw in items:
@@ -233,10 +291,10 @@ def process_json(file) -> Tuple[List[Dict[str, Any]], str, str, str]:
233
  first.append(it)
234
  scores.append(s)
235
 
236
- # 2) Dinamik label fonksiyonu
237
  to_label = label_mapper_from_distribution(scores)
238
 
239
- # 3) Label ata, low ise otomatik rewrite
240
  processed = []
241
  for it in first:
242
  base_label = to_label(it["quality_before"]["score"])
@@ -244,7 +302,6 @@ def process_json(file) -> Tuple[List[Dict[str, Any]], str, str, str]:
244
 
245
  if base_label == "low":
246
  it = improve_smart(it)
247
- # 4) yeniden skorla
248
  s2 = score_pair(it.get("question",""), it.get("answer",""))
249
  it["quality_after"] = {
250
  "score": round(s2, 3),
@@ -252,7 +309,7 @@ def process_json(file) -> Tuple[List[Dict[str, Any]], str, str, str]:
252
  }
253
  processed.append(it)
254
 
255
- # Summary tablo
256
  summary = []
257
  for idx, it in enumerate(processed):
258
  qb = it.get("quality_before", {})
@@ -266,12 +323,12 @@ def process_json(file) -> Tuple[List[Dict[str, Any]], str, str, str]:
266
  "question_preview": (it.get("question") or "")[:120]
267
  })
268
 
269
- # İndirilebilir JSON
270
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w", encoding="utf-8")
271
  json.dump(processed, tmp, indent=2, ensure_ascii=False)
272
  tmp.flush(); tmp.close()
273
 
274
- # Önizleme
275
  preview = json.dumps(processed[:50], indent=2, ensure_ascii=False)
276
  if len(processed) > 50:
277
  preview += "\n\n// NOTE: Showing first 50 items. Download full file below."
@@ -288,7 +345,7 @@ def process_json(file) -> Tuple[List[Dict[str, Any]], str, str, str]:
288
  with gr.Blocks(title="Q&A Quality Upgrader", theme=gr.themes.Soft()) as demo:
289
  gr.Markdown("## Q&A Quality Upgrader\nUpload your JSON. Low-quality items will be auto-rewritten and rescored.")
290
 
291
- # ---- Minimal sample accordion (shows + download) ----
292
  with gr.Accordion("Minimal sample JSON (only id, question, answer)", open=False):
293
  gr.Markdown("Upload a JSON **array of objects** with the following schema:")
294
  gr.Code(value=SAMPLE_JSON_MIN, language="json", lines=18, label="Minimal JSON example")
 
17
  },
18
  {
19
  "id": "ex-002",
20
+ "question": "question",
21
  "answer": "answer"
22
  },
23
  {
24
  "id": "ex-003",
25
+ "question": "question",
26
  "answer": "answer"
27
  }
28
  ]"""
 
33
  tmp.flush()
34
  return tmp.name
35
 
36
+ # ---------------------------
37
+ # Helpers (formatting & menu-path)
38
+ # ---------------------------
39
+ def _normalize_ws(s: str) -> str:
40
+ return re.sub(r"\s+", " ", (s or "").strip())
41
+
42
+ def _sentence_case(s: str) -> str:
43
+ s = _normalize_ws(s)
44
+ if not s:
45
+ return s
46
+ # single lowercase 'i' -> 'I'
47
+ s = re.sub(r"\bi\b", "I", s)
48
+ if s[0].islower():
49
+ s = s[0].upper() + s[1:]
50
+ if s[-1] not in ".!?":
51
+ s += "."
52
+ return s
53
+
54
+ def _join_path(section: str | None, option: str | None) -> str | None:
55
+ section = (section or "").strip()
56
+ option = (option or "").strip()
57
+ if section and option:
58
+ return f"{section} > {option}"
59
+ if option:
60
+ return option
61
+ if section:
62
+ return section
63
+ return None
64
+
65
  # ---------------------------
66
  # Model: reward/quality (Transformers uyumlu)
67
  # ---------------------------
 
72
  task="text-classification",
73
  model=MODEL_ID,
74
  tokenizer=MODEL_ID,
75
+ function_to_apply="none" # regression score
76
  )
77
  MODEL_READY = True
78
  LOAD_ERR = ""
 
84
  # Scoring & labeling
85
  # ---------------------------
86
  def score_pair(question: str, answer: str) -> float:
87
+ """Reward score (higher = better). If model not ready, use light heuristic."""
88
  if not MODEL_READY:
89
  base = 0.3
90
  if question.strip().endswith("?"):
 
100
 
101
  def label_mapper_from_distribution(scores: List[float]):
102
  """
103
+ Scores may be negative; use distribution-based thresholds:
104
+ low : < 33rd percentile
105
+ medium: 33–66
106
+ high : >= 66
107
  """
108
  if not scores:
109
  return lambda s: "medium"
 
129
  # ---------------------------
130
  def _extract_module(item: Dict[str, Any], q_text: str) -> str | None:
131
  """
132
+ Extract uppercase 3+ letter 'module-like' token from question/context, e.g., CUSTOMERS.
 
133
  """
134
  ctx = f"{item.get('context','')} {q_text}"
135
  m = re.search(r"\b([A-Z]{3,})\b", ctx)
 
137
 
138
  def _roles_from_answer(ans: str) -> List[str]:
139
  """
140
+ Pull a role list from the answer; Title Case; drop empties.
 
141
  """
142
+ parts = re.split(r",| and ", ans or "", flags=re.IGNORECASE)
143
  roles = []
144
  for p in parts:
145
  t = p.strip(" .")
 
149
  roles.append(t)
150
  return [r for r in roles if r]
151
 
152
+ # ---------------------------
153
+ # Rewriter (WHO/WHERE/WHAT/HOW aware)
154
+ # ---------------------------
 
 
 
 
 
 
 
155
  def improve_smart(item: Dict[str, Any]) -> Dict[str, Any]:
156
  """
157
+ LLM-free safe rewrite:
158
+ - WHO + MODULE: use roles pattern. (Only here!)
159
+ - WHERE: produce menu-path sentence (e.g., Settings > Inventory Parameters).
160
+ - WHAT: definition/purpose or 'allows searching by …' sentence.
161
+ - HOW: short procedural sentence.
162
+ - Else: grammar/format normalization.
163
  """
164
+ q = _normalize_ws(item.get("question") or "")
165
+ a = _normalize_ws(item.get("answer") or "")
166
  meta = item.get("metadata") or {}
167
  qtype = (meta.get("question_type") or "").lower()
168
+ orig_q = _normalize_ws(item.get("original_question") or "")
169
+ orig_a = _normalize_ws(item.get("original_answer") or "")
170
+ base = orig_a or a # prefer original to keep semantics
171
 
172
+ module = _extract_module(item, q) # e.g., CUSTOMERS
 
173
 
174
+ # -- WHO: ONLY here we use roles pattern
175
  if qtype == "who" and module:
176
+ roles = _roles_from_answer(base)
177
  new_q = f"Which roles are authorized to access the {module} module in DealerTIQ?"
178
  if roles:
179
  if len(roles) == 1:
 
184
  roles_str = ", ".join(roles[:-1]) + f", and {roles[-1]}"
185
  new_a = f"Authorized roles include {roles_str}."
186
  else:
187
+ new_a = _sentence_case(base) if base else _sentence_case(a)
 
188
 
 
189
  if item.get("context"):
190
  item["context"] = f"DealerTIQ — {module} module"
191
 
192
+ item["question"] = _sentence_case(new_q[:-1] + "?")
193
+ item["answer"] = _sentence_case(new_a)
194
  return item
195
 
196
+ # -- WHERE: menu path sentence
197
+ if qtype == "where":
198
+ text = base or a
199
+ # "under the Settings section"
200
+ m_sec = re.search(r"under the\s+([A-Za-z ]+?)\s+section", text or "", flags=re.IGNORECASE)
201
+ section = m_sec.group(1).strip().title() if m_sec else None
202
+ # quoted option: "Inventory Parameters"
203
+ quotes = re.findall(r'"([^"]+)"', text or "")
204
+ option = quotes[0].strip() if quotes else None
205
+
206
+ path = _join_path(section, option)
207
+ target = option or (module and f"{module} module") or "page"
208
+ new_q = f"Where is the {target} located in DealerTIQ?"
209
+ new_a = f"It is located under {path} in the left navigation menu." if path else (text or "It is available in the left navigation menu.")
210
+
211
+ item["question"] = _sentence_case(new_q[:-1] + "?")
212
+ item["answer"] = _sentence_case(new_a)
213
+ return item
214
+
215
+ # -- WHAT: definition/purpose or 'allows searching by …'
216
+ if qtype == "what":
217
+ text = base or a
218
+ if re.search(r"allows\s+search(ing)?\s+by", text or "", flags=re.IGNORECASE):
219
+ m = re.search(r"such as\s+(.+)", text or "", flags=re.IGNORECASE)
220
+ if m:
221
+ feats = m.group(1).strip().rstrip(".")
222
+ new_q = f"What can you search for in {module or 'this module'}?"
223
+ new_a = f"It allows searching by criteria such as {feats}."
224
  else:
225
+ new_q = orig_q or q or "What can you search for in this module?"
226
+ new_a = text or "It allows searching by multiple criteria."
227
+ else:
228
+ if re.search(r"\b(configure|configuration|settings)\b", text or "", flags=re.IGNORECASE):
229
+ target = module or "Inventory Parameters"
230
+ new_q = f"What is configured in the {target}?"
231
+ new_a = text or "It configures related settings and rules."
232
+ else:
233
+ new_q = orig_q or q or "What is the purpose of this module?"
234
+ new_a = text or "It provides the core functionality for this area."
235
+ item["question"] = _sentence_case(new_q[:-1] + "?")
236
+ item["answer"] = _sentence_case(new_a)
237
+ return item
238
 
239
+ # -- HOW: short procedure
240
+ if qtype == "how":
241
+ text = base or a
242
+ quotes = re.findall(r'"([^"]+)"', text or "") # "Add Channel", "Inventory Parameters"
243
+ m_sec = re.search(r"under the\s+([A-Za-z ]+?)\s+section", text or "", flags=re.IGNORECASE)
244
+ section = m_sec.group(1).strip().title() if m_sec else None
245
+ path = _join_path(section, quotes[0] if quotes else None)
246
+
247
+ new_q = orig_q or q or f"How do I perform this action in {module or 'the module'}?"
248
+ steps = _normalize_ws(text or "")
249
+ steps = re.sub(r"\bclick on\b", "select", steps, flags=re.IGNORECASE)
250
+ if path and "left navigation" not in steps.lower():
251
+ steps = f"Go to {path} in the left navigation menu, then {steps[0].lower() + steps[1:]}" if steps else f"Go to {path} in the left navigation menu."
252
+
253
+ item["question"] = _sentence_case(new_q[:-1] + "?") if not new_q.endswith("?") else _sentence_case(new_q)
254
+ item["answer"] = _sentence_case(steps or "Follow the on-screen instructions to complete the action.")
255
+ return item
256
 
257
+ # -- Fallback: grammar/format normalization only
258
+ if q and not q.endswith("?"):
259
+ q += "?"
260
+ item["question"] = _sentence_case(q)
261
+ item["answer"] = _sentence_case(base or a)
262
  return item
263
 
264
  # ---------------------------
 
266
  # ---------------------------
267
  def process_json(file) -> Tuple[List[Dict[str, Any]], str, str, str]:
268
  """
269
+ Input: JSON (list or single object)
270
+ Steps:
271
+ 1) First scoring pass for all items
272
+ 2) Label via distribution thresholds (low/medium/high)
273
+ 3) Auto-rewrite items labeled 'low' (improve_smart)
274
+ 4) Rescore & write quality_before / quality_after
275
+ Output:
276
  - Summary (Dataframe)
277
+ - Preview JSON (first 50)
278
  - Download JSON path
279
  - Warn/Info
280
  """
281
  data = json.load(open(file.name))
282
  items: List[Dict[str, Any]] = data if isinstance(data, list) else [data]
283
 
284
+ # 1) First scoring pass
285
  first = []
286
  scores = []
287
  for raw in items:
 
291
  first.append(it)
292
  scores.append(s)
293
 
294
+ # 2) Dynamic label function
295
  to_label = label_mapper_from_distribution(scores)
296
 
297
+ # 3) Label, rewrite if 'low', then rescore
298
  processed = []
299
  for it in first:
300
  base_label = to_label(it["quality_before"]["score"])
 
302
 
303
  if base_label == "low":
304
  it = improve_smart(it)
 
305
  s2 = score_pair(it.get("question",""), it.get("answer",""))
306
  it["quality_after"] = {
307
  "score": round(s2, 3),
 
309
  }
310
  processed.append(it)
311
 
312
+ # Summary table
313
  summary = []
314
  for idx, it in enumerate(processed):
315
  qb = it.get("quality_before", {})
 
323
  "question_preview": (it.get("question") or "")[:120]
324
  })
325
 
326
+ # Downloadable JSON
327
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w", encoding="utf-8")
328
  json.dump(processed, tmp, indent=2, ensure_ascii=False)
329
  tmp.flush(); tmp.close()
330
 
331
+ # Preview
332
  preview = json.dumps(processed[:50], indent=2, ensure_ascii=False)
333
  if len(processed) > 50:
334
  preview += "\n\n// NOTE: Showing first 50 items. Download full file below."
 
345
  with gr.Blocks(title="Q&A Quality Upgrader", theme=gr.themes.Soft()) as demo:
346
  gr.Markdown("## Q&A Quality Upgrader\nUpload your JSON. Low-quality items will be auto-rewritten and rescored.")
347
 
348
+ # ---- Minimal sample accordion (show + download) ----
349
  with gr.Accordion("Minimal sample JSON (only id, question, answer)", open=False):
350
  gr.Markdown("Upload a JSON **array of objects** with the following schema:")
351
  gr.Code(value=SAMPLE_JSON_MIN, language="json", lines=18, label="Minimal JSON example")