prithivMLmods commited on
Commit
c4905cb
·
verified ·
1 Parent(s): 7492c88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -244
app.py CHANGED
@@ -1,100 +1,57 @@
1
- import os
2
  import re
3
- import json
 
 
4
  import numpy as np
5
  import torch
6
- import spaces
7
- import gradio as gr
8
  from PIL import Image, ImageDraw, ImageFont
9
- from typing import Tuple, Optional, List, Dict, Any
10
 
11
- # Transformers & Qwen Utils
12
  from transformers import (
13
  Qwen2_5_VLForConditionalGeneration,
14
  AutoProcessor,
15
  )
16
  from qwen_vl_utils import process_vision_info
17
 
 
 
 
 
18
  # -----------------------------------------------------------------------------
19
- # 1. PROMPTS (from prompt.py)
20
  # -----------------------------------------------------------------------------
21
 
22
  OS_ACTIONS = """
23
- def final_answer(answer: any) -> any:
24
- \"\"\"
25
- Provides a final answer to the given problem.
26
- Args:
27
- answer: The final answer to the problem
28
- \"\"\"
29
-
30
- def move_mouse(self, x: float, y: float) -> str:
31
- \"\"\"
32
- Moves the mouse cursor to the specified coordinates
33
- Args:
34
- x: The x coordinate (horizontal position)
35
- y: The y coordinate (vertical position)
36
  \"\"\"
37
-
38
- def click(x: Optional[float] = None, y: Optional[float] = None) -> str:
39
- \"\"\"
40
- Performs a left-click at the specified normalized coordinates
41
  Args:
42
- x: The x coordinate (horizontal position)
43
- y: The y coordinate (vertical position)
44
  \"\"\"
45
 
46
- def double_click(x: Optional[float] = None, y: Optional[float] = None) -> str:
47
  \"\"\"
48
- Performs a double-click at the specified normalized coordinates
49
  Args:
50
- x: The x coordinate (horizontal position)
51
- y: The y coordinate (vertical position)
52
  \"\"\"
53
 
54
  def type(text: str) -> str:
55
  \"\"\"
56
- Types the specified text at the current cursor position.
57
  Args:
58
- text: The text to type
59
- \"\"\"
60
-
61
- def press(keys: str | list[str]) -> str:
62
- \"\"\"
63
- Presses a keyboard key
64
- Args:
65
- keys: The key or list of keys to press (e.g. "enter", "space", "backspace", "ctrl", etc.).
66
- \"\"\"
67
-
68
- def navigate_back() -> str:
69
- \"\"\"
70
- Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
71
  \"\"\"
72
 
73
  def drag(from_coord: list[float], to_coord: list[float]) -> str:
74
  \"\"\"
75
- Clicks [x1, y1], drags mouse to [x2, y2], then release click.
76
- Args:
77
- x1: origin x coordinate
78
- y1: origin y coordinate
79
- x2: end x coordinate
80
- y2: end y coordinate
81
- \"\"\"
82
-
83
- def scroll(direction: Literal["up", "down"] = "down", amount: int = 1) -> str:
84
- \"\"\"
85
- Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
86
  Args:
87
- x: The x coordinate (horizontal position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
88
- y: The y coordinate (vertical position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
89
- direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
90
- amount: The amount to scroll. A good amount is 1 or 2.
91
- \"\"\"
92
-
93
- def wait(seconds: float) -> str:
94
- \"\"\"
95
- Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
96
- Args:
97
- seconds: Number of seconds to wait, generally 2 is enough.
98
  \"\"\"
99
  """
100
 
@@ -102,73 +59,71 @@ OS_SYSTEM_PROMPT = f"""You are a helpful GUI agent. You’ll be given a task and
102
 
103
  For each step:
104
  • First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
105
- • Then, use <code></code> to perform the action. it will be executed in a stateful environment.
106
 
107
  The following functions are exposed to the Python interpreter:
108
  <code>
109
  {OS_ACTIONS}
110
  </code>
111
 
112
- The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
113
  """
114
 
115
  # -----------------------------------------------------------------------------
116
- # 2. MODEL WRAPPER (Modified for Fara/QwenVL)
117
  # -----------------------------------------------------------------------------
118
 
119
- class TransformersModel:
120
- def __init__(self, model_id: str, to_device: str = "cuda"):
121
- print(f"Loading model: {model_id}...")
122
  self.model_id = model_id
123
 
124
- # Load Processor
125
  try:
126
  self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
127
- except Exception as e:
128
- print(f"Error loading processor: {e}")
129
- raise e
130
-
131
- # Load Model
132
- try:
133
  self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
134
  model_id,
135
  trust_remote_code=True,
136
- torch_dtype=torch.bfloat16,
137
  device_map="auto" if to_device == "cuda" else None,
138
  )
139
  if to_device == "cpu":
140
  self.model.to("cpu")
141
-
142
- print("Model loaded successfully.")
143
  except Exception as e:
144
- print(f"Error loading Fara/Qwen model: {e}. Ensure you have access/internet.")
145
- raise e
 
 
 
 
 
 
 
 
146
 
147
- def generate(self, messages: list[dict], **kwargs):
148
- # 1. Prepare text prompt using chat template
 
 
149
  text = self.processor.apply_chat_template(
150
  messages, tokenize=False, add_generation_prompt=True
151
  )
 
152
 
153
- # 2. Process images/videos
154
- image_inputs, video_inputs = process_vision_info(messages)
155
-
156
- # 3. Create model inputs
157
  inputs = self.processor(
158
  text=[text],
159
  images=image_inputs,
160
- videos=video_inputs,
161
  padding=True,
162
  return_tensors="pt",
163
- )
164
- inputs = inputs.to(self.model.device)
165
 
166
- # 4. Generate
167
- generated_ids = self.model.generate(**inputs, **kwargs)
168
 
169
- # 5. Decode (trimming input tokens)
170
  generated_ids_trimmed = [
171
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
172
  ]
173
 
174
  output_text = self.processor.batch_decode(
@@ -177,17 +132,20 @@ class TransformersModel:
177
 
178
  return output_text
179
 
 
 
 
 
 
 
180
  # -----------------------------------------------------------------------------
181
- # 3. HELPER FUNCTIONS
182
  # -----------------------------------------------------------------------------
183
 
184
- def array_to_image(image_array: np.ndarray) -> Image.Image:
185
- if image_array is None:
186
- raise ValueError("No image provided. Please upload an image before submitting.")
187
- return Image.fromarray(np.uint8(image_array))
188
-
189
- def get_navigation_prompt(task, image):
190
- """Constructs the prompt messages for the model"""
191
  return [
192
  {
193
  "role": "system",
@@ -197,40 +155,30 @@ def get_navigation_prompt(task, image):
197
  "role": "user",
198
  "content": [
199
  {"type": "image", "image": image},
200
- {"type": "text", "text": f"Instruction: {task}\n\nPrevious actions:\nNone"},
201
  ],
202
  },
203
  ]
204
 
 
 
 
 
 
205
  def parse_actions_from_response(response: str) -> list[str]:
206
- """Parse actions from model response using regex pattern."""
207
- # Look for code block
208
  pattern = r"<code>\s*(.*?)\s*</code>"
209
  matches = re.findall(pattern, response, re.DOTALL)
210
-
211
- # If no code block, try to find raw function calls if the model forgot tags
212
- if not matches:
213
- # Fallback: look for lines starting with known functions
214
- funcs = ["click", "type", "press", "drag", "scroll", "wait"]
215
- lines = response.split('\n')
216
- found = []
217
- for line in lines:
218
- line = line.strip()
219
- if any(line.startswith(f) for f in funcs):
220
- found.append(line)
221
- return found
222
-
223
  return matches
224
 
225
  def extract_coordinates_from_action(action_code: str) -> list[dict]:
226
- """Extract coordinates from action code for localization actions."""
227
  localization_actions = []
228
 
229
- # Patterns for different action types
230
  patterns = {
231
  'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
232
  'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
233
- 'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
234
  'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
235
  }
236
 
@@ -238,35 +186,17 @@ def extract_coordinates_from_action(action_code: str) -> list[dict]:
238
  matches = re.finditer(pattern, action_code)
239
  for match in matches:
240
  if action_type == 'drag':
241
- # Drag has from and to coordinates
242
- from_x, from_y, to_x, to_y = match.groups()
243
- localization_actions.append({
244
- 'type': 'drag_from', 'x': float(from_x), 'y': float(from_y), 'action': action_type
245
- })
246
- localization_actions.append({
247
- 'type': 'drag_to', 'x': float(to_x), 'y': float(to_y), 'action': action_type
248
- })
249
  else:
250
- # Single coordinate actions
251
- if match.groups()[0]:
252
- x_val = match.group(1)
253
- y_val = match.group(2) if match.group(2) else x_val
254
-
255
- # Convert pixel coords to normalized if they look like pixels (assuming > 1000 width usually)
256
- # Note: The prompt implies normalized (0.0-1.0), but if model outputs 500, we handle it visually later
257
-
258
- if x_val and y_val:
259
- localization_actions.append({
260
- 'type': action_type,
261
- 'x': float(x_val),
262
- 'y': float(y_val),
263
- 'action': action_type
264
- })
265
 
266
  return localization_actions
267
 
268
  def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
269
- """Create an image with localization markers drawn on it."""
270
  if not coordinates:
271
  return None
272
 
@@ -275,142 +205,127 @@ def create_localized_image(original_image: Image.Image, coordinates: list[dict])
275
  width, height = img_copy.size
276
 
277
  try:
 
 
278
  font = ImageFont.load_default()
279
- except:
280
- font = None
281
 
282
- colors = {
283
- 'click': 'red', 'double_click': 'blue', 'move_mouse': 'green',
284
- 'drag_from': 'orange', 'drag_to': 'purple'
285
- }
286
 
287
  for i, coord in enumerate(coordinates):
288
- # Handle normalized vs pixel coordinates
289
- x, y = coord['x'], coord['y']
290
-
291
- if x <= 1.0 and y <= 1.0:
292
- pixel_x = int(x * width)
293
- pixel_y = int(y * height)
294
- else:
295
- pixel_x = int(x)
296
- pixel_y = int(y)
297
-
298
  color = colors.get(coord['type'], 'red')
299
 
300
- # Draw Circle
301
- r = 8
302
- draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r],
303
- fill=color, outline='white', width=2)
304
 
305
- # Draw Label
306
- label = f"{coord['type']}"
307
- text_pos = (pixel_x + 10, pixel_y - 10)
308
- if font:
309
- draw.text(text_pos, label, fill=color, font=font)
310
- else:
311
- draw.text(text_pos, label, fill=color)
312
 
313
- # Draw Arrow for Drag
314
  if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
315
  next_coord = coordinates[i + 1]
316
- nx, ny = next_coord['x'], next_coord['y']
317
-
318
- if nx <= 1.0 and ny <= 1.0:
319
- end_x, end_y = int(nx * width), int(ny * height)
320
- else:
321
- end_x, end_y = int(nx), int(ny)
322
-
323
  draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
324
 
325
  return img_copy
326
 
327
  # -----------------------------------------------------------------------------
328
- # 4. INITIALIZATION
329
- # -----------------------------------------------------------------------------
330
-
331
- # Using Fara-7B (or fallback)
332
- MODEL_ID = "microsoft/Fara-7B"
333
-
334
- print(f"Initializing {MODEL_ID}...")
335
- # Global model instance
336
- # Note: We initialize this lazily or globally depending on environment.
337
- # For Gradio Spaces, global init is standard.
338
- try:
339
- model = TransformersModel(model_id=MODEL_ID, to_device="cuda" if torch.cuda.is_available() else "cpu")
340
- except Exception as e:
341
- print(f"Failed to load Fara. Trying fallback Qwen...")
342
- model = TransformersModel(model_id="Qwen/Qwen2.5-VL-7B-Instruct", to_device="cuda" if torch.cuda.is_available() else "cpu")
343
-
344
- # -----------------------------------------------------------------------------
345
- # 5. GRADIO APP
346
  # -----------------------------------------------------------------------------
347
 
348
- @spaces.GPU
349
- def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
350
- if input_numpy_image is None:
351
- return "Please upload an image.", None
 
 
 
352
 
353
  input_pil_image = array_to_image(input_numpy_image)
354
 
355
- # Generate Prompt
356
- prompt_msgs = get_navigation_prompt(task, input_pil_image)
 
 
357
 
358
- # Generate Response
359
- print("Generating response...")
360
- response_str = model.generate(prompt_msgs, max_new_tokens=500)
361
- print(f"Model Response: {response_str}")
362
 
363
- # Parse
364
- actions = parse_actions_from_response(response_str)
365
-
366
- # Extract Coordinates
367
  all_coordinates = []
368
- for action_code in actions:
369
- coords = extract_coordinates_from_action(action_code)
370
- all_coordinates.extend(coords)
371
 
372
- # Visualize
373
- localized_image = input_pil_image
374
  if all_coordinates:
375
- localized_image = create_localized_image(input_pil_image, all_coordinates)
 
 
 
376
 
377
- return response_str, localized_image
 
378
 
379
- title = "Fara-7B GUI Operator 🤖"
 
 
 
 
380
  description = """
381
- ### Fara GUI Agent Demo
382
- Upload a screenshot and give an instruction. The model will analyze the UI and output the Python code to execute the action.
383
- This demo visualizes where the model wants to click or drag.
384
  """
385
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
387
  gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
388
  gr.Markdown(description)
389
 
390
- with gr.Row():
391
- input_image = gr.Image(label="Upload Screenshot", height=500, type="numpy")
392
-
393
  with gr.Row():
394
  with gr.Column(scale=1):
395
- task_input = gr.Textbox(
396
- label="Instruction",
397
- placeholder="e.g. Click on the Search button...",
398
- lines=2
 
399
  )
400
- submit_btn = gr.Button("Generate Action", variant="primary")
401
 
402
  with gr.Column(scale=1):
403
- output_code = gr.Textbox(label="Generated Python Code", lines=10)
404
-
405
- # Output image gets updated with markers
406
- submit_btn.click(
407
- fn=navigate,
408
- inputs=[input_image, task_input],
409
- outputs=[output_code, input_image]
 
 
410
  )
411
 
412
- # Optional: Examples
413
- # gr.Examples(...)
 
 
 
 
 
 
414
 
415
  if __name__ == "__main__":
416
- demo.launch()
 
1
+ import spaces
2
  import re
3
+ from typing import Tuple, Optional, List, Dict, Any
4
+
5
+ import gradio as gr
6
  import numpy as np
7
  import torch
 
 
8
  from PIL import Image, ImageDraw, ImageFont
 
9
 
10
+ # Transformers imports for Fara Model
11
  from transformers import (
12
  Qwen2_5_VLForConditionalGeneration,
13
  AutoProcessor,
14
  )
15
  from qwen_vl_utils import process_vision_info
16
 
17
+ # --- Configuration ---
18
+ MODEL_ID = "microsoft/Fara-7B"
19
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
20
+
21
  # -----------------------------------------------------------------------------
22
+ # PROMPT DEFINITIONS (from prompt.py)
23
  # -----------------------------------------------------------------------------
24
 
25
  OS_ACTIONS = """
26
+ def click(x: float, y: float) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
27
  \"\"\"
28
+ Performs a left-click at the specified normalized coordinates.
 
 
 
29
  Args:
30
+ x: The x coordinate (0.0 to 1.0).
31
+ y: The y coordinate (0.0 to 1.0).
32
  \"\"\"
33
 
34
+ def double_click(x: float, y: float) -> str:
35
  \"\"\"
36
+ Performs a double-click at the specified normalized coordinates.
37
  Args:
38
+ x: The x coordinate (0.0 to 1.0).
39
+ y: The y coordinate (0.0 to 1.0).
40
  \"\"\"
41
 
42
  def type(text: str) -> str:
43
  \"\"\"
44
+ Types the specified text.
45
  Args:
46
+ text: The text to type.
 
 
 
 
 
 
 
 
 
 
 
 
47
  \"\"\"
48
 
49
  def drag(from_coord: list[float], to_coord: list[float]) -> str:
50
  \"\"\"
51
+ Drags from [x1, y1] to [x2, y2].
 
 
 
 
 
 
 
 
 
 
52
  Args:
53
+ from_coord: The starting normalized coordinates [x1, y1].
54
+ to_coord: The ending normalized coordinates [x2, y2].
 
 
 
 
 
 
 
 
 
55
  \"\"\"
56
  """
57
 
 
59
 
60
  For each step:
61
  • First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
62
+ • Then, use <code></code> to perform the action. It will be executed in a stateful environment.
63
 
64
  The following functions are exposed to the Python interpreter:
65
  <code>
66
  {OS_ACTIONS}
67
  </code>
68
 
69
+ The state persists between code executions.
70
  """
71
 
72
  # -----------------------------------------------------------------------------
73
+ # FARA MODEL WRAPPER (adapted from smolvlm_inference.py)
74
  # -----------------------------------------------------------------------------
75
 
76
+ class FaraModelWrapper:
77
+ def __init__(self, model_id: str, to_device: str):
78
+ print(f"Loading {model_id} on {to_device}...")
79
  self.model_id = model_id
80
 
 
81
  try:
82
  self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 
 
 
 
 
83
  self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
84
  model_id,
85
  trust_remote_code=True,
86
+ torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
87
  device_map="auto" if to_device == "cuda" else None,
88
  )
89
  if to_device == "cpu":
90
  self.model.to("cpu")
91
+ self.model.eval()
92
+ print("Fara Model loaded successfully.")
93
  except Exception as e:
94
+ print(f"Failed to load Fara, falling back to Qwen2.5-VL-7B. Error: {e}")
95
+ fallback_id = "Qwen/Qwen2.5-VL-7B-Instruct"
96
+ self.processor = AutoProcessor.from_pretrained(fallback_id, trust_remote_code=True)
97
+ self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
98
+ fallback_id,
99
+ trust_remote_code=True,
100
+ torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
101
+ device_map="auto",
102
+ )
103
+ print("Fallback model loaded.")
104
 
105
+ def generate(self, messages: list[dict], max_new_tokens=512, **kwargs):
106
+ """
107
+ Generate a response from the Fara/QwenVL model.
108
+ """
109
  text = self.processor.apply_chat_template(
110
  messages, tokenize=False, add_generation_prompt=True
111
  )
112
+ image_inputs, _ = process_vision_info(messages)
113
 
 
 
 
 
114
  inputs = self.processor(
115
  text=[text],
116
  images=image_inputs,
 
117
  padding=True,
118
  return_tensors="pt",
119
+ ).to(self.model.device)
 
120
 
121
+ with torch.no_grad():
122
+ generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, **kwargs)
123
 
124
+ # Trim input tokens to get only the generated part
125
  generated_ids_trimmed = [
126
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
127
  ]
128
 
129
  output_text = self.processor.batch_decode(
 
132
 
133
  return output_text
134
 
135
+ # --- Initialize Global Model ---
136
+ model = FaraModelWrapper(
137
+ model_id=MODEL_ID,
138
+ to_device=DEVICE,
139
+ )
140
+
141
  # -----------------------------------------------------------------------------
142
+ # HELPER FUNCTIONS (from app.py logic)
143
  # -----------------------------------------------------------------------------
144
 
145
+ def get_navigation_prompt(task, image, previous_actions="None"):
146
+ """
147
+ Constructs the prompt for the model.
148
+ """
 
 
 
149
  return [
150
  {
151
  "role": "system",
 
155
  "role": "user",
156
  "content": [
157
  {"type": "image", "image": image},
158
+ {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\n{previous_actions}"},
159
  ],
160
  },
161
  ]
162
 
163
+ def array_to_image(image_array: np.ndarray) -> Image.Image:
164
+ if image_array is None:
165
+ raise ValueError("No image provided.")
166
+ return Image.fromarray(np.uint8(image_array))
167
+
168
  def parse_actions_from_response(response: str) -> list[str]:
169
+ """Parse actions from model response using <code>...</code> pattern."""
 
170
  pattern = r"<code>\s*(.*?)\s*</code>"
171
  matches = re.findall(pattern, response, re.DOTALL)
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  return matches
173
 
174
  def extract_coordinates_from_action(action_code: str) -> list[dict]:
175
+ """Extract normalized (0-1) coordinates from action code for visualization."""
176
  localization_actions = []
177
 
178
+ # Patterns for different action types expecting normalized floats
179
  patterns = {
180
  'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
181
  'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
 
182
  'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
183
  }
184
 
 
186
  matches = re.finditer(pattern, action_code)
187
  for match in matches:
188
  if action_type == 'drag':
189
+ from_x, from_y, to_x, to_y = map(float, match.groups())
190
+ localization_actions.append({'type': 'drag_from', 'x': from_x, 'y': from_y, 'action': action_type})
191
+ localization_actions.append({'type': 'drag_to', 'x': to_x, 'y': to_y, 'action': action_type})
 
 
 
 
 
192
  else:
193
+ x_val, y_val = map(float, match.groups())
194
+ localization_actions.append({'type': action_type, 'x': x_val, 'y': y_val, 'action': action_type})
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  return localization_actions
197
 
198
  def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
199
+ """Draw markers on the image to visualize the predicted action."""
200
  if not coordinates:
201
  return None
202
 
 
205
  width, height = img_copy.size
206
 
207
  try:
208
+ font = ImageFont.truetype("Arial.ttf", 15)
209
+ except IOError:
210
  font = ImageFont.load_default()
 
 
211
 
212
+ colors = {'click': 'red', 'double_click': 'blue', 'drag_from': 'orange', 'drag_to': 'purple'}
 
 
 
213
 
214
  for i, coord in enumerate(coordinates):
215
+ pixel_x = int(coord['x'] * width)
216
+ pixel_y = int(coord['y'] * height)
 
 
 
 
 
 
 
 
217
  color = colors.get(coord['type'], 'red')
218
 
219
+ radius = 8
220
+ draw.ellipse([pixel_x - radius, pixel_y - radius, pixel_x + radius, pixel_y + radius], fill=color, outline='white', width=2)
 
 
221
 
222
+ label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
223
+ draw.text((pixel_x + 12, pixel_y - 12), label, fill=color, font=font, stroke_width=1, stroke_fill="white")
 
 
 
 
 
224
 
 
225
  if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
226
  next_coord = coordinates[i + 1]
227
+ end_x = int(next_coord['x'] * width)
228
+ end_y = int(next_coord['y'] * height)
 
 
 
 
 
229
  draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
230
 
231
  return img_copy
232
 
233
  # -----------------------------------------------------------------------------
234
+ # GRADIO CORE FUNCTION
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  # -----------------------------------------------------------------------------
236
 
237
+ @spaces.GPU(duration=60)
238
+ def predict_action(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
239
+ """
240
+ Main Gradio function: takes image and task, returns model output and visualized image.
241
+ """
242
+ if model is None:
243
+ raise ValueError("Model not loaded")
244
 
245
  input_pil_image = array_to_image(input_numpy_image)
246
 
247
+ # Generate prompt and get model prediction
248
+ prompt = get_navigation_prompt(task, input_pil_image)
249
+ model_response = model.generate(prompt, max_new_tokens=500)
250
+ print(f"Model Response: {model_response}")
251
 
252
+ # Parse the response to find action code
253
+ action_codes = parse_actions_from_response(model_response)
 
 
254
 
255
+ # Extract coordinates from all found actions for visualization
 
 
 
256
  all_coordinates = []
257
+ for code in action_codes:
258
+ coordinates = extract_coordinates_from_action(code)
259
+ all_coordinates.extend(coordinates)
260
 
261
+ # Create the visualized image if coordinates were found
262
+ visualized_image = None
263
  if all_coordinates:
264
+ visualized_image = create_localized_image(input_pil_image, all_coordinates)
265
+ print(f"Found {len(all_coordinates)} localization actions. Visualizing.")
266
+ else:
267
+ print("No localization actions found in the response.")
268
 
269
+ # Return the raw model response and the (possibly updated) image
270
+ return model_response, visualized_image if visualized_image else input_pil_image
271
 
272
+ # -----------------------------------------------------------------------------
273
+ # GRADIO UI LAYOUT
274
+ # -----------------------------------------------------------------------------
275
+
276
+ title = "Fara GUI Operator"
277
  description = """
278
+ This is a demo of the **Fara Model** acting as a GUI Operator.
279
+ Provide a screenshot of a user interface and a task you want to perform. The model will output the thought process and the corresponding action code, visualizing clicks and drags directly on the image.
280
+ This version does not execute the actions; it only predicts and visualizes them.
281
  """
282
 
283
+ # Load Example Data
284
+ try:
285
+ example_1_image = Image.open("./assets/google.png")
286
+ example_1_task = "Search for the name of the current UK Prime Minister."
287
+ example_2_image = Image.open("./assets/huggingface.png")
288
+ example_2_task = "Find the most trending model."
289
+ examples = [[example_1_image, example_1_task], [example_2_image, example_2_task]]
290
+ except FileNotFoundError:
291
+ print("Warning: Example assets not found. The demo will run without examples.")
292
+ examples = []
293
+
294
+
295
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
296
  gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
297
  gr.Markdown(description)
298
 
 
 
 
299
  with gr.Row():
300
  with gr.Column(scale=1):
301
+ input_image_component = gr.Image(label="UI Screenshot", type="numpy", height=500)
302
+ task_component = gr.Textbox(
303
+ label="Task",
304
+ placeholder="e.g., Search for 'Fara Model'",
305
+ info="Type the task you want the model to perform on this UI.",
306
  )
307
+ submit_button = gr.Button("Predict Action", variant="primary")
308
 
309
  with gr.Column(scale=1):
310
+ output_text_component = gr.Textbox(label="Model Full Output", lines=10, interactive=False)
311
+ # The input image component will be updated with the visualized output
312
+ gr.Markdown("### Visualized Action")
313
+ gr.Markdown("The image on the left will update with markers for clicks/drags.")
314
+
315
+ submit_button.click(
316
+ predict_action,
317
+ [input_image_component, task_component],
318
+ [output_text_component, input_image_component]
319
  )
320
 
321
+ if examples:
322
+ gr.Examples(
323
+ examples=examples,
324
+ inputs=[input_image_component, task_component],
325
+ outputs=[output_text_component, input_image_component],
326
+ fn=predict_action,
327
+ cache_examples=True,
328
+ )
329
 
330
  if __name__ == "__main__":
331
+ demo.queue().launch(debug=True, share=True)