Skip to content

Commit 5171b09

Browse files
committed
supprt local data logging
1 parent 0e03689 commit 5171b09

File tree

3 files changed

+45
-28
lines changed

3 files changed

+45
-28
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
<p align="center">
44
<img src="imgs/logo.png" alt="Logo">
55
</p>
6+
<!-- <a href="https://trendshift.io/repositories/12975" target="_blank"><img src="https://trendshift.io/api/badge/repositories/12975" alt="microsoft%2FOmniParser | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> -->
67

78
[![arXiv](https://img.shields.io/badge/Paper-green)](https://arxiv.org/abs/2408.00203)
89
[![License](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -12,6 +13,7 @@
1213
**OmniParser** is a comprehensive method for parsing user interface screenshots into structured and easy-to-understand elements, which significantly enhances the ability of GPT-4V to generate actions that can be accurately grounded in the corresponding regions of the interface.
1314

1415
## News
16+
- [2025/3] We support local logging of trajecotry so that you can use OmniParser+OmniTool to build training data pipeline for your favorate agent in your domain. [Documentation WIP]
1517
- [2025/3] We are gradually adding multi agents orchstration and improving user interface in OmniTool for better experience.
1618
- [2025/2] We release OmniParser V2 [checkpoints](https://huggingface.co/microsoft/OmniParser-v2.0). [Watch Video](https://1drv.ms/v/c/650b027c18d5a573/EWXbVESKWo9Buu6OYCwg06wBeoM97C6EOTG6RjvWLEN1Qg?e=alnHGC)
1719
- [2025/2] We introduce OmniTool: Control a Windows 11 VM with OmniParser + your vision model of choice. OmniTool supports out of the box the following large language models - OpenAI (4o/o1/o3-mini), DeepSeek (R1), Qwen (2.5VL) or Anthropic Computer Use. [Watch Video](https://1drv.ms/v/c/650b027c18d5a573/EehZ7RzY69ZHn-MeQHrnnR4BCj3by-cLLpUVlxMjF4O65Q?e=8LxMgX)

omnitool/gradio/agent/vlm_agent_with_orchestrator.py

Lines changed: 31 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from agent.llm_utils.utils import is_image_path
1818
import time
1919
import re
20-
20+
import os
2121
OUTPUT_DIR = "./tmp/outputs"
2222
ORCHESTRATOR_LEDGER_PROMPT = """
2323
Recall we are working on the following request:
@@ -73,7 +73,7 @@ def __init__(
7373
max_tokens: int = 4096,
7474
only_n_most_recent_images: int | None = None,
7575
print_usage: bool = True,
76-
save_folder: str = "./uploads",
76+
save_folder: str = None,
7777
):
7878
if model == "omniparser + gpt-4o" or model == "omniparser + gpt-4o-orchestrated":
7979
self.model = "gpt-4o-2024-11-20"
@@ -95,22 +95,20 @@ def __init__(
9595
self.max_tokens = max_tokens
9696
self.only_n_most_recent_images = only_n_most_recent_images
9797
self.output_callback = output_callback
98-
self.save_folder = Path(save_folder).absolute()
98+
self.save_folder = save_folder
9999

100-
# Create save folder if it doesn't exist
101-
self.save_folder.mkdir(parents=True, exist_ok=True)
102-
103100
self.print_usage = print_usage
104101
self.total_token_usage = 0
105102
self.total_cost = 0
106103
self.step_count = 0
104+
self.plan, self.ledger = None, None
107105

108106
self.system = ''
109107

110108
def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
111109
if self.step_count == 0:
112110
plan = self._initialize_task(messages)
113-
self.output_callback(f'-- Plan: {plan} --', sender="bot")
111+
self.output_callback(f'-- Plan: {plan} --', )
114112
# update messages with the plan
115113
messages.append({"role": "assistant", "content": plan})
116114
else:
@@ -122,13 +120,18 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
122120
f' <pre>{updated_ledger}</pre>'
123121
f' </div>'
124122
f'</details>',
125-
sender="bot"
126123
)
127124
# update messages with the ledger
128125
messages.append({"role": "assistant", "content": updated_ledger})
126+
self.ledger = updated_ledger
129127

130128
self.step_count += 1
131-
image_base64 = parsed_screen['original_screenshot_base64']
129+
# save the image to the output folder
130+
with open(f"{self.save_folder}/screenshot_{self.step_count}.png", "wb") as f:
131+
f.write(base64.b64decode(parsed_screen['original_screenshot_base64']))
132+
with open(f"{self.save_folder}/som_screenshot_{self.step_count}.png", "wb") as f:
133+
f.write(base64.b64decode(parsed_screen['som_image_base64']))
134+
132135
latency_omniparser = parsed_screen['latency']
133136
screen_info = str(parsed_screen['screen_info'])
134137
screenshot_uuid = parsed_screen['screenshot_uuid']
@@ -196,7 +199,7 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
196199
latency_vlm = time.time() - start
197200

198201
# Update step counter with both latencies
199-
self.output_callback(f'<i>Step {self.step_count} | OmniParser: {latency_omniparser:.2f}s | LLM: {latency_vlm:.2f}s</i>', sender="bot")
202+
self.output_callback(f'<i>Step {self.step_count} | OmniParser: {latency_omniparser:.2f}s | LLM: {latency_vlm:.2f}s</i>', )
200203

201204
print(f"{vlm_response}")
202205

@@ -226,7 +229,7 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
226229
except:
227230
print(f"Error parsing: {vlm_response_json}")
228231
pass
229-
self.output_callback(f'<img src="data:image/png;base64,{img_to_show_base64}">', sender="bot")
232+
self.output_callback(f'<img src="data:image/png;base64,{img_to_show_base64}">', )
230233

231234
# Display screen info in a collapsible dropdown
232235
self.output_callback(
@@ -236,7 +239,6 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
236239
f' <pre>{screen_info}</pre>'
237240
f' </div>'
238241
f'</details>',
239-
sender="bot"
240242
)
241243

242244
vlm_plan_str = ""
@@ -267,6 +269,21 @@ def __call__(self, messages: list, parsed_screen: list[str, list, dict]):
267269
name='computer', type='tool_use')
268270
response_content.append(sim_content_block)
269271
response_message = BetaMessage(id=f'toolu_{uuid.uuid4()}', content=response_content, model='', role='assistant', type='message', stop_reason='tool_use', usage=BetaUsage(input_tokens=0, output_tokens=0))
272+
273+
# save the intermediate step trajectory to the save folder
274+
step_trajectory = {
275+
"screenshot_path": f"{self.save_folder}/screenshot_{self.step_count}.png",
276+
"som_screenshot_path": f"{self.save_folder}/som_screenshot_{self.step_count}.png",
277+
"screen_info": screen_info,
278+
"latency_omniparser": latency_omniparser,
279+
"latency_vlm": latency_vlm,
280+
"vlm_response_json": vlm_response_json,
281+
'ledger': self.ledger,
282+
}
283+
with open(f"{self.save_folder}/trajectory.json", "a") as f:
284+
f.write(json.dumps(step_trajectory))
285+
f.write("\n")
286+
270287
return response_message, vlm_response_json
271288

272289
def _api_response_callback(self, response: APIResponse):
@@ -376,9 +393,8 @@ def _initialize_task(self, messages: list):
376393
plan = extract_data(vlm_response, "json")
377394

378395
# Create a filename with timestamp
379-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
380-
plan_filename = f"plan_{timestamp}.json"
381-
plan_path = self.save_folder / plan_filename
396+
plan_filename = f"plan.json"
397+
plan_path = os.path.join(self.save_folder, plan_filename)
382398

383399
# Save the plan to a file
384400
try:

omnitool/gradio/app_new.py

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
"""
2+
The app contains:
3+
- a new UI for the OmniParser AI Agent.
4+
-
25
python app_new.py --windows_host_url localhost:8006 --omniparser_server_url localhost:8000
36
"""
47

@@ -28,10 +31,6 @@
2831

2932
CONFIG_DIR = Path("~/.anthropic").expanduser()
3033
API_KEY_FILE = CONFIG_DIR / "api_key"
31-
UPLOAD_FOLDER = Path("./uploads").absolute()
32-
33-
# Create uploads directory if it doesn't exist
34-
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
3534

3635
INTRO_TEXT = '''
3736
<div style="text-align: center; margin-bottom: 10px;">
@@ -46,13 +45,13 @@ def parse_arguments():
4645
parser = argparse.ArgumentParser(description="Gradio App")
4746
parser.add_argument("--windows_host_url", type=str, default='localhost:8006')
4847
parser.add_argument("--omniparser_server_url", type=str, default="localhost:8000")
49-
parser.add_argument("--upload_folder", type=str, default="./uploads")
48+
parser.add_argument("--run_folder", type=str, default="./tmp/outputs")
5049
return parser.parse_args()
5150
args = parse_arguments()
5251

5352
# Update upload folder from args if provided
54-
UPLOAD_FOLDER = Path(args.upload_folder).absolute()
55-
UPLOAD_FOLDER.mkdir(parents=True, exist_ok=True)
53+
RUN_FOLDER = Path(os.path.join(args.run_folder, datetime.now().strftime('%Y%m%d_%H%M')))
54+
RUN_FOLDER.mkdir(parents=True, exist_ok=True)
5655

5756
class Sender(StrEnum):
5857
USER = "user"
@@ -63,8 +62,8 @@ class Sender(StrEnum):
6362
def load_existing_files():
6463
"""Load all existing files from the uploads folder"""
6564
files = []
66-
if UPLOAD_FOLDER.exists():
67-
for file_path in UPLOAD_FOLDER.iterdir():
65+
if RUN_FOLDER.exists():
66+
for file_path in RUN_FOLDER.iterdir():
6867
if file_path.is_file():
6968
files.append(str(file_path))
7069
return files
@@ -277,7 +276,7 @@ def process_input(user_input, state):
277276
only_n_most_recent_images=state["only_n_most_recent_images"],
278277
max_tokens=16384,
279278
omniparser_url=args.omniparser_server_url,
280-
save_folder=str(UPLOAD_FOLDER)
279+
save_folder=str(RUN_FOLDER)
281280
):
282281
if loop_msg is None or state.get("stop"):
283282
# Detect and add new files to the state
@@ -434,7 +433,7 @@ def handle_file_upload(files, state):
434433
for file in files:
435434
# Get the file name and create a path in the upload directory
436435
file_name = Path(file.name).name
437-
file_path = UPLOAD_FOLDER / file_name
436+
file_path = RUN_FOLDER / file_name
438437

439438
# Save the file
440439
shutil.copy(file.name, file_path)
@@ -471,9 +470,9 @@ def toggle_view(view_mode, file_path=None, state=None):
471470
def detect_new_files(state):
472471
"""Detect new files in the uploads folder and add them to the state"""
473472
new_files_count = 0
474-
if UPLOAD_FOLDER.exists():
473+
if RUN_FOLDER.exists():
475474
current_files = set(state['uploaded_files'])
476-
for file_path in UPLOAD_FOLDER.iterdir():
475+
for file_path in RUN_FOLDER.iterdir():
477476
if file_path.is_file():
478477
file_path_str = str(file_path)
479478
if file_path_str not in current_files:

0 commit comments

Comments
 (0)