diff --git a/openadapt/prompts/generate_action_event_with_dot.j2 b/openadapt/prompts/generate_action_event_with_dot.j2 new file mode 100644 index 000000000..4adb0657e --- /dev/null +++ b/openadapt/prompts/generate_action_event_with_dot.j2 @@ -0,0 +1,21 @@ +You are tasked with generating the next action event in a sequence. Please analyze the following information: + +Current Window Information: +{{ current_window | pprint }} + +Recorded Actions: +{% for action in recorded_actions %} +{{ action | pprint }} +{% endfor %} + +Replayed Actions So Far: +{% for action in replayed_actions %} +{{ action | pprint }} +{% endfor %} + +Replay Instructions: +{{ replay_instructions }} + +Based on this information and the provided screenshot, please generate the next action event. Remember to specify the x and y coordinates for where you would like a red dot to be painted, representing the target location for this action. + +Provide your response in the JSON format specified in the system prompt. Be as precise as possible with your coordinates and action details. diff --git a/openadapt/prompts/self_correct_with_dot.j2 b/openadapt/prompts/self_correct_with_dot.j2 new file mode 100644 index 000000000..5d3601d7b --- /dev/null +++ b/openadapt/prompts/self_correct_with_dot.j2 @@ -0,0 +1,30 @@ +You have previously suggested the following action: + +{{ initial_action | pprint }} + +A red dot has been painted on the screenshot at the x and y coordinates you specified. Please carefully examine the new screenshot with the red dot. + +Current Window Information: +{{ current_window | pprint }} + +Recorded Actions: +{% for action in recorded_actions %} +{{ action | pprint }} +{% endfor %} + +Replayed Actions So Far: +{% for action in replayed_actions %} +{{ action | pprint }} +{% endfor %} + +Replay Instructions: +{{ replay_instructions }} + +After examining the screenshot with the red dot, please confirm or correct your initial action suggestion. If you believe your initial suggestion was correct, you may simply state "Confirmed". If you believe a correction is necessary, please provide a completely new JSON object with the corrected information, using the format specified in the system prompt. + +Remember to consider: +1. Is the red dot accurately placed for the intended action? +2. Does the placement align with the visible elements in the interface? +3. Is the action type (mouse/keyboard) and specifics (button/key, action type) still appropriate given the dot placement? + +Provide your final decision below: diff --git a/openadapt/prompts/system_with_dot.j2 b/openadapt/prompts/system_with_dot.j2 new file mode 100644 index 000000000..09bc5486a --- /dev/null +++ b/openadapt/prompts/system_with_dot.j2 @@ -0,0 +1,22 @@ +You are an AI assistant capable of analyzing screenshots and generating precise mouse and keyboard actions to interact with computer interfaces. Your task is to generate the next action in a sequence, considering the current screenshot, window information, and previous actions. + +In addition to your regular capabilities, you now have the ability to suggest a location for a red dot to be painted on the screenshot. This dot represents the target location for the next action. After suggesting the location, you will be shown the screenshot with the dot painted on it, and you'll have the opportunity to confirm or correct your initial suggestion. + +Your responses should be in the following JSON format: +{ + "type": "mouse" | "keyboard", + "x": int, // x-coordinate for mouse actions + "y": int, // y-coordinate for mouse actions + "button": "left" | "right" | "middle", // for mouse actions + "key": string, // for keyboard actions + "action": "press" | "release" | "click" | "doubleclick" | "move" +} + +Remember: +1. Analyze the screenshot carefully before suggesting an action. +2. Consider the context provided by previous actions and window information. +3. Be precise with your x and y coordinates for the red dot placement. +4. When shown the screenshot with the dot, carefully evaluate if your initial suggestion was correct or if it needs adjustment. +5. If you need to correct your initial suggestion, provide a completely new JSON object with the corrected information. + +Your goal is to accurately replicate or modify the recorded sequence of actions based on the given instructions and current state of the interface. \ No newline at end of file diff --git a/openadapt/strategies/__init__.py b/openadapt/strategies/__init__.py index fecc7c056..180cf82df 100644 --- a/openadapt/strategies/__init__.py +++ b/openadapt/strategies/__init__.py @@ -13,5 +13,6 @@ from openadapt.strategies.stateful import StatefulReplayStrategy from openadapt.strategies.vanilla import VanillaReplayStrategy from openadapt.strategies.visual import VisualReplayStrategy +from openadapt.strategies.cursor import CursorReplayStrategy # add more strategies here diff --git a/openadapt/strategies/cursor.py b/openadapt/strategies/cursor.py new file mode 100644 index 000000000..088aa691b --- /dev/null +++ b/openadapt/strategies/cursor.py @@ -0,0 +1,151 @@ +from PIL import Image, ImageDraw +import io +import base64 + +from openadapt import adapters, models, strategies, utils +from openadapt.strategies.vanilla import VanillaReplayStrategy + +class CursorReplayStrategy(VanillaReplayStrategy): + """Cursor replay strategy that allows the model to paint a red dot and self-correct.""" + + def __init__( + self, + recording: models.Recording, + replay_instructions: str = "", + process_events: bool = True, + dot_radius: int = 5, + dot_color: str = "red", + ) -> None: + """Initialize the CursorReplayStrategy. + + Args: + recording (models.Recording): The recording object. + replay_instructions (str): Natural language instructions + for how recording should be replayed. + process_events (bool): Flag indicating whether to process the events. + dot_radius (int): Radius of the dot to be painted. + dot_color (str): Color of the dot to be painted. + """ + super().__init__(recording, replay_instructions, process_events) + self.dot_radius = dot_radius + self.dot_color = dot_color + + def get_next_action_event( + self, + screenshot: models.Screenshot, + window_event: models.WindowEvent, + ) -> models.ActionEvent | None: + """Get the next ActionEvent for replay with self-correction. + + Args: + screenshot (models.Screenshot): The screenshot object. + window_event (models.WindowEvent): The window event object. + + Returns: + models.ActionEvent or None: The next ActionEvent for replay or None + if there are no more events. + """ + action_event = generate_action_event_with_dot( + screenshot, + window_event, + self.recording.action_events if not self.process_events else self.recording.processed_action_events, + self.action_history, + self.replay_instructions, + self.dot_radius, + self.dot_color, + ) + + if not action_event: + raise StopIteration() + + self.action_history.append(action_event) + return action_event + +def generate_action_event_with_dot( + current_screenshot: models.Screenshot, + current_window_event: models.WindowEvent, + recorded_actions: list[models.ActionEvent], + replayed_actions: list[models.ActionEvent], + replay_instructions: str, + dot_radius: int, + dot_color: str, +) -> models.ActionEvent: + """Generate the next action event with the ability to paint a red dot and self-correct. + + Args: + current_screenshot (models.Screenshot): current state screenshot + current_window_event (models.WindowEvent): current state window data + recorded_actions (list[models.ActionEvent]): list of action events from the recording + replayed_actions (list[models.ActionEvent]): list of actions produced during current replay + replay_instructions (str): proposed modifications in natural language instructions + dot_radius (int): radius of the dot to be painted + dot_color (str): color of the dot to be painted + + Returns: + (models.ActionEvent) the next action event to be played, produced by the model + """ + current_image = current_screenshot.image + current_window_dict = current_window_event.to_prompt_dict() + recorded_action_dicts = [action.to_prompt_dict() for action in recorded_actions] + replayed_action_dicts = [action.to_prompt_dict() for action in replayed_actions] + + system_prompt = utils.render_template_from_file( + "prompts/system_with_dot.j2", + ) + prompt = utils.render_template_from_file( + "prompts/generate_action_event_with_dot.j2", + current_window=current_window_dict, + recorded_actions=recorded_action_dicts, + replayed_actions=replayed_action_dicts, + replay_instructions=replay_instructions, + ) + prompt_adapter = adapters.get_default_prompt_adapter() + + # First pass: Generate action and get coordinates for the dot + content = prompt_adapter.prompt( + prompt, + system_prompt, + [current_image], + ) + action_dict = utils.parse_code_snippet(content) + + if not action_dict: + return None + + # Paint the dot on the image + img = Image.open(io.BytesIO(current_screenshot.image)) + draw = ImageDraw.Draw(img) + x, y = action_dict['x'], action_dict['y'] + draw.ellipse( + [(x - dot_radius, y - dot_radius), (x + dot_radius, y + dot_radius)], + fill=dot_color, + outline=dot_color, + ) + + # Convert the image with the dot back to bytes + buffer = io.BytesIO() + img.save(buffer, format="PNG") + img_with_dot = buffer.getvalue() + + # Second pass: Allow self-correction + prompt_with_dot = utils.render_template_from_file( + "prompts/self_correct_with_dot.j2", + current_window=current_window_dict, + recorded_actions=recorded_action_dicts, + replayed_actions=replayed_action_dicts, + replay_instructions=replay_instructions, + initial_action=action_dict, + ) + + content_corrected = prompt_adapter.prompt( + prompt_with_dot, + system_prompt, + [img_with_dot], + ) + action_dict_corrected = utils.parse_code_snippet(content_corrected) + + if not action_dict_corrected: + action_dict_corrected = action_dict # Use the original if correction failed + + action = models.ActionEvent.from_dict(action_dict_corrected) + return action \ No newline at end of file