microsoft
/

GUI-Actor-7B-Qwen2-VL

Image-Text-to-Text

text-generation-inference

Model card Files Files and versions

qianhuiwu commited on Jun 3

Commit

ae7947f

·

verified ·

1 Parent(s): 34e15c4

Update example code.

Files changed (1) hide show

README.md +7 -4

README.md CHANGED Viewed

@@ -60,7 +60,7 @@ from qwen_vl_utils import process_vision_info
 from datasets import load_dataset
 from transformers import Qwen2VLProcessor
 from gui_actor.constants import chat_template
-from gui_actor.modeling import Qwen2VLForConditionalGenerationWithActionHead
 from gui_actor.inference import inference
@@ -68,7 +68,7 @@ from gui_actor.inference import inference
 model_name_or_path = "microsoft/GUI-Actor-7B-Qwen2-VL"
 data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
 tokenizer = data_processor.tokenizer
-model = Qwen2VLForConditionalGenerationWithActionHead.from_pretrained(
     model_name_or_path,
     torch_dtype=torch.bfloat16,
     device_map="cuda:0",
@@ -78,6 +78,9 @@ model = Qwen2VLForConditionalGenerationWithActionHead.from_pretrained(
 # prepare example
 dataset = load_dataset("rootsautomation/ScreenSpot")["test"]
 example = dataset[0]
 conversation = [
     {
         "role": "system",
@@ -105,9 +108,9 @@ conversation = [
 ]
 # inference
-pred = inference(conversation, model, tokenizer, data_processor, logits_processor=logits_processor_actor, use_placeholder=True, topk=3)
 px, py = pred["topk_points"][0]
-print(f"Click point: [{px}, {py}]")
 ```
 ## Citation

 from datasets import load_dataset
 from transformers import Qwen2VLProcessor
 from gui_actor.constants import chat_template
+from gui_actor.modeling import Qwen2VLForConditionalGenerationWithPointer
 from gui_actor.inference import inference
 model_name_or_path = "microsoft/GUI-Actor-7B-Qwen2-VL"
 data_processor = Qwen2VLProcessor.from_pretrained(model_name_or_path)
 tokenizer = data_processor.tokenizer
+model = Qwen2VLForConditionalGenerationWithPointer.from_pretrained(
     model_name_or_path,
     torch_dtype=torch.bfloat16,
     device_map="cuda:0",
 # prepare example
 dataset = load_dataset("rootsautomation/ScreenSpot")["test"]
 example = dataset[0]
+print(f"Intruction: {example['instruction']}")
+print(f"ground-truth action region (x1, y1, x2, y2): {[round(i, 2) for i in example['bbox']]}")
 conversation = [
     {
         "role": "system",
 ]
 # inference
+pred = inference(conversation, model, tokenizer, data_processor, use_placeholder=True, topk=3)
 px, py = pred["topk_points"][0]
+print(f"Predicted click point: [{round(px, 2)}, {round(py, 2)}]")
 ```
 ## Citation