-
Notifications
You must be signed in to change notification settings - Fork 0
/
vh_to_html.py
214 lines (181 loc) · 7.13 KB
/
vh_to_html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/python3
# Copyright 2023 SJTU X-Lance Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Created by Danyang Zhang @X-Lance.
import lxml.etree
import lxml.html
from typing import Dict, List, Tuple, Pattern
from android_env.wrappers.vh_io_wrapper import filter_elements
import re
def convert_node(node: lxml.etree.Element) -> lxml.html.Element:
# function convert_node {{{ #
"""
Converts one leaf node in android view hierarchy to html element. Will
convert the class, text, resource-id, and content-desc properties.
Args:
node (lxml.etree.Element): leaf node from an android view hierarchy
Returns:
lxml.html.Element: the converted html element. usually is p, button,
img, input, or div.
"""
attribute_dict: Dict[str, str] = {}
# convert resource-id
resource_id: str = node.get("resource-id")
if len(resource_id)>0:
resource_identifyers = resource_id.rsplit("/", maxsplit=1)
#assert len(resource_identifyers)==2
attribute_dict["class"] = " ".join(resource_identifyers[-1].split("_"))
# convert content-desc
content_desc: str = node.get("content-desc")
if len(content_desc)>0:
attribute_dict["alt"] = content_desc
# convert text
text: str = node.get("text")
# convert class
vh_class_name: str = node.get("class")
if vh_class_name.endswith("TextView"):
html_element = lxml.html.Element( "p"
, attribute_dict
)
if len(text)>0:
html_element.text = text
elif vh_class_name.endswith("Button")\
or vh_class_name.endswith("MenuItemView"):
html_element = lxml.html.Element( "button"
, attribute_dict
)
if len(text)>0:
html_element.text = text
elif vh_class_name.endswith("ImageView")\
or vh_class_name.endswith("IconView")\
or vh_class_name.endswith("Image"):
if len(text)>0:
if "alt" in attribute_dict:
attribute_dict["alt"] += ": " + text
else:
attribute_dict["alt"] = text
html_element = lxml.html.Element( "img"
, attribute_dict
)
elif vh_class_name.endswith("EditText"):
if len(text)>0:
attribute_dict["value"] = text
attribute_dict["type"] = "text"
html_element = lxml.html.Element( "input"
, attribute_dict
)
else:
html_element = lxml.html.Element( "div"
, attribute_dict
)
if len(text)>0:
html_element.text = text
return html_element
# }}} function convert_node #
bounds_pattern: Pattern[str] = re.compile(r"\[(\d+),(\d+)\]\[(\d+),(\d+)\]")
def convert_tree(node: lxml.etree.Element) ->\
Tuple[ List[lxml.html.Element]
, List[List[int]]
]:
# function convert_tree {{{ #
"""
Converts a view hierarchy tree to a html element list of all the leaf
nodes.
Args:
node (lxml.etrn.Element): root node of the given view hierarchy tree
Returns:
List[lxml.html.Element]: the converted html element representation of
the leaf nodes sorted in the docuemnt order
List[List[int]]: list of list with length 4 of int as the corresponding
bounding box of the leaf elements
"""
node_list: List[lxml.etree.Element]
bbox_list: List[List[int]] = []
node_list, bbox_list = filter_elements(node)
result_list: List[lxml.html.Element] = []
for i, n in enumerate(node_list):
html_element: lxml.html.Element = convert_node(n)
html_element.set("id", str(i))
html_element.set("clickable", n.get("clickable"))
result_list.append(html_element)
return result_list, bbox_list
# }}} function convert_tree #
def convert_simple_page(page: str) -> List[str]:
"""
Args:
page (str): " [SEP] " concatenated page observation
Returns:
List[str]: page observation devided at " [SEP] "
"""
return page.split(" [SEP] ")
def simplify_html(page: str, with_eid: bool = False) -> List[str]:
# function simplify_html {{{ #
"""
Args:
page (str): full html page observation
with_eid (bool): if an auxiliary `eid` (element id) should be added to
the returned elements
Returns:
List[str]: only leaf nodes of the html
"""
page = page.replace("<br>", " ")\
.replace("<br/>", " ")
html_root: lxml.html.Element = lxml.html.fromstring(page)
for n in list(html_root):
if n.tag=="body":
body_root: lxml.html.Element = n
break
result_list: List[str] = []
if with_eid:
id_counter = 0
for n in body_root.iter():
if isinstance(n, lxml.html.HtmlComment):
continue
if len(list(n))==0:
if with_eid:
n.set("eid", str(id_counter))
id_counter += 1
if "href" in n.attrib:
del n.attrib["href"]
if "data-url" in n.attrib:
del n.attrib["data-url"]
if "src" in n.attrib:
del n.attrib["src"]
result_list.append( lxml.html.tostring( n
, pretty_print=True
, encoding="unicode"
).strip()\
.replace("\n", " ")\
.replace("\r", " ")
)
return result_list
# }}} function simplify_html #
if __name__ == "__main__":
import sys
input_file: str = sys.argv[1]
output_file: str = sys.argv[2]
html_elements: List[lxml.html.Element]
node_bboxes: List[List[int]]
vh_tree: lxml.etree.ElementTree = lxml.etree.parse(input_file)
html_elements, node_bboxes = convert_tree(vh_tree.getroot())
with open(output_file, "w") as f:
for html_elm, n_bb in zip(html_elements, node_bboxes):
f.write( lxml.html.tostring( html_elm
, pretty_print=True
, encoding="unicode"
).strip()\
+ " "
+ str(n_bb)
+ "\n"
)