gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / pynliner / soupselect.py @ 475
History | View | Annotate | Download (8.33 KB)
1 |
"""
|
---|---|
2 |
# Included with pynliner since it isn't on PyPI #
|
3 |
|
4 |
soupselect.py
|
5 |
|
6 |
CSS selector support for BeautifulSoup.
|
7 |
|
8 |
soup = BeautifulSoup('<html>...')
|
9 |
select(soup, 'div')
|
10 |
- returns a list of div elements
|
11 |
|
12 |
select(soup, 'div#main ul a')
|
13 |
- returns a list of links inside a ul inside div#main
|
14 |
|
15 |
patched to support multiple class selectors here http://code.google.com/p/soupselect/issues/detail?id=4#c0
|
16 |
"""
|
17 |
import re |
18 |
import BeautifulSoup |
19 |
|
20 |
attribute_regex = re.compile('\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)=?["\']?(?P<value>[^\]"]*)["\']?\]')
|
21 |
pseudo_class_regex = re.compile(ur':(([^:.#(*\[]|\([^)]+\))+)')
|
22 |
|
23 |
def get_attribute_checker(operator, attribute, value=''): |
24 |
"""
|
25 |
Takes an operator, attribute and optional value; returns a function that
|
26 |
will return True for elements that match that combination.
|
27 |
"""
|
28 |
return {
|
29 |
'=': lambda el: el.get(attribute) == value, |
30 |
# attribute includes value as one of a set of space separated tokens
|
31 |
'~': lambda el: value in el.get(attribute, '').split(), |
32 |
# attribute starts with value
|
33 |
'^': lambda el: el.get(attribute, '').startswith(value), |
34 |
# attribute ends with value
|
35 |
'$': lambda el: el.get(attribute, '').endswith(value), |
36 |
# attribute contains value
|
37 |
'*': lambda el: value in el.get(attribute, ''), |
38 |
# attribute is either exactly value or starts with value-
|
39 |
'|': lambda el: el.get(attribute, '') == value \ |
40 |
or el.get(attribute, '').startswith('%s-' % value), |
41 |
}.get(operator, lambda el: el.has_key(attribute))
|
42 |
|
43 |
def is_white_space(el): |
44 |
if isinstance(el, BeautifulSoup.NavigableString) and str(el).strip() == '': |
45 |
return True |
46 |
if isinstance(el, BeautifulSoup.Comment): |
47 |
return True |
48 |
return False |
49 |
|
50 |
def is_last_content_node(el): |
51 |
result = False
|
52 |
if el is None: |
53 |
result = True
|
54 |
elif is_white_space(el):
|
55 |
result = is_last_content_node(el.nextSibling) |
56 |
return result
|
57 |
|
58 |
def is_first_content_node(el): |
59 |
result = False
|
60 |
if el is None: |
61 |
result = True
|
62 |
if is_white_space(el):
|
63 |
result = is_first_content_node(el.previousSibling) |
64 |
return result
|
65 |
|
66 |
def get_pseudo_class_checker(psuedo_class): |
67 |
"""
|
68 |
Takes a psuedo_class, like "first-child" or "last-child"
|
69 |
and returns a function that will check if the element satisfies
|
70 |
that psuedo class
|
71 |
"""
|
72 |
return {
|
73 |
'first-child': lambda el: is_first_content_node(getattr(el, 'previousSibling', None)), |
74 |
'last-child': lambda el: is_last_content_node(getattr(el, 'nextSibling', None)) |
75 |
}.get(psuedo_class, lambda el: False) |
76 |
|
77 |
def get_checker(functions): |
78 |
def checker(el): |
79 |
for func in functions: |
80 |
if not func(el): |
81 |
return False |
82 |
return el
|
83 |
return checker
|
84 |
|
85 |
|
86 |
def select(soup, selector): |
87 |
"""
|
88 |
soup should be a BeautifulSoup instance; selector is a CSS selector
|
89 |
specifying the elements you want to retrieve.
|
90 |
"""
|
91 |
handle_token = True
|
92 |
current_context = [(soup, [])] |
93 |
operator = None
|
94 |
while selector:
|
95 |
if handle_token:
|
96 |
# Get the rightmost token
|
97 |
handle_token = False
|
98 |
match = re.search('([_0-9a-zA-Z-#.:*"\'\[\\]=]+)$', selector)
|
99 |
if not match: |
100 |
raise Exception("No match was found. We're done or something is broken") |
101 |
token = match.groups(1)[0] |
102 |
|
103 |
# remove this token from the selector
|
104 |
selector = selector.rsplit(token, 1)[0].rstrip() |
105 |
|
106 |
checker_functions = [] |
107 |
#
|
108 |
# Get attribute selectors from token
|
109 |
#
|
110 |
matches = attribute_regex.findall(token) |
111 |
for match in matches: |
112 |
checker_functions.append(get_attribute_checker(match[1], match[0], match[2])) |
113 |
|
114 |
#
|
115 |
# Get pseudo classes from token
|
116 |
#
|
117 |
for match in pseudo_class_regex.finditer(token): |
118 |
checker_functions.append(get_pseudo_class_checker(match.groups(1)[0])) |
119 |
|
120 |
checker = get_checker(checker_functions) |
121 |
#
|
122 |
# Get tag
|
123 |
#
|
124 |
tag = re.findall('^([a-zA-Z0-9]+)', token)
|
125 |
if len(tag) == 0: |
126 |
tag = True
|
127 |
elif len(tag) == 1: |
128 |
tag = tag[0]
|
129 |
else:
|
130 |
raise Exception("Multiple tags found (invalid CSS)") |
131 |
|
132 |
#
|
133 |
# Get ID
|
134 |
#
|
135 |
ids = re.findall('#([a-zA-Z0-9_-]+)', token)
|
136 |
if len(ids) > 1: |
137 |
raise Exception("Only single # OK") |
138 |
#
|
139 |
# Get classes
|
140 |
#
|
141 |
classes = re.findall('\.([a-zA-Z0-9_-]+)', token)
|
142 |
|
143 |
#
|
144 |
# Search contexts for matches
|
145 |
#
|
146 |
found = [] |
147 |
find_dict = {} |
148 |
if ids:
|
149 |
find_dict['id'] = ids
|
150 |
if classes:
|
151 |
find_dict['class'] = lambda attr: attr and set(classes).issubset(attr.split()) |
152 |
if operator is None: |
153 |
# This is the first token: simply find all matches
|
154 |
for context in current_context: |
155 |
context_matches = [el for el in context[0].findAll(tag, find_dict) if checker(el)] |
156 |
for context_match in context_matches: |
157 |
found.append( |
158 |
(context_match, [context_match]), |
159 |
) |
160 |
elif operator == ' ': |
161 |
# for each context in current_context, ensure there
|
162 |
# exists an element somewhere above that element that
|
163 |
# matches the provided token
|
164 |
# ("descendant" selector)
|
165 |
for context in current_context: |
166 |
context_matches = [] |
167 |
for el in context[1]: |
168 |
if checker(el.findParent(tag, find_dict)):
|
169 |
context_matches.append(el) |
170 |
if context_matches:
|
171 |
found.append( |
172 |
(context[0], context_matches),
|
173 |
) |
174 |
elif operator == '>': |
175 |
# for each context in current_context,
|
176 |
# check if the parent satisfies the provided
|
177 |
# arguments.
|
178 |
for context in current_context: |
179 |
context_matches = [] |
180 |
for el in context[1]: |
181 |
if checker(el.findParent(tag, find_dict)) == el.parent:
|
182 |
context_matches.append(el.parent) |
183 |
if context_matches:
|
184 |
found.append( |
185 |
(context[0], context_matches),
|
186 |
) |
187 |
elif operator == '~': |
188 |
# for each context in current_context
|
189 |
# check
|
190 |
raise NotImplementedError("~ operator is not implemented. Sad face :(") |
191 |
elif operator == '+': |
192 |
# for each context in current_context
|
193 |
# check if the preceding sibling satisfies the
|
194 |
# provided arguments
|
195 |
for context in current_context: |
196 |
context_matches = [] |
197 |
for el in context[1]: |
198 |
if checker(el.findPreviousSibling(tag, find_dict)) == el.previousSibling:
|
199 |
context_matches.append(el.previousSibling) |
200 |
if context_matches:
|
201 |
found.append( |
202 |
(context[0], context_matches)
|
203 |
) |
204 |
current_context = found |
205 |
else:
|
206 |
# Get the next operator (whitespace, >, ~, +)
|
207 |
handle_token = True
|
208 |
operator = None
|
209 |
match = re.search('([>~+]+)$', selector)
|
210 |
if match:
|
211 |
operator = match.groups(1)[0] |
212 |
else:
|
213 |
operator = ' '
|
214 |
selector = selector.rsplit(operator, 1)[0].rstrip() |
215 |
return [entry[0] for entry in current_context] |
216 |
|
217 |
def monkeypatch(BeautifulSoupClass=None): |
218 |
"""
|
219 |
If you don't explicitly state the class to patch, defaults to the most
|
220 |
common import location for BeautifulSoup.
|
221 |
"""
|
222 |
if not BeautifulSoupClass: |
223 |
from BeautifulSoup import BeautifulSoup as BeautifulSoupClass |
224 |
BeautifulSoupClass.findSelect = select |
225 |
|
226 |
def unmonkeypatch(BeautifulSoupClass=None): |
227 |
if not BeautifulSoupClass: |
228 |
from BeautifulSoup import BeautifulSoup as BeautifulSoupClass |
229 |
delattr(BeautifulSoupClass, 'findSelect') |