Statistics
| Revision:

gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / pynliner / soupselect.py @ 475

History | View | Annotate | Download (8.33 KB)

1
"""
2
# Included with pynliner since it isn't on PyPI #
3

4
soupselect.py
5

6
CSS selector support for BeautifulSoup.
7

8
soup = BeautifulSoup('<html>...')
9
select(soup, 'div')
10
    - returns a list of div elements
11

12
select(soup, 'div#main ul a')
13
    - returns a list of links inside a ul inside div#main
14

15
patched to support multiple class selectors here http://code.google.com/p/soupselect/issues/detail?id=4#c0
16
"""
17
import re
18
import BeautifulSoup
19

    
20
attribute_regex = re.compile('\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)=?["\']?(?P<value>[^\]"]*)["\']?\]')
21
pseudo_class_regex = re.compile(ur':(([^:.#(*\[]|\([^)]+\))+)')
22

    
23
def get_attribute_checker(operator, attribute, value=''):
24
    """
25
    Takes an operator, attribute and optional value; returns a function that
26
    will return True for elements that match that combination.
27
    """
28
    return {
29
        '=': lambda el: el.get(attribute) == value,
30
        # attribute includes value as one of a set of space separated tokens
31
        '~': lambda el: value in el.get(attribute, '').split(),
32
        # attribute starts with value
33
        '^': lambda el: el.get(attribute, '').startswith(value),
34
        # attribute ends with value
35
        '$': lambda el: el.get(attribute, '').endswith(value),
36
        # attribute contains value
37
        '*': lambda el: value in el.get(attribute, ''),
38
        # attribute is either exactly value or starts with value-
39
        '|': lambda el: el.get(attribute, '') == value \
40
            or el.get(attribute, '').startswith('%s-' % value),
41
    }.get(operator, lambda el: el.has_key(attribute))
42

    
43
def is_white_space(el):
44
    if isinstance(el, BeautifulSoup.NavigableString) and str(el).strip() == '':
45
        return True
46
    if isinstance(el, BeautifulSoup.Comment):
47
        return True
48
    return False
49

    
50
def is_last_content_node(el):
51
    result = False
52
    if el is None:
53
        result = True
54
    elif is_white_space(el):
55
        result = is_last_content_node(el.nextSibling)
56
    return result
57

    
58
def is_first_content_node(el):
59
    result = False
60
    if el is None:
61
        result = True
62
    if is_white_space(el):
63
        result = is_first_content_node(el.previousSibling)
64
    return result
65

    
66
def get_pseudo_class_checker(psuedo_class):
67
    """
68
    Takes a psuedo_class, like "first-child" or "last-child"
69
    and returns a function that will check if the element satisfies
70
    that psuedo class
71
    """
72
    return {
73
        'first-child': lambda el: is_first_content_node(getattr(el, 'previousSibling', None)),
74
        'last-child': lambda el: is_last_content_node(getattr(el, 'nextSibling', None))
75
    }.get(psuedo_class, lambda el: False)
76

    
77
def get_checker(functions):
78
    def checker(el):
79
        for func in functions:
80
            if not func(el):
81
                return False
82
        return el
83
    return checker
84

    
85

    
86
def select(soup, selector):
87
    """
88
    soup should be a BeautifulSoup instance; selector is a CSS selector 
89
    specifying the elements you want to retrieve.
90
    """
91
    handle_token = True
92
    current_context = [(soup, [])]
93
    operator = None
94
    while selector:
95
        if handle_token:
96
            # Get the rightmost token
97
            handle_token = False
98
            match = re.search('([_0-9a-zA-Z-#.:*"\'\[\\]=]+)$', selector)
99
            if not match:
100
                raise Exception("No match was found. We're done or something is broken")
101
            token = match.groups(1)[0]
102

    
103
            # remove this token from the selector
104
            selector = selector.rsplit(token, 1)[0].rstrip()
105
            
106
            checker_functions = []
107
            #
108
            # Get attribute selectors from token
109
            #
110
            matches = attribute_regex.findall(token)
111
            for match in matches:
112
                checker_functions.append(get_attribute_checker(match[1], match[0], match[2]))
113

    
114
            #
115
            # Get pseudo classes from token
116
            #
117
            for match in pseudo_class_regex.finditer(token):
118
                checker_functions.append(get_pseudo_class_checker(match.groups(1)[0]))
119

    
120
            checker = get_checker(checker_functions)
121
            #
122
            # Get tag
123
            #
124
            tag = re.findall('^([a-zA-Z0-9]+)', token)
125
            if len(tag) == 0:
126
                tag = True
127
            elif len(tag) == 1:
128
                tag = tag[0]
129
            else:
130
                raise Exception("Multiple tags found (invalid CSS)")
131

    
132
            #
133
            # Get ID
134
            #
135
            ids = re.findall('#([a-zA-Z0-9_-]+)', token)
136
            if len(ids) > 1:
137
                raise Exception("Only single # OK")
138
            #
139
            # Get classes
140
            #
141
            classes = re.findall('\.([a-zA-Z0-9_-]+)', token)
142

    
143
            #
144
            # Search contexts for matches
145
            #
146
            found = []
147
            find_dict = {}
148
            if ids:
149
                find_dict['id'] = ids
150
            if classes:
151
                find_dict['class'] = lambda attr: attr and set(classes).issubset(attr.split())
152
            if operator is None:
153
                # This is the first token: simply find all matches
154
                for context in current_context:
155
                    context_matches = [el for el in context[0].findAll(tag, find_dict) if checker(el)]
156
                    for context_match in context_matches:
157
                        found.append(
158
                            (context_match, [context_match]),
159
                        )
160
            elif operator == ' ':
161
                # for each context in current_context, ensure there
162
                # exists an element somewhere above that element that
163
                # matches the provided token
164
                # ("descendant" selector)
165
                for context in current_context:
166
                    context_matches = []
167
                    for el in context[1]:
168
                        if checker(el.findParent(tag, find_dict)):
169
                            context_matches.append(el)
170
                    if context_matches:
171
                        found.append(
172
                            (context[0], context_matches),
173
                        )
174
            elif operator == '>':
175
                # for each context in current_context,
176
                # check if the parent satisfies the provided
177
                # arguments.
178
                for context in current_context:
179
                    context_matches = []
180
                    for el in context[1]:
181
                        if checker(el.findParent(tag, find_dict)) == el.parent:
182
                            context_matches.append(el.parent)
183
                    if context_matches:
184
                        found.append(
185
                            (context[0], context_matches),
186
                        )
187
            elif operator == '~':
188
                # for each context in current_context
189
                # check 
190
                raise NotImplementedError("~ operator is not implemented. Sad face :(")
191
            elif operator == '+':
192
                # for each context in current_context
193
                # check if the preceding sibling satisfies the
194
                # provided arguments
195
                for context in current_context:
196
                    context_matches = []
197
                    for el in context[1]:
198
                        if checker(el.findPreviousSibling(tag, find_dict)) == el.previousSibling:
199
                            context_matches.append(el.previousSibling)
200
                    if context_matches:
201
                        found.append(
202
                            (context[0], context_matches)
203
                        )
204
            current_context = found
205
        else:
206
            # Get the next operator (whitespace, >, ~, +)
207
            handle_token = True
208
            operator = None
209
            match = re.search('([>~+]+)$', selector)
210
            if match:
211
                operator = match.groups(1)[0]
212
            else:
213
                operator = ' '
214
            selector = selector.rsplit(operator, 1)[0].rstrip()
215
    return [entry[0] for entry in current_context]
216

    
217
def monkeypatch(BeautifulSoupClass=None):
218
    """
219
    If you don't explicitly state the class to patch, defaults to the most 
220
    common import location for BeautifulSoup.
221
    """
222
    if not BeautifulSoupClass:
223
        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
224
    BeautifulSoupClass.findSelect = select
225

    
226
def unmonkeypatch(BeautifulSoupClass=None):
227
    if not BeautifulSoupClass:
228
        from BeautifulSoup import BeautifulSoup as BeautifulSoupClass
229
    delattr(BeautifulSoupClass, 'findSelect')