diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f1f552b --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.DS_Store +python-review.code-workspace +py4e_codes \ No newline at end of file diff --git a/HelloWorld.py b/HelloWorld.py new file mode 100644 index 0000000..f301245 --- /dev/null +++ b/HelloWorld.py @@ -0,0 +1 @@ +print("Hello World!") diff --git a/bs4/__init__.py b/bs4/__init__.py new file mode 100644 index 0000000..f6fdfd5 --- /dev/null +++ b/bs4/__init__.py @@ -0,0 +1,468 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup uses a pluggable XML or HTML parser to parse a +(possibly invalid) document into a tree representation. Beautiful Soup +provides provides methods and Pythonic idioms that make it easy to +navigate, search, and modify the parse tree. + +Beautiful Soup works with Python 2.6 and up. It works better if lxml +and/or html5lib is installed. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/bs4/doc/ +""" + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.4.1" +__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" +__license__ = "MIT" + +__all__ = ['BeautifulSoup'] + +import os +import re +import warnings + +from .builder import builder_registry, ParserRejectedMarkup +from .dammit import UnicodeDammit +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + ResultSet, + SoupStrainer, + Tag, + ) + +# The very first thing we do is give a useful error if someone is +# running this code under Python 3 without converting it. +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' + +class BeautifulSoup(Tag): + """ + This class defines the basic interface called by the tree builders. + + These methods will be called by the parser: + reset() + feed(markup) + + The tree builder may call these methods from its feed() implementation: + handle_starttag(name, attrs) # See note about return value + handle_endtag(name) + handle_data(data) # Appends to the current data node + endData(containerClass=NavigableString) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's
tag), call handle_starttag and then + handle_endtag. + """ + ROOT_TAG_NAME = '[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html', 'fast'] + + ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None, exclude_encodings=None, + **kwargs): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser.""" + + if 'convertEntities' in kwargs: + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if len(kwargs) > 0: + arg = list(kwargs.keys()).pop() + raise TypeError( + "__init__() got an unexpected keyword argument '%s'" % arg) + + if builder is None: + original_features = features + if isinstance(features, str): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise FeatureNotFound( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + builder = builder_class() + if not (original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES): + if builder.is_xml: + markup_type = "XML" + else: + markup_type = "HTML" + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( + parser=builder.NAME, + markup_type=markup_type)) + + self.builder = builder + self.is_xml = builder.is_xml + self.builder.soup = self + + self.parse_only = parse_only + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + elif len(markup) <= 256: + # Print out warnings for a couple beginner problems + # involving passing non-markup to Beautiful Soup. + # Beautiful Soup will still parse the input as markup, + # just in case that's what the user really wants. + if (isinstance(markup, str) + and not os.path.supports_unicode_filenames): + possible_filename = markup.encode("utf8") + else: + possible_filename = markup + is_file = False + try: + is_file = os.path.exists(possible_filename) + except Exception as e: + # This is almost certainly a problem involving + # characters not valid in filenames on this + # system. Just let it go. + pass + if is_file: + if isinstance(markup, str): + markup = markup.encode("utf8") + warnings.warn( + '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) + if markup[:5] == "http:" or markup[:6] == "https:": + # TODO: This is ugly but I couldn't get it to work in + # Python 3 otherwise. + if ((isinstance(markup, bytes) and not b' ' in markup) + or (isinstance(markup, str) and not ' ' in markup)): + if isinstance(markup, str): + markup = markup.encode("utf8") + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) + + for (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) in ( + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): + self.reset() + try: + self._feed() + break + except ParserRejectedMarkup: + pass + + # Clear out the markup and remove the builder's circular + # reference to this object. + self.markup = None + self.builder.soup = None + + def __copy__(self): + return type(self)(self.encode(), builder=self.builder) + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + del d['builder'] + return d + + def _feed(self): + # Convert the document to Unicode. + self.builder.reset() + + self.builder.feed(self.markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.current_data = [] + self.currentTag = None + self.tagStack = [] + self.preserve_whitespace_tag_stack = [] + self.pushTag(self) + + def new_tag(self, name, namespace=None, nsprefix=None, **attrs): + """Create a new tag associated with this soup.""" + return Tag(None, self.builder, name, namespace, nsprefix, attrs) + + def new_string(self, s, subclass=NavigableString): + """Create a new NavigableString associated with this soup.""" + return subclass(s) + + def insert_before(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_before().") + + def insert_after(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_after().") + + def popTag(self): + tag = self.tagStack.pop() + if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: + self.preserve_whitespace_tag_stack.pop() + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + if tag.name in self.builder.preserve_whitespace_tags: + self.preserve_whitespace_tag_stack.append(tag) + + def endData(self, containerClass=NavigableString): + if self.current_data: + current_data = ''.join(self.current_data) + # If whitespace is not preserved, and this string contains + # nothing but ASCII spaces, replace it with a single space + # or newline. + if not self.preserve_whitespace_tag_stack: + strippable = True + for i in current_data: + if i not in self.ASCII_SPACES: + strippable = False + break + if strippable: + if '\n' in current_data: + current_data = '\n' + else: + current_data = ' ' + + # Reset the data collector. + self.current_data = [] + + # Should we add this string to the tree at all? + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(current_data)): + return + + o = containerClass(current_data) + self.object_was_parsed(o) + + def object_was_parsed(self, o, parent=None, most_recent_element=None): + """Add an object to the parse tree.""" + parent = parent or self.currentTag + previous_element = most_recent_element or self._most_recent_element + + next_element = previous_sibling = next_sibling = None + if isinstance(o, Tag): + next_element = o.next_element + next_sibling = o.next_sibling + previous_sibling = o.previous_sibling + if not previous_element: + previous_element = o.previous_element + + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) + + self._most_recent_element = o + parent.contents.append(o) + + if parent.next_sibling: + # This node is being inserted into an element that has + # already been parsed. Deal with any dangling references. + index = parent.contents.index(o) + if index == 0: + previous_element = parent + previous_sibling = None + else: + previous_element = previous_sibling = parent.contents[index-1] + if index == len(parent.contents)-1: + next_element = parent.next_sibling + next_sibling = None + else: + next_element = next_sibling = parent.contents[index+1] + + o.previous_element = previous_element + if previous_element: + previous_element.next_element = o + o.next_element = next_element + if next_element: + next_element.previous_element = o + o.next_sibling = next_sibling + if next_sibling: + next_sibling.previous_sibling = o + o.previous_sibling = previous_sibling + if previous_sibling: + previous_sibling.next_sibling = o + + def _popToTag(self, name, nsprefix=None, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + # The BeautifulSoup object itself can never be popped. + return + + most_recently_popped = None + + stack_size = len(self.tagStack) + for i in range(stack_size - 1, 0, -1): + t = self.tagStack[i] + if (name == t.name and nsprefix == t.prefix): + if inclusivePop: + most_recently_popped = self.popTag() + break + most_recently_popped = self.popTag() + + return most_recently_popped + + def handle_starttag(self, name, namespace, nsprefix, attrs): + """Push a start tag on to the stack. + + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occured + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + + # print "Start tag %s: %s" % (name, attrs) + self.endData() + + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.search_tag(name, attrs))): + return None + + tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self._most_recent_element) + if tag is None: + return tag + if self._most_recent_element: + self._most_recent_element.next_element = tag + self._most_recent_element = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name, nsprefix=None): + #print "End tag: " + name + self.endData() + self._popToTag(name, nsprefix) + + def handle_data(self, data): + self.current_data.append(data) + + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a string or Unicode representation of this document. + To get Unicode, pass None for encoding.""" + + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = '\n' % encoding_part + else: + prefix = '' + if not pretty_print: + indent_level = None + else: + indent_level = 0 + return prefix + super(BeautifulSoup, self).decode( + indent_level, eventual_encoding, formatter) + +# Alias to make it easier to type import: 'from bs4 import _soup' +_s = BeautifulSoup +_soup = BeautifulSoup + +class BeautifulStoneSoup(BeautifulSoup): + """Deprecated interface to an XML parser.""" + + def __init__(self, *args, **kwargs): + kwargs['features'] = 'xml' + warnings.warn( + 'The BeautifulStoneSoup class is deprecated. Instead of using ' + 'it, pass features="xml" into the BeautifulSoup constructor.') + super(BeautifulStoneSoup, self).__init__(*args, **kwargs) + + +class StopParsing(Exception): + pass + +class FeatureNotFound(ValueError): + pass + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print(soup.prettify()) diff --git a/bs4/__pycache__/__init__.cpython-37.pyc b/bs4/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..fd2e368 Binary files /dev/null and b/bs4/__pycache__/__init__.cpython-37.pyc differ diff --git a/bs4/__pycache__/dammit.cpython-37.pyc b/bs4/__pycache__/dammit.cpython-37.pyc new file mode 100644 index 0000000..b3b8557 Binary files /dev/null and b/bs4/__pycache__/dammit.cpython-37.pyc differ diff --git a/bs4/__pycache__/element.cpython-37.pyc b/bs4/__pycache__/element.cpython-37.pyc new file mode 100644 index 0000000..12f9c24 Binary files /dev/null and b/bs4/__pycache__/element.cpython-37.pyc differ diff --git a/bs4/builder/__init__.py b/bs4/builder/__init__.py new file mode 100644 index 0000000..6ccd4d2 --- /dev/null +++ b/bs4/builder/__init__.py @@ -0,0 +1,324 @@ +from collections import defaultdict +import itertools +import sys +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + whitespace_re + ) + +__all__ = [ + 'HTMLTreeBuilder', + 'SAXTreeBuilder', + 'TreeBuilder', + 'TreeBuilderRegistry', + ] + +# Some useful features for a TreeBuilder to have. +FAST = 'fast' +PERMISSIVE = 'permissive' +STRICT = 'strict' +XML = 'xml' +HTML = 'html' +HTML_5 = 'html5' + + +class TreeBuilderRegistry(object): + + def __init__(self): + self.builders_for_feature = defaultdict(list) + self.builders = [] + + def register(self, treebuilder_class): + """Register a treebuilder based on its advertised features.""" + for feature in treebuilder_class.features: + self.builders_for_feature[feature].insert(0, treebuilder_class) + self.builders.insert(0, treebuilder_class) + + def lookup(self, *features): + if len(self.builders) == 0: + # There are no builders at all. + return None + + if len(features) == 0: + # They didn't ask for any features. Give them the most + # recently registered builder. + return self.builders[0] + + # Go down the list of features in order, and eliminate any builders + # that don't match every feature. + features = list(features) + features.reverse() + candidates = None + candidate_set = None + while len(features) > 0: + feature = features.pop() + we_have_the_feature = self.builders_for_feature.get(feature, []) + if len(we_have_the_feature) > 0: + if candidates is None: + candidates = we_have_the_feature + candidate_set = set(candidates) + else: + # Eliminate any candidates that don't have this feature. + candidate_set = candidate_set.intersection( + set(we_have_the_feature)) + + # The only valid candidates are the ones in candidate_set. + # Go through the original list of candidates and pick the first one + # that's in candidate_set. + if candidate_set is None: + return None + for candidate in candidates: + if candidate in candidate_set: + return candidate + return None + +# The BeautifulSoup class will take feature lists from developers and use them +# to look up builders in this registry. +builder_registry = TreeBuilderRegistry() + +class TreeBuilder(object): + """Turn a document into a Beautiful Soup object tree.""" + + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] + features = [] + + is_xml = False + picklable = False + preserve_whitespace_tags = set() + empty_element_tags = None # A tag will be considered an empty-element + # tag when and only when it has no contents. + + # A value for these tag/attribute combinations is a space- or + # comma-separated list of CDATA, rather than a single CDATA. + cdata_list_attributes = {} + + + def __init__(self): + self.soup = None + + def reset(self): + pass + + def can_be_empty_element(self, tag_name): + """Might a tag with this name be an empty-element tag? + + The final markup may or may not actually present this tag as + self-closing. + + For instance: an HTMLBuilder does not consider a

tag to be + an empty-element tag (it's not in + HTMLBuilder.empty_element_tags). This means an empty

tag + will be presented as "

", not "

". + + The default implementation has no opinion about which tags are + empty-element tags, so a tag will be presented as an + empty-element tag if and only if it has no contents. + "" will become "", and "bar" will + be left alone. + """ + if self.empty_element_tags is None: + return True + return tag_name in self.empty_element_tags + + def feed(self, markup): + raise NotImplementedError() + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None): + return markup, None, None, False + + def test_fragment_to_document(self, fragment): + """Wrap an HTML fragment to make it look like a document. + + Different parsers do this differently. For instance, lxml + introduces an empty tag, and html5lib + doesn't. Abstracting this away lets us write simple tests + which run HTML fragments through the parser and compare the + results against other HTML fragments. + + This method should not be used outside of tests. + """ + return fragment + + def set_up_substitutions(self, tag): + return False + + def _replace_cdata_list_attribute_values(self, tag_name, attrs): + """Replaces class="foo bar" with class=["foo", "bar"] + + Modifies its input in place. + """ + if not attrs: + return attrs + if self.cdata_list_attributes: + universal = self.cdata_list_attributes.get('*', []) + tag_specific = self.cdata_list_attributes.get( + tag_name.lower(), None) + for attr in list(attrs.keys()): + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + value = attrs[attr] + if isinstance(value, str): + values = whitespace_re.split(value) + else: + # html5lib sometimes calls setAttributes twice + # for the same tag when rearranging the parse + # tree. On the second call the attribute value + # here is already a list. If this happens, + # leave the value alone rather than trying to + # split it again. + values = value + attrs[attr] = values + return attrs + +class SAXTreeBuilder(TreeBuilder): + """A Beautiful Soup treebuilder that listens for SAX events.""" + + def feed(self, markup): + raise NotImplementedError() + + def close(self): + pass + + def startElement(self, name, attrs): + attrs = dict((key[1], value) for key, value in list(attrs.items())) + #print "Start %s, %r" % (name, attrs) + self.soup.handle_starttag(name, attrs) + + def endElement(self, name): + #print "End %s" % name + self.soup.handle_endtag(name) + + def startElementNS(self, nsTuple, nodeName, attrs): + # Throw away (ns, nodeName) for now. + self.startElement(nodeName, attrs) + + def endElementNS(self, nsTuple, nodeName): + # Throw away (ns, nodeName) for now. + self.endElement(nodeName) + #handler.endElementNS((ns, node.nodeName), node.nodeName) + + def startPrefixMapping(self, prefix, nodeValue): + # Ignore the prefix for now. + pass + + def endPrefixMapping(self, prefix): + # Ignore the prefix for now. + # handler.endPrefixMapping(prefix) + pass + + def characters(self, content): + self.soup.handle_data(content) + + def startDocument(self): + pass + + def endDocument(self): + pass + + +class HTMLTreeBuilder(TreeBuilder): + """This TreeBuilder knows facts about HTML. + + Such as which tags are empty-element tags. + """ + + preserve_whitespace_tags = set(['pre', 'textarea']) + empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', + 'spacer', 'link', 'frame', 'base']) + + # The HTML standard defines these attributes as containing a + # space-separated list of values, not a single value. That is, + # class="foo bar" means that the 'class' attribute has two values, + # 'foo' and 'bar', not the single value 'foo bar'. When we + # encounter one of these attributes, we will parse its value into + # a list of values if possible. Upon output, the list will be + # converted back into a string. + cdata_list_attributes = { + "*" : ['class', 'accesskey', 'dropzone'], + "a" : ['rel', 'rev'], + "link" : ['rel', 'rev'], + "td" : ["headers"], + "th" : ["headers"], + "td" : ["headers"], + "form" : ["accept-charset"], + "object" : ["archive"], + + # These are HTML5 specific, as are *.accesskey and *.dropzone above. + "area" : ["rel"], + "icon" : ["sizes"], + "iframe" : ["sandbox"], + "output" : ["for"], + } + + def set_up_substitutions(self, tag): + # We are only interested in tags + if tag.name != 'meta': + return False + + http_equiv = tag.get('http-equiv') + content = tag.get('content') + charset = tag.get('charset') + + # We are interested in tags that say what encoding the + # document was originally in. This means HTML 5-style + # tags that provide the "charset" attribute. It also means + # HTML 4-style tags that provide the "content" + # attribute and have "http-equiv" set to "content-type". + # + # In both cases we will replace the value of the appropriate + # attribute with a standin object that can take on any + # encoding. + meta_encoding = None + if charset is not None: + # HTML 5 style: + # + meta_encoding = charset + tag['charset'] = CharsetMetaAttributeValue(charset) + + elif (content is not None and http_equiv is not None + and http_equiv.lower() == 'content-type'): + # HTML 4 style: + # + tag['content'] = ContentMetaAttributeValue(content) + + return (meta_encoding is not None) + +def register_treebuilders_from(module): + """Copy TreeBuilders from the given module into this module.""" + # I'm fairly sure this is not the best way to do this. + this_module = sys.modules['bs4.builder'] + for name in module.__all__: + obj = getattr(module, name) + + if issubclass(obj, TreeBuilder): + setattr(this_module, name, obj) + this_module.__all__.append(name) + # Register the builder while we're at it. + this_module.builder_registry.register(obj) + +class ParserRejectedMarkup(Exception): + pass + +# Builders are registered in reverse order of priority, so that custom +# builder registrations will take precedence. In general, we want lxml +# to take precedence over html5lib, because it's faster. And we only +# want to use HTMLParser as a last result. +from . import _htmlparser +register_treebuilders_from(_htmlparser) +try: + from . import _html5lib + register_treebuilders_from(_html5lib) +except ImportError: + # They don't have html5lib installed. + pass +try: + from . import _lxml + register_treebuilders_from(_lxml) +except ImportError: + # They don't have lxml installed. + pass diff --git a/bs4/builder/__pycache__/__init__.cpython-37.pyc b/bs4/builder/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000..ed53c46 Binary files /dev/null and b/bs4/builder/__pycache__/__init__.cpython-37.pyc differ diff --git a/bs4/builder/__pycache__/_html5lib.cpython-37.pyc b/bs4/builder/__pycache__/_html5lib.cpython-37.pyc new file mode 100644 index 0000000..b670adc Binary files /dev/null and b/bs4/builder/__pycache__/_html5lib.cpython-37.pyc differ diff --git a/bs4/builder/__pycache__/_htmlparser.cpython-37.pyc b/bs4/builder/__pycache__/_htmlparser.cpython-37.pyc new file mode 100644 index 0000000..f613f56 Binary files /dev/null and b/bs4/builder/__pycache__/_htmlparser.cpython-37.pyc differ diff --git a/bs4/builder/__pycache__/_lxml.cpython-37.pyc b/bs4/builder/__pycache__/_lxml.cpython-37.pyc new file mode 100644 index 0000000..538712c Binary files /dev/null and b/bs4/builder/__pycache__/_lxml.cpython-37.pyc differ diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py new file mode 100644 index 0000000..f0e5924 --- /dev/null +++ b/bs4/builder/_html5lib.py @@ -0,0 +1,332 @@ +__all__ = [ + 'HTML5TreeBuilder', + ] + +from pdb import set_trace +import warnings +from bs4.builder import ( + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, + ) +from bs4.element import ( + NamespacedAttribute, + whitespace_re, +) +import html5lib +from html5lib.constants import namespaces +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree.""" + + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] + + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") + yield (markup, None, None, False) + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + doc = parser.parse(markup, encoding=self.user_specified_encoding) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, str): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + doc.original_encoding = parser.tokenizer.stream.charEncoding[0] + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + self.soup, namespaceHTMLElements) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '%s' % fragment + + +class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): + + def __init__(self, soup, namespaceHTMLElements): + self.soup = soup + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + tag = self.soup.new_tag(name, namespace) + return Element(tag, self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + self.soup = BeautifulSoup("") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return list(self.attrs.items()).__iter__() + def __setitem__(self, name, value): + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = HTML5TreeBuilder.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + # A node that is being cloned may have already undergone + # this procedure. + if not isinstance(value, list): + value = whitespace_re.split(value) + self.element[name] = value + def items(self): + return list(self.attrs.items()) + def keys(self): + return list(self.attrs.keys()) + def __len__(self): + return len(self.attrs) + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in list(self.attrs.keys()) + + +class Element(html5lib.treebuilders._base.Node): + def __init__(self, element, soup, namespace): + html5lib.treebuilders._base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node): + string_child = child = None + if isinstance(node, str): + # Some other piece of code decided to pass in a string + # instead of creating a TextElement object to contain the + # string. + string_child = child = node + elif isinstance(node, Tag): + # Some other piece of code decided to pass in a Tag + # instead of creating an Element object to contain the + # Tag. + child = node + elif node.element.__class__ == NavigableString: + string_child = child = node.element + else: + child = node.element + + if not isinstance(child, str) and child.parent is not None: + node.element.extract() + + if (string_child and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # We are appending a string onto another string. + # TODO This has O(n^2) performance, for input like + # "aaa..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + string_child) + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: + if isinstance(node, str): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + + # Tell Beautiful Soup to act as if it parsed this element + # immediately after the parent's last descendant. (Or + # immediately after the parent, if it has no children.) + if self.element.contents: + most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() + else: + most_recent_element = self.element + + self.soup.object_was_parsed( + child, parent=self.element, + most_recent_element=most_recent_element) + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + + if attributes is not None and len(attributes) > 0: + + converted_attributes = [] + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in list(attributes.items()): + self.element[name] = value + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + if insertBefore: + text = TextNode(self.soup.new_string(data), self.soup) + self.insertBefore(data, insertBefore) + else: + self.appendChild(data) + + def insertBefore(self, node, refNode): + index = self.element.index(refNode.element) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + node.element.extract() + + def reparentChildren(self, new_parent): + """Move all of this tag's children into another tag.""" + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element + element = self.element + new_parent_element = new_parent.element + # Determine what this tag's next_element will be once all the children + # are removed. + final_next_element = element.next_sibling + + new_parents_last_descendant = new_parent_element._last_descendant(False, False) + if len(new_parent_element.contents) > 0: + # The new parent already contains children. We will be + # appending this tag's children to the end. + new_parents_last_child = new_parent_element.contents[-1] + new_parents_last_descendant_next_element = new_parents_last_descendant.next_element + else: + # The new parent contains no children. + new_parents_last_child = None + new_parents_last_descendant_next_element = new_parent_element.next_element + + to_append = element.contents + append_after = new_parent_element.contents + if len(to_append) > 0: + # Set the first child's previous_element and previous_sibling + # to elements within the new parent + first_child = to_append[0] + if new_parents_last_descendant: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element + first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child: + new_parents_last_child.next_sibling = first_child + + # Fix the last child's next_element and next_sibling + last_child = to_append[-1] + last_child.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element: + new_parents_last_descendant_next_element.previous_element = last_child + last_child.next_sibling = None + + for child in to_append: + child.parent = new_parent_element + new_parent_element.contents.append(child) + + # Now that this element has no children, change its .next_element. + element.contents = [] + element.next_element = final_next_element + + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element + + def cloneNode(self): + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + html5lib.treebuilders._base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/bs4/builder/_htmlparser.py b/bs4/builder/_htmlparser.py new file mode 100644 index 0000000..bb0a63f --- /dev/null +++ b/bs4/builder/_htmlparser.py @@ -0,0 +1,262 @@ +"""Use the HTMLParser library to parse HTML files that aren't too bad.""" + +__all__ = [ + 'HTMLParserTreeBuilder', + ] + +from html.parser import HTMLParser + +try: + from html.parser import HTMLParseError +except ImportError as e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + +import sys +import warnings + +# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' +# argument, which we'd like to set to False. Unfortunately, +# http://bugs.python.org/issue13273 makes strict=True a better bet +# before Python 3.2.3. +# +# At the end of this file, we monkeypatch HTMLParser so that +# strict=True works well on Python 3.2.2. +major, minor, release = sys.version_info[:3] +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + + +from bs4.element import ( + CData, + Comment, + Declaration, + Doctype, + ProcessingInstruction, + ) +from bs4.dammit import EntitySubstitution, UnicodeDammit + +from bs4.builder import ( + HTML, + HTMLTreeBuilder, + STRICT, + ) + + +HTMLPARSER = 'html.parser' + +class BeautifulSoupHTMLParser(HTMLParser): + def handle_starttag(self, name, attrs): + # XXX namespace + attr_dict = {} + for key, value in attrs: + # Change None attribute values to the empty string + # for consistency with the other tree builders. + if value is None: + value = '' + attr_dict[key] = value + attrvalue = '""' + self.soup.handle_starttag(name, None, None, attr_dict) + + def handle_endtag(self, name): + self.soup.handle_endtag(name) + + def handle_data(self, data): + self.soup.handle_data(data) + + def handle_charref(self, name): + # XXX workaround for a bug in HTMLParser. Remove this once + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 + if name.startswith('x'): + real_name = int(name.lstrip('x'), 16) + elif name.startswith('X'): + real_name = int(name.lstrip('X'), 16) + else: + real_name = int(name) + + try: + data = chr(real_name) + except (ValueError, OverflowError) as e: + data = "\N{REPLACEMENT CHARACTER}" + + self.handle_data(data) + + def handle_entityref(self, name): + character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) + if character is not None: + data = character + else: + data = "&%s;" % name + self.handle_data(data) + + def handle_comment(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(Comment) + + def handle_decl(self, data): + self.soup.endData() + if data.startswith("DOCTYPE "): + data = data[len("DOCTYPE "):] + elif data == 'DOCTYPE': + # i.e. "" + data = '' + self.soup.handle_data(data) + self.soup.endData(Doctype) + + def unknown_decl(self, data): + if data.upper().startswith('CDATA['): + cls = CData + data = data[len('CDATA['):] + else: + cls = Declaration + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(cls) + + def handle_pi(self, data): + self.soup.endData() + self.soup.handle_data(data) + self.soup.endData(ProcessingInstruction) + + +class HTMLParserTreeBuilder(HTMLTreeBuilder): + + is_xml = False + picklable = True + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] + + def __init__(self, *args, **kwargs): + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: + kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + kwargs['convert_charrefs'] = False + self.parser_args = (args, kwargs) + + def prepare_markup(self, markup, user_specified_encoding=None, + document_declared_encoding=None, exclude_encodings=None): + """ + :return: A 4-tuple (markup, original encoding, encoding + declared within markup, whether any characters had to be + replaced with REPLACEMENT CHARACTER). + """ + if isinstance(markup, str): + yield (markup, None, None, False) + return + + try_encodings = [user_specified_encoding, document_declared_encoding] + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) + yield (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) + + def feed(self, markup): + args, kwargs = self.parser_args + parser = BeautifulSoupHTMLParser(*args, **kwargs) + parser.soup = self.soup + try: + parser.feed(markup) + except HTMLParseError as e: + warnings.warn(RuntimeWarning( + "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) + raise e + +# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some +# 3.2.3 code. This ensures they don't treat markup like

as a +# string. +# +# XXX This code can be removed once most Python 3 users are on 3.2.3. +if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: + import re + attrfind_tolerant = re.compile( + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant + + locatestarttagend = re.compile(r""" + <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name + (?:\s+ # whitespace before attribute name + (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:\s*=\s* # value indicator + (?:'[^']*' # LITA-enclosed value + |\"[^\"]*\" # LIT-enclosed value + |[^'\">\s]+ # bare value + ) + )? + ) + )* + \s* # trailing whitespace +""", re.VERBOSE) + BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend + + from html.parser import tagfind, attrfind + + def parse_starttag(self, i): + self.__starttag_text = None + endpos = self.check_for_whole_start_tag(i) + if endpos < 0: + return endpos + rawdata = self.rawdata + self.__starttag_text = rawdata[i:endpos] + + # Now parse the data between i+1 and j into a tag and attrs + attrs = [] + match = tagfind.match(rawdata, i+1) + assert match, 'unexpected call to parse_starttag()' + k = match.end() + self.lasttag = tag = rawdata[i+1:k].lower() + while k < endpos: + if self.strict: + m = attrfind.match(rawdata, k) + else: + m = attrfind_tolerant.match(rawdata, k) + if not m: + break + attrname, rest, attrvalue = m.group(1, 2, 3) + if not rest: + attrvalue = None + elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ + attrvalue[:1] == '"' == attrvalue[-1:]: + attrvalue = attrvalue[1:-1] + if attrvalue: + attrvalue = self.unescape(attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = m.end() + + end = rawdata[k:endpos].strip() + if end not in (">", "/>"): + lineno, offset = self.getpos() + if "\n" in self.__starttag_text: + lineno = lineno + self.__starttag_text.count("\n") + offset = len(self.__starttag_text) \ + - self.__starttag_text.rfind("\n") + else: + offset = offset + len(self.__starttag_text) + if self.strict: + self.error("junk characters in start tag: %r" + % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos + if end.endswith('/>'): + # XHTML-style empty tag: + self.handle_startendtag(tag, attrs) + else: + self.handle_starttag(tag, attrs) + if tag in self.CDATA_CONTENT_ELEMENTS: + self.set_cdata_mode(tag) + return endpos + + def set_cdata_mode(self, elem): + self.cdata_elem = elem.lower() + self.interesting = re.compile(r'' % self.cdata_elem, re.I) + + BeautifulSoupHTMLParser.parse_starttag = parse_starttag + BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode + + CONSTRUCTOR_TAKES_STRICT = True diff --git a/bs4/builder/_lxml.py b/bs4/builder/_lxml.py new file mode 100644 index 0000000..9c6c14e --- /dev/null +++ b/bs4/builder/_lxml.py @@ -0,0 +1,248 @@ +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +from io import BytesIO +from io import StringIO +import collections +from lxml import etree +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, +) +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + ParserRejectedMarkup, + TreeBuilder, + XML) +from bs4.dammit import EncodingDetector + +LXML = 'lxml' + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + + NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] + + # Well, it's permissive by XML parser standards. + features = [NAME, LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE = 512 + + # This namespace mapping is specified in the XML Namespace + # standard. + DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} + + def default_parser(self, encoding): + # This can either return a parser object or a class, which + # will be instantiated with default arguments. + if self._default_parser is not None: + return self._default_parser + return etree.XMLParser( + target=self, strip_cdata=False, recover=True, encoding=encoding) + + def parser_for(self, encoding): + # Use the default parser. + parser = self.default_parser(encoding) + + if isinstance(parser, collections.Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False, encoding=encoding) + return parser + + def __init__(self, parser=None, empty_element_tags=None): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + self.soup = None + self.nsmaps = [self.DEFAULT_NSMAPS] + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) + + def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, + document_declared_encoding=None): + """ + :yield: A series of 4-tuples. + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for parsing the document. + """ + if isinstance(markup, str): + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + + if isinstance(markup, str): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", + document_declared_encoding, False) + + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + try_encodings = [user_specified_encoding, document_declared_encoding] + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) + for encoding in detector.encodings: + yield (detector.markup, encoding, document_declared_encoding, False) + + def feed(self, markup): + if isinstance(markup, bytes): + markup = BytesIO(markup) + elif isinstance(markup, str): + markup = StringIO(markup) + + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(str(e)) + + def close(self): + self.nsmaps = [self.DEFAULT_NSMAPS] + + def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + nsprefix = None + # Invert each namespace map as it comes in. + if len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) + self.nsmaps.append(inverted_nsmap) + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in list(nsmap.items()): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in list(attrs.items()): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs + + namespace, name = self._getNsTag(name) + nsprefix = self._prefix_for_namespace(namespace) + self.soup.handle_starttag(name, namespace, nsprefix, attrs) + + def _prefix_for_namespace(self, namespace): + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + return None + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if len(self.nsmaps) > 1: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + + def pi(self, target, data): + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(ProcessingInstruction) + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] + is_xml = False + + def default_parser(self, encoding): + return etree.HTMLParser + + def feed(self, markup): + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: + raise ParserRejectedMarkup(str(e)) + + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return '%s' % fragment diff --git a/bs4/dammit.py b/bs4/dammit.py new file mode 100644 index 0000000..68d419f --- /dev/null +++ b/bs4/dammit.py @@ -0,0 +1,840 @@ +# -*- coding: utf-8 -*- +"""Beautiful Soup bonus library: Unicode, Dammit + +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It works best on XML and HTML, but it does not rewrite the +XML or HTML to reflect a new encoding; that's the tree builder's job. +""" +__license__ = "MIT" + +from pdb import set_trace +import codecs +from html.entities import codepoint2name +import re +import logging +import string + +# Import a library to autodetect character encodings. +chardet_type = None +try: + # First try the fast C implementation. + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): + return cchardet.detect(s)['encoding'] +except ImportError: + try: + # Fall back to the pure Python implementation + # Debian package: python-chardet + # PyPI package: chardet + import chardet + def chardet_dammit(s): + return chardet.detect(s)['encoding'] + #import chardet.constants + #chardet.constants._debug = 1 + except ImportError: + # No chardet available. + def chardet_dammit(s): + return None + +# Available from http://cjkpython.i18n.org/. +try: + import iconv_codec +except ImportError: + pass + +xml_encoding_re = re.compile( + '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) +html_meta_re = re.compile( + '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) + +class EntitySubstitution(object): + + """Substitute XML or HTML entities for the corresponding characters.""" + + def _populate_class_variables(): + lookup = {} + reverse_lookup = {} + characters_for_re = [] + for codepoint, name in list(codepoint2name.items()): + character = chr(codepoint) + if codepoint != 34: + # There's no point in turning the quotation mark into + # ", unless it happens within an attribute value, which + # is handled elsewhere. + characters_for_re.append(character) + lookup[character] = name + # But we do want to turn " into the quotation mark. + reverse_lookup[name] = character + re_definition = "[%s]" % "".join(characters_for_re) + return lookup, reverse_lookup, re.compile(re_definition) + (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, + CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() + + CHARACTER_TO_XML_ENTITY = { + "'": "apos", + '"': "quot", + "&": "amp", + "<": "lt", + ">": "gt", + } + + BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" + ")") + + AMPERSAND_OR_BRACKET = re.compile("([<>&])") + + @classmethod + def _substitute_html_entity(cls, matchobj): + entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) + return "&%s;" % entity + + @classmethod + def _substitute_xml_entity(cls, matchobj): + """Used with a regular expression to substitute the + appropriate XML entity for an XML special character.""" + entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] + return "&%s;" % entity + + @classmethod + def quoted_attribute_value(self, value): + """Make a value into a quoted XML attribute, possibly escaping it. + + Most strings will be quoted using double quotes. + + Bob's Bar -> "Bob's Bar" + + If a string contains double quotes, it will be quoted using + single quotes. + + Welcome to "my bar" -> 'Welcome to "my bar"' + + If a string contains both single and double quotes, the + double quotes will be escaped, and the string will be quoted + using double quotes. + + Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" + """ + quote_with = '"' + if '"' in value: + if "'" in value: + # The string contains both single and double + # quotes. Turn the double quotes into + # entities. We quote the double quotes rather than + # the single quotes because the entity name is + # """ whether this is HTML or XML. If we + # quoted the single quotes, we'd have to decide + # between ' and &squot;. + replace_with = """ + value = value.replace('"', replace_with) + else: + # There are double quotes but no single quotes. + # We can use single quotes to quote the attribute. + quote_with = "'" + return quote_with + value + quote_with + + @classmethod + def substitute_xml(cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign + will become <, the greater-than sign will become >, + and any ampersands will become &. If you want ampersands + that appear to be part of an entity definition to be left + alone, use substitute_xml_containing_entities() instead. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + + :param value: A string to be substituted. The less-than sign will + become <, the greater-than sign will become >, and any + ampersands that are not part of an entity defition will + become &. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets, and ampersands that aren't part of + # entities. + value = cls.BARE_AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_html(cls, s): + """Replace certain Unicode characters with named HTML entities. + + This differs from data.encode(encoding, 'xmlcharrefreplace') + in that the goal is to make the result more readable (to those + with ASCII displays) rather than to recover from + errors. There's absolutely nothing wrong with a UTF-8 string + containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that + character with "é" will make it more readable to some + people. + """ + return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( + cls._substitute_html_entity, s) + + +class EncodingDetector: + """Suggests a number of possible encodings for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the override_encodings argument to the constructor). + + 2. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a tag (if the bytestring is to be + interpreted as an HTML document.) + + 3. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 4. UTF-8. + + 5. Windows-1252. + """ + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): + self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) + self.chardet_encoding = None + self.is_html = is_html + self.declared_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) + + def _usable(self, encoding, tried): + if encoding is not None: + encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False + if encoding not in tried: + tried.add(encoding) + return True + return False + + @property + def encodings(self): + """Yield a number of encodings that might work for this markup.""" + tried = set() + for e in self.override_encodings: + if self._usable(e, tried): + yield e + + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html) + if self._usable(self.declared_encoding, tried): + yield self.declared_encoding + + # Use third-party character set detection to guess at the + # encoding. + if self.chardet_encoding is None: + self.chardet_encoding = chardet_dammit(self.markup) + if self._usable(self.chardet_encoding, tried): + yield self.chardet_encoding + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ('utf-8', 'windows-1252'): + if self._usable(e, tried): + yield e + + @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies.""" + encoding = None + if isinstance(data, str): + # Unicode data cannot have a byte-order mark. + return data, encoding + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + + @classmethod + def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): + """Given a document, tries to find its declared encoding. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a tag, hopefully near the + beginning of the document. + """ + if search_entire_document: + xml_endpos = html_endpos = len(markup) + else: + xml_endpos = 1024 + html_endpos = max(2048, int(len(markup) * 0.05)) + + declared_encoding = None + declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) + if not declared_encoding_match and is_html: + declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0].decode( + 'ascii', 'replace') + if declared_encoding: + return declared_encoding.lower() + return None + +class UnicodeDammit: + """A class for detecting the encoding of a *ML document and + converting it to a Unicode string. If the source encoding is + windows-1252, can replace MS smart quotes with their HTML or XML + equivalents.""" + + # This dictionary maps commonly seen values for "charset" in HTML + # meta tags to the corresponding Python codec names. It only covers + # values that aren't in Python's aliases and can't be determined + # by the heuristics in find_codec. + CHARSET_ALIASES = {"macintosh": "mac-roman", + "x-sjis": "shift-jis"} + + ENCODINGS_WITH_SMART_QUOTES = [ + "windows-1252", + "iso-8859-1", + "iso-8859-2", + ] + + def __init__(self, markup, override_encodings=[], + smart_quotes_to=None, is_html=False, exclude_encodings=[]): + self.smart_quotes_to = smart_quotes_to + self.tried_encodings = [] + self.contains_replacement_characters = False + self.is_html = is_html + + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) + + # Short-circuit if the data is in Unicode to begin with. + if isinstance(markup, str) or markup == '': + self.markup = markup + self.unicode_markup = str(markup) + self.original_encoding = None + return + + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. + self.markup = self.detector.markup + + u = None + for encoding in self.detector.encodings: + markup = self.detector.markup + u = self._convert_from(encoding) + if u is not None: + break + + if not u: + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. + + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") + if u is not None: + logging.warning( + "Some characters could not be decoded, and were " + "replaced with REPLACEMENT CHARACTER.") + self.contains_replacement_characters = True + break + + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. + self.unicode_markup = u + if not u: + self.original_encoding = None + + def _sub_ms_char(self, match): + """Changes a MS smart quote character to an XML or HTML + entity, or an ASCII character.""" + orig = match.group(1) + if self.smart_quotes_to == 'ascii': + sub = self.MS_CHARS_TO_ASCII.get(orig).encode() + else: + sub = self.MS_CHARS.get(orig) + if type(sub) == tuple: + if self.smart_quotes_to == 'xml': + sub = '&#x'.encode() + sub[1].encode() + ';'.encode() + else: + sub = '&'.encode() + sub[0].encode() + ';'.encode() + else: + sub = sub.encode() + return sub + + def _convert_from(self, proposed, errors="strict"): + proposed = self.find_codec(proposed) + if not proposed or (proposed, errors) in self.tried_encodings: + return None + self.tried_encodings.append((proposed, errors)) + markup = self.markup + # Convert smart quotes to HTML if coming from an encoding + # that might have them. + if (self.smart_quotes_to is not None + and proposed in self.ENCODINGS_WITH_SMART_QUOTES): + smart_quotes_re = b"([\x80-\x9f])" + smart_quotes_compiled = re.compile(smart_quotes_re) + markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) + + try: + #print "Trying to convert document to %s (errors=%s)" % ( + # proposed, errors) + u = self._to_unicode(markup, proposed, errors) + self.markup = u + self.original_encoding = proposed + except Exception as e: + #print "That didn't work!" + #print e + return None + #print "Correct encoding: %s" % proposed + return self.markup + + def _to_unicode(self, data, encoding, errors="strict"): + '''Given a string and its encoding, decodes the string into Unicode. + %encoding is a string recognized by encodings.aliases''' + return str(data, encoding, errors) + + @property + def declared_html_encoding(self): + if not self.is_html: + return None + return self.detector.declared_encoding + + def find_codec(self, charset): + value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) + or charset + ) + if value: + return value.lower() + return None + + def _codec(self, charset): + if not charset: + return charset + codec = None + try: + codecs.lookup(charset) + codec = charset + except (LookupError, ValueError): + pass + return codec + + + # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. + MS_CHARS = {b'\x80': ('euro', '20AC'), + b'\x81': ' ', + b'\x82': ('sbquo', '201A'), + b'\x83': ('fnof', '192'), + b'\x84': ('bdquo', '201E'), + b'\x85': ('hellip', '2026'), + b'\x86': ('dagger', '2020'), + b'\x87': ('Dagger', '2021'), + b'\x88': ('circ', '2C6'), + b'\x89': ('permil', '2030'), + b'\x8A': ('Scaron', '160'), + b'\x8B': ('lsaquo', '2039'), + b'\x8C': ('OElig', '152'), + b'\x8D': '?', + b'\x8E': ('#x17D', '17D'), + b'\x8F': '?', + b'\x90': '?', + b'\x91': ('lsquo', '2018'), + b'\x92': ('rsquo', '2019'), + b'\x93': ('ldquo', '201C'), + b'\x94': ('rdquo', '201D'), + b'\x95': ('bull', '2022'), + b'\x96': ('ndash', '2013'), + b'\x97': ('mdash', '2014'), + b'\x98': ('tilde', '2DC'), + b'\x99': ('trade', '2122'), + b'\x9a': ('scaron', '161'), + b'\x9b': ('rsaquo', '203A'), + b'\x9c': ('oelig', '153'), + b'\x9d': '?', + b'\x9e': ('#x17E', '17E'), + b'\x9f': ('Yuml', ''),} + + # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains + # horrors like stripping diacritical marks to turn á into a, but also + # contains non-horrors like turning “ into ". + MS_CHARS_TO_ASCII = { + b'\x80' : 'EUR', + b'\x81' : ' ', + b'\x82' : ',', + b'\x83' : 'f', + b'\x84' : ',,', + b'\x85' : '...', + b'\x86' : '+', + b'\x87' : '++', + b'\x88' : '^', + b'\x89' : '%', + b'\x8a' : 'S', + b'\x8b' : '<', + b'\x8c' : 'OE', + b'\x8d' : '?', + b'\x8e' : 'Z', + b'\x8f' : '?', + b'\x90' : '?', + b'\x91' : "'", + b'\x92' : "'", + b'\x93' : '"', + b'\x94' : '"', + b'\x95' : '*', + b'\x96' : '-', + b'\x97' : '--', + b'\x98' : '~', + b'\x99' : '(TM)', + b'\x9a' : 's', + b'\x9b' : '>', + b'\x9c' : 'oe', + b'\x9d' : '?', + b'\x9e' : 'z', + b'\x9f' : 'Y', + b'\xa0' : ' ', + b'\xa1' : '!', + b'\xa2' : 'c', + b'\xa3' : 'GBP', + b'\xa4' : '$', #This approximation is especially parochial--this is the + #generic currency symbol. + b'\xa5' : 'YEN', + b'\xa6' : '|', + b'\xa7' : 'S', + b'\xa8' : '..', + b'\xa9' : '', + b'\xaa' : '(th)', + b'\xab' : '<<', + b'\xac' : '!', + b'\xad' : ' ', + b'\xae' : '(R)', + b'\xaf' : '-', + b'\xb0' : 'o', + b'\xb1' : '+-', + b'\xb2' : '2', + b'\xb3' : '3', + b'\xb4' : ("'", 'acute'), + b'\xb5' : 'u', + b'\xb6' : 'P', + b'\xb7' : '*', + b'\xb8' : ',', + b'\xb9' : '1', + b'\xba' : '(th)', + b'\xbb' : '>>', + b'\xbc' : '1/4', + b'\xbd' : '1/2', + b'\xbe' : '3/4', + b'\xbf' : '?', + b'\xc0' : 'A', + b'\xc1' : 'A', + b'\xc2' : 'A', + b'\xc3' : 'A', + b'\xc4' : 'A', + b'\xc5' : 'A', + b'\xc6' : 'AE', + b'\xc7' : 'C', + b'\xc8' : 'E', + b'\xc9' : 'E', + b'\xca' : 'E', + b'\xcb' : 'E', + b'\xcc' : 'I', + b'\xcd' : 'I', + b'\xce' : 'I', + b'\xcf' : 'I', + b'\xd0' : 'D', + b'\xd1' : 'N', + b'\xd2' : 'O', + b'\xd3' : 'O', + b'\xd4' : 'O', + b'\xd5' : 'O', + b'\xd6' : 'O', + b'\xd7' : '*', + b'\xd8' : 'O', + b'\xd9' : 'U', + b'\xda' : 'U', + b'\xdb' : 'U', + b'\xdc' : 'U', + b'\xdd' : 'Y', + b'\xde' : 'b', + b'\xdf' : 'B', + b'\xe0' : 'a', + b'\xe1' : 'a', + b'\xe2' : 'a', + b'\xe3' : 'a', + b'\xe4' : 'a', + b'\xe5' : 'a', + b'\xe6' : 'ae', + b'\xe7' : 'c', + b'\xe8' : 'e', + b'\xe9' : 'e', + b'\xea' : 'e', + b'\xeb' : 'e', + b'\xec' : 'i', + b'\xed' : 'i', + b'\xee' : 'i', + b'\xef' : 'i', + b'\xf0' : 'o', + b'\xf1' : 'n', + b'\xf2' : 'o', + b'\xf3' : 'o', + b'\xf4' : 'o', + b'\xf5' : 'o', + b'\xf6' : 'o', + b'\xf7' : '/', + b'\xf8' : 'o', + b'\xf9' : 'u', + b'\xfa' : 'u', + b'\xfb' : 'u', + b'\xfc' : 'u', + b'\xfd' : 'y', + b'\xfe' : 'b', + b'\xff' : 'y', + } + + # A map used when removing rogue Windows-1252/ISO-8859-1 + # characters in otherwise UTF-8 documents. + # + # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in + # Windows-1252. + WINDOWS_1252_TO_UTF8 = { + 0x80 : b'\xe2\x82\xac', # € + 0x82 : b'\xe2\x80\x9a', # ‚ + 0x83 : b'\xc6\x92', # ƒ + 0x84 : b'\xe2\x80\x9e', # „ + 0x85 : b'\xe2\x80\xa6', # … + 0x86 : b'\xe2\x80\xa0', # † + 0x87 : b'\xe2\x80\xa1', # ‡ + 0x88 : b'\xcb\x86', # ˆ + 0x89 : b'\xe2\x80\xb0', # ‰ + 0x8a : b'\xc5\xa0', # Š + 0x8b : b'\xe2\x80\xb9', # ‹ + 0x8c : b'\xc5\x92', # Œ + 0x8e : b'\xc5\xbd', # Ž + 0x91 : b'\xe2\x80\x98', # ‘ + 0x92 : b'\xe2\x80\x99', # ’ + 0x93 : b'\xe2\x80\x9c', # “ + 0x94 : b'\xe2\x80\x9d', # ” + 0x95 : b'\xe2\x80\xa2', # • + 0x96 : b'\xe2\x80\x93', # – + 0x97 : b'\xe2\x80\x94', # — + 0x98 : b'\xcb\x9c', # ˜ + 0x99 : b'\xe2\x84\xa2', # ™ + 0x9a : b'\xc5\xa1', # š + 0x9b : b'\xe2\x80\xba', # › + 0x9c : b'\xc5\x93', # œ + 0x9e : b'\xc5\xbe', # ž + 0x9f : b'\xc5\xb8', # Ÿ + 0xa0 : b'\xc2\xa0', #   + 0xa1 : b'\xc2\xa1', # ¡ + 0xa2 : b'\xc2\xa2', # ¢ + 0xa3 : b'\xc2\xa3', # £ + 0xa4 : b'\xc2\xa4', # ¤ + 0xa5 : b'\xc2\xa5', # ¥ + 0xa6 : b'\xc2\xa6', # ¦ + 0xa7 : b'\xc2\xa7', # § + 0xa8 : b'\xc2\xa8', # ¨ + 0xa9 : b'\xc2\xa9', # © + 0xaa : b'\xc2\xaa', # ª + 0xab : b'\xc2\xab', # « + 0xac : b'\xc2\xac', # ¬ + 0xad : b'\xc2\xad', # ­ + 0xae : b'\xc2\xae', # ® + 0xaf : b'\xc2\xaf', # ¯ + 0xb0 : b'\xc2\xb0', # ° + 0xb1 : b'\xc2\xb1', # ± + 0xb2 : b'\xc2\xb2', # ² + 0xb3 : b'\xc2\xb3', # ³ + 0xb4 : b'\xc2\xb4', # ´ + 0xb5 : b'\xc2\xb5', # µ + 0xb6 : b'\xc2\xb6', # ¶ + 0xb7 : b'\xc2\xb7', # · + 0xb8 : b'\xc2\xb8', # ¸ + 0xb9 : b'\xc2\xb9', # ¹ + 0xba : b'\xc2\xba', # º + 0xbb : b'\xc2\xbb', # » + 0xbc : b'\xc2\xbc', # ¼ + 0xbd : b'\xc2\xbd', # ½ + 0xbe : b'\xc2\xbe', # ¾ + 0xbf : b'\xc2\xbf', # ¿ + 0xc0 : b'\xc3\x80', # À + 0xc1 : b'\xc3\x81', # Á + 0xc2 : b'\xc3\x82', #  + 0xc3 : b'\xc3\x83', # à + 0xc4 : b'\xc3\x84', # Ä + 0xc5 : b'\xc3\x85', # Å + 0xc6 : b'\xc3\x86', # Æ + 0xc7 : b'\xc3\x87', # Ç + 0xc8 : b'\xc3\x88', # È + 0xc9 : b'\xc3\x89', # É + 0xca : b'\xc3\x8a', # Ê + 0xcb : b'\xc3\x8b', # Ë + 0xcc : b'\xc3\x8c', # Ì + 0xcd : b'\xc3\x8d', # Í + 0xce : b'\xc3\x8e', # Î + 0xcf : b'\xc3\x8f', # Ï + 0xd0 : b'\xc3\x90', # Ð + 0xd1 : b'\xc3\x91', # Ñ + 0xd2 : b'\xc3\x92', # Ò + 0xd3 : b'\xc3\x93', # Ó + 0xd4 : b'\xc3\x94', # Ô + 0xd5 : b'\xc3\x95', # Õ + 0xd6 : b'\xc3\x96', # Ö + 0xd7 : b'\xc3\x97', # × + 0xd8 : b'\xc3\x98', # Ø + 0xd9 : b'\xc3\x99', # Ù + 0xda : b'\xc3\x9a', # Ú + 0xdb : b'\xc3\x9b', # Û + 0xdc : b'\xc3\x9c', # Ü + 0xdd : b'\xc3\x9d', # Ý + 0xde : b'\xc3\x9e', # Þ + 0xdf : b'\xc3\x9f', # ß + 0xe0 : b'\xc3\xa0', # à + 0xe1 : b'\xa1', # á + 0xe2 : b'\xc3\xa2', # â + 0xe3 : b'\xc3\xa3', # ã + 0xe4 : b'\xc3\xa4', # ä + 0xe5 : b'\xc3\xa5', # å + 0xe6 : b'\xc3\xa6', # æ + 0xe7 : b'\xc3\xa7', # ç + 0xe8 : b'\xc3\xa8', # è + 0xe9 : b'\xc3\xa9', # é + 0xea : b'\xc3\xaa', # ê + 0xeb : b'\xc3\xab', # ë + 0xec : b'\xc3\xac', # ì + 0xed : b'\xc3\xad', # í + 0xee : b'\xc3\xae', # î + 0xef : b'\xc3\xaf', # ï + 0xf0 : b'\xc3\xb0', # ð + 0xf1 : b'\xc3\xb1', # ñ + 0xf2 : b'\xc3\xb2', # ò + 0xf3 : b'\xc3\xb3', # ó + 0xf4 : b'\xc3\xb4', # ô + 0xf5 : b'\xc3\xb5', # õ + 0xf6 : b'\xc3\xb6', # ö + 0xf7 : b'\xc3\xb7', # ÷ + 0xf8 : b'\xc3\xb8', # ø + 0xf9 : b'\xc3\xb9', # ù + 0xfa : b'\xc3\xba', # ú + 0xfb : b'\xc3\xbb', # û + 0xfc : b'\xc3\xbc', # ü + 0xfd : b'\xc3\xbd', # ý + 0xfe : b'\xc3\xbe', # þ + } + + MULTIBYTE_MARKERS_AND_SIZES = [ + (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF + (0xe0, 0xef, 3), # 3-byte characters start with E0-EF + (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 + ] + + FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] + LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] + + @classmethod + def detwingle(cls, in_bytes, main_encoding="utf8", + embedded_encoding="windows-1252"): + """Fix characters from one encoding embedded in some other encoding. + + Currently the only situation supported is Windows-1252 (or its + subset ISO-8859-1), embedded in UTF-8. + + The input must be a bytestring. If you've already converted + the document to Unicode, you're too late. + + The output is a bytestring in which `embedded_encoding` + characters have been converted to their `main_encoding` + equivalents. + """ + if embedded_encoding.replace('_', '-').lower() not in ( + 'windows-1252', 'windows_1252'): + raise NotImplementedError( + "Windows-1252 and ISO-8859-1 are the only currently supported " + "embedded encodings.") + + if main_encoding.lower() not in ('utf8', 'utf-8'): + raise NotImplementedError( + "UTF-8 is the only currently supported main encoding.") + + byte_chunks = [] + + chunk_start = 0 + pos = 0 + while pos < len(in_bytes): + byte = in_bytes[pos] + if not isinstance(byte, int): + # Python 2.x + byte = ord(byte) + if (byte >= cls.FIRST_MULTIBYTE_MARKER + and byte <= cls.LAST_MULTIBYTE_MARKER): + # This is the start of a UTF-8 multibyte character. Skip + # to the end. + for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: + if byte >= start and byte <= end: + pos += size + break + elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: + # We found a Windows-1252 character! + # Save the string up to this point as a chunk. + byte_chunks.append(in_bytes[chunk_start:pos]) + + # Now translate the Windows-1252 character into UTF-8 + # and add it as another, one-byte chunk. + byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) + pos += 1 + chunk_start = pos + else: + # Go on to the next character. + pos += 1 + if chunk_start == 0: + # The string is unchanged. + return in_bytes + else: + # Store the final chunk. + byte_chunks.append(in_bytes[chunk_start:]) + return b''.join(byte_chunks) + diff --git a/bs4/diagnose.py b/bs4/diagnose.py new file mode 100644 index 0000000..083395f --- /dev/null +++ b/bs4/diagnose.py @@ -0,0 +1,216 @@ +"""Diagnostic functions, mainly for use when doing tech support.""" + +__license__ = "MIT" + +import cProfile +from io import StringIO +from html.parser import HTMLParser +import bs4 +from bs4 import BeautifulSoup, __version__ +from bs4.builder import builder_registry + +import os +import pstats +import random +import tempfile +import time +import traceback +import sys +import cProfile + +def diagnose(data): + """Diagnostic suite for isolating common problems.""" + print("Diagnostic running on Beautiful Soup %s" % __version__) + print("Python version %s" % sys.version) + + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: + for builder in builder_registry.builders: + if name in builder.features: + break + else: + basic_parsers.remove(name) + print(( + "I noticed that %s is not installed. Installing it may help." % + name)) + + if 'lxml' in basic_parsers: + basic_parsers.append(["lxml", "xml"]) + try: + from lxml import etree + print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) + except ImportError as e: + print ( + "lxml is not installed or couldn't be imported.") + + + if 'html5lib' in basic_parsers: + try: + import html5lib + print("Found html5lib version %s" % html5lib.__version__) + except ImportError as e: + print ( + "html5lib is not installed or couldn't be imported.") + + if hasattr(data, 'read'): + data = data.read() + elif os.path.exists(data): + print('"%s" looks like a filename. Reading data from the file.' % data) + data = open(data).read() + elif data.startswith("http:") or data.startswith("https:"): + print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") + return + print() + + for parser in basic_parsers: + print("Trying to parse your markup with %s" % parser) + success = False + try: + soup = BeautifulSoup(data, parser) + success = True + except Exception as e: + print("%s could not parse the markup." % parser) + traceback.print_exc() + if success: + print("Here's what %s did with the markup:" % parser) + print(soup.prettify()) + + print("-" * 80) + +def lxml_trace(data, html=True, **kwargs): + """Print out the lxml events that occur during parsing. + + This lets you see how lxml parses a document when no Beautiful + Soup code is running. + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): + print(("%s, %4s, %s" % (event, element.tag, element.text))) + +class AnnouncingParser(HTMLParser): + """Announces HTMLParser parse events, without doing anything else.""" + + def _p(self, s): + print(s) + + def handle_starttag(self, name, attrs): + self._p("%s START" % name) + + def handle_endtag(self, name): + self._p("%s END" % name) + + def handle_data(self, data): + self._p("%s DATA" % data) + + def handle_charref(self, name): + self._p("%s CHARREF" % name) + + def handle_entityref(self, name): + self._p("%s ENTITYREF" % name) + + def handle_comment(self, data): + self._p("%s COMMENT" % data) + + def handle_decl(self, data): + self._p("%s DECL" % data) + + def unknown_decl(self, data): + self._p("%s UNKNOWN-DECL" % data) + + def handle_pi(self, data): + self._p("%s PI" % data) + +def htmlparser_trace(data): + """Print out the HTMLParser events that occur during parsing. + + This lets you see how HTMLParser parses a document when no + Beautiful Soup code is running. + """ + parser = AnnouncingParser() + parser.feed(data) + +_vowels = "aeiou" +_consonants = "bcdfghjklmnpqrstvwxyz" + +def rword(length=5): + "Generate a random word-like string." + s = '' + for i in range(length): + if i % 2 == 0: + t = _consonants + else: + t = _vowels + s += random.choice(t) + return s + +def rsentence(length=4): + "Generate a random sentence-like string." + return " ".join(rword(random.randint(4,9)) for i in range(length)) + +def rdoc(num_elements=1000): + """Randomly generate an invalid HTML document.""" + tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] + elements = [] + for i in range(num_elements): + choice = random.randint(0,3) + if choice == 0: + # New tag. + tag_name = random.choice(tag_names) + elements.append("<%s>" % tag_name) + elif choice == 1: + elements.append(rsentence(random.randint(1,4))) + elif choice == 2: + # Close a tag. + tag_name = random.choice(tag_names) + elements.append("" % tag_name) + return "" + "\n".join(elements) + "" + +def benchmark_parsers(num_elements=100000): + """Very basic head-to-head performance benchmark.""" + print("Comparative parser benchmark on Beautiful Soup %s" % __version__) + data = rdoc(num_elements) + print("Generated a large invalid HTML document (%d bytes)." % len(data)) + + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: + success = False + try: + a = time.time() + soup = BeautifulSoup(data, parser) + b = time.time() + success = True + except Exception as e: + print("%s could not parse the markup." % parser) + traceback.print_exc() + if success: + print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) + + from lxml import etree + a = time.time() + etree.HTML(data) + b = time.time() + print("Raw lxml parsed the markup in %.2fs." % (b-a)) + + import html5lib + parser = html5lib.HTMLParser() + a = time.time() + parser.parse(data) + b = time.time() + print("Raw html5lib parsed the markup in %.2fs." % (b-a)) + +def profile(num_elements=100000, parser="lxml"): + + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + + data = rdoc(num_elements) + vars = dict(bs4=bs4, data=data, parser=parser) + cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) + + stats = pstats.Stats(filename) + # stats.strip_dirs() + stats.sort_stats("cumulative") + stats.print_stats('_html5lib|bs4', 50) + +if __name__ == '__main__': + diagnose(sys.stdin.read()) diff --git a/bs4/element.py b/bs4/element.py new file mode 100644 index 0000000..0e62c2e --- /dev/null +++ b/bs4/element.py @@ -0,0 +1,1725 @@ +__license__ = "MIT" + +from pdb import set_trace +import collections +import re +import sys +import warnings +from bs4.dammit import EntitySubstitution + +DEFAULT_OUTPUT_ENCODING = "utf-8" +PY3K = (sys.version_info[0] > 2) + +whitespace_re = re.compile("\s+") + +def _alias(attr): + """Alias one attribute name to another for backward compatibility""" + @property + def alias(self): + return getattr(self, attr) + + @alias.setter + def alias(self): + return setattr(self, attr) + return alias + + +class NamespacedAttribute(str): + + def __new__(cls, prefix, name, namespace=None): + if name is None: + obj = str.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. + obj = str.__new__(cls, name) + else: + obj = str.__new__(cls, prefix + ":" + name) + obj.prefix = prefix + obj.name = name + obj.namespace = namespace + return obj + +class AttributeValueWithCharsetSubstitution(str): + """A stand-in object for a character encoding specified in HTML.""" + +class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'charset' attribute. + + When Beautiful Soup parses the markup '', the + value of the 'charset' attribute will be one of these objects. + """ + + def __new__(cls, original_value): + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + return encoding + + +class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): + """A generic stand-in for the value of a meta tag's 'content' attribute. + + When Beautiful Soup parses the markup: + + + The value of the 'content' attribute will be one of these objects. + """ + + CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) + + def __new__(cls, original_value): + match = cls.CHARSET_RE.search(original_value) + if match is None: + # No substitution necessary. + return str.__new__(str, original_value) + + obj = str.__new__(cls, original_value) + obj.original_value = original_value + return obj + + def encode(self, encoding): + def rewrite(match): + return match.group(1) + encoding + return self.CHARSET_RE.sub(rewrite, self.original_value) + +class HTMLAwareEntitySubstitution(EntitySubstitution): + + """Entity substitution rules that are aware of some HTML quirks. + + Specifically, the contents of + +Hello, world! + + +''' + soup = self.soup(html) + self.assertEqual("text/javascript", soup.find('script')['type']) + + def test_comment(self): + # Comments are represented as Comment objects. + markup = "

foobaz

" + self.assertSoupEquals(markup) + + soup = self.soup(markup) + comment = soup.find(text="foobar") + self.assertEqual(comment.__class__, Comment) + + # The comment is properly integrated into the tree. + foo = soup.find(text="foo") + self.assertEqual(comment, foo.next_element) + baz = soup.find(text="baz") + self.assertEqual(comment, baz.previous_element) + + def test_preserved_whitespace_in_pre_and_textarea(self): + """Whitespace must be preserved in
 and ")
+
+    def test_nested_inline_elements(self):
+        """Inline elements can be nested indefinitely."""
+        b_tag = "Inside a B tag"
+        self.assertSoupEquals(b_tag)
+
+        nested_b_tag = "

A nested tag

" + self.assertSoupEquals(nested_b_tag) + + double_nested_b_tag = "

A doubly nested tag

" + self.assertSoupEquals(nested_b_tag) + + def test_nested_block_level_elements(self): + """Block elements can be nested.""" + soup = self.soup('

Foo

') + blockquote = soup.blockquote + self.assertEqual(blockquote.p.b.string, 'Foo') + self.assertEqual(blockquote.b.string, 'Foo') + + def test_correctly_nested_tables(self): + """One table can go inside another one.""" + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") + + def test_deeply_nested_multivalued_attribute(self): + # html5lib can set the attributes of the same tag many times + # as it rearranges the tree. This has caused problems with + # multivalued attributes. + markup = '
' + soup = self.soup(markup) + self.assertEqual(["css"], soup.div.div['class']) + + def test_multivalued_attribute_on_html(self): + # html5lib uses a different API to set the attributes ot the + # tag. This has caused problems with multivalued + # attributes. + markup = '' + soup = self.soup(markup) + self.assertEqual(["a", "b"], soup.html['class']) + + def test_angle_brackets_in_attribute_values_are_escaped(self): + self.assertSoupEquals('', '') + + def test_entities_in_attributes_converted_to_unicode(self): + expect = '

' + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) + + def test_entities_in_text_converted_to_unicode(self): + expect = '

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) + + def test_quot_entity_converted_to_quotation_mark(self): + self.assertSoupEquals("

I said "good day!"

", + '

I said "good day!"

') + + def test_out_of_range_entity(self): + expect = "\N{REPLACEMENT CHARACTER}" + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + self.assertSoupEquals("�", expect) + + def test_multipart_strings(self): + "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." + soup = self.soup("

\nfoo

") + self.assertEqual("p", soup.h2.string.next_element.name) + self.assertEqual("p", soup.p.name) + self.assertConnectedness(soup) + + def test_head_tag_between_head_and_body(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """ + + foo + +""" + soup = self.soup(content) + self.assertNotEqual(None, soup.html.body) + self.assertConnectedness(soup) + + def test_multiple_copies_of_a_tag(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """ + + + + + +""" + soup = self.soup(content) + self.assertConnectedness(soup.article) + + def test_basic_namespaces(self): + """Parsers don't need to *understand* namespaces, but at the + very least they should not choke on namespaces or lose + data.""" + + markup = b'4' + soup = self.soup(markup) + self.assertEqual(markup, soup.encode()) + html = soup.html + self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) + self.assertEqual( + 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) + self.assertEqual( + 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) + + def test_multivalued_attribute_value_becomes_list(self): + markup = b'' + soup = self.soup(markup) + self.assertEqual(['foo', 'bar'], soup.a['class']) + + # + # Generally speaking, tests below this point are more tests of + # Beautiful Soup than tests of the tree builders. But parsers are + # weird, so we run these tests separately for every tree builder + # to detect any differences between them. + # + + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) + self.assertEqual('Sacr\xe9 bleu!', soup.body.string) + + def test_soupstrainer(self): + """Parsers should be able to work with SoupStrainers.""" + strainer = SoupStrainer("b") + soup = self.soup("A bold statement", + parse_only=strainer) + self.assertEqual(soup.decode(), "bold") + + def test_single_quote_attribute_values_become_double_quotes(self): + self.assertSoupEquals("", + '') + + def test_attribute_values_with_nested_quotes_are_left_alone(self): + text = """a""" + self.assertSoupEquals(text) + + def test_attribute_values_with_double_nested_quotes_get_quoted(self): + text = """a""" + soup = self.soup(text) + soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' + self.assertSoupEquals( + soup.foo.decode(), + """a""") + + def test_ampersand_in_attribute_value_gets_escaped(self): + self.assertSoupEquals('', + '') + + self.assertSoupEquals( + 'foo', + 'foo') + + def test_escaped_ampersand_in_attribute_value_is_left_alone(self): + self.assertSoupEquals('') + + def test_entities_in_strings_converted_during_parsing(self): + # Both XML and HTML entities are converted to Unicode characters + # during parsing. + text = "

<<sacré bleu!>>

" + expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" + self.assertSoupEquals(text, expected) + + def test_smart_quotes_converted_on_the_way_in(self): + # Microsoft smart quotes are converted to Unicode characters during + # parsing. + quote = b"

\x91Foo\x92

" + soup = self.soup(quote) + self.assertEqual( + soup.p.string, + "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + + def test_non_breaking_spaces_converted_on_the_way_in(self): + soup = self.soup("  ") + self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) + + def test_entities_converted_on_the_way_out(self): + text = "

<<sacré bleu!>>

" + expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") + soup = self.soup(text) + self.assertEqual(soup.p.encode("utf-8"), expected) + + def test_real_iso_latin_document(self): + # Smoke test of interrelated functionality, using an + # easy-to-understand document. + + # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. + unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' + + # That's because we're going to encode it into ISO-Latin-1, and use + # that to test. + iso_latin_html = unicode_html.encode("iso-8859-1") + + # Parse the ISO-Latin-1 HTML. + soup = self.soup(iso_latin_html) + # Encode it to UTF-8. + result = soup.encode("utf-8") + + # What do we expect the result to look like? Well, it would + # look like unicode_html, except that the META tag would say + # UTF-8 instead of ISO-Latin-1. + expected = unicode_html.replace("ISO-Latin-1", "utf-8") + + # And, of course, it would be in UTF-8, not Unicode. + expected = expected.encode("utf-8") + + # Ta-da! + self.assertEqual(result, expected) + + def test_real_shift_jis_document(self): + # Smoke test to make sure the parser can handle a document in + # Shift-JIS encoding, without choking. + shift_jis_html = ( + b'
'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
+            b'
') + unicode_html = shift_jis_html.decode("shift-jis") + soup = self.soup(unicode_html) + + # Make sure the parse tree is correctly encoded to various + # encodings. + self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) + self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) + + def test_real_hebrew_document(self): + # A real-world test to make sure we can convert ISO-8859-9 (a + # Hebrew encoding) to UTF-8. + hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' + soup = self.soup( + hebrew_document, from_encoding="iso8859-8") + self.assertEqual(soup.original_encoding, 'iso8859-8') + self.assertEqual( + soup.encode('utf-8'), + hebrew_document.decode("iso8859-8").encode("utf-8")) + + def test_meta_tag_reflects_current_encoding(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '\n%s\n' + '' + 'Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) + content = parsed_meta['content'] + self.assertEqual('text/html; charset=x-sjis', content) + + # But that value is actually a ContentMetaAttributeValue object. + self.assertTrue(isinstance(content, ContentMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('text/html; charset=utf8', content.encode("utf8")) + + # For the rest of the story, see TestSubstitutions in + # test_tree.py. + + def test_html5_style_meta_tag_reflects_current_encoding(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + + # Here's a document incorporating that meta tag. + shift_jis_html = ( + '\n%s\n' + '' + 'Shift-JIS markup goes here.') % meta_tag + soup = self.soup(shift_jis_html) + + # Parse the document, and the charset is seemingly unaffected. + parsed_meta = soup.find('meta', id="encoding") + charset = parsed_meta['charset'] + self.assertEqual('x-sjis', charset) + + # But that value is actually a CharsetMetaAttributeValue object. + self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) + + # And it will take on a value that reflects its current + # encoding. + self.assertEqual('utf8', charset.encode("utf8")) + + def test_tag_with_no_attributes_can_have_attributes_added(self): + data = self.soup("text") + data.a['foo'] = 'bar' + self.assertEqual('text', data.a.decode()) + +class XMLTreeBuilderSmokeTest(object): + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + + def test_docstring_generated(self): + soup = self.soup("") + self.assertEqual( + soup.encode(), b'\n') + + def test_xml_declaration(self): + markup = b"""\n""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_real_xhtml_document(self): + """A real XHTML document should come out *exactly* the same as it went in.""" + markup = b""" + + +Hello. +Goodbye. +""" + soup = self.soup(markup) + self.assertEqual( + soup.encode("utf-8"), markup) + + def test_formatter_processes_script_tag_for_xml_documents(self): + doc = """ + +""" + soup = BeautifulSoup(doc, "lxml-xml") + # lxml would have stripped this while parsing, but we can add + # it later. + soup.script.string = 'console.log("< < hey > > ");' + encoded = soup.encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_can_parse_unicode_document(self): + markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) + self.assertEqual('Sacr\xe9 bleu!', soup.root.string) + + def test_popping_namespaced_tag(self): + markup = 'b2012-07-02T20:33:42Zcd' + soup = self.soup(markup) + self.assertEqual( + str(soup.rss), markup) + + def test_docstring_includes_correct_encoding(self): + soup = self.soup("") + self.assertEqual( + soup.encode("latin1"), + b'\n') + + def test_large_xml_document(self): + """A large XML document should come out the same as it went in.""" + markup = (b'\n' + + b'0' * (2**12) + + b'') + soup = self.soup(markup) + self.assertEqual(soup.encode("utf-8"), markup) + + + def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): + self.assertSoupEquals("

", "

") + self.assertSoupEquals("

foo

") + + def test_namespaces_are_preserved(self): + markup = 'This tag is in the a namespaceThis tag is in the b namespace' + soup = self.soup(markup) + root = soup.root + self.assertEqual("http://example.com/", root['xmlns:a']) + self.assertEqual("http://example.net/", root['xmlns:b']) + + def test_closing_namespaced_tag(self): + markup = '

20010504

' + soup = self.soup(markup) + self.assertEqual(str(soup.p), markup) + + def test_namespaced_attributes(self): + markup = '' + soup = self.soup(markup) + self.assertEqual(str(soup.foo), markup) + + def test_namespaced_attributes_xml_namespace(self): + markup = 'bar' + soup = self.soup(markup) + self.assertEqual(str(soup.foo), markup) + +class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): + """Smoke test for a tree builder that supports HTML5.""" + + def test_real_xhtml_document(self): + # Since XHTML is not HTML5, HTML5 parsers are not tested to handle + # XHTML documents in any particular way. + pass + + def test_html_tags_have_namespace(self): + markup = "" + soup = self.soup(markup) + self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) + + def test_svg_tags_have_namespace(self): + markup = '' + soup = self.soup(markup) + namespace = "http://www.w3.org/2000/svg" + self.assertEqual(namespace, soup.svg.namespace) + self.assertEqual(namespace, soup.circle.namespace) + + + def test_mathml_tags_have_namespace(self): + markup = '5' + soup = self.soup(markup) + namespace = 'http://www.w3.org/1998/Math/MathML' + self.assertEqual(namespace, soup.math.namespace) + self.assertEqual(namespace, soup.msqrt.namespace) + + def test_xml_declaration_becomes_comment(self): + markup = '' + soup = self.soup(markup) + self.assertTrue(isinstance(soup.contents[0], Comment)) + self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') + self.assertEqual("html", soup.contents[0].next_element.name) + +def skipIf(condition, reason): + def nothing(test, *args, **kwargs): + return None + + def decorator(test_item): + if condition: + return nothing + else: + return test_item + + return decorator diff --git a/bs4/tests/__init__.py b/bs4/tests/__init__.py new file mode 100644 index 0000000..142c8cc --- /dev/null +++ b/bs4/tests/__init__.py @@ -0,0 +1 @@ +"The beautifulsoup tests." diff --git a/bs4/tests/test_builder_registry.py b/bs4/tests/test_builder_registry.py new file mode 100644 index 0000000..90cad82 --- /dev/null +++ b/bs4/tests/test_builder_registry.py @@ -0,0 +1,147 @@ +"""Tests of the builder registry.""" + +import unittest +import warnings + +from bs4 import BeautifulSoup +from bs4.builder import ( + builder_registry as registry, + HTMLParserTreeBuilder, + TreeBuilderRegistry, +) + +try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +except ImportError: + HTML5LIB_PRESENT = False + +try: + from bs4.builder import ( + LXMLTreeBuilderForXML, + LXMLTreeBuilder, + ) + LXML_PRESENT = True +except ImportError: + LXML_PRESENT = False + + +class BuiltInRegistryTest(unittest.TestCase): + """Test the built-in registry with the default builders registered.""" + + def test_combination(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('fast', 'html'), + LXMLTreeBuilder) + + if LXML_PRESENT: + self.assertEqual(registry.lookup('permissive', 'xml'), + LXMLTreeBuilderForXML) + self.assertEqual(registry.lookup('strict', 'html'), + HTMLParserTreeBuilder) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html5lib', 'html'), + HTML5TreeBuilder) + + def test_lookup_by_markup_type(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) + self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) + else: + self.assertEqual(registry.lookup('xml'), None) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) + else: + self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) + + def test_named_library(self): + if LXML_PRESENT: + self.assertEqual(registry.lookup('lxml', 'xml'), + LXMLTreeBuilderForXML) + self.assertEqual(registry.lookup('lxml', 'html'), + LXMLTreeBuilder) + if HTML5LIB_PRESENT: + self.assertEqual(registry.lookup('html5lib'), + HTML5TreeBuilder) + + self.assertEqual(registry.lookup('html.parser'), + HTMLParserTreeBuilder) + + def test_beautifulsoup_constructor_does_lookup(self): + + with warnings.catch_warnings(record=True) as w: + # This will create a warning about not explicitly + # specifying a parser, but we'll ignore it. + + # You can pass in a string. + BeautifulSoup("", features="html") + # Or a list of strings. + BeautifulSoup("", features=["html", "fast"]) + + # You'll get an exception if BS can't find an appropriate + # builder. + self.assertRaises(ValueError, BeautifulSoup, + "", features="no-such-feature") + +class RegistryTest(unittest.TestCase): + """Test the TreeBuilderRegistry class in general.""" + + def setUp(self): + self.registry = TreeBuilderRegistry() + + def builder_for_features(self, *feature_list): + cls = type('Builder_' + '_'.join(feature_list), + (object,), {'features' : feature_list}) + + self.registry.register(cls) + return cls + + def test_register_with_no_features(self): + builder = self.builder_for_features() + + # Since the builder advertises no features, you can't find it + # by looking up features. + self.assertEqual(self.registry.lookup('foo'), None) + + # But you can find it by doing a lookup with no features, if + # this happens to be the only registered builder. + self.assertEqual(self.registry.lookup(), builder) + + def test_register_with_features_makes_lookup_succeed(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEqual(self.registry.lookup('foo'), builder) + self.assertEqual(self.registry.lookup('bar'), builder) + + def test_lookup_fails_when_no_builder_implements_feature(self): + builder = self.builder_for_features('foo', 'bar') + self.assertEqual(self.registry.lookup('baz'), None) + + def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): + builder1 = self.builder_for_features('foo') + builder2 = self.builder_for_features('bar') + self.assertEqual(self.registry.lookup(), builder2) + + def test_lookup_fails_when_no_tree_builders_registered(self): + self.assertEqual(self.registry.lookup(), None) + + def test_lookup_gets_most_recent_builder_supporting_all_features(self): + has_one = self.builder_for_features('foo') + has_the_other = self.builder_for_features('bar') + has_both_early = self.builder_for_features('foo', 'bar', 'baz') + has_both_late = self.builder_for_features('foo', 'bar', 'quux') + lacks_one = self.builder_for_features('bar') + has_the_other = self.builder_for_features('foo') + + # There are two builders featuring 'foo' and 'bar', but + # the one that also features 'quux' was registered later. + self.assertEqual(self.registry.lookup('foo', 'bar'), + has_both_late) + + # There is only one builder featuring 'foo', 'bar', and 'baz'. + self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), + has_both_early) + + def test_lookup_fails_when_cannot_reconcile_requested_features(self): + builder1 = self.builder_for_features('foo', 'bar') + builder2 = self.builder_for_features('foo', 'baz') + self.assertEqual(self.registry.lookup('bar', 'baz'), None) diff --git a/bs4/tests/test_docs.py b/bs4/tests/test_docs.py new file mode 100644 index 0000000..5b9f677 --- /dev/null +++ b/bs4/tests/test_docs.py @@ -0,0 +1,36 @@ +"Test harness for doctests." + +# pylint: disable-msg=E0611,W0142 + +__metaclass__ = type +__all__ = [ + 'additional_tests', + ] + +import atexit +import doctest +import os +#from pkg_resources import ( +# resource_filename, resource_exists, resource_listdir, cleanup_resources) +import unittest + +DOCTEST_FLAGS = ( + doctest.ELLIPSIS | + doctest.NORMALIZE_WHITESPACE | + doctest.REPORT_NDIFF) + + +# def additional_tests(): +# "Run the doc tests (README.txt and docs/*, if any exist)" +# doctest_files = [ +# os.path.abspath(resource_filename('bs4', 'README.txt'))] +# if resource_exists('bs4', 'docs'): +# for name in resource_listdir('bs4', 'docs'): +# if name.endswith('.txt'): +# doctest_files.append( +# os.path.abspath( +# resource_filename('bs4', 'docs/%s' % name))) +# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) +# atexit.register(cleanup_resources) +# return unittest.TestSuite(( +# doctest.DocFileSuite(*doctest_files, **kwargs))) diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py new file mode 100644 index 0000000..a7494ca --- /dev/null +++ b/bs4/tests/test_html5lib.py @@ -0,0 +1,98 @@ +"""Tests to ensure that the html5lib tree builder generates good trees.""" + +import warnings + +try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +except ImportError as e: + HTML5LIB_PRESENT = False +from bs4.element import SoupStrainer +from bs4.testing import ( + HTML5TreeBuilderSmokeTest, + SoupTest, + skipIf, +) + +@skipIf( + not HTML5LIB_PRESENT, + "html5lib seems not to be present, not testing its tree builder.") +class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): + """See ``HTML5TreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return HTML5TreeBuilder() + + def test_soupstrainer(self): + # The html5lib tree builder does not support SoupStrainers. + strainer = SoupStrainer("b") + markup = "

A bold statement.

" + with warnings.catch_warnings(record=True) as w: + soup = self.soup(markup, parse_only=strainer) + self.assertEqual( + soup.decode(), self.document_for(markup)) + + self.assertTrue( + "the html5lib tree builder doesn't support parse_only" in + str(w[0].message)) + + def test_correctly_nested_tables(self): + """html5lib inserts tags where other parsers don't.""" + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") + + def test_xml_declaration_followed_by_doctype(self): + markup = ''' + + + + + +

foo

+ +''' + soup = self.soup(markup) + # Verify that we can reach the

tag; this means the tree is connected. + self.assertEqual(b"

foo

", soup.p.encode()) + + def test_reparented_markup(self): + markup = '

foo

\n

bar

' + soup = self.soup(markup) + self.assertEqual("

foo

\n

bar

", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + + def test_reparented_markup_ends_with_whitespace(self): + markup = '

foo

\n

bar

\n' + soup = self.soup(markup) + self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + def test_processing_instruction(self): + """Processing instructions become comments.""" + markup = b"""""" + soup = self.soup(markup) + assert str(soup).startswith("") + + def test_cloned_multivalue_node(self): + markup = b"""

""" + soup = self.soup(markup) + a1, a2 = soup.find_all('a') + self.assertEqual(a1, a2) + assert a1 is not a2 diff --git a/bs4/tests/test_htmlparser.py b/bs4/tests/test_htmlparser.py new file mode 100644 index 0000000..b45e35f --- /dev/null +++ b/bs4/tests/test_htmlparser.py @@ -0,0 +1,32 @@ +"""Tests to ensure that the html.parser tree builder generates good +trees.""" + +from pdb import set_trace +import pickle +from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest +from bs4.builder import HTMLParserTreeBuilder + +class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + + @property + def default_builder(self): + return HTMLParserTreeBuilder() + + def test_namespaced_system_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass + + def test_namespaced_public_doctype(self): + # html.parser can't handle namespaced doctypes, so skip this one. + pass + + def test_builder_is_pickled(self): + """Unlike most tree builders, HTMLParserTreeBuilder and will + be restored after pickling. + """ + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertTrue(isinstance(loaded.builder, type(tree.builder))) + + diff --git a/bs4/tests/test_lxml.py b/bs4/tests/test_lxml.py new file mode 100644 index 0000000..6c2a1d7 --- /dev/null +++ b/bs4/tests/test_lxml.py @@ -0,0 +1,76 @@ +"""Tests to ensure that the lxml tree builder generates good trees.""" + +import re +import warnings + +try: + import lxml.etree + LXML_PRESENT = True + LXML_VERSION = lxml.etree.LXML_VERSION +except ImportError as e: + LXML_PRESENT = False + LXML_VERSION = (0,) + +if LXML_PRESENT: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + +from bs4 import ( + BeautifulSoup, + BeautifulStoneSoup, + ) +from bs4.element import Comment, Doctype, SoupStrainer +from bs4.testing import skipIf +from bs4.tests import test_htmlparser +from bs4.testing import ( + HTMLTreeBuilderSmokeTest, + XMLTreeBuilderSmokeTest, + SoupTest, + skipIf, +) + +@skipIf( + not LXML_PRESENT, + "lxml seems not to be present, not testing its tree builder.") +class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilder() + + def test_out_of_range_entity(self): + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + self.assertSoupEquals( + "

foo�bar

", "

foobar

") + + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this + # test if an old version of lxml is installed. + + @skipIf( + not LXML_PRESENT or LXML_VERSION < (2,3,5,0), + "Skipping doctype test for old version of lxml to avoid segfault.") + def test_empty_doctype(self): + soup = self.soup("") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + + def test_beautifulstonesoup_is_xml_parser(self): + # Make sure that the deprecated BSS class uses an xml builder + # if one is installed. + with warnings.catch_warnings(record=True) as w: + soup = BeautifulStoneSoup("") + self.assertEqual("", str(soup.b)) + self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) + +@skipIf( + not LXML_PRESENT, + "lxml seems not to be present, not testing its XML tree builder.") +class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): + """See ``HTMLTreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return LXMLTreeBuilderForXML() diff --git a/bs4/tests/test_soup.py b/bs4/tests/test_soup.py new file mode 100644 index 0000000..f87949e --- /dev/null +++ b/bs4/tests/test_soup.py @@ -0,0 +1,483 @@ +# -*- coding: utf-8 -*- +"""Tests of Beautiful Soup as a whole.""" + +from pdb import set_trace +import logging +import unittest +import sys +import tempfile + +from bs4 import ( + BeautifulSoup, + BeautifulStoneSoup, +) +from bs4.element import ( + CharsetMetaAttributeValue, + ContentMetaAttributeValue, + SoupStrainer, + NamespacedAttribute, + ) +import bs4.dammit +from bs4.dammit import ( + EntitySubstitution, + UnicodeDammit, + EncodingDetector, +) +from bs4.testing import ( + SoupTest, + skipIf, +) +import warnings + +try: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + LXML_PRESENT = True +except ImportError as e: + LXML_PRESENT = False + +PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) +PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) + +class TestConstructor(SoupTest): + + def test_short_unicode_input(self): + data = "

éé

" + soup = self.soup(data) + self.assertEqual("éé", soup.h1.string) + + def test_embedded_null(self): + data = "

foo\0bar

" + soup = self.soup(data) + self.assertEqual("foo\0bar", soup.h1.string) + + def test_exclude_encodings(self): + utf8_data = "Räksmörgås".encode("utf-8") + soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual("windows-1252", soup.original_encoding) + + +class TestWarnings(SoupTest): + + def _no_parser_specified(self, s, is_there=True): + v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) + self.assertTrue(v) + + def test_warning_if_no_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_warning_if_parser_specified_too_vague(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("", "html") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_no_warning_if_explicit_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("", "html.parser") + self.assertEqual([], w) + + def test_parseOnlyThese_renamed_to_parse_only(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("", parseOnlyThese=SoupStrainer("b")) + msg = str(w[0].message) + self.assertTrue("parseOnlyThese" in msg) + self.assertTrue("parse_only" in msg) + self.assertEqual(b"", soup.encode()) + + def test_fromEncoding_renamed_to_from_encoding(self): + with warnings.catch_warnings(record=True) as w: + utf8 = b"\xc3\xa9" + soup = self.soup(utf8, fromEncoding="utf8") + msg = str(w[0].message) + self.assertTrue("fromEncoding" in msg) + self.assertTrue("from_encoding" in msg) + self.assertEqual("utf8", soup.original_encoding) + + def test_unrecognized_keyword_argument(self): + self.assertRaises( + TypeError, self.soup, "", no_such_argument=True) + +class TestWarnings(SoupTest): + + def test_disk_file_warning(self): + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + try: + with warnings.catch_warnings(record=True) as w: + soup = self.soup(filename) + msg = str(w[0].message) + self.assertTrue("looks like a filename" in msg) + finally: + filehandle.close() + + # The file no longer exists, so Beautiful Soup will no longer issue the warning. + with warnings.catch_warnings(record=True) as w: + soup = self.soup(filename) + self.assertEqual(0, len(w)) + + def test_url_warning(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("http://www.crummy.com/") + msg = str(w[0].message) + self.assertTrue("looks like a URL" in msg) + + with warnings.catch_warnings(record=True) as w: + soup = self.soup("http://www.crummy.com/ is great") + self.assertEqual(0, len(w)) + +class TestSelectiveParsing(SoupTest): + + def test_parse_with_soupstrainer(self): + markup = "NoYesNoYes Yes" + strainer = SoupStrainer("b") + soup = self.soup(markup, parse_only=strainer) + self.assertEqual(soup.encode(), b"YesYes Yes") + + +class TestEntitySubstitution(unittest.TestCase): + """Standalone tests of the EntitySubstitution class.""" + def setUp(self): + self.sub = EntitySubstitution + + def test_simple_html_substitution(self): + # Unicode characters corresponding to named HTML entites + # are substituted, and no others. + s = "foo\u2200\N{SNOWMAN}\u00f5bar" + self.assertEqual(self.sub.substitute_html(s), + "foo∀\N{SNOWMAN}õbar") + + def test_smart_quote_substitution(self): + # MS smart quotes are a common source of frustration, so we + # give them a special test. + quotes = b"\x91\x92foo\x93\x94" + dammit = UnicodeDammit(quotes) + self.assertEqual(self.sub.substitute_html(dammit.markup), + "‘’foo“”") + + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): + s = 'Welcome to "my bar"' + self.assertEqual(self.sub.substitute_xml(s, False), s) + + def test_xml_attribute_quoting_normally_uses_double_quotes(self): + self.assertEqual(self.sub.substitute_xml("Welcome", True), + '"Welcome"') + self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), + '"Bob\'s Bar"') + + def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): + s = 'Welcome to "my bar"' + self.assertEqual(self.sub.substitute_xml(s, True), + "'Welcome to \"my bar\"'") + + def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): + s = 'Welcome to "Bob\'s Bar"' + self.assertEqual( + self.sub.substitute_xml(s, True), + '"Welcome to "Bob\'s Bar""') + + def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): + quoted = 'Welcome to "Bob\'s Bar"' + self.assertEqual(self.sub.substitute_xml(quoted), quoted) + + def test_xml_quoting_handles_angle_brackets(self): + self.assertEqual( + self.sub.substitute_xml("foo"), + "foo<bar>") + + def test_xml_quoting_handles_ampersands(self): + self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") + + def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): + self.assertEqual( + self.sub.substitute_xml("ÁT&T"), + "&Aacute;T&T") + + def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): + self.assertEqual( + self.sub.substitute_xml_containing_entities("ÁT&T"), + "ÁT&T") + + def test_quotes_not_html_substituted(self): + """There's no need to do this except inside attribute values.""" + text = 'Bob\'s "bar"' + self.assertEqual(self.sub.substitute_html(text), text) + + +class TestEncodingConversion(SoupTest): + # Test Beautiful Soup's ability to decode and encode from various + # encodings. + + def setUp(self): + super(TestEncodingConversion, self).setUp() + self.unicode_data = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + self.utf8_data = self.unicode_data.encode("utf-8") + # Just so you know what it looks like. + self.assertEqual( + self.utf8_data, + b'Sacr\xc3\xa9 bleu!') + + def test_ascii_in_unicode_out(self): + # ASCII input is converted to Unicode. The original_encoding + # attribute is set to 'utf-8', a superset of ASCII. + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) + try: + def noop(str): + return None + # Disable chardet, which will realize that the ASCII is ASCII. + bs4.dammit.chardet_dammit = noop + ascii = b"a" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + self.assertTrue(isinstance(unicode_output, str)) + self.assertEqual(unicode_output, self.document_for(ascii.decode())) + self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") + finally: + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet + + def test_unicode_in_unicode_out(self): + # Unicode input is left alone. The original_encoding attribute + # is not set. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.decode(), self.unicode_data) + self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') + self.assertEqual(soup_from_unicode.original_encoding, None) + + def test_utf8_in_unicode_out(self): + # UTF-8 input is converted to Unicode. The original_encoding + # attribute is set. + soup_from_utf8 = self.soup(self.utf8_data) + self.assertEqual(soup_from_utf8.decode(), self.unicode_data) + self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') + + def test_utf8_out(self): + # The internal data structures can be encoded as UTF-8. + soup_from_unicode = self.soup(self.unicode_data) + self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) + + @skipIf( + PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, + "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") + def test_attribute_name_containing_unicode_characters(self): + markup = '
' + self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) + +class TestUnicodeDammit(unittest.TestCase): + """Standalone tests of UnicodeDammit.""" + + def test_unicode_input(self): + markup = "I'm already Unicode! \N{SNOWMAN}" + dammit = UnicodeDammit(markup) + self.assertEqual(dammit.unicode_markup, markup) + + def test_smart_quotes_to_unicode(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup) + self.assertEqual( + dammit.unicode_markup, "\u2018\u2019\u201c\u201d") + + def test_smart_quotes_to_xml_entities(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup, smart_quotes_to="xml") + self.assertEqual( + dammit.unicode_markup, "‘’“”") + + def test_smart_quotes_to_html_entities(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup, smart_quotes_to="html") + self.assertEqual( + dammit.unicode_markup, "‘’“”") + + def test_smart_quotes_to_ascii(self): + markup = b"\x91\x92\x93\x94" + dammit = UnicodeDammit(markup, smart_quotes_to="ascii") + self.assertEqual( + dammit.unicode_markup, """''""""") + + def test_detect_utf8(self): + utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" + dammit = UnicodeDammit(utf8) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') + + + def test_convert_hebrew(self): + hebrew = b"\xed\xe5\xec\xf9" + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') + self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') + + def test_dont_see_smart_quotes_where_there_are_none(self): + utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" + dammit = UnicodeDammit(utf_8) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) + + def test_ignore_inappropriate_codecs(self): + utf8_data = "Räksmörgås".encode("utf-8") + dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + + def test_ignore_invalid_codecs(self): + utf8_data = "Räksmörgås".encode("utf-8") + for bad_encoding in ['.utf8', '...', 'utF---16.!']: + dammit = UnicodeDammit(utf8_data, [bad_encoding]) + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + + def test_exclude_encodings(self): + # This is UTF-8. + utf8_data = "Räksmörgås".encode("utf-8") + + # But if we exclude UTF-8 from consideration, the guess is + # Windows-1252. + dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') + + # And if we exclude that, there is no valid guess at all. + dammit = UnicodeDammit( + utf8_data, exclude_encodings=["utf-8", "windows-1252"]) + self.assertEqual(dammit.original_encoding, None) + + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): + detected = EncodingDetector( + b'') + encodings = list(detected.encodings) + assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings + + def test_detect_html5_style_meta_tag(self): + + for data in ( + b'', + b"", + b"", + b""): + dammit = UnicodeDammit(data, is_html=True) + self.assertEqual( + "euc-jp", dammit.original_encoding) + + def test_last_ditch_entity_replacement(self): + # This is a UTF-8 document that contains bytestrings + # completely incompatible with UTF-8 (ie. encoded with some other + # encoding). + # + # Since there is no consistent encoding for the document, + # Unicode, Dammit will eventually encode the document as UTF-8 + # and encode the incompatible characters as REPLACEMENT + # CHARACTER. + # + # If chardet is installed, it will detect that the document + # can be converted into ISO-8859-1 without errors. This happens + # to be the wrong encoding, but it is a consistent encoding, so the + # code we're testing here won't run. + # + # So we temporarily disable chardet if it's present. + doc = b"""\357\273\277 +\330\250\330\252\330\261 +\310\322\321\220\312\321\355\344""" + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) + try: + def noop(str): + return None + bs4.dammit.chardet_dammit = noop + dammit = UnicodeDammit(doc) + self.assertEqual(True, dammit.contains_replacement_characters) + self.assertTrue("\ufffd" in dammit.unicode_markup) + + soup = BeautifulSoup(doc, "html.parser") + self.assertTrue(soup.contains_replacement_characters) + finally: + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet + + def test_byte_order_mark_removed(self): + # A document written in UTF-16LE will have its byte order marker stripped. + data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' + dammit = UnicodeDammit(data) + self.assertEqual("áé", dammit.unicode_markup) + self.assertEqual("utf-16le", dammit.original_encoding) + + def test_detwingle(self): + # Here's a UTF8 document. + utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") + + # Here's a Windows-1252 document. + windows_1252 = ( + "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" + "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") + + # Through some unholy alchemy, they've been stuck together. + doc = utf8 + windows_1252 + utf8 + + # The document can't be turned into UTF-8: + self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") + + # Unicode, Dammit thinks the whole document is Windows-1252, + # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" + + # But if we run it through fix_embedded_windows_1252, it's fixed: + + fixed = UnicodeDammit.detwingle(doc) + self.assertEqual( + "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) + + def test_detwingle_ignores_multibyte_characters(self): + # Each of these characters has a UTF-8 representation ending + # in \x93. \x93 is a smart quote if interpreted as + # Windows-1252. But our code knows to skip over multibyte + # UTF-8 characters, so they'll survive the process unscathed. + for tricky_unicode_char in ( + "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' + "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' + "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. + ): + input = tricky_unicode_char.encode("utf8") + self.assertTrue(input.endswith(b'\x93')) + output = UnicodeDammit.detwingle(input) + self.assertEqual(output, input) + +class TestNamedspacedAttribute(SoupTest): + + def test_name_may_be_none(self): + a = NamespacedAttribute("xmlns", None) + self.assertEqual(a, "xmlns") + + def test_attribute_is_equivalent_to_colon_separated_string(self): + a = NamespacedAttribute("a", "b") + self.assertEqual("a:b", a) + + def test_attributes_are_equivalent_if_prefix_and_name_identical(self): + a = NamespacedAttribute("a", "b", "c") + b = NamespacedAttribute("a", "b", "c") + self.assertEqual(a, b) + + # The actual namespace is not considered. + c = NamespacedAttribute("a", "b", None) + self.assertEqual(a, c) + + # But name and prefix are important. + d = NamespacedAttribute("a", "z", "c") + self.assertNotEqual(a, d) + + e = NamespacedAttribute("z", "b", "c") + self.assertNotEqual(a, e) + + +class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): + + def test_content_meta_attribute_value(self): + value = CharsetMetaAttributeValue("euc-jp") + self.assertEqual("euc-jp", value) + self.assertEqual("euc-jp", value.original_value) + self.assertEqual("utf8", value.encode("utf8")) + + + def test_content_meta_attribute_value(self): + value = ContentMetaAttributeValue("text/html; charset=euc-jp") + self.assertEqual("text/html; charset=euc-jp", value) + self.assertEqual("text/html; charset=euc-jp", value.original_value) + self.assertEqual("text/html; charset=utf8", value.encode("utf8")) diff --git a/bs4/tests/test_tree.py b/bs4/tests/test_tree.py new file mode 100644 index 0000000..6d3e67f --- /dev/null +++ b/bs4/tests/test_tree.py @@ -0,0 +1,2011 @@ +# -*- coding: utf-8 -*- +"""Tests for Beautiful Soup's tree traversal methods. + +The tree traversal methods are the main advantage of using Beautiful +Soup over just using a parser. + +Different parsers will build different Beautiful Soup trees given the +same markup, but all Beautiful Soup trees can be traversed with the +methods tested here. +""" + +from pdb import set_trace +import copy +import pickle +import re +import warnings +from bs4 import BeautifulSoup +from bs4.builder import ( + builder_registry, + HTMLParserTreeBuilder, +) +from bs4.element import ( + PY3K, + CData, + Comment, + Declaration, + Doctype, + NavigableString, + SoupStrainer, + Tag, +) +from bs4.testing import ( + SoupTest, + skipIf, +) + +XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) +LXML_PRESENT = (builder_registry.lookup("lxml") is not None) + +class TreeTest(SoupTest): + + def assertSelects(self, tags, should_match): + """Make sure that the given tags have the correct text. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + self.assertEqual([tag.string for tag in tags], should_match) + + def assertSelectsIDs(self, tags, should_match): + """Make sure that the given tags have the correct IDs. + + This is used in tests that define a bunch of tags, each + containing a single string, and then select certain strings by + some mechanism. + """ + self.assertEqual([tag['id'] for tag in tags], should_match) + + +class TestFind(TreeTest): + """Basic tests of the find() method. + + find() just calls find_all() with limit=1, so it's not tested all + that thouroughly here. + """ + + def test_find_tag(self): + soup = self.soup("1234") + self.assertEqual(soup.find("b").string, "2") + + def test_unicode_text_find(self): + soup = self.soup('

Räksmörgås

') + self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås') + + def test_unicode_attribute_find(self): + soup = self.soup('

here it is

') + str(soup) + self.assertEqual("here it is", soup.find(id='Räksmörgås').text) + + + def test_find_everything(self): + """Test an optimization that finds all tags.""" + soup = self.soup("foobar") + self.assertEqual(2, len(soup.find_all())) + + def test_find_everything_with_name(self): + """Test an optimization that finds all tags with a given name.""" + soup = self.soup("foobarbaz") + self.assertEqual(2, len(soup.find_all('a'))) + +class TestFindAll(TreeTest): + """Basic tests of the find_all() method.""" + + def test_find_all_text_nodes(self): + """You can search the tree for text nodes.""" + soup = self.soup("Foobar\xbb") + # Exact match. + self.assertEqual(soup.find_all(string="bar"), ["bar"]) + self.assertEqual(soup.find_all(text="bar"), ["bar"]) + # Match any of a number of strings. + self.assertEqual( + soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"]) + # Match a regular expression. + self.assertEqual(soup.find_all(text=re.compile('.*')), + ["Foo", "bar", '\xbb']) + # Match anything. + self.assertEqual(soup.find_all(text=True), + ["Foo", "bar", '\xbb']) + + def test_find_all_limit(self): + """You can limit the number of items returned by find_all.""" + soup = self.soup("12345") + self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) + self.assertSelects(soup.find_all('a', limit=1), ["1"]) + self.assertSelects( + soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) + + # A limit of 0 means no limit. + self.assertSelects( + soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) + + def test_calling_a_tag_is_calling_findall(self): + soup = self.soup("123") + self.assertSelects(soup('a', limit=1), ["1"]) + self.assertSelects(soup.b(id="foo"), ["3"]) + + def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): + soup = self.soup("") + # Create a self-referential list. + l = [] + l.append(l) + + # Without special code in _normalize_search_value, this would cause infinite + # recursion. + self.assertEqual([], soup.find_all(l)) + + def test_find_all_resultset(self): + """All find_all calls return a ResultSet""" + soup = self.soup("") + result = soup.find_all("a") + self.assertTrue(hasattr(result, "source")) + + result = soup.find_all(True) + self.assertTrue(hasattr(result, "source")) + + result = soup.find_all(text="foo") + self.assertTrue(hasattr(result, "source")) + + +class TestFindAllBasicNamespaces(TreeTest): + + def test_find_by_namespaced_name(self): + soup = self.soup('4') + self.assertEqual("4", soup.find("mathml:msqrt").string) + self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) + + +class TestFindAllByName(TreeTest): + """Test ways of finding tags by tag name.""" + + def setUp(self): + super(TreeTest, self).setUp() + self.tree = self.soup("""First tag. + Second tag. + Third Nested tag. tag.""") + + def test_find_all_by_tag_name(self): + # Find all the tags. + self.assertSelects( + self.tree.find_all('a'), ['First tag.', 'Nested tag.']) + + def test_find_all_by_name_and_text(self): + self.assertSelects( + self.tree.find_all('a', text='First tag.'), ['First tag.']) + + self.assertSelects( + self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) + + self.assertSelects( + self.tree.find_all('a', text=re.compile("tag")), + ['First tag.', 'Nested tag.']) + + + def test_find_all_on_non_root_element(self): + # You can call find_all on any node, not just the root. + self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) + + def test_calling_element_invokes_find_all(self): + self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) + + def test_find_all_by_tag_strainer(self): + self.assertSelects( + self.tree.find_all(SoupStrainer('a')), + ['First tag.', 'Nested tag.']) + + def test_find_all_by_tag_names(self): + self.assertSelects( + self.tree.find_all(['a', 'b']), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_by_tag_dict(self): + self.assertSelects( + self.tree.find_all({'a' : True, 'b' : True}), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_by_tag_re(self): + self.assertSelects( + self.tree.find_all(re.compile('^[ab]$')), + ['First tag.', 'Second tag.', 'Nested tag.']) + + def test_find_all_with_tags_matching_method(self): + # You can define an oracle method that determines whether + # a tag matches the search. + def id_matches_name(tag): + return tag.name == tag.get('id') + + tree = self.soup("""Match 1. + Does not match. + Match 2.""") + + self.assertSelects( + tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) + + +class TestFindAllByAttribute(TreeTest): + + def test_find_all_by_attribute_name(self): + # You can pass in keyword arguments to find_all to search by + # attribute. + tree = self.soup(""" + Matching a. + + Non-matching Matching b.a. + """) + self.assertSelects(tree.find_all(id='first'), + ["Matching a.", "Matching b."]) + + def test_find_all_by_utf8_attribute_value(self): + peace = "םולש".encode("utf8") + data = ''.encode("utf8") + soup = self.soup(data) + self.assertEqual([soup.a], soup.find_all(title=peace)) + self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) + self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) + + def test_find_all_by_attribute_dict(self): + # You can pass in a dictionary as the argument 'attrs'. This + # lets you search for attributes like 'name' (a fixed argument + # to find_all) and 'class' (a reserved word in Python.) + tree = self.soup(""" + Name match. + Class match. + Non-match. + A tag called 'name1'. + """) + + # This doesn't do what you want. + self.assertSelects(tree.find_all(name='name1'), + ["A tag called 'name1'."]) + # This does what you want. + self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), + ["Name match."]) + + self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), + ["Class match."]) + + def test_find_all_by_class(self): + tree = self.soup(""" + Class 1. + Class 2. + Class 1. + Class 3 and 4. + """) + + # Passing in the class_ keyword argument will search against + # the 'class' attribute. + self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) + self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) + self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) + + # Passing in a string to 'attrs' will also search the CSS class. + self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) + self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) + self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) + self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) + + def test_find_by_class_when_multiple_classes_present(self): + tree = self.soup("Found it") + + f = tree.find_all("gar", class_=re.compile("o")) + self.assertSelects(f, ["Found it"]) + + f = tree.find_all("gar", class_=re.compile("a")) + self.assertSelects(f, ["Found it"]) + + # Since the class is not the string "foo bar", but the two + # strings "foo" and "bar", this will not find anything. + f = tree.find_all("gar", class_=re.compile("o b")) + self.assertSelects(f, []) + + def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): + soup = self.soup("Found it") + + self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) + + def big_attribute_value(value): + return len(value) > 3 + + self.assertSelects(soup.find_all("a", big_attribute_value), []) + + def small_attribute_value(value): + return len(value) <= 3 + + self.assertSelects( + soup.find_all("a", small_attribute_value), ["Found it"]) + + def test_find_all_with_string_for_attrs_finds_multiple_classes(self): + soup = self.soup('') + a, a2 = soup.find_all("a") + self.assertEqual([a, a2], soup.find_all("a", "foo")) + self.assertEqual([a], soup.find_all("a", "bar")) + + # If you specify the class as a string that contains a + # space, only that specific value will be found. + self.assertEqual([a], soup.find_all("a", class_="foo bar")) + self.assertEqual([a], soup.find_all("a", "foo bar")) + self.assertEqual([], soup.find_all("a", "bar foo")) + + def test_find_all_by_attribute_soupstrainer(self): + tree = self.soup(""" + Match. + Non-match.""") + + strainer = SoupStrainer(attrs={'id' : 'first'}) + self.assertSelects(tree.find_all(strainer), ['Match.']) + + def test_find_all_with_missing_atribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that do not have that attribute set. + tree = self.soup("""ID present. + No ID present. + ID is empty.""") + self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) + + def test_find_all_with_defined_attribute(self): + # You can pass in None as the value of an attribute to find_all. + # This will match tags that have that attribute set to any value. + tree = self.soup("""ID present. + No ID present. + ID is empty.""") + self.assertSelects( + tree.find_all(id=True), ["ID present.", "ID is empty."]) + + def test_find_all_with_numeric_attribute(self): + # If you search for a number, it's treated as a string. + tree = self.soup("""Unquoted attribute. + Quoted attribute.""") + + expected = ["Unquoted attribute.", "Quoted attribute."] + self.assertSelects(tree.find_all(id=1), expected) + self.assertSelects(tree.find_all(id="1"), expected) + + def test_find_all_with_list_attribute_values(self): + # You can pass a list of attribute values instead of just one, + # and you'll get tags that match any of the values. + tree = self.soup("""1 + 2 + 3 + No ID.""") + self.assertSelects(tree.find_all(id=["1", "3", "4"]), + ["1", "3"]) + + def test_find_all_with_regular_expression_attribute_value(self): + # You can pass a regular expression as an attribute value, and + # you'll get tags whose values for that attribute match the + # regular expression. + tree = self.soup("""One a. + Two as. + Mixed as and bs. + One b. + No ID.""") + + self.assertSelects(tree.find_all(id=re.compile("^a+$")), + ["One a.", "Two as."]) + + def test_find_by_name_and_containing_string(self): + soup = self.soup("foobarfoo") + a = soup.a + + self.assertEqual([a], soup.find_all("a", text="foo")) + self.assertEqual([], soup.find_all("a", text="bar")) + self.assertEqual([], soup.find_all("a", text="bar")) + + def test_find_by_name_and_containing_string_when_string_is_buried(self): + soup = self.soup("foofoo") + self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) + + def test_find_by_attribute_and_containing_string(self): + soup = self.soup('foofoo') + a = soup.a + + self.assertEqual([a], soup.find_all(id=2, text="foo")) + self.assertEqual([], soup.find_all(id=1, text="bar")) + + + + +class TestIndex(TreeTest): + """Test Tag.index""" + def test_index(self): + tree = self.soup("""
+ Identical + Not identical + Identical + + Identical with child + Also not identical + Identical with child +
""") + div = tree.div + for i, element in enumerate(div.contents): + self.assertEqual(i, div.index(element)) + self.assertRaises(ValueError, tree.index, 1) + + +class TestParentOperations(TreeTest): + """Test navigation and searching through an element's parents.""" + + def setUp(self): + super(TestParentOperations, self).setUp() + self.tree = self.soup('''
    +
      +
        +
          + Start here +
        +
      ''') + self.start = self.tree.b + + + def test_parent(self): + self.assertEqual(self.start.parent['id'], 'bottom') + self.assertEqual(self.start.parent.parent['id'], 'middle') + self.assertEqual(self.start.parent.parent.parent['id'], 'top') + + def test_parent_of_top_tag_is_soup_object(self): + top_tag = self.tree.contents[0] + self.assertEqual(top_tag.parent, self.tree) + + def test_soup_object_has_no_parent(self): + self.assertEqual(None, self.tree.parent) + + def test_find_parents(self): + self.assertSelectsIDs( + self.start.find_parents('ul'), ['bottom', 'middle', 'top']) + self.assertSelectsIDs( + self.start.find_parents('ul', id="middle"), ['middle']) + + def test_find_parent(self): + self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') + self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') + + def test_parent_of_text_element(self): + text = self.tree.find(text="Start here") + self.assertEqual(text.parent.name, 'b') + + def test_text_element_find_parent(self): + text = self.tree.find(text="Start here") + self.assertEqual(text.find_parent('ul')['id'], 'bottom') + + def test_parent_generator(self): + parents = [parent['id'] for parent in self.start.parents + if parent is not None and 'id' in parent.attrs] + self.assertEqual(parents, ['bottom', 'middle', 'top']) + + +class ProximityTest(TreeTest): + + def setUp(self): + super(TreeTest, self).setUp() + self.tree = self.soup( + 'OneTwoThree') + + +class TestNextOperations(ProximityTest): + + def setUp(self): + super(TestNextOperations, self).setUp() + self.start = self.tree.b + + def test_next(self): + self.assertEqual(self.start.next_element, "One") + self.assertEqual(self.start.next_element.next_element['id'], "2") + + def test_next_of_last_item_is_none(self): + last = self.tree.find(text="Three") + self.assertEqual(last.next_element, None) + + def test_next_of_root_is_none(self): + # The document root is outside the next/previous chain. + self.assertEqual(self.tree.next_element, None) + + def test_find_all_next(self): + self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) + self.start.find_all_next(id=3) + self.assertSelects(self.start.find_all_next(id=3), ["Three"]) + + def test_find_next(self): + self.assertEqual(self.start.find_next('b')['id'], '2') + self.assertEqual(self.start.find_next(text="Three"), "Three") + + def test_find_next_for_text_element(self): + text = self.tree.find(text="One") + self.assertEqual(text.find_next("b").string, "Two") + self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) + + def test_next_generator(self): + start = self.tree.find(text="Two") + successors = [node for node in start.next_elements] + # There are two successors: the final tag and its text contents. + tag, contents = successors + self.assertEqual(tag['id'], '3') + self.assertEqual(contents, "Three") + +class TestPreviousOperations(ProximityTest): + + def setUp(self): + super(TestPreviousOperations, self).setUp() + self.end = self.tree.find(text="Three") + + def test_previous(self): + self.assertEqual(self.end.previous_element['id'], "3") + self.assertEqual(self.end.previous_element.previous_element, "Two") + + def test_previous_of_first_item_is_none(self): + first = self.tree.find('html') + self.assertEqual(first.previous_element, None) + + def test_previous_of_root_is_none(self): + # The document root is outside the next/previous chain. + # XXX This is broken! + #self.assertEqual(self.tree.previous_element, None) + pass + + def test_find_all_previous(self): + # The tag containing the "Three" node is the predecessor + # of the "Three" node itself, which is why "Three" shows up + # here. + self.assertSelects( + self.end.find_all_previous('b'), ["Three", "Two", "One"]) + self.assertSelects(self.end.find_all_previous(id=1), ["One"]) + + def test_find_previous(self): + self.assertEqual(self.end.find_previous('b')['id'], '3') + self.assertEqual(self.end.find_previous(text="One"), "One") + + def test_find_previous_for_text_element(self): + text = self.tree.find(text="Three") + self.assertEqual(text.find_previous("b").string, "Three") + self.assertSelects( + text.find_all_previous("b"), ["Three", "Two", "One"]) + + def test_previous_generator(self): + start = self.tree.find(text="One") + predecessors = [node for node in start.previous_elements] + + # There are four predecessors: the tag containing "One" + # the tag, the tag, and the tag. + b, body, head, html = predecessors + self.assertEqual(b['id'], '1') + self.assertEqual(body.name, "body") + self.assertEqual(head.name, "head") + self.assertEqual(html.name, "html") + + +class SiblingTest(TreeTest): + + def setUp(self): + super(SiblingTest, self).setUp() + markup = ''' + + + + + + + + + + + ''' + # All that whitespace looks good but makes the tests more + # difficult. Get rid of it. + markup = re.compile("\n\s*").sub("", markup) + self.tree = self.soup(markup) + + +class TestNextSibling(SiblingTest): + + def setUp(self): + super(TestNextSibling, self).setUp() + self.start = self.tree.find(id="1") + + def test_next_sibling_of_root_is_none(self): + self.assertEqual(self.tree.next_sibling, None) + + def test_next_sibling(self): + self.assertEqual(self.start.next_sibling['id'], '2') + self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') + + # Note the difference between next_sibling and next_element. + self.assertEqual(self.start.next_element['id'], '1.1') + + def test_next_sibling_may_not_exist(self): + self.assertEqual(self.tree.html.next_sibling, None) + + nested_span = self.tree.find(id="1.1") + self.assertEqual(nested_span.next_sibling, None) + + last_span = self.tree.find(id="4") + self.assertEqual(last_span.next_sibling, None) + + def test_find_next_sibling(self): + self.assertEqual(self.start.find_next_sibling('span')['id'], '2') + + def test_next_siblings(self): + self.assertSelectsIDs(self.start.find_next_siblings("span"), + ['2', '3', '4']) + + self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) + + def test_next_sibling_for_text_element(self): + soup = self.soup("Foobarbaz") + start = soup.find(text="Foo") + self.assertEqual(start.next_sibling.name, 'b') + self.assertEqual(start.next_sibling.next_sibling, 'baz') + + self.assertSelects(start.find_next_siblings('b'), ['bar']) + self.assertEqual(start.find_next_sibling(text="baz"), "baz") + self.assertEqual(start.find_next_sibling(text="nonesuch"), None) + + +class TestPreviousSibling(SiblingTest): + + def setUp(self): + super(TestPreviousSibling, self).setUp() + self.end = self.tree.find(id="4") + + def test_previous_sibling_of_root_is_none(self): + self.assertEqual(self.tree.previous_sibling, None) + + def test_previous_sibling(self): + self.assertEqual(self.end.previous_sibling['id'], '3') + self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') + + # Note the difference between previous_sibling and previous_element. + self.assertEqual(self.end.previous_element['id'], '3.1') + + def test_previous_sibling_may_not_exist(self): + self.assertEqual(self.tree.html.previous_sibling, None) + + nested_span = self.tree.find(id="1.1") + self.assertEqual(nested_span.previous_sibling, None) + + first_span = self.tree.find(id="1") + self.assertEqual(first_span.previous_sibling, None) + + def test_find_previous_sibling(self): + self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') + + def test_previous_siblings(self): + self.assertSelectsIDs(self.end.find_previous_siblings("span"), + ['3', '2', '1']) + + self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) + + def test_previous_sibling_for_text_element(self): + soup = self.soup("Foobarbaz") + start = soup.find(text="baz") + self.assertEqual(start.previous_sibling.name, 'b') + self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') + + self.assertSelects(start.find_previous_siblings('b'), ['bar']) + self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") + self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) + + +class TestTagCreation(SoupTest): + """Test the ability to create new tags.""" + def test_new_tag(self): + soup = self.soup("") + new_tag = soup.new_tag("foo", bar="baz") + self.assertTrue(isinstance(new_tag, Tag)) + self.assertEqual("foo", new_tag.name) + self.assertEqual(dict(bar="baz"), new_tag.attrs) + self.assertEqual(None, new_tag.parent) + + def test_tag_inherits_self_closing_rules_from_builder(self): + if XML_BUILDER_PRESENT: + xml_soup = BeautifulSoup("", "lxml-xml") + xml_br = xml_soup.new_tag("br") + xml_p = xml_soup.new_tag("p") + + # Both the
      and

      tag are empty-element, just because + # they have no contents. + self.assertEqual(b"
      ", xml_br.encode()) + self.assertEqual(b"

      ", xml_p.encode()) + + html_soup = BeautifulSoup("", "html.parser") + html_br = html_soup.new_tag("br") + html_p = html_soup.new_tag("p") + + # The HTML builder users HTML's rules about which tags are + # empty-element tags, and the new tags reflect these rules. + self.assertEqual(b"
      ", html_br.encode()) + self.assertEqual(b"

      ", html_p.encode()) + + def test_new_string_creates_navigablestring(self): + soup = self.soup("") + s = soup.new_string("foo") + self.assertEqual("foo", s) + self.assertTrue(isinstance(s, NavigableString)) + + def test_new_string_can_create_navigablestring_subclass(self): + soup = self.soup("") + s = soup.new_string("foo", Comment) + self.assertEqual("foo", s) + self.assertTrue(isinstance(s, Comment)) + +class TestTreeModification(SoupTest): + + def test_attribute_modification(self): + soup = self.soup('') + soup.a['id'] = 2 + self.assertEqual(soup.decode(), self.document_for('')) + del(soup.a['id']) + self.assertEqual(soup.decode(), self.document_for('')) + soup.a['id2'] = 'foo' + self.assertEqual(soup.decode(), self.document_for('')) + + def test_new_tag_creation(self): + builder = builder_registry.lookup('html')() + soup = self.soup("", builder=builder) + a = Tag(soup, builder, 'a') + ol = Tag(soup, builder, 'ol') + a['href'] = 'http://foo.com/' + soup.body.insert(0, a) + soup.body.insert(1, ol) + self.assertEqual( + soup.body.encode(), + b'
        ') + + def test_append_to_contents_moves_tag(self): + doc = """

        Don't leave me here.

        +

        Don\'t leave!

        """ + soup = self.soup(doc) + second_para = soup.find(id='2') + bold = soup.b + + # Move the tag to the end of the second paragraph. + soup.find(id='2').append(soup.b) + + # The tag is now a child of the second paragraph. + self.assertEqual(bold.parent, second_para) + + self.assertEqual( + soup.decode(), self.document_for( + '

        Don\'t leave me .

        \n' + '

        Don\'t leave!here

        ')) + + def test_replace_with_returns_thing_that_was_replaced(self): + text = "" + soup = self.soup(text) + a = soup.a + new_a = a.replace_with(soup.c) + self.assertEqual(a, new_a) + + def test_unwrap_returns_thing_that_was_replaced(self): + text = "" + soup = self.soup(text) + a = soup.a + new_a = a.unwrap() + self.assertEqual(a, new_a) + + def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self): + soup = self.soup("FooBar") + a = soup.a + a.extract() + self.assertEqual(None, a.parent) + self.assertRaises(ValueError, a.unwrap) + self.assertRaises(ValueError, a.replace_with, soup.c) + + def test_replace_tag_with_itself(self): + text = "Foo" + soup = self.soup(text) + c = soup.c + soup.c.replace_with(c) + self.assertEqual(soup.decode(), self.document_for(text)) + + def test_replace_tag_with_its_parent_raises_exception(self): + text = "" + soup = self.soup(text) + self.assertRaises(ValueError, soup.b.replace_with, soup.a) + + def test_insert_tag_into_itself_raises_exception(self): + text = "" + soup = self.soup(text) + self.assertRaises(ValueError, soup.a.insert, 0, soup.a) + + def test_replace_with_maintains_next_element_throughout(self): + soup = self.soup('

        onethree

        ') + a = soup.a + b = a.contents[0] + # Make it so the tag has two text children. + a.insert(1, "two") + + # Now replace each one with the empty string. + left, right = a.contents + left.replaceWith('') + right.replaceWith('') + + # The tag is still connected to the tree. + self.assertEqual("three", soup.b.string) + + def test_replace_final_node(self): + soup = self.soup("Argh!") + soup.find(text="Argh!").replace_with("Hooray!") + new_text = soup.find(text="Hooray!") + b = soup.b + self.assertEqual(new_text.previous_element, b) + self.assertEqual(new_text.parent, b) + self.assertEqual(new_text.previous_element.next_element, new_text) + self.assertEqual(new_text.next_element, None) + + def test_consecutive_text_nodes(self): + # A builder should never create two consecutive text nodes, + # but if you insert one next to another, Beautiful Soup will + # handle it correctly. + soup = self.soup("Argh!") + soup.b.insert(1, "Hooray!") + + self.assertEqual( + soup.decode(), self.document_for( + "Argh!Hooray!")) + + new_text = soup.find(text="Hooray!") + self.assertEqual(new_text.previous_element, "Argh!") + self.assertEqual(new_text.previous_element.next_element, new_text) + + self.assertEqual(new_text.previous_sibling, "Argh!") + self.assertEqual(new_text.previous_sibling.next_sibling, new_text) + + self.assertEqual(new_text.next_sibling, None) + self.assertEqual(new_text.next_element, soup.c) + + def test_insert_string(self): + soup = self.soup("") + soup.a.insert(0, "bar") + soup.a.insert(0, "foo") + # The string were added to the tag. + self.assertEqual(["foo", "bar"], soup.a.contents) + # And they were converted to NavigableStrings. + self.assertEqual(soup.a.contents[0].next_element, "bar") + + def test_insert_tag(self): + builder = self.default_builder + soup = self.soup( + "Findlady!", builder=builder) + magic_tag = Tag(soup, builder, 'magictag') + magic_tag.insert(0, "the") + soup.a.insert(1, magic_tag) + + self.assertEqual( + soup.decode(), self.document_for( + "Findthelady!")) + + # Make sure all the relationships are hooked up correctly. + b_tag = soup.b + self.assertEqual(b_tag.next_sibling, magic_tag) + self.assertEqual(magic_tag.previous_sibling, b_tag) + + find = b_tag.find(text="Find") + self.assertEqual(find.next_element, magic_tag) + self.assertEqual(magic_tag.previous_element, find) + + c_tag = soup.c + self.assertEqual(magic_tag.next_sibling, c_tag) + self.assertEqual(c_tag.previous_sibling, magic_tag) + + the = magic_tag.find(text="the") + self.assertEqual(the.parent, magic_tag) + self.assertEqual(the.next_element, c_tag) + self.assertEqual(c_tag.previous_element, the) + + def test_append_child_thats_already_at_the_end(self): + data = "" + soup = self.soup(data) + soup.a.append(soup.b) + self.assertEqual(data, soup.decode()) + + def test_move_tag_to_beginning_of_parent(self): + data = "" + soup = self.soup(data) + soup.a.insert(0, soup.d) + self.assertEqual("", soup.decode()) + + def test_insert_works_on_empty_element_tag(self): + # This is a little strange, since most HTML parsers don't allow + # markup like this to come through. But in general, we don't + # know what the parser would or wouldn't have allowed, so + # I'm letting this succeed for now. + soup = self.soup("
        ") + soup.br.insert(1, "Contents") + self.assertEqual(str(soup.br), "
        Contents
        ") + + def test_insert_before(self): + soup = self.soup("foobar") + soup.b.insert_before("BAZ") + soup.a.insert_before("QUUX") + self.assertEqual( + soup.decode(), self.document_for("QUUXfooBAZbar")) + + soup.a.insert_before(soup.b) + self.assertEqual( + soup.decode(), self.document_for("QUUXbarfooBAZ")) + + def test_insert_after(self): + soup = self.soup("foobar") + soup.b.insert_after("BAZ") + soup.a.insert_after("QUUX") + self.assertEqual( + soup.decode(), self.document_for("fooQUUXbarBAZ")) + soup.b.insert_after(soup.a) + self.assertEqual( + soup.decode(), self.document_for("QUUXbarfooBAZ")) + + def test_insert_after_raises_exception_if_after_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + self.assertRaises(ValueError, string.insert_after, tag) + self.assertRaises(NotImplementedError, soup.insert_after, tag) + self.assertRaises(ValueError, tag.insert_after, tag) + + def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): + soup = self.soup("") + tag = soup.new_tag("a") + string = soup.new_string("") + self.assertRaises(ValueError, string.insert_before, tag) + self.assertRaises(NotImplementedError, soup.insert_before, tag) + self.assertRaises(ValueError, tag.insert_before, tag) + + def test_replace_with(self): + soup = self.soup( + "

        There's no business like show business

        ") + no, show = soup.find_all('b') + show.replace_with(no) + self.assertEqual( + soup.decode(), + self.document_for( + "

        There's business like no business

        ")) + + self.assertEqual(show.parent, None) + self.assertEqual(no.parent, soup.p) + self.assertEqual(no.next_element, "no") + self.assertEqual(no.next_sibling, " business") + + def test_replace_first_child(self): + data = "" + soup = self.soup(data) + soup.b.replace_with(soup.c) + self.assertEqual("", soup.decode()) + + def test_replace_last_child(self): + data = "" + soup = self.soup(data) + soup.c.replace_with(soup.b) + self.assertEqual("", soup.decode()) + + def test_nested_tag_replace_with(self): + soup = self.soup( + """Wereservetherighttorefuseservice""") + + # Replace the entire tag and its contents ("reserve the + # right") with the tag ("refuse"). + remove_tag = soup.b + move_tag = soup.f + remove_tag.replace_with(move_tag) + + self.assertEqual( + soup.decode(), self.document_for( + "Werefusetoservice")) + + # The tag is now an orphan. + self.assertEqual(remove_tag.parent, None) + self.assertEqual(remove_tag.find(text="right").next_element, None) + self.assertEqual(remove_tag.previous_element, None) + self.assertEqual(remove_tag.next_sibling, None) + self.assertEqual(remove_tag.previous_sibling, None) + + # The tag is now connected to the tag. + self.assertEqual(move_tag.parent, soup.a) + self.assertEqual(move_tag.previous_element, "We") + self.assertEqual(move_tag.next_element.next_element, soup.e) + self.assertEqual(move_tag.next_sibling, None) + + # The gap where the tag used to be has been mended, and + # the word "to" is now connected to the tag. + to_text = soup.find(text="to") + g_tag = soup.g + self.assertEqual(to_text.next_element, g_tag) + self.assertEqual(to_text.next_sibling, g_tag) + self.assertEqual(g_tag.previous_element, to_text) + self.assertEqual(g_tag.previous_sibling, to_text) + + def test_unwrap(self): + tree = self.soup(""" +

        Unneeded formatting is unneeded

        + """) + tree.em.unwrap() + self.assertEqual(tree.em, None) + self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") + + def test_wrap(self): + soup = self.soup("I wish I was bold.") + value = soup.string.wrap(soup.new_tag("b")) + self.assertEqual(value.decode(), "I wish I was bold.") + self.assertEqual( + soup.decode(), self.document_for("I wish I was bold.")) + + def test_wrap_extracts_tag_from_elsewhere(self): + soup = self.soup("I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + self.assertEqual( + soup.decode(), self.document_for("I wish I was bold.")) + + def test_wrap_puts_new_contents_at_the_end(self): + soup = self.soup("I like being bold.I wish I was bold.") + soup.b.next_sibling.wrap(soup.b) + self.assertEqual(2, len(soup.b.contents)) + self.assertEqual( + soup.decode(), self.document_for( + "I like being bold.I wish I was bold.")) + + def test_extract(self): + soup = self.soup( + 'Some content. More content.') + + self.assertEqual(len(soup.body.contents), 3) + extracted = soup.find(id="nav").extract() + + self.assertEqual( + soup.decode(), "Some content. More content.") + self.assertEqual(extracted.decode(), '') + + # The extracted tag is now an orphan. + self.assertEqual(len(soup.body.contents), 2) + self.assertEqual(extracted.parent, None) + self.assertEqual(extracted.previous_element, None) + self.assertEqual(extracted.next_element.next_element, None) + + # The gap where the extracted tag used to be has been mended. + content_1 = soup.find(text="Some content. ") + content_2 = soup.find(text=" More content.") + self.assertEqual(content_1.next_element, content_2) + self.assertEqual(content_1.next_sibling, content_2) + self.assertEqual(content_2.previous_element, content_1) + self.assertEqual(content_2.previous_sibling, content_1) + + def test_extract_distinguishes_between_identical_strings(self): + soup = self.soup("
        foobar") + foo_1 = soup.a.string + bar_1 = soup.b.string + foo_2 = soup.new_string("foo") + bar_2 = soup.new_string("bar") + soup.a.append(foo_2) + soup.b.append(bar_2) + + # Now there are two identical strings in the tag, and two + # in the tag. Let's remove the first "foo" and the second + # "bar". + foo_1.extract() + bar_2.extract() + self.assertEqual(foo_2, soup.a.string) + self.assertEqual(bar_2, soup.b.string) + + def test_extract_multiples_of_same_tag(self): + soup = self.soup(""" + + + + + + + + + +""") + [soup.script.extract() for i in soup.find_all("script")] + self.assertEqual("\n\n\n", str(soup.body)) + + + def test_extract_works_when_element_is_surrounded_by_identical_strings(self): + soup = self.soup( + '\n' + 'hi\n' + '') + soup.find('body').extract() + self.assertEqual(None, soup.find('body')) + + + def test_clear(self): + """Tag.clear()""" + soup = self.soup("

        String Italicized and another

        ") + # clear using extract() + a = soup.a + soup.p.clear() + self.assertEqual(len(soup.p.contents), 0) + self.assertTrue(hasattr(a, "contents")) + + # clear using decompose() + em = a.em + a.clear(decompose=True) + self.assertEqual(0, len(em.contents)) + + def test_string_set(self): + """Tag.string = 'string'""" + soup = self.soup(" ") + soup.a.string = "foo" + self.assertEqual(soup.a.contents, ["foo"]) + soup.b.string = "bar" + self.assertEqual(soup.b.contents, ["bar"]) + + def test_string_set_does_not_affect_original_string(self): + soup = self.soup("foobar") + soup.b.string = soup.c.string + self.assertEqual(soup.a.encode(), b"barbar") + + def test_set_string_preserves_class_of_string(self): + soup = self.soup("") + cdata = CData("foo") + soup.a.string = cdata + self.assertTrue(isinstance(soup.a.string, CData)) + +class TestElementObjects(SoupTest): + """Test various features of element objects.""" + + def test_len(self): + """The length of an element is its number of children.""" + soup = self.soup("123") + + # The BeautifulSoup object itself contains one element: the + # tag. + self.assertEqual(len(soup.contents), 1) + self.assertEqual(len(soup), 1) + + # The tag contains three elements: the text node "1", the + # tag, and the text node "3". + self.assertEqual(len(soup.top), 3) + self.assertEqual(len(soup.top.contents), 3) + + def test_member_access_invokes_find(self): + """Accessing a Python member .foo invokes find('foo')""" + soup = self.soup('') + self.assertEqual(soup.b, soup.find('b')) + self.assertEqual(soup.b.i, soup.find('b').find('i')) + self.assertEqual(soup.a, None) + + def test_deprecated_member_access(self): + soup = self.soup('') + with warnings.catch_warnings(record=True) as w: + tag = soup.bTag + self.assertEqual(soup.b, tag) + self.assertEqual( + '.bTag is deprecated, use .find("b") instead.', + str(w[0].message)) + + def test_has_attr(self): + """has_attr() checks for the presence of an attribute. + + Please note note: has_attr() is different from + __in__. has_attr() checks the tag's attributes and __in__ + checks the tag's chidlren. + """ + soup = self.soup("") + self.assertTrue(soup.foo.has_attr('attr')) + self.assertFalse(soup.foo.has_attr('attr2')) + + + def test_attributes_come_out_in_alphabetical_order(self): + markup = '' + self.assertSoupEquals(markup, '') + + def test_string(self): + # A tag that contains only a text node makes that node + # available as .string. + soup = self.soup("foo") + self.assertEqual(soup.b.string, 'foo') + + def test_empty_tag_has_no_string(self): + # A tag with no children has no .stirng. + soup = self.soup("") + self.assertEqual(soup.b.string, None) + + def test_tag_with_multiple_children_has_no_string(self): + # A tag with no children has no .string. + soup = self.soup("foo") + self.assertEqual(soup.b.string, None) + + soup = self.soup("foobar
        ") + self.assertEqual(soup.b.string, None) + + # Even if all the children are strings, due to trickery, + # it won't work--but this would be a good optimization. + soup = self.soup("foo
        ") + soup.a.insert(1, "bar") + self.assertEqual(soup.a.string, None) + + def test_tag_with_recursive_string_has_string(self): + # A tag with a single child which has a .string inherits that + # .string. + soup = self.soup("foo") + self.assertEqual(soup.a.string, "foo") + self.assertEqual(soup.string, "foo") + + def test_lack_of_string(self): + """Only a tag containing a single text node has a .string.""" + soup = self.soup("feo") + self.assertFalse(soup.b.string) + + soup = self.soup("") + self.assertFalse(soup.b.string) + + def test_all_text(self): + """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" + soup = self.soup("ar t ") + self.assertEqual(soup.a.text, "ar t ") + self.assertEqual(soup.a.get_text(strip=True), "art") + self.assertEqual(soup.a.get_text(","), "a,r, , t ") + self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") + + def test_get_text_ignores_comments(self): + soup = self.soup("foobar") + self.assertEqual(soup.get_text(), "foobar") + + self.assertEqual( + soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") + self.assertEqual( + soup.get_text(types=None), "fooIGNOREbar") + + def test_all_strings_ignores_comments(self): + soup = self.soup("foobar") + self.assertEqual(['foo', 'bar'], list(soup.strings)) + +class TestCDAtaListAttributes(SoupTest): + + """Testing cdata-list attributes like 'class'. + """ + def test_single_value_becomes_list(self): + soup = self.soup("") + self.assertEqual(["foo"],soup.a['class']) + + def test_multiple_values_becomes_list(self): + soup = self.soup("") + self.assertEqual(["foo", "bar"], soup.a['class']) + + def test_multiple_values_separated_by_weird_whitespace(self): + soup = self.soup("") + self.assertEqual(["foo", "bar", "baz"],soup.a['class']) + + def test_attributes_joined_into_string_on_output(self): + soup = self.soup("") + self.assertEqual(b'', soup.a.encode()) + + def test_accept_charset(self): + soup = self.soup('
        ') + self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) + + def test_cdata_attribute_applying_only_to_one_tag(self): + data = '' + soup = self.soup(data) + # We saw in another test that accept-charset is a cdata-list + # attribute for the tag. But it's not a cdata-list + # attribute for any other tag. + self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) + + def test_string_has_immutable_name_property(self): + string = self.soup("s").string + self.assertEqual(None, string.name) + def t(): + string.name = 'foo' + self.assertRaises(AttributeError, t) + +class TestPersistence(SoupTest): + "Testing features like pickle and deepcopy." + + def setUp(self): + super(TestPersistence, self).setUp() + self.page = """ + + + +Beautiful Soup: We called him Tortoise because he taught us. + + + + + + +foo +bar + +""" + self.tree = self.soup(self.page) + + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + dumped = pickle.dumps(self.tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), self.tree.decode()) + + def test_deepcopy_identity(self): + # Making a deepcopy of a tree yields an identical tree. + copied = copy.deepcopy(self.tree) + self.assertEqual(copied.decode(), self.tree.decode()) + + def test_unicode_pickle(self): + # A tree containing Unicode characters can be pickled. + html = "\N{SNOWMAN}" + soup = self.soup(html) + dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.decode(), soup.decode()) + + def test_copy_navigablestring_is_not_attached_to_tree(self): + html = "FooBar" + soup = self.soup(html) + s1 = soup.find(string="Foo") + s2 = copy.copy(s1) + self.assertEqual(s1, s2) + self.assertEqual(None, s2.parent) + self.assertEqual(None, s2.next_element) + self.assertNotEqual(None, s1.next_sibling) + self.assertEqual(None, s2.next_sibling) + self.assertEqual(None, s2.previous_element) + + def test_copy_navigablestring_subclass_has_same_type(self): + html = "" + soup = self.soup(html) + s1 = soup.string + s2 = copy.copy(s1) + self.assertEqual(s1, s2) + self.assertTrue(isinstance(s2, Comment)) + + def test_copy_entire_soup(self): + html = "
        FooBar
        end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + self.assertEqual(soup, soup_copy) + + def test_copy_tag_copies_contents(self): + html = "
        FooBar
        end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. + self.assertEqual(str(div), str(div_copy)) + self.assertEqual(div, div_copy) + + # But they're not the same object. + self.assertFalse(div is div_copy) + + # And they don't have the same relation to the parse tree. The + # copy is not associated with a parse tree at all. + self.assertEqual(None, div_copy.parent) + self.assertEqual(None, div_copy.previous_element) + self.assertEqual(None, div_copy.find(string='Bar').next_element) + self.assertNotEqual(None, div.find(string='Bar').next_element) + +class TestSubstitutions(SoupTest): + + def test_default_formatter_is_minimal(self): + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( + "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + + def test_formatter_html(self): + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="html") + self.assertEqual( + decoded, + self.document_for("<<Sacré bleu!>>")) + + def test_formatter_minimal(self): + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter="minimal") + # The < is converted back into < but the e-with-acute is left alone. + self.assertEqual( + decoded, + self.document_for( + "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>")) + + def test_formatter_null(self): + markup = "<<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>" + soup = self.soup(markup) + decoded = soup.decode(formatter=None) + # Neither the angle brackets nor the e-with-acute are converted. + # This is not valid HTML, but it's what the user wanted. + self.assertEqual(decoded, + self.document_for("<>")) + + def test_formatter_custom(self): + markup = "<foo>bar" + soup = self.soup(markup) + decoded = soup.decode(formatter = lambda x: x.upper()) + # Instead of normal entity conversion code, the custom + # callable is called on every string. + self.assertEqual( + decoded, + self.document_for("BAR")) + + def test_formatter_is_run_on_attribute_values(self): + markup = 'e' + soup = self.soup(markup) + a = soup.a + + expect_minimal = 'e' + + self.assertEqual(expect_minimal, a.decode()) + self.assertEqual(expect_minimal, a.decode(formatter="minimal")) + + expect_html = 'e' + self.assertEqual(expect_html, a.decode(formatter="html")) + + self.assertEqual(markup, a.decode(formatter=None)) + expect_upper = 'E' + self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) + + def test_formatter_skips_script_tag_for_html_documents(self): + doc = """ + +""" + encoded = BeautifulSoup(doc, 'html.parser').encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_formatter_skips_style_tag_for_html_documents(self): + doc = """ + +""" + encoded = BeautifulSoup(doc, 'html.parser').encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_prettify_leaves_preformatted_text_alone(self): + soup = self.soup("
        foo
          \tbar\n  \n  
        baz ") + # Everything outside the
         tag is reformatted, but everything
        +        # inside is left alone.
        +        self.assertEqual(
        +            '
        \n foo\n
          \tbar\n  \n  
        \n baz\n
        ', + soup.div.prettify()) + + def test_prettify_accepts_formatter(self): + soup = BeautifulSoup("foo", 'html.parser') + pretty = soup.prettify(formatter = lambda x: x.upper()) + self.assertTrue("FOO" in pretty) + + def test_prettify_outputs_unicode_by_default(self): + soup = self.soup("") + self.assertEqual(str, type(soup.prettify())) + + def test_prettify_can_encode_data(self): + soup = self.soup("") + self.assertEqual(bytes, type(soup.prettify("utf-8"))) + + def test_html_entity_substitution_off_by_default(self): + markup = "Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" + soup = self.soup(markup) + encoded = soup.b.encode("utf-8") + self.assertEqual(encoded, markup.encode('utf-8')) + + def test_encoding_substitution(self): + # Here's the tag saying that a document is + # encoded in Shift-JIS. + meta_tag = ('') + soup = self.soup(meta_tag) + + # Parse the document, and the charset apprears unchanged. + self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') + + # Encode the document into some encoding, and the encoding is + # substituted into the meta tag. + utf_8 = soup.encode("utf-8") + self.assertTrue(b"charset=utf-8" in utf_8) + + euc_jp = soup.encode("euc_jp") + self.assertTrue(b"charset=euc_jp" in euc_jp) + + shift_jis = soup.encode("shift-jis") + self.assertTrue(b"charset=shift-jis" in shift_jis) + + utf_16_u = soup.encode("utf-16").decode("utf-16") + self.assertTrue("charset=utf-16" in utf_16_u) + + def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): + markup = ('
        foo
        ') + + # Beautiful Soup used to try to rewrite the meta tag even if the + # meta tag got filtered out by the strainer. This test makes + # sure that doesn't happen. + strainer = SoupStrainer('pre') + soup = self.soup(markup, parse_only=strainer) + self.assertEqual(soup.contents[0].name, 'pre') + +class TestEncoding(SoupTest): + """Test the ability to encode objects into strings.""" + + def test_unicode_string_can_be_encoded(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual(soup.b.string.encode("utf-8"), + "\N{SNOWMAN}".encode("utf-8")) + + def test_tag_containing_unicode_string_can_be_encoded(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( + soup.b.encode("utf-8"), html.encode("utf-8")) + + def test_encoding_substitutes_unrecognized_characters_by_default(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual(soup.b.encode("ascii"), b"") + + def test_encoding_can_be_made_strict(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertRaises( + UnicodeEncodeError, soup.encode, "ascii", errors="strict") + + def test_decode_contents(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents()) + + def test_encode_contents(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( + "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( + encoding="utf8")) + + def test_deprecated_renderContents(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + self.assertEqual( + "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) + + def test_repr(self): + html = "\N{SNOWMAN}" + soup = self.soup(html) + if PY3K: + self.assertEqual(html, repr(soup)) + else: + self.assertEqual(b'\\u2603', repr(soup)) + +class TestNavigableStringSubclasses(SoupTest): + + def test_cdata(self): + # None of the current builders turn CDATA sections into CData + # objects, but you can create them manually. + soup = self.soup("") + cdata = CData("foo") + soup.insert(1, cdata) + self.assertEqual(str(soup), "") + self.assertEqual(soup.find(text="foo"), "foo") + self.assertEqual(soup.contents[0], "foo") + + def test_cdata_is_never_formatted(self): + """Text inside a CData object is passed into the formatter. + + But the return value is ignored. + """ + + self.count = 0 + def increment(*args): + self.count += 1 + return "BITTER FAILURE" + + soup = self.soup("") + cdata = CData("<><><>") + soup.insert(1, cdata) + self.assertEqual( + b"<><>]]>", soup.encode(formatter=increment)) + self.assertEqual(1, self.count) + + def test_doctype_ends_in_newline(self): + # Unlike other NavigableString subclasses, a DOCTYPE always ends + # in a newline. + doctype = Doctype("foo") + soup = self.soup("") + soup.insert(1, doctype) + self.assertEqual(soup.encode(), b"\n") + + def test_declaration(self): + d = Declaration("foo") + self.assertEqual("", d.output_ready()) + +class TestSoupSelector(TreeTest): + + HTML = """ + + + +The title + + + +Hello there. +
        +
        +

        An H1

        +

        Some text

        +

        Some more text

        +

        An H2

        +

        Another

        +Bob +

        Another H2

        +me + +span1a1 +span1a2 test + +span2a1 + + + +
        + +
        + + + + + + + + +

        English

        +

        English UK

        +

        English US

        +

        French

        +
        + + +""" + + def setUp(self): + self.soup = BeautifulSoup(self.HTML, 'html.parser') + + def assertSelects(self, selector, expected_ids): + el_ids = [el['id'] for el in self.soup.select(selector)] + el_ids.sort() + expected_ids.sort() + self.assertEqual(expected_ids, el_ids, + "Selector %s, expected [%s], got [%s]" % ( + selector, ', '.join(expected_ids), ', '.join(el_ids) + ) + ) + + assertSelect = assertSelects + + def assertSelectMultiple(self, *tests): + for selector, expected_ids in tests: + self.assertSelect(selector, expected_ids) + + def test_one_tag_one(self): + els = self.soup.select('title') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'title') + self.assertEqual(els[0].contents, ['The title']) + + def test_one_tag_many(self): + els = self.soup.select('div') + self.assertEqual(len(els), 4) + for div in els: + self.assertEqual(div.name, 'div') + + el = self.soup.select_one('div') + self.assertEqual('main', el['id']) + + def test_select_one_returns_none_if_no_match(self): + match = self.soup.select_one('nonexistenttag') + self.assertEqual(None, match) + + + def test_tag_in_tag_one(self): + els = self.soup.select('div div') + self.assertSelects('div div', ['inner', 'data1']) + + def test_tag_in_tag_many(self): + for selector in ('html div', 'html body div', 'body div'): + self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) + + def test_tag_no_match(self): + self.assertEqual(len(self.soup.select('del')), 0) + + def test_invalid_tag(self): + self.assertRaises(ValueError, self.soup.select, 'tag%t') + + def test_select_dashed_tag_ids(self): + self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) + + def test_select_dashed_by_id(self): + dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') + self.assertEqual(dashed[0].name, 'custom-dashed-tag') + self.assertEqual(dashed[0]['id'], 'dash2') + + def test_dashed_tag_text(self): + self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.') + + def test_select_dashed_matches_find_all(self): + self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) + + def test_header_tags(self): + self.assertSelectMultiple( + ('h1', ['header1']), + ('h2', ['header2', 'header3']), + ) + + def test_class_one(self): + for selector in ('.onep', 'p.onep', 'html p.onep'): + els = self.soup.select(selector) + self.assertEqual(len(els), 1) + self.assertEqual(els[0].name, 'p') + self.assertEqual(els[0]['class'], ['onep']) + + def test_class_mismatched_tag(self): + els = self.soup.select('div.onep') + self.assertEqual(len(els), 0) + + def test_one_id(self): + for selector in ('div#inner', '#inner', 'div div#inner'): + self.assertSelects(selector, ['inner']) + + def test_bad_id(self): + els = self.soup.select('#doesnotexist') + self.assertEqual(len(els), 0) + + def test_items_in_id(self): + els = self.soup.select('div#inner p') + self.assertEqual(len(els), 3) + for el in els: + self.assertEqual(el.name, 'p') + self.assertEqual(els[1]['class'], ['onep']) + self.assertFalse(els[0].has_attr('class')) + + def test_a_bunch_of_emptys(self): + for selector in ('div#main del', 'div#main div.oops', 'div div#main'): + self.assertEqual(len(self.soup.select(selector)), 0) + + def test_multi_class_support(self): + for selector in ('.class1', 'p.class1', '.class2', 'p.class2', + '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): + self.assertSelects(selector, ['pmulti']) + + def test_multi_class_selection(self): + for selector in ('.class1.class3', '.class3.class2', + '.class1.class2.class3'): + self.assertSelects(selector, ['pmulti']) + + def test_child_selector(self): + self.assertSelects('.s1 > a', ['s1a1', 's1a2']) + self.assertSelects('.s1 > a span', ['s1a2s1']) + + def test_child_selector_id(self): + self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) + + def test_attribute_equals(self): + self.assertSelectMultiple( + ('p[class="onep"]', ['p1']), + ('p[id="p1"]', ['p1']), + ('[class="onep"]', ['p1']), + ('[id="p1"]', ['p1']), + ('link[rel="stylesheet"]', ['l1']), + ('link[type="text/css"]', ['l1']), + ('link[href="blah.css"]', ['l1']), + ('link[href="no-blah.css"]', []), + ('[rel="stylesheet"]', ['l1']), + ('[type="text/css"]', ['l1']), + ('[href="blah.css"]', ['l1']), + ('[href="no-blah.css"]', []), + ('p[href="no-blah.css"]', []), + ('[href="no-blah.css"]', []), + ) + + def test_attribute_tilde(self): + self.assertSelectMultiple( + ('p[class~="class1"]', ['pmulti']), + ('p[class~="class2"]', ['pmulti']), + ('p[class~="class3"]', ['pmulti']), + ('[class~="class1"]', ['pmulti']), + ('[class~="class2"]', ['pmulti']), + ('[class~="class3"]', ['pmulti']), + ('a[rel~="friend"]', ['bob']), + ('a[rel~="met"]', ['bob']), + ('[rel~="friend"]', ['bob']), + ('[rel~="met"]', ['bob']), + ) + + def test_attribute_startswith(self): + self.assertSelectMultiple( + ('[rel^="style"]', ['l1']), + ('link[rel^="style"]', ['l1']), + ('notlink[rel^="notstyle"]', []), + ('[rel^="notstyle"]', []), + ('link[rel^="notstyle"]', []), + ('link[href^="bla"]', ['l1']), + ('a[href^="http://"]', ['bob', 'me']), + ('[href^="http://"]', ['bob', 'me']), + ('[id^="p"]', ['pmulti', 'p1']), + ('[id^="m"]', ['me', 'main']), + ('div[id^="m"]', ['main']), + ('a[id^="m"]', ['me']), + ('div[data-tag^="dashed"]', ['data1']) + ) + + def test_attribute_endswith(self): + self.assertSelectMultiple( + ('[href$=".css"]', ['l1']), + ('link[href$=".css"]', ['l1']), + ('link[id$="1"]', ['l1']), + ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), + ('div[id$="1"]', ['data1']), + ('[id$="noending"]', []), + ) + + def test_attribute_contains(self): + self.assertSelectMultiple( + # From test_attribute_startswith + ('[rel*="style"]', ['l1']), + ('link[rel*="style"]', ['l1']), + ('notlink[rel*="notstyle"]', []), + ('[rel*="notstyle"]', []), + ('link[rel*="notstyle"]', []), + ('link[href*="bla"]', ['l1']), + ('[href*="http://"]', ['bob', 'me']), + ('[id*="p"]', ['pmulti', 'p1']), + ('div[id*="m"]', ['main']), + ('a[id*="m"]', ['me']), + # From test_attribute_endswith + ('[href*=".css"]', ['l1']), + ('link[href*=".css"]', ['l1']), + ('link[id*="1"]', ['l1']), + ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), + ('div[id*="1"]', ['data1']), + ('[id*="noending"]', []), + # New for this test + ('[href*="."]', ['bob', 'me', 'l1']), + ('a[href*="."]', ['bob', 'me']), + ('link[href*="."]', ['l1']), + ('div[id*="n"]', ['main', 'inner']), + ('div[id*="nn"]', ['inner']), + ('div[data-tag*="edval"]', ['data1']) + ) + + def test_attribute_exact_or_hypen(self): + self.assertSelectMultiple( + ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), + ('p[lang|="fr"]', ['lang-fr']), + ('p[lang|="gb"]', []), + ) + + def test_attribute_exists(self): + self.assertSelectMultiple( + ('[rel]', ['l1', 'bob', 'me']), + ('link[rel]', ['l1']), + ('a[rel]', ['bob', 'me']), + ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), + ('p[class]', ['p1', 'pmulti']), + ('[blah]', []), + ('p[blah]', []), + ('div[data-tag]', ['data1']) + ) + + def test_unsupported_pseudoclass(self): + self.assertRaises( + NotImplementedError, self.soup.select, "a:no-such-pseudoclass") + + self.assertRaises( + NotImplementedError, self.soup.select, "a:nth-of-type(a)") + + + def test_nth_of_type(self): + # Try to select first paragraph + els = self.soup.select('div#inner p:nth-of-type(1)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, 'Some text') + + # Try to select third paragraph + els = self.soup.select('div#inner p:nth-of-type(3)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, 'Another') + + # Try to select (non-existent!) fourth paragraph + els = self.soup.select('div#inner p:nth-of-type(4)') + self.assertEqual(len(els), 0) + + # Pass in an invalid value. + self.assertRaises( + ValueError, self.soup.select, 'div p:nth-of-type(0)') + + def test_nth_of_type_direct_descendant(self): + els = self.soup.select('div#inner > p:nth-of-type(1)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, 'Some text') + + def test_id_child_selector_nth_of_type(self): + self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) + + def test_select_on_element(self): + # Other tests operate on the tree; this operates on an element + # within the tree. + inner = self.soup.find("div", id="main") + selected = inner.select("div") + # The
        tag was selected. The