diff --git a/infra/api/package/beautifulsoup4-4.9.1.dist-info/AUTHORS b/infra/api/package/beautifulsoup4-4.9.1.dist-info/AUTHORS deleted file mode 100644 index 1f14fe0..0000000 --- a/infra/api/package/beautifulsoup4-4.9.1.dist-info/AUTHORS +++ /dev/null @@ -1,49 +0,0 @@ -Behold, mortal, the origins of Beautiful Soup... -================================================ - -Leonard Richardson is the primary maintainer. - -Aaron DeVore and Isaac Muse have made significant contributions to the -code base. - -Mark Pilgrim provided the encoding detection code that forms the base -of UnicodeDammit. - -Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful -Soup 4 working under Python 3. - -Simon Willison wrote soupselect, which was used to make Beautiful Soup -support CSS selectors. Isaac Muse wrote SoupSieve, which made it -possible to _remove_ the CSS selector code from Beautiful Soup. - -Sam Ruby helped with a lot of edge cases. - -Jonathan Ellis was awarded the prestigious Beau Potage D'Or for his -work in solving the nestable tags conundrum. - -An incomplete list of people have contributed patches to Beautiful -Soup: - - Istvan Albert, Andrew Lin, Anthony Baxter, Oliver Beattie, Andrew -Boyko, Tony Chang, Francisco Canas, "Delong", Zephyr Fang, Fuzzy, -Roman Gaufman, Yoni Gilad, Richie Hindle, Toshihiro Kamiya, Peteris -Krumins, Kent Johnson, Marek Kapolka, Andreas Kostyrka, Roel Kramer, -Ben Last, Robert Leftwich, Stefaan Lippens, "liquider", Staffan -Malmgren, Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", -Ed Oskiewicz, Martijn Peters, Greg Phillips, Giles Radford, Stefano -Revera, Arthur Rudolph, Marko Samastur, James Salter, Jouni Seppänen, -Alexander Schmolck, Tim Shirley, Geoffrey Sneddon, Ville Skyttä, -"Vikas", Jens Svalgaard, Andy Theyers, Eric Weiser, Glyn Webster, John -Wiseman, Paul Wright, Danny Yoo - -An incomplete list of people who made suggestions or found bugs or -found ways to break Beautiful Soup: - - Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, - Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, - Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams, - warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, - Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed - Summers, Dennis Sutch, Chris Smith, Aaron Swartz, Stuart - Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de - Sousa Rocha, Yichun Wei, Per Vognsen diff --git a/infra/api/package/beautifulsoup4-4.9.1.dist-info/COPYING.txt b/infra/api/package/beautifulsoup4-4.9.1.dist-info/COPYING.txt deleted file mode 100644 index fb6ae69..0000000 --- a/infra/api/package/beautifulsoup4-4.9.1.dist-info/COPYING.txt +++ /dev/null @@ -1,27 +0,0 @@ -Beautiful Soup is made available under the MIT license: - - Copyright (c) 2004-2017 Leonard Richardson - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - -Beautiful Soup incorporates code from the html5lib library, which is -also made available under the MIT license. Copyright (c) 2006-2013 -James Graham and other contributors diff --git a/infra/api/package/beautifulsoup4-4.9.1.dist-info/INSTALLER b/infra/api/package/beautifulsoup4-4.9.1.dist-info/INSTALLER deleted file mode 100644 index a1b589e..0000000 --- a/infra/api/package/beautifulsoup4-4.9.1.dist-info/INSTALLER +++ /dev/null @@ -1 +0,0 @@ -pip diff --git a/infra/api/package/beautifulsoup4-4.9.1.dist-info/LICENSE b/infra/api/package/beautifulsoup4-4.9.1.dist-info/LICENSE deleted file mode 100644 index 4c068ba..0000000 --- a/infra/api/package/beautifulsoup4-4.9.1.dist-info/LICENSE +++ /dev/null @@ -1,30 +0,0 @@ -Beautiful Soup is made available under the MIT license: - - Copyright (c) 2004-2019 Leonard Richardson - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - -Beautiful Soup incorporates code from the html5lib library, which is -also made available under the MIT license. Copyright (c) 2006-2013 -James Graham and other contributors - -Beautiful Soup depends on the soupsieve library, which is also made -available under the MIT license. Copyright (c) 2018 Isaac Muse diff --git a/infra/api/package/beautifulsoup4-4.9.1.dist-info/METADATA b/infra/api/package/beautifulsoup4-4.9.1.dist-info/METADATA deleted file mode 100644 index 1b4a564..0000000 --- a/infra/api/package/beautifulsoup4-4.9.1.dist-info/METADATA +++ /dev/null @@ -1,131 +0,0 @@ -Metadata-Version: 2.1 -Name: beautifulsoup4 -Version: 4.9.1 -Summary: Screen-scraping library -Home-page: http://www.crummy.com/software/BeautifulSoup/bs4/ -Author: Leonard Richardson -Author-email: leonardr@segfault.org -License: MIT -Download-URL: http://www.crummy.com/software/BeautifulSoup/bs4/download/ -Platform: UNKNOWN -Classifier: Development Status :: 5 - Production/Stable -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: MIT License -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 2.7 -Classifier: Programming Language :: Python :: 3 -Classifier: Topic :: Text Processing :: Markup :: HTML -Classifier: Topic :: Text Processing :: Markup :: XML -Classifier: Topic :: Text Processing :: Markup :: SGML -Classifier: Topic :: Software Development :: Libraries :: Python Modules -Description-Content-Type: text/markdown -Requires-Dist: soupsieve (>1.2) -Provides-Extra: html5lib -Requires-Dist: html5lib ; extra == 'html5lib' -Provides-Extra: lxml -Requires-Dist: lxml ; extra == 'lxml' - -Beautiful Soup is a library that makes it easy to scrape information -from web pages. It sits atop an HTML or XML parser, providing Pythonic -idioms for iterating, searching, and modifying the parse tree. - -# Quick start - -``` ->>> from bs4 import BeautifulSoup ->>> soup = BeautifulSoup("

SomebadHTML") ->>> print soup.prettify() - - -

-Some - -bad - -HTML - - -

- - ->>> soup.find(text="bad") -u'bad' ->>> soup.i -HTML -# ->>> soup = BeautifulSoup("SomebadXML", "xml") -# ->>> print soup.prettify() - - -Some - -bad - -XML - - -``` - -To go beyond the basics, [comprehensive documentation is available](http://www.crummy.com/software/BeautifulSoup/bs4/doc/). - -# Links - -* [Homepage](http://www.crummy.com/software/BeautifulSoup/bs4/) -* [Documentation](http://www.crummy.com/software/BeautifulSoup/bs4/doc/) -* [Discussion group](http://groups.google.com/group/beautifulsoup/) -* [Development](https://code.launchpad.net/beautifulsoup/) -* [Bug tracker](https://bugs.launchpad.net/beautifulsoup/) -* [Complete changelog](https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/view/head:/CHANGELOG) - -# Note on Python 2 sunsetting - -Since 2012, Beautiful Soup has been developed as a Python 2 library -which is automatically converted to Python 3 code as necessary. This -makes it impossible to take advantage of some features of Python -3. - -For this reason, I plan to discontinue Beautiful Soup's Python 2 -support at some point after December 31, 2020: one year after the -sunset date for Python 2 itself. Beyond that point, new Beautiful Soup -development will exclusively target Python 3. Of course, older -releases of Beautiful Soup, which support both versions, will continue -to be available. - -# Supporting the project - -If you use Beautiful Soup as part of your professional work, please consider a -[Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=readme). -This will support many of the free software projects your organization -depends on, not just Beautiful Soup. - -If you use Beautiful Soup for personal projects, the best way to say -thank you is to read -[Tool Safety](https://www.crummy.com/software/BeautifulSoup/zine/), a zine I -wrote about what Beautiful Soup has taught me about software -development. - -# Building the documentation - -The bs4/doc/ directory contains full documentation in Sphinx -format. Run `make html` in that directory to create HTML -documentation. - -# Running the unit tests - -Beautiful Soup supports unit test discovery from the project root directory: - -``` -$ nosetests -``` - -``` -$ python -m unittest discover -s bs4 -``` - -If you checked out the source tree, you should see a script in the -home directory called test-all-versions. This script will run the unit -tests under Python 2, then create a temporary Python 3 conversion of -the source and run the unit tests again under Python 3. - - diff --git a/infra/api/package/beautifulsoup4-4.9.1.dist-info/RECORD b/infra/api/package/beautifulsoup4-4.9.1.dist-info/RECORD deleted file mode 100644 index 689c041..0000000 --- a/infra/api/package/beautifulsoup4-4.9.1.dist-info/RECORD +++ /dev/null @@ -1,45 +0,0 @@ -beautifulsoup4-4.9.1.dist-info/AUTHORS,sha256=uSIdbrBb1sobdXl7VrlUvuvim2dN9kF3MH4Edn0WKGE,2176 -beautifulsoup4-4.9.1.dist-info/COPYING.txt,sha256=pH6lEjYJhGT-C09Vl0NZC1MwVtngD0nsv4Apn6tH4jE,1315 -beautifulsoup4-4.9.1.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4 -beautifulsoup4-4.9.1.dist-info/LICENSE,sha256=ynIn3bnu1syAnhV_Z7Ag543eBjJAAB0RhW-FxJy25CM,1447 -beautifulsoup4-4.9.1.dist-info/METADATA,sha256=cIvKNOIcfpYqN5nITv3UDCtXZF1l_B5cPwpib-Gkjk8,4062 -beautifulsoup4-4.9.1.dist-info/RECORD,, -beautifulsoup4-4.9.1.dist-info/REQUESTED,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 -beautifulsoup4-4.9.1.dist-info/WHEEL,sha256=g4nMs7d-Xl9-xC9XovUrsDHGXt-FT0E17Yqo92DEfvY,92 -beautifulsoup4-4.9.1.dist-info/top_level.txt,sha256=H8VT-IuPWLzQqwG9_eChjXDJ1z0H9RRebdSR90Bjnkw,4 -bs4/__init__.py,sha256=ZM_Revj0Yipo_FRYdxS-rksn3MFXzR3XRyFirnM-h84,31551 -bs4/__pycache__/__init__.cpython-38.pyc,, -bs4/__pycache__/dammit.cpython-38.pyc,, -bs4/__pycache__/diagnose.cpython-38.pyc,, -bs4/__pycache__/element.cpython-38.pyc,, -bs4/__pycache__/formatter.cpython-38.pyc,, -bs4/__pycache__/testing.cpython-38.pyc,, -bs4/builder/__init__.py,sha256=VgrBobApHGLnuA8VCJuumZDP64UiGEzlxJJgLiV--sU,19841 -bs4/builder/__pycache__/__init__.cpython-38.pyc,, -bs4/builder/__pycache__/_html5lib.cpython-38.pyc,, -bs4/builder/__pycache__/_htmlparser.cpython-38.pyc,, -bs4/builder/__pycache__/_lxml.cpython-38.pyc,, -bs4/builder/_html5lib.py,sha256=hDxlzVrAku_eU7zEt4gZ-sAXzG58GvkLfMz6P4zUqoA,18748 -bs4/builder/_htmlparser.py,sha256=PEKGvBcJcf6_78CVwA1_uLxulnmnlRuWH0W2caTzpKk,18406 -bs4/builder/_lxml.py,sha256=e4w91RZi3NII_QYe2e1-EiN_BxQtgJPSRwQ8Xgz41ZA,12234 -bs4/dammit.py,sha256=k_XPB3kbZsHM01ckf9BxCUB2Eu2dIQ3d3DDt7UEv9RA,34130 -bs4/diagnose.py,sha256=HkiiFUWS9KU3sLILDYm8X-Tu0wZRuTdMClqtXPd99go,7761 -bs4/element.py,sha256=N6UNaAICZ0BjUl2VnZWZJz7b-v9W50R1YrCmaZbXw_0,81066 -bs4/formatter.py,sha256=Wayv1d6fUc9BSCa2k9uhvWwm89xCukdtJhyi9Sxvkuc,5654 -bs4/testing.py,sha256=xpDhC4AQaVrvNVog1Lgg8nUtg0Nack0CyEVD-bjAAj0,44897 -bs4/tests/__init__.py,sha256=bdUBDE750n7qNEfue7-3a1fBaUxJlvZMkvJvZa-lbYs,27 -bs4/tests/__pycache__/__init__.cpython-38.pyc,, -bs4/tests/__pycache__/test_builder_registry.cpython-38.pyc,, -bs4/tests/__pycache__/test_docs.cpython-38.pyc,, -bs4/tests/__pycache__/test_html5lib.cpython-38.pyc,, -bs4/tests/__pycache__/test_htmlparser.cpython-38.pyc,, -bs4/tests/__pycache__/test_lxml.cpython-38.pyc,, -bs4/tests/__pycache__/test_soup.cpython-38.pyc,, -bs4/tests/__pycache__/test_tree.cpython-38.pyc,, -bs4/tests/test_builder_registry.py,sha256=pllfRpArh9TYhjjRUiu1wITr9Ryyv4hiaAtRjij-k4E,5582 -bs4/tests/test_docs.py,sha256=FXfz2bGL4Xe0q6duwpmg9hmFiZuU4DVJPNZ0hTb6aH4,1067 -bs4/tests/test_html5lib.py,sha256=eWnLGHek_RO_TMq0Ixpb1RF3BEDrvhenMf2eaEBjjsg,6754 -bs4/tests/test_htmlparser.py,sha256=3294XvFbWVe0AYoTlnLPEDW_a0Om0BKRcsrwlJbxUaI,3941 -bs4/tests/test_lxml.py,sha256=xJr8eDrtHSb_vQw88lYEKyfdM1Hel4-dBaz14vQq78M,4105 -bs4/tests/test_soup.py,sha256=EhE1dhHKyctNu0y2l0ql6FOHg9qliEt8Kh7jfCx1lDw,29303 -bs4/tests/test_tree.py,sha256=W75j1-aDx8qHWzOr_JtRCjDE83nUShFauEzGfoys2k0,88988 diff --git a/infra/api/package/beautifulsoup4-4.9.1.dist-info/REQUESTED b/infra/api/package/beautifulsoup4-4.9.1.dist-info/REQUESTED deleted file mode 100644 index e69de29..0000000 diff --git a/infra/api/package/beautifulsoup4-4.9.1.dist-info/WHEEL b/infra/api/package/beautifulsoup4-4.9.1.dist-info/WHEEL deleted file mode 100644 index b552003..0000000 --- a/infra/api/package/beautifulsoup4-4.9.1.dist-info/WHEEL +++ /dev/null @@ -1,5 +0,0 @@ -Wheel-Version: 1.0 -Generator: bdist_wheel (0.34.2) -Root-Is-Purelib: true -Tag: py3-none-any - diff --git a/infra/api/package/beautifulsoup4-4.9.1.dist-info/top_level.txt b/infra/api/package/beautifulsoup4-4.9.1.dist-info/top_level.txt deleted file mode 100644 index 1315442..0000000 --- a/infra/api/package/beautifulsoup4-4.9.1.dist-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -bs4 diff --git a/infra/api/package/bin/chardetect b/infra/api/package/bin/chardetect deleted file mode 100644 index f63944b..0000000 --- a/infra/api/package/bin/chardetect +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/python3 -# -*- coding: utf-8 -*- -import re -import sys -from chardet.cli.chardetect import main -if __name__ == '__main__': - sys.argv[0] = re.sub(r'(-script\.pyw|\.exe)?$', '', sys.argv[0]) - sys.exit(main()) diff --git a/infra/api/package/bs4/__init__.py b/infra/api/package/bs4/__init__.py deleted file mode 100644 index afaca71..0000000 --- a/infra/api/package/bs4/__init__.py +++ /dev/null @@ -1,777 +0,0 @@ -"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend". - -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup uses a pluggable XML or HTML parser to parse a -(possibly invalid) document into a tree representation. Beautiful Soup -provides methods and Pythonic idioms that make it easy to navigate, -search, and modify the parse tree. - -Beautiful Soup works with Python 2.7 and up. It works better if lxml -and/or html5lib is installed. - -For more than you ever wanted to know about Beautiful Soup, see the -documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ -""" - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.9.1" -__copyright__ = "Copyright (c) 2004-2020 Leonard Richardson" -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -__all__ = ['BeautifulSoup'] - -import os -import re -import sys -import traceback -import warnings - -from .builder import builder_registry, ParserRejectedMarkup -from .dammit import UnicodeDammit -from .element import ( - CData, - Comment, - DEFAULT_OUTPUT_ENCODING, - Declaration, - Doctype, - NavigableString, - PageElement, - ProcessingInstruction, - PYTHON_SPECIFIC_ENCODINGS, - ResultSet, - Script, - Stylesheet, - SoupStrainer, - Tag, - TemplateString, - ) - -# The very first thing we do is give a useful error if someone is -# running this code under Python 3 without converting it. -'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' - -# Define some custom warnings. -class GuessedAtParserWarning(UserWarning): - """The warning issued when BeautifulSoup has to guess what parser to - use -- probably because no parser was specified in the constructor. - """ - -class MarkupResemblesLocatorWarning(UserWarning): - """The warning issued when BeautifulSoup is given 'markup' that - actually looks like a resource locator -- a URL or a path to a file - on disk. - """ - - -class BeautifulSoup(Tag): - """A data structure representing a parsed HTML or XML document. - - Most of the methods you'll call on a BeautifulSoup object are inherited from - PageElement or Tag. - - Internally, this class defines the basic interface called by the - tree builders when converting an HTML/XML document into a data - structure. The interface abstracts away the differences between - parsers. To write a new tree builder, you'll need to understand - these methods as a whole. - - These methods will be called by the BeautifulSoup constructor: - * reset() - * feed(markup) - - The tree builder may call these methods from its feed() implementation: - * handle_starttag(name, attrs) # See note about return value - * handle_endtag(name) - * handle_data(data) # Appends to the current data node - * endData(containerClass) # Ends the current data node - - No matter how complicated the underlying parser is, you should be - able to build a tree using 'start tag' events, 'end tag' events, - 'data' events, and "done with data" events. - - If you encounter an empty-element tag (aka a self-closing tag, - like HTML's
tag), call handle_starttag and then - handle_endtag. - """ - - # Since BeautifulSoup subclasses Tag, it's possible to treat it as - # a Tag with a .name. This name makes it clear the BeautifulSoup - # object isn't a real markup tag. - ROOT_TAG_NAME = '[document]' - - # If the end-user gives no indication which tree builder they - # want, look for one with these features. - DEFAULT_BUILDER_FEATURES = ['html', 'fast'] - - # A string containing all ASCII whitespace characters, used in - # endData() to detect data chunks that seem 'empty'. - ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" - - def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None, exclude_encodings=None, - element_classes=None, **kwargs): - """Constructor. - - :param markup: A string or a file-like object representing - markup to be parsed. - - :param features: Desirable features of the parser to be - used. This may be the name of a specific parser ("lxml", - "lxml-xml", "html.parser", or "html5lib") or it may be the - type of markup to be used ("html", "html5", "xml"). It's - recommended that you name a specific parser, so that - Beautiful Soup gives you the same results across platforms - and virtual environments. - - :param builder: A TreeBuilder subclass to instantiate (or - instance to use) instead of looking one up based on - `features`. You only need to use this if you've implemented a - custom TreeBuilder. - - :param parse_only: A SoupStrainer. Only parts of the document - matching the SoupStrainer will be considered. This is useful - when parsing part of a document that would otherwise be too - large to fit into memory. - - :param from_encoding: A string indicating the encoding of the - document to be parsed. Pass this in if Beautiful Soup is - guessing wrongly about the document's encoding. - - :param exclude_encodings: A list of strings indicating - encodings known to be wrong. Pass this in if you don't know - the document's encoding but you know Beautiful Soup's guess is - wrong. - - :param element_classes: A dictionary mapping BeautifulSoup - classes like Tag and NavigableString, to other classes you'd - like to be instantiated instead as the parse tree is - built. This is useful for subclassing Tag or NavigableString - to modify default behavior. - - :param kwargs: For backwards compatibility purposes, the - constructor accepts certain keyword arguments used in - Beautiful Soup 3. None of these arguments do anything in - Beautiful Soup 4; they will result in a warning and then be - ignored. - - Apart from this, any keyword arguments passed into the - BeautifulSoup constructor are propagated to the TreeBuilder - constructor. This makes it possible to configure a - TreeBuilder by passing in arguments, not just by saying which - one to use. - """ - if 'convertEntities' in kwargs: - del kwargs['convertEntities'] - warnings.warn( - "BS4 does not respect the convertEntities argument to the " - "BeautifulSoup constructor. Entities are always converted " - "to Unicode characters.") - - if 'markupMassage' in kwargs: - del kwargs['markupMassage'] - warnings.warn( - "BS4 does not respect the markupMassage argument to the " - "BeautifulSoup constructor. The tree builder is responsible " - "for any necessary markup massage.") - - if 'smartQuotesTo' in kwargs: - del kwargs['smartQuotesTo'] - warnings.warn( - "BS4 does not respect the smartQuotesTo argument to the " - "BeautifulSoup constructor. Smart quotes are always converted " - "to Unicode characters.") - - if 'selfClosingTags' in kwargs: - del kwargs['selfClosingTags'] - warnings.warn( - "BS4 does not respect the selfClosingTags argument to the " - "BeautifulSoup constructor. The tree builder is responsible " - "for understanding self-closing tags.") - - if 'isHTML' in kwargs: - del kwargs['isHTML'] - warnings.warn( - "BS4 does not respect the isHTML argument to the " - "BeautifulSoup constructor. Suggest you use " - "features='lxml' for HTML and features='lxml-xml' for " - "XML.") - - def deprecated_argument(old_name, new_name): - if old_name in kwargs: - warnings.warn( - 'The "%s" argument to the BeautifulSoup constructor ' - 'has been renamed to "%s."' % (old_name, new_name)) - value = kwargs[old_name] - del kwargs[old_name] - return value - return None - - parse_only = parse_only or deprecated_argument( - "parseOnlyThese", "parse_only") - - from_encoding = from_encoding or deprecated_argument( - "fromEncoding", "from_encoding") - - if from_encoding and isinstance(markup, str): - warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") - from_encoding = None - - self.element_classes = element_classes or dict() - - # We need this information to track whether or not the builder - # was specified well enough that we can omit the 'you need to - # specify a parser' warning. - original_builder = builder - original_features = features - - if isinstance(builder, type): - # A builder class was passed in; it needs to be instantiated. - builder_class = builder - builder = None - elif builder is None: - if isinstance(features, str): - features = [features] - if features is None or len(features) == 0: - features = self.DEFAULT_BUILDER_FEATURES - builder_class = builder_registry.lookup(*features) - if builder_class is None: - raise FeatureNotFound( - "Couldn't find a tree builder with the features you " - "requested: %s. Do you need to install a parser library?" - % ",".join(features)) - - # At this point either we have a TreeBuilder instance in - # builder, or we have a builder_class that we can instantiate - # with the remaining **kwargs. - if builder is None: - builder = builder_class(**kwargs) - if not original_builder and not ( - original_features == builder.NAME or - original_features in builder.ALTERNATE_NAMES - ): - if builder.is_xml: - markup_type = "XML" - else: - markup_type = "HTML" - - # This code adapted from warnings.py so that we get the same line - # of code as our warnings.warn() call gets, even if the answer is wrong - # (as it may be in a multithreading situation). - caller = None - try: - caller = sys._getframe(1) - except ValueError: - pass - if caller: - globals = caller.f_globals - line_number = caller.f_lineno - else: - globals = sys.__dict__ - line_number= 1 - filename = globals.get('__file__') - if filename: - fnl = filename.lower() - if fnl.endswith((".pyc", ".pyo")): - filename = filename[:-1] - if filename: - # If there is no filename at all, the user is most likely in a REPL, - # and the warning is not necessary. - values = dict( - filename=filename, - line_number=line_number, - parser=builder.NAME, - markup_type=markup_type - ) - warnings.warn( - self.NO_PARSER_SPECIFIED_WARNING % values, - GuessedAtParserWarning, stacklevel=2 - ) - else: - if kwargs: - warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.") - - self.builder = builder - self.is_xml = builder.is_xml - self.known_xml = self.is_xml - self._namespaces = dict() - self.parse_only = parse_only - - self.builder.initialize_soup(self) - - if hasattr(markup, 'read'): # It's a file-type object. - markup = markup.read() - elif len(markup) <= 256 and ( - (isinstance(markup, bytes) and not b'<' in markup) - or (isinstance(markup, str) and not '<' in markup) - ): - # Print out warnings for a couple beginner problems - # involving passing non-markup to Beautiful Soup. - # Beautiful Soup will still parse the input as markup, - # just in case that's what the user really wants. - if (isinstance(markup, str) - and not os.path.supports_unicode_filenames): - possible_filename = markup.encode("utf8") - else: - possible_filename = markup - is_file = False - try: - is_file = os.path.exists(possible_filename) - except Exception as e: - # This is almost certainly a problem involving - # characters not valid in filenames on this - # system. Just let it go. - pass - if is_file: - warnings.warn( - '"%s" looks like a filename, not markup. You should' - ' probably open this file and pass the filehandle into' - ' Beautiful Soup.' % self._decode_markup(markup), - MarkupResemblesLocatorWarning - ) - self._check_markup_is_url(markup) - - rejections = [] - success = False - for (self.markup, self.original_encoding, self.declared_html_encoding, - self.contains_replacement_characters) in ( - self.builder.prepare_markup( - markup, from_encoding, exclude_encodings=exclude_encodings)): - self.reset() - try: - self._feed() - success = True - break - except ParserRejectedMarkup as e: - rejections.append(e) - pass - - if not success: - other_exceptions = [str(e) for e in rejections] - raise ParserRejectedMarkup( - "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions) - ) - - # Clear out the markup and remove the builder's circular - # reference to this object. - self.markup = None - self.builder.soup = None - - def __copy__(self): - """Copy a BeautifulSoup object by converting the document to a string and parsing it again.""" - copy = type(self)( - self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' - ) - - # Although we encoded the tree to UTF-8, that may not have - # been the encoding of the original markup. Set the copy's - # .original_encoding to reflect the original object's - # .original_encoding. - copy.original_encoding = self.original_encoding - return copy - - def __getstate__(self): - # Frequently a tree builder can't be pickled. - d = dict(self.__dict__) - if 'builder' in d and not self.builder.picklable: - d['builder'] = None - return d - - @classmethod - def _decode_markup(cls, markup): - """Ensure `markup` is bytes so it's safe to send into warnings.warn. - - TODO: warnings.warn had this problem back in 2010 but it might not - anymore. - """ - if isinstance(markup, bytes): - decoded = markup.decode('utf-8', 'replace') - else: - decoded = markup - return decoded - - @classmethod - def _check_markup_is_url(cls, markup): - """Error-handling method to raise a warning if incoming markup looks - like a URL. - - :param markup: A string. - """ - if isinstance(markup, bytes): - space = b' ' - cant_start_with = (b"http:", b"https:") - elif isinstance(markup, str): - space = ' ' - cant_start_with = ("http:", "https:") - else: - return - - if any(markup.startswith(prefix) for prefix in cant_start_with): - if not space in markup: - warnings.warn( - '"%s" looks like a URL. Beautiful Soup is not an' - ' HTTP client. You should probably use an HTTP client like' - ' requests to get the document behind the URL, and feed' - ' that document to Beautiful Soup.' % cls._decode_markup( - markup - ), - MarkupResemblesLocatorWarning - ) - - def _feed(self): - """Internal method that parses previously set markup, creating a large - number of Tag and NavigableString objects. - """ - # Convert the document to Unicode. - self.builder.reset() - - self.builder.feed(self.markup) - # Close out any unfinished strings and close all the open tags. - self.endData() - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def reset(self): - """Reset this object to a state as though it had never parsed any - markup. - """ - Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) - self.hidden = 1 - self.builder.reset() - self.current_data = [] - self.currentTag = None - self.tagStack = [] - self.preserve_whitespace_tag_stack = [] - self.string_container_stack = [] - self.pushTag(self) - - def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, - sourceline=None, sourcepos=None, **kwattrs): - """Create a new Tag associated with this BeautifulSoup object. - - :param name: The name of the new Tag. - :param namespace: The URI of the new Tag's XML namespace, if any. - :param prefix: The prefix for the new Tag's XML namespace, if any. - :param attrs: A dictionary of this Tag's attribute values; can - be used instead of `kwattrs` for attributes like 'class' - that are reserved words in Python. - :param sourceline: The line number where this tag was - (purportedly) found in its source document. - :param sourcepos: The character position within `sourceline` where this - tag was (purportedly) found. - :param kwattrs: Keyword arguments for the new Tag's attribute values. - - """ - kwattrs.update(attrs) - return self.element_classes.get(Tag, Tag)( - None, self.builder, name, namespace, nsprefix, kwattrs, - sourceline=sourceline, sourcepos=sourcepos - ) - - def string_container(self, base_class=None): - container = base_class or NavigableString - - # There may be a general override of NavigableString. - container = self.element_classes.get( - container, container - ) - - # On top of that, we may be inside a tag that needs a special - # container class. - if self.string_container_stack: - container = self.builder.string_containers.get( - self.string_container_stack[-1].name, container - ) - return container - - def new_string(self, s, subclass=None): - """Create a new NavigableString associated with this BeautifulSoup - object. - """ - container = self.string_container(subclass) - return container(s) - - def insert_before(self, successor): - """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement - it because there is nothing before or after it in the parse tree. - """ - raise NotImplementedError("BeautifulSoup objects don't support insert_before().") - - def insert_after(self, successor): - """This method is part of the PageElement API, but `BeautifulSoup` doesn't implement - it because there is nothing before or after it in the parse tree. - """ - raise NotImplementedError("BeautifulSoup objects don't support insert_after().") - - def popTag(self): - """Internal method called by _popToTag when a tag is closed.""" - tag = self.tagStack.pop() - if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: - self.preserve_whitespace_tag_stack.pop() - if self.string_container_stack and tag == self.string_container_stack[-1]: - self.string_container_stack.pop() - #print("Pop", tag.name) - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - """Internal method called by handle_starttag when a tag is opened.""" - #print("Push", tag.name) - if self.currentTag is not None: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - if tag.name in self.builder.preserve_whitespace_tags: - self.preserve_whitespace_tag_stack.append(tag) - if tag.name in self.builder.string_containers: - self.string_container_stack.append(tag) - - def endData(self, containerClass=None): - """Method called by the TreeBuilder when the end of a data segment - occurs. - """ - containerClass = self.string_container(containerClass) - - if self.current_data: - current_data = ''.join(self.current_data) - # If whitespace is not preserved, and this string contains - # nothing but ASCII spaces, replace it with a single space - # or newline. - if not self.preserve_whitespace_tag_stack: - strippable = True - for i in current_data: - if i not in self.ASCII_SPACES: - strippable = False - break - if strippable: - if '\n' in current_data: - current_data = '\n' - else: - current_data = ' ' - - # Reset the data collector. - self.current_data = [] - - # Should we add this string to the tree at all? - if self.parse_only and len(self.tagStack) <= 1 and \ - (not self.parse_only.text or \ - not self.parse_only.search(current_data)): - return - - o = containerClass(current_data) - self.object_was_parsed(o) - - def object_was_parsed(self, o, parent=None, most_recent_element=None): - """Method called by the TreeBuilder to integrate an object into the parse tree.""" - if parent is None: - parent = self.currentTag - if most_recent_element is not None: - previous_element = most_recent_element - else: - previous_element = self._most_recent_element - - next_element = previous_sibling = next_sibling = None - if isinstance(o, Tag): - next_element = o.next_element - next_sibling = o.next_sibling - previous_sibling = o.previous_sibling - if previous_element is None: - previous_element = o.previous_element - - fix = parent.next_element is not None - - o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) - - self._most_recent_element = o - parent.contents.append(o) - - # Check if we are inserting into an already parsed node. - if fix: - self._linkage_fixer(parent) - - def _linkage_fixer(self, el): - """Make sure linkage of this fragment is sound.""" - - first = el.contents[0] - child = el.contents[-1] - descendant = child - - if child is first and el.parent is not None: - # Parent should be linked to first child - el.next_element = child - # We are no longer linked to whatever this element is - prev_el = child.previous_element - if prev_el is not None and prev_el is not el: - prev_el.next_element = None - # First child should be linked to the parent, and no previous siblings. - child.previous_element = el - child.previous_sibling = None - - # We have no sibling as we've been appended as the last. - child.next_sibling = None - - # This index is a tag, dig deeper for a "last descendant" - if isinstance(child, Tag) and child.contents: - descendant = child._last_descendant(False) - - # As the final step, link last descendant. It should be linked - # to the parent's next sibling (if found), else walk up the chain - # and find a parent with a sibling. It should have no next sibling. - descendant.next_element = None - descendant.next_sibling = None - target = el - while True: - if target is None: - break - elif target.next_sibling is not None: - descendant.next_element = target.next_sibling - target.next_sibling.previous_element = child - break - target = target.parent - - def _popToTag(self, name, nsprefix=None, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. - - :param name: Pop up to the most recent tag with this name. - :param nsprefix: The namespace prefix that goes with `name`. - :param inclusivePop: It this is false, pops the tag stack up - to but *not* including the most recent instqance of the - given tag. - """ - #print("Popping to %s" % name) - if name == self.ROOT_TAG_NAME: - # The BeautifulSoup object itself can never be popped. - return - - most_recently_popped = None - - stack_size = len(self.tagStack) - for i in range(stack_size - 1, 0, -1): - t = self.tagStack[i] - if (name == t.name and nsprefix == t.prefix): - if inclusivePop: - most_recently_popped = self.popTag() - break - most_recently_popped = self.popTag() - - return most_recently_popped - - def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None, - sourcepos=None): - """Called by the tree builder when a new tag is encountered. - - :param name: Name of the tag. - :param nsprefix: Namespace prefix for the tag. - :param attrs: A dictionary of attribute values. - :param sourceline: The line number where this tag was found in its - source document. - :param sourcepos: The character position within `sourceline` where this - tag was found. - - If this method returns None, the tag was rejected by an active - SoupStrainer. You should proceed as if the tag had not occurred - in the document. For instance, if this was a self-closing tag, - don't call handle_endtag. - """ - # print("Start tag %s: %s" % (name, attrs)) - self.endData() - - if (self.parse_only and len(self.tagStack) <= 1 - and (self.parse_only.text - or not self.parse_only.search_tag(name, attrs))): - return None - - tag = self.element_classes.get(Tag, Tag)( - self, self.builder, name, namespace, nsprefix, attrs, - self.currentTag, self._most_recent_element, - sourceline=sourceline, sourcepos=sourcepos - ) - if tag is None: - return tag - if self._most_recent_element is not None: - self._most_recent_element.next_element = tag - self._most_recent_element = tag - self.pushTag(tag) - return tag - - def handle_endtag(self, name, nsprefix=None): - """Called by the tree builder when an ending tag is encountered. - - :param name: Name of the tag. - :param nsprefix: Namespace prefix for the tag. - """ - #print("End tag: " + name) - self.endData() - self._popToTag(name, nsprefix) - - def handle_data(self, data): - """Called by the tree builder when a chunk of textual data is encountered.""" - self.current_data.append(data) - - def decode(self, pretty_print=False, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Returns a string or Unicode representation of the parse tree - as an HTML or XML document. - - :param pretty_print: If this is True, indentation will be used to - make the document more readable. - :param eventual_encoding: The encoding of the final document. - If this is None, the document will be a Unicode string. - """ - if self.is_xml: - # Print the XML declaration - encoding_part = '' - if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS: - # This is a special Python encoding; it can't actually - # go into an XML document because it means nothing - # outside of Python. - eventual_encoding = None - if eventual_encoding != None: - encoding_part = ' encoding="%s"' % eventual_encoding - prefix = '\n' % encoding_part - else: - prefix = '' - if not pretty_print: - indent_level = None - else: - indent_level = 0 - return prefix + super(BeautifulSoup, self).decode( - indent_level, eventual_encoding, formatter) - -# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup' -_s = BeautifulSoup -_soup = BeautifulSoup - -class BeautifulStoneSoup(BeautifulSoup): - """Deprecated interface to an XML parser.""" - - def __init__(self, *args, **kwargs): - kwargs['features'] = 'xml' - warnings.warn( - 'The BeautifulStoneSoup class is deprecated. Instead of using ' - 'it, pass features="xml" into the BeautifulSoup constructor.') - super(BeautifulStoneSoup, self).__init__(*args, **kwargs) - - -class StopParsing(Exception): - """Exception raised by a TreeBuilder if it's unable to continue parsing.""" - pass - -class FeatureNotFound(ValueError): - """Exception raised by the BeautifulSoup constructor if no parser with the - requested features is found. - """ - pass - - -#If this file is run as a script, act as an HTML pretty-printer. -if __name__ == '__main__': - import sys - soup = BeautifulSoup(sys.stdin) - print((soup.prettify())) diff --git a/infra/api/package/bs4/builder/__init__.py b/infra/api/package/bs4/builder/__init__.py deleted file mode 100644 index 86759db..0000000 --- a/infra/api/package/bs4/builder/__init__.py +++ /dev/null @@ -1,520 +0,0 @@ -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -from collections import defaultdict -import itertools -import sys -from bs4.element import ( - CharsetMetaAttributeValue, - ContentMetaAttributeValue, - Stylesheet, - Script, - TemplateString, - nonwhitespace_re -) - -__all__ = [ - 'HTMLTreeBuilder', - 'SAXTreeBuilder', - 'TreeBuilder', - 'TreeBuilderRegistry', - ] - -# Some useful features for a TreeBuilder to have. -FAST = 'fast' -PERMISSIVE = 'permissive' -STRICT = 'strict' -XML = 'xml' -HTML = 'html' -HTML_5 = 'html5' - - -class TreeBuilderRegistry(object): - """A way of looking up TreeBuilder subclasses by their name or by desired - features. - """ - - def __init__(self): - self.builders_for_feature = defaultdict(list) - self.builders = [] - - def register(self, treebuilder_class): - """Register a treebuilder based on its advertised features. - - :param treebuilder_class: A subclass of Treebuilder. its .features - attribute should list its features. - """ - for feature in treebuilder_class.features: - self.builders_for_feature[feature].insert(0, treebuilder_class) - self.builders.insert(0, treebuilder_class) - - def lookup(self, *features): - """Look up a TreeBuilder subclass with the desired features. - - :param features: A list of features to look for. If none are - provided, the most recently registered TreeBuilder subclass - will be used. - :return: A TreeBuilder subclass, or None if there's no - registered subclass with all the requested features. - """ - if len(self.builders) == 0: - # There are no builders at all. - return None - - if len(features) == 0: - # They didn't ask for any features. Give them the most - # recently registered builder. - return self.builders[0] - - # Go down the list of features in order, and eliminate any builders - # that don't match every feature. - features = list(features) - features.reverse() - candidates = None - candidate_set = None - while len(features) > 0: - feature = features.pop() - we_have_the_feature = self.builders_for_feature.get(feature, []) - if len(we_have_the_feature) > 0: - if candidates is None: - candidates = we_have_the_feature - candidate_set = set(candidates) - else: - # Eliminate any candidates that don't have this feature. - candidate_set = candidate_set.intersection( - set(we_have_the_feature)) - - # The only valid candidates are the ones in candidate_set. - # Go through the original list of candidates and pick the first one - # that's in candidate_set. - if candidate_set is None: - return None - for candidate in candidates: - if candidate in candidate_set: - return candidate - return None - -# The BeautifulSoup class will take feature lists from developers and use them -# to look up builders in this registry. -builder_registry = TreeBuilderRegistry() - -class TreeBuilder(object): - """Turn a textual document into a Beautiful Soup object tree.""" - - NAME = "[Unknown tree builder]" - ALTERNATE_NAMES = [] - features = [] - - is_xml = False - picklable = False - empty_element_tags = None # A tag will be considered an empty-element - # tag when and only when it has no contents. - - # A value for these tag/attribute combinations is a space- or - # comma-separated list of CDATA, rather than a single CDATA. - DEFAULT_CDATA_LIST_ATTRIBUTES = {} - - # Whitespace should be preserved inside these tags. - DEFAULT_PRESERVE_WHITESPACE_TAGS = set() - - # The textual contents of tags with these names should be - # instantiated with some class other than NavigableString. - DEFAULT_STRING_CONTAINERS = {} - - USE_DEFAULT = object() - - # Most parsers don't keep track of line numbers. - TRACKS_LINE_NUMBERS = False - - def __init__(self, multi_valued_attributes=USE_DEFAULT, - preserve_whitespace_tags=USE_DEFAULT, - store_line_numbers=USE_DEFAULT, - string_containers=USE_DEFAULT, - ): - """Constructor. - - :param multi_valued_attributes: If this is set to None, the - TreeBuilder will not turn any values for attributes like - 'class' into lists. Setting this to a dictionary will - customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES - for an example. - - Internally, these are called "CDATA list attributes", but that - probably doesn't make sense to an end-user, so the argument name - is `multi_valued_attributes`. - - :param preserve_whitespace_tags: A list of tags to treat - the way
 tags are treated in HTML. Tags in this list
-         are immune from pretty-printing; their contents will always be
-         output as-is.
-
-        :param string_containers: A dictionary mapping tag names to
-        the classes that should be instantiated to contain the textual
-        contents of those tags. The default is to use NavigableString
-        for every tag, no matter what the name. You can override the
-        default by changing DEFAULT_STRING_CONTAINERS.
-
-        :param store_line_numbers: If the parser keeps track of the
-         line numbers and positions of the original markup, that
-         information will, by default, be stored in each corresponding
-         `Tag` object. You can turn this off by passing
-         store_line_numbers=False. If the parser you're using doesn't 
-         keep track of this information, then setting store_line_numbers=True
-         will do nothing.
-        """
-        self.soup = None
-        if multi_valued_attributes is self.USE_DEFAULT:
-            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
-        self.cdata_list_attributes = multi_valued_attributes
-        if preserve_whitespace_tags is self.USE_DEFAULT:
-            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
-        self.preserve_whitespace_tags = preserve_whitespace_tags
-        if store_line_numbers == self.USE_DEFAULT:
-            store_line_numbers = self.TRACKS_LINE_NUMBERS
-        self.store_line_numbers = store_line_numbers 
-        if string_containers == self.USE_DEFAULT:
-            string_containers = self.DEFAULT_STRING_CONTAINERS
-        self.string_containers = string_containers
-        
-    def initialize_soup(self, soup):
-        """The BeautifulSoup object has been initialized and is now
-        being associated with the TreeBuilder.
-
-        :param soup: A BeautifulSoup object.
-        """
-        self.soup = soup
-        
-    def reset(self):
-        """Do any work necessary to reset the underlying parser
-        for a new document.
-
-        By default, this does nothing.
-        """
-        pass
-
-    def can_be_empty_element(self, tag_name):
-        """Might a tag with this name be an empty-element tag?
-
-        The final markup may or may not actually present this tag as
-        self-closing.
-
-        For instance: an HTMLBuilder does not consider a 

tag to be - an empty-element tag (it's not in - HTMLBuilder.empty_element_tags). This means an empty

tag - will be presented as "

", not "

" or "

". - - The default implementation has no opinion about which tags are - empty-element tags, so a tag will be presented as an - empty-element tag if and only if it has no children. - "" will become "", and "bar" will - be left alone. - - :param tag_name: The name of a markup tag. - """ - if self.empty_element_tags is None: - return True - return tag_name in self.empty_element_tags - - def feed(self, markup): - """Run some incoming markup through some parsing process, - populating the `BeautifulSoup` object in self.soup. - - This method is not implemented in TreeBuilder; it must be - implemented in subclasses. - - :return: None. - """ - raise NotImplementedError() - - def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None, exclude_encodings=None): - """Run any preliminary steps necessary to make incoming markup - acceptable to the parser. - - :param markup: Some markup -- probably a bytestring. - :param user_specified_encoding: The user asked to try this encoding. - :param document_declared_encoding: The markup itself claims to be - in this encoding. - :param exclude_encodings: The user asked _not_ to try any of - these encodings. - - :yield: A series of 4-tuples: - (markup, encoding, declared encoding, - has undergone character replacement) - - Each 4-tuple represents a strategy for converting the - document to Unicode and parsing it. Each strategy will be tried - in turn. - - By default, the only strategy is to parse the markup - as-is. See `LXMLTreeBuilderForXML` and - `HTMLParserTreeBuilder` for implementations that take into - account the quirks of particular parsers. - """ - yield markup, None, None, False - - def test_fragment_to_document(self, fragment): - """Wrap an HTML fragment to make it look like a document. - - Different parsers do this differently. For instance, lxml - introduces an empty tag, and html5lib - doesn't. Abstracting this away lets us write simple tests - which run HTML fragments through the parser and compare the - results against other HTML fragments. - - This method should not be used outside of tests. - - :param fragment: A string -- fragment of HTML. - :return: A string -- a full HTML document. - """ - return fragment - - def set_up_substitutions(self, tag): - """Set up any substitutions that will need to be performed on - a `Tag` when it's output as a string. - - By default, this does nothing. See `HTMLTreeBuilder` for a - case where this is used. - - :param tag: A `Tag` - :return: Whether or not a substitution was performed. - """ - return False - - def _replace_cdata_list_attribute_values(self, tag_name, attrs): - """When an attribute value is associated with a tag that can - have multiple values for that attribute, convert the string - value to a list of strings. - - Basically, replaces class="foo bar" with class=["foo", "bar"] - - NOTE: This method modifies its input in place. - - :param tag_name: The name of a tag. - :param attrs: A dictionary containing the tag's attributes. - Any appropriate attribute values will be modified in place. - """ - if not attrs: - return attrs - if self.cdata_list_attributes: - universal = self.cdata_list_attributes.get('*', []) - tag_specific = self.cdata_list_attributes.get( - tag_name.lower(), None) - for attr in list(attrs.keys()): - if attr in universal or (tag_specific and attr in tag_specific): - # We have a "class"-type attribute whose string - # value is a whitespace-separated list of - # values. Split it into a list. - value = attrs[attr] - if isinstance(value, str): - values = nonwhitespace_re.findall(value) - else: - # html5lib sometimes calls setAttributes twice - # for the same tag when rearranging the parse - # tree. On the second call the attribute value - # here is already a list. If this happens, - # leave the value alone rather than trying to - # split it again. - values = value - attrs[attr] = values - return attrs - -class SAXTreeBuilder(TreeBuilder): - """A Beautiful Soup treebuilder that listens for SAX events. - - This is not currently used for anything, but it demonstrates - how a simple TreeBuilder would work. - """ - - def feed(self, markup): - raise NotImplementedError() - - def close(self): - pass - - def startElement(self, name, attrs): - attrs = dict((key[1], value) for key, value in list(attrs.items())) - #print("Start %s, %r" % (name, attrs)) - self.soup.handle_starttag(name, attrs) - - def endElement(self, name): - #print("End %s" % name) - self.soup.handle_endtag(name) - - def startElementNS(self, nsTuple, nodeName, attrs): - # Throw away (ns, nodeName) for now. - self.startElement(nodeName, attrs) - - def endElementNS(self, nsTuple, nodeName): - # Throw away (ns, nodeName) for now. - self.endElement(nodeName) - #handler.endElementNS((ns, node.nodeName), node.nodeName) - - def startPrefixMapping(self, prefix, nodeValue): - # Ignore the prefix for now. - pass - - def endPrefixMapping(self, prefix): - # Ignore the prefix for now. - # handler.endPrefixMapping(prefix) - pass - - def characters(self, content): - self.soup.handle_data(content) - - def startDocument(self): - pass - - def endDocument(self): - pass - - -class HTMLTreeBuilder(TreeBuilder): - """This TreeBuilder knows facts about HTML. - - Such as which tags are empty-element tags. - """ - - empty_element_tags = set([ - # These are from HTML5. - 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - - # These are from earlier versions of HTML and are removed in HTML5. - 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer' - ]) - - # The HTML standard defines these as block-level elements. Beautiful - # Soup does not treat these elements differently from other elements, - # but it may do so eventually, and this information is available if - # you need to use it. - block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"]) - - # The HTML standard defines an unusual content model for these tags. - # We represent this by using a string class other than NavigableString - # inside these tags. - # - # I made this list by going through the HTML spec - # (https://html.spec.whatwg.org/#metadata-content) and looking for - # "metadata content" elements that can contain strings. - # - # TODO: Arguably

as a -# string. -# -# XXX This code can be removed once most Python 3 users are on 3.2.3. -if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: - import re - attrfind_tolerant = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') - HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant - - locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) - BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend - - from html.parser import tagfind, attrfind - - def parse_starttag(self, i): - self.__starttag_text = None - endpos = self.check_for_whole_start_tag(i) - if endpos < 0: - return endpos - rawdata = self.rawdata - self.__starttag_text = rawdata[i:endpos] - - # Now parse the data between i+1 and j into a tag and attrs - attrs = [] - match = tagfind.match(rawdata, i+1) - assert match, 'unexpected call to parse_starttag()' - k = match.end() - self.lasttag = tag = rawdata[i+1:k].lower() - while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) - if not m: - break - attrname, rest, attrvalue = m.group(1, 2, 3) - if not rest: - attrvalue = None - elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ - attrvalue[:1] == '"' == attrvalue[-1:]: - attrvalue = attrvalue[1:-1] - if attrvalue: - attrvalue = self.unescape(attrvalue) - attrs.append((attrname.lower(), attrvalue)) - k = m.end() - - end = rawdata[k:endpos].strip() - if end not in (">", "/>"): - lineno, offset = self.getpos() - if "\n" in self.__starttag_text: - lineno = lineno + self.__starttag_text.count("\n") - offset = len(self.__starttag_text) \ - - self.__starttag_text.rfind("\n") - else: - offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) - self.handle_data(rawdata[i:endpos]) - return endpos - if end.endswith('/>'): - # XHTML-style empty tag: - self.handle_startendtag(tag, attrs) - else: - self.handle_starttag(tag, attrs) - if tag in self.CDATA_CONTENT_ELEMENTS: - self.set_cdata_mode(tag) - return endpos - - def set_cdata_mode(self, elem): - self.cdata_elem = elem.lower() - self.interesting = re.compile(r'' % self.cdata_elem, re.I) - - BeautifulSoupHTMLParser.parse_starttag = parse_starttag - BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode - - CONSTRUCTOR_TAKES_STRICT = True diff --git a/infra/api/package/bs4/builder/_lxml.py b/infra/api/package/bs4/builder/_lxml.py deleted file mode 100644 index 432a2c8..0000000 --- a/infra/api/package/bs4/builder/_lxml.py +++ /dev/null @@ -1,332 +0,0 @@ -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -__all__ = [ - 'LXMLTreeBuilderForXML', - 'LXMLTreeBuilder', - ] - -try: - from collections.abc import Callable # Python 3.6 -except ImportError as e: - from collections import Callable - -from io import BytesIO -from io import StringIO -from lxml import etree -from bs4.element import ( - Comment, - Doctype, - NamespacedAttribute, - ProcessingInstruction, - XMLProcessingInstruction, -) -from bs4.builder import ( - FAST, - HTML, - HTMLTreeBuilder, - PERMISSIVE, - ParserRejectedMarkup, - TreeBuilder, - XML) -from bs4.dammit import EncodingDetector - -LXML = 'lxml' - -def _invert(d): - "Invert a dictionary." - return dict((v,k) for k, v in list(d.items())) - -class LXMLTreeBuilderForXML(TreeBuilder): - DEFAULT_PARSER_CLASS = etree.XMLParser - - is_xml = True - processing_instruction_class = XMLProcessingInstruction - - NAME = "lxml-xml" - ALTERNATE_NAMES = ["xml"] - - # Well, it's permissive by XML parser standards. - features = [NAME, LXML, XML, FAST, PERMISSIVE] - - CHUNK_SIZE = 512 - - # This namespace mapping is specified in the XML Namespace - # standard. - DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') - - DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) - - # NOTE: If we parsed Element objects and looked at .sourceline, - # we'd be able to see the line numbers from the original document. - # But instead we build an XMLParser or HTMLParser object to serve - # as the target of parse messages, and those messages don't include - # line numbers. - # See: https://bugs.launchpad.net/lxml/+bug/1846906 - - def initialize_soup(self, soup): - """Let the BeautifulSoup object know about the standard namespace - mapping. - - :param soup: A `BeautifulSoup`. - """ - super(LXMLTreeBuilderForXML, self).initialize_soup(soup) - self._register_namespaces(self.DEFAULT_NSMAPS) - - def _register_namespaces(self, mapping): - """Let the BeautifulSoup object know about namespaces encountered - while parsing the document. - - This might be useful later on when creating CSS selectors. - - :param mapping: A dictionary mapping namespace prefixes to URIs. - """ - for key, value in list(mapping.items()): - if key and key not in self.soup._namespaces: - # Let the BeautifulSoup object know about a new namespace. - # If there are multiple namespaces defined with the same - # prefix, the first one in the document takes precedence. - self.soup._namespaces[key] = value - - def default_parser(self, encoding): - """Find the default parser for the given encoding. - - :param encoding: A string. - :return: Either a parser object or a class, which - will be instantiated with default arguments. - """ - if self._default_parser is not None: - return self._default_parser - return etree.XMLParser( - target=self, strip_cdata=False, recover=True, encoding=encoding) - - def parser_for(self, encoding): - """Instantiate an appropriate parser for the given encoding. - - :param encoding: A string. - :return: A parser object such as an `etree.XMLParser`. - """ - # Use the default parser. - parser = self.default_parser(encoding) - - if isinstance(parser, Callable): - # Instantiate the parser with default arguments - parser = parser( - target=self, strip_cdata=False, recover=True, encoding=encoding - ) - return parser - - def __init__(self, parser=None, empty_element_tags=None, **kwargs): - # TODO: Issue a warning if parser is present but not a - # callable, since that means there's no way to create new - # parsers for different encodings. - self._default_parser = parser - if empty_element_tags is not None: - self.empty_element_tags = set(empty_element_tags) - self.soup = None - self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] - super(LXMLTreeBuilderForXML, self).__init__(**kwargs) - - def _getNsTag(self, tag): - # Split the namespace URL out of a fully-qualified lxml tag - # name. Copied from lxml's src/lxml/sax.py. - if tag[0] == '{': - return tuple(tag[1:].split('}', 1)) - else: - return (None, tag) - - def prepare_markup(self, markup, user_specified_encoding=None, - exclude_encodings=None, - document_declared_encoding=None): - """Run any preliminary steps necessary to make incoming markup - acceptable to the parser. - - lxml really wants to get a bytestring and convert it to - Unicode itself. So instead of using UnicodeDammit to convert - the bytestring to Unicode using different encodings, this - implementation uses EncodingDetector to iterate over the - encodings, and tell lxml to try to parse the document as each - one in turn. - - :param markup: Some markup -- hopefully a bytestring. - :param user_specified_encoding: The user asked to try this encoding. - :param document_declared_encoding: The markup itself claims to be - in this encoding. - :param exclude_encodings: The user asked _not_ to try any of - these encodings. - - :yield: A series of 4-tuples: - (markup, encoding, declared encoding, - has undergone character replacement) - - Each 4-tuple represents a strategy for converting the - document to Unicode and parsing it. Each strategy will be tried - in turn. - """ - is_html = not self.is_xml - if is_html: - self.processing_instruction_class = ProcessingInstruction - else: - self.processing_instruction_class = XMLProcessingInstruction - - if isinstance(markup, str): - # We were given Unicode. Maybe lxml can parse Unicode on - # this system? - yield markup, None, document_declared_encoding, False - - if isinstance(markup, str): - # No, apparently not. Convert the Unicode to UTF-8 and - # tell lxml to parse it as UTF-8. - yield (markup.encode("utf8"), "utf8", - document_declared_encoding, False) - - try_encodings = [user_specified_encoding, document_declared_encoding] - detector = EncodingDetector( - markup, try_encodings, is_html, exclude_encodings) - for encoding in detector.encodings: - yield (detector.markup, encoding, document_declared_encoding, False) - - def feed(self, markup): - if isinstance(markup, bytes): - markup = BytesIO(markup) - elif isinstance(markup, str): - markup = StringIO(markup) - - # Call feed() at least once, even if the markup is empty, - # or the parser won't be initialized. - data = markup.read(self.CHUNK_SIZE) - try: - self.parser = self.parser_for(self.soup.original_encoding) - self.parser.feed(data) - while len(data) != 0: - # Now call feed() on the rest of the data, chunk by chunk. - data = markup.read(self.CHUNK_SIZE) - if len(data) != 0: - self.parser.feed(data) - self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError) as e: - raise ParserRejectedMarkup(e) - - def close(self): - self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] - - def start(self, name, attrs, nsmap={}): - # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. - attrs = dict(attrs) - nsprefix = None - # Invert each namespace map as it comes in. - if len(nsmap) == 0 and len(self.nsmaps) > 1: - # There are no new namespaces for this tag, but - # non-default namespaces are in play, so we need a - # separate tag stack to know when they end. - self.nsmaps.append(None) - elif len(nsmap) > 0: - # A new namespace mapping has come into play. - - # First, Let the BeautifulSoup object know about it. - self._register_namespaces(nsmap) - - # Then, add it to our running list of inverted namespace - # mappings. - self.nsmaps.append(_invert(nsmap)) - - # Also treat the namespace mapping as a set of attributes on the - # tag, so we can recreate it later. - attrs = attrs.copy() - for prefix, namespace in list(nsmap.items()): - attribute = NamespacedAttribute( - "xmlns", prefix, "http://www.w3.org/2000/xmlns/") - attrs[attribute] = namespace - - # Namespaces are in play. Find any attributes that came in - # from lxml with namespaces attached to their names, and - # turn then into NamespacedAttribute objects. - new_attrs = {} - for attr, value in list(attrs.items()): - namespace, attr = self._getNsTag(attr) - if namespace is None: - new_attrs[attr] = value - else: - nsprefix = self._prefix_for_namespace(namespace) - attr = NamespacedAttribute(nsprefix, attr, namespace) - new_attrs[attr] = value - attrs = new_attrs - - namespace, name = self._getNsTag(name) - nsprefix = self._prefix_for_namespace(namespace) - self.soup.handle_starttag(name, namespace, nsprefix, attrs) - - def _prefix_for_namespace(self, namespace): - """Find the currently active prefix for the given namespace.""" - if namespace is None: - return None - for inverted_nsmap in reversed(self.nsmaps): - if inverted_nsmap is not None and namespace in inverted_nsmap: - return inverted_nsmap[namespace] - return None - - def end(self, name): - self.soup.endData() - completed_tag = self.soup.tagStack[-1] - namespace, name = self._getNsTag(name) - nsprefix = None - if namespace is not None: - for inverted_nsmap in reversed(self.nsmaps): - if inverted_nsmap is not None and namespace in inverted_nsmap: - nsprefix = inverted_nsmap[namespace] - break - self.soup.handle_endtag(name, nsprefix) - if len(self.nsmaps) > 1: - # This tag, or one of its parents, introduced a namespace - # mapping, so pop it off the stack. - self.nsmaps.pop() - - def pi(self, target, data): - self.soup.endData() - self.soup.handle_data(target + ' ' + data) - self.soup.endData(self.processing_instruction_class) - - def data(self, content): - self.soup.handle_data(content) - - def doctype(self, name, pubid, system): - self.soup.endData() - doctype = Doctype.for_name_and_ids(name, pubid, system) - self.soup.object_was_parsed(doctype) - - def comment(self, content): - "Handle comments as Comment objects." - self.soup.endData() - self.soup.handle_data(content) - self.soup.endData(Comment) - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return '\n%s' % fragment - - -class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): - - NAME = LXML - ALTERNATE_NAMES = ["lxml-html"] - - features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] - is_xml = False - processing_instruction_class = ProcessingInstruction - - def default_parser(self, encoding): - return etree.HTMLParser - - def feed(self, markup): - encoding = self.soup.original_encoding - try: - self.parser = self.parser_for(encoding) - self.parser.feed(markup) - self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError) as e: - raise ParserRejectedMarkup(e) - - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return '%s' % fragment diff --git a/infra/api/package/bs4/dammit.py b/infra/api/package/bs4/dammit.py deleted file mode 100644 index ee3708f..0000000 --- a/infra/api/package/bs4/dammit.py +++ /dev/null @@ -1,939 +0,0 @@ -# -*- coding: utf-8 -*- -"""Beautiful Soup bonus library: Unicode, Dammit - -This library converts a bytestream to Unicode through any means -necessary. It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It works best on XML and HTML, but it does not rewrite the -XML or HTML to reflect a new encoding; that's the tree builder's job. -""" -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -import codecs -from html.entities import codepoint2name -import re -import logging -import string - -# Import a library to autodetect character encodings. -chardet_type = None -try: - # First try the fast C implementation. - # PyPI package: cchardet - import cchardet - def chardet_dammit(s): - if isinstance(s, str): - return None - return cchardet.detect(s)['encoding'] -except ImportError: - try: - # Fall back to the pure Python implementation - # Debian package: python-chardet - # PyPI package: chardet - import chardet - def chardet_dammit(s): - if isinstance(s, str): - return None - return chardet.detect(s)['encoding'] - #import chardet.constants - #chardet.constants._debug = 1 - except ImportError: - # No chardet available. - def chardet_dammit(s): - return None - -# Available from http://cjkpython.i18n.org/. -# -# TODO: This doesn't work anymore and the closest thing, iconv_codecs, -# is GPL-licensed. Check whether this is still necessary. -try: - import iconv_codec -except ImportError: - pass - -# Build bytestring and Unicode versions of regular expressions for finding -# a declared encoding inside an XML or HTML document. -xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>' -html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]' -encoding_res = dict() -encoding_res[bytes] = { - 'html' : re.compile(html_meta.encode("ascii"), re.I), - 'xml' : re.compile(xml_encoding.encode("ascii"), re.I), -} -encoding_res[str] = { - 'html' : re.compile(html_meta, re.I), - 'xml' : re.compile(xml_encoding, re.I) -} - -class EntitySubstitution(object): - """The ability to substitute XML or HTML entities for certain characters.""" - - def _populate_class_variables(): - lookup = {} - reverse_lookup = {} - characters_for_re = [] - - # &apos is an XHTML entity and an HTML 5, but not an HTML 4 - # entity. We don't want to use it, but we want to recognize it on the way in. - # - # TODO: Ideally we would be able to recognize all HTML 5 named - # entities, but that's a little tricky. - extra = [(39, 'apos')] - for codepoint, name in list(codepoint2name.items()) + extra: - character = chr(codepoint) - if codepoint not in (34, 39): - # There's no point in turning the quotation mark into - # " or the single quote into ', unless it - # happens within an attribute value, which is handled - # elsewhere. - characters_for_re.append(character) - lookup[character] = name - # But we do want to recognize those entities on the way in and - # convert them to Unicode characters. - reverse_lookup[name] = character - re_definition = "[%s]" % "".join(characters_for_re) - return lookup, reverse_lookup, re.compile(re_definition) - (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, - CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() - - CHARACTER_TO_XML_ENTITY = { - "'": "apos", - '"': "quot", - "&": "amp", - "<": "lt", - ">": "gt", - } - - BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" - "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)" - ")") - - AMPERSAND_OR_BRACKET = re.compile("([<>&])") - - @classmethod - def _substitute_html_entity(cls, matchobj): - """Used with a regular expression to substitute the - appropriate HTML entity for a special character.""" - entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) - return "&%s;" % entity - - @classmethod - def _substitute_xml_entity(cls, matchobj): - """Used with a regular expression to substitute the - appropriate XML entity for a special character.""" - entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] - return "&%s;" % entity - - @classmethod - def quoted_attribute_value(self, value): - """Make a value into a quoted XML attribute, possibly escaping it. - - Most strings will be quoted using double quotes. - - Bob's Bar -> "Bob's Bar" - - If a string contains double quotes, it will be quoted using - single quotes. - - Welcome to "my bar" -> 'Welcome to "my bar"' - - If a string contains both single and double quotes, the - double quotes will be escaped, and the string will be quoted - using double quotes. - - Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" - """ - quote_with = '"' - if '"' in value: - if "'" in value: - # The string contains both single and double - # quotes. Turn the double quotes into - # entities. We quote the double quotes rather than - # the single quotes because the entity name is - # """ whether this is HTML or XML. If we - # quoted the single quotes, we'd have to decide - # between ' and &squot;. - replace_with = """ - value = value.replace('"', replace_with) - else: - # There are double quotes but no single quotes. - # We can use single quotes to quote the attribute. - quote_with = "'" - return quote_with + value + quote_with - - @classmethod - def substitute_xml(cls, value, make_quoted_attribute=False): - """Substitute XML entities for special XML characters. - - :param value: A string to be substituted. The less-than sign - will become <, the greater-than sign will become >, - and any ampersands will become &. If you want ampersands - that appear to be part of an entity definition to be left - alone, use substitute_xml_containing_entities() instead. - - :param make_quoted_attribute: If True, then the string will be - quoted, as befits an attribute value. - """ - # Escape angle brackets and ampersands. - value = cls.AMPERSAND_OR_BRACKET.sub( - cls._substitute_xml_entity, value) - - if make_quoted_attribute: - value = cls.quoted_attribute_value(value) - return value - - @classmethod - def substitute_xml_containing_entities( - cls, value, make_quoted_attribute=False): - """Substitute XML entities for special XML characters. - - :param value: A string to be substituted. The less-than sign will - become <, the greater-than sign will become >, and any - ampersands that are not part of an entity defition will - become &. - - :param make_quoted_attribute: If True, then the string will be - quoted, as befits an attribute value. - """ - # Escape angle brackets, and ampersands that aren't part of - # entities. - value = cls.BARE_AMPERSAND_OR_BRACKET.sub( - cls._substitute_xml_entity, value) - - if make_quoted_attribute: - value = cls.quoted_attribute_value(value) - return value - - @classmethod - def substitute_html(cls, s): - """Replace certain Unicode characters with named HTML entities. - - This differs from data.encode(encoding, 'xmlcharrefreplace') - in that the goal is to make the result more readable (to those - with ASCII displays) rather than to recover from - errors. There's absolutely nothing wrong with a UTF-8 string - containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that - character with "é" will make it more readable to some - people. - - :param s: A Unicode string. - """ - return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( - cls._substitute_html_entity, s) - - -class EncodingDetector: - """Suggests a number of possible encodings for a bytestring. - - Order of precedence: - - 1. Encodings you specifically tell EncodingDetector to try first - (the override_encodings argument to the constructor). - - 2. An encoding declared within the bytestring itself, either in an - XML declaration (if the bytestring is to be interpreted as an XML - document), or in a tag (if the bytestring is to be - interpreted as an HTML document.) - - 3. An encoding detected through textual analysis by chardet, - cchardet, or a similar external library. - - 4. UTF-8. - - 5. Windows-1252. - """ - def __init__(self, markup, override_encodings=None, is_html=False, - exclude_encodings=None): - """Constructor. - - :param markup: Some markup in an unknown encoding. - :param override_encodings: These encodings will be tried first. - :param is_html: If True, this markup is considered to be HTML. Otherwise - it's assumed to be XML. - :param exclude_encodings: These encodings will not be tried, even - if they otherwise would be. - """ - self.override_encodings = override_encodings or [] - exclude_encodings = exclude_encodings or [] - self.exclude_encodings = set([x.lower() for x in exclude_encodings]) - self.chardet_encoding = None - self.is_html = is_html - self.declared_encoding = None - - # First order of business: strip a byte-order mark. - self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) - - def _usable(self, encoding, tried): - """Should we even bother to try this encoding? - - :param encoding: Name of an encoding. - :param tried: Encodings that have already been tried. This will be modified - as a side effect. - """ - if encoding is not None: - encoding = encoding.lower() - if encoding in self.exclude_encodings: - return False - if encoding not in tried: - tried.add(encoding) - return True - return False - - @property - def encodings(self): - """Yield a number of encodings that might work for this markup. - - :yield: A sequence of strings. - """ - tried = set() - for e in self.override_encodings: - if self._usable(e, tried): - yield e - - # Did the document originally start with a byte-order mark - # that indicated its encoding? - if self._usable(self.sniffed_encoding, tried): - yield self.sniffed_encoding - - # Look within the document for an XML or HTML encoding - # declaration. - if self.declared_encoding is None: - self.declared_encoding = self.find_declared_encoding( - self.markup, self.is_html) - if self._usable(self.declared_encoding, tried): - yield self.declared_encoding - - # Use third-party character set detection to guess at the - # encoding. - if self.chardet_encoding is None: - self.chardet_encoding = chardet_dammit(self.markup) - if self._usable(self.chardet_encoding, tried): - yield self.chardet_encoding - - # As a last-ditch effort, try utf-8 and windows-1252. - for e in ('utf-8', 'windows-1252'): - if self._usable(e, tried): - yield e - - @classmethod - def strip_byte_order_mark(cls, data): - """If a byte-order mark is present, strip it and return the encoding it implies. - - :param data: Some markup. - :return: A 2-tuple (modified data, implied encoding) - """ - encoding = None - if isinstance(data, str): - # Unicode data cannot have a byte-order mark. - return data, encoding - if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == b'\xef\xbb\xbf': - encoding = 'utf-8' - data = data[3:] - elif data[:4] == b'\x00\x00\xfe\xff': - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == b'\xff\xfe\x00\x00': - encoding = 'utf-32le' - data = data[4:] - return data, encoding - - @classmethod - def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): - """Given a document, tries to find its declared encoding. - - An XML encoding is declared at the beginning of the document. - - An HTML encoding is declared in a tag, hopefully near the - beginning of the document. - - :param markup: Some markup. - :param is_html: If True, this markup is considered to be HTML. Otherwise - it's assumed to be XML. - :param search_entire_document: Since an encoding is supposed to declared near the beginning - of the document, most of the time it's only necessary to search a few kilobytes of data. - Set this to True to force this method to search the entire document. - """ - if search_entire_document: - xml_endpos = html_endpos = len(markup) - else: - xml_endpos = 1024 - html_endpos = max(2048, int(len(markup) * 0.05)) - - if isinstance(markup, bytes): - res = encoding_res[bytes] - else: - res = encoding_res[str] - - xml_re = res['xml'] - html_re = res['html'] - declared_encoding = None - declared_encoding_match = xml_re.search(markup, endpos=xml_endpos) - if not declared_encoding_match and is_html: - declared_encoding_match = html_re.search(markup, endpos=html_endpos) - if declared_encoding_match is not None: - declared_encoding = declared_encoding_match.groups()[0] - if declared_encoding: - if isinstance(declared_encoding, bytes): - declared_encoding = declared_encoding.decode('ascii', 'replace') - return declared_encoding.lower() - return None - -class UnicodeDammit: - """A class for detecting the encoding of a *ML document and - converting it to a Unicode string. If the source encoding is - windows-1252, can replace MS smart quotes with their HTML or XML - equivalents.""" - - # This dictionary maps commonly seen values for "charset" in HTML - # meta tags to the corresponding Python codec names. It only covers - # values that aren't in Python's aliases and can't be determined - # by the heuristics in find_codec. - CHARSET_ALIASES = {"macintosh": "mac-roman", - "x-sjis": "shift-jis"} - - ENCODINGS_WITH_SMART_QUOTES = [ - "windows-1252", - "iso-8859-1", - "iso-8859-2", - ] - - def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, is_html=False, exclude_encodings=[]): - """Constructor. - - :param markup: A bytestring representing markup in an unknown encoding. - :param override_encodings: These encodings will be tried first, - before any sniffing code is run. - - :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted - to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead. - Setting it to 'xml' will convert them to XML entity references, and setting it to 'html' - will convert them to HTML entity references. - :param is_html: If True, this markup is considered to be HTML. Otherwise - it's assumed to be XML. - :param exclude_encodings: These encodings will not be considered, even - if the sniffing code thinks they might make sense. - """ - self.smart_quotes_to = smart_quotes_to - self.tried_encodings = [] - self.contains_replacement_characters = False - self.is_html = is_html - self.log = logging.getLogger(__name__) - self.detector = EncodingDetector( - markup, override_encodings, is_html, exclude_encodings) - - # Short-circuit if the data is in Unicode to begin with. - if isinstance(markup, str) or markup == '': - self.markup = markup - self.unicode_markup = str(markup) - self.original_encoding = None - return - - # The encoding detector may have stripped a byte-order mark. - # Use the stripped markup from this point on. - self.markup = self.detector.markup - - u = None - for encoding in self.detector.encodings: - markup = self.detector.markup - u = self._convert_from(encoding) - if u is not None: - break - - if not u: - # None of the encodings worked. As an absolute last resort, - # try them again with character replacement. - - for encoding in self.detector.encodings: - if encoding != "ascii": - u = self._convert_from(encoding, "replace") - if u is not None: - self.log.warning( - "Some characters could not be decoded, and were " - "replaced with REPLACEMENT CHARACTER." - ) - self.contains_replacement_characters = True - break - - # If none of that worked, we could at this point force it to - # ASCII, but that would destroy so much data that I think - # giving up is better. - self.unicode_markup = u - if not u: - self.original_encoding = None - - def _sub_ms_char(self, match): - """Changes a MS smart quote character to an XML or HTML - entity, or an ASCII character.""" - orig = match.group(1) - if self.smart_quotes_to == 'ascii': - sub = self.MS_CHARS_TO_ASCII.get(orig).encode() - else: - sub = self.MS_CHARS.get(orig) - if type(sub) == tuple: - if self.smart_quotes_to == 'xml': - sub = '&#x'.encode() + sub[1].encode() + ';'.encode() - else: - sub = '&'.encode() + sub[0].encode() + ';'.encode() - else: - sub = sub.encode() - return sub - - def _convert_from(self, proposed, errors="strict"): - """Attempt to convert the markup to the proposed encoding. - - :param proposed: The name of a character encoding. - """ - proposed = self.find_codec(proposed) - if not proposed or (proposed, errors) in self.tried_encodings: - return None - self.tried_encodings.append((proposed, errors)) - markup = self.markup - # Convert smart quotes to HTML if coming from an encoding - # that might have them. - if (self.smart_quotes_to is not None - and proposed in self.ENCODINGS_WITH_SMART_QUOTES): - smart_quotes_re = b"([\x80-\x9f])" - smart_quotes_compiled = re.compile(smart_quotes_re) - markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) - - try: - #print("Trying to convert document to %s (errors=%s)" % ( - # proposed, errors)) - u = self._to_unicode(markup, proposed, errors) - self.markup = u - self.original_encoding = proposed - except Exception as e: - #print("That didn't work!") - #print(e) - return None - #print("Correct encoding: %s" % proposed) - return self.markup - - def _to_unicode(self, data, encoding, errors="strict"): - """Given a string and its encoding, decodes the string into Unicode. - - :param encoding: The name of an encoding. - """ - return str(data, encoding, errors) - - @property - def declared_html_encoding(self): - """If the markup is an HTML document, returns the encoding declared _within_ - the document. - """ - if not self.is_html: - return None - return self.detector.declared_encoding - - def find_codec(self, charset): - """Convert the name of a character set to a codec name. - - :param charset: The name of a character set. - :return: The name of a codec. - """ - value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) - or (charset and self._codec(charset.replace("-", ""))) - or (charset and self._codec(charset.replace("-", "_"))) - or (charset and charset.lower()) - or charset - ) - if value: - return value.lower() - return None - - def _codec(self, charset): - if not charset: - return charset - codec = None - try: - codecs.lookup(charset) - codec = charset - except (LookupError, ValueError): - pass - return codec - - - # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. - MS_CHARS = {b'\x80': ('euro', '20AC'), - b'\x81': ' ', - b'\x82': ('sbquo', '201A'), - b'\x83': ('fnof', '192'), - b'\x84': ('bdquo', '201E'), - b'\x85': ('hellip', '2026'), - b'\x86': ('dagger', '2020'), - b'\x87': ('Dagger', '2021'), - b'\x88': ('circ', '2C6'), - b'\x89': ('permil', '2030'), - b'\x8A': ('Scaron', '160'), - b'\x8B': ('lsaquo', '2039'), - b'\x8C': ('OElig', '152'), - b'\x8D': '?', - b'\x8E': ('#x17D', '17D'), - b'\x8F': '?', - b'\x90': '?', - b'\x91': ('lsquo', '2018'), - b'\x92': ('rsquo', '2019'), - b'\x93': ('ldquo', '201C'), - b'\x94': ('rdquo', '201D'), - b'\x95': ('bull', '2022'), - b'\x96': ('ndash', '2013'), - b'\x97': ('mdash', '2014'), - b'\x98': ('tilde', '2DC'), - b'\x99': ('trade', '2122'), - b'\x9a': ('scaron', '161'), - b'\x9b': ('rsaquo', '203A'), - b'\x9c': ('oelig', '153'), - b'\x9d': '?', - b'\x9e': ('#x17E', '17E'), - b'\x9f': ('Yuml', ''),} - - # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains - # horrors like stripping diacritical marks to turn á into a, but also - # contains non-horrors like turning “ into ". - MS_CHARS_TO_ASCII = { - b'\x80' : 'EUR', - b'\x81' : ' ', - b'\x82' : ',', - b'\x83' : 'f', - b'\x84' : ',,', - b'\x85' : '...', - b'\x86' : '+', - b'\x87' : '++', - b'\x88' : '^', - b'\x89' : '%', - b'\x8a' : 'S', - b'\x8b' : '<', - b'\x8c' : 'OE', - b'\x8d' : '?', - b'\x8e' : 'Z', - b'\x8f' : '?', - b'\x90' : '?', - b'\x91' : "'", - b'\x92' : "'", - b'\x93' : '"', - b'\x94' : '"', - b'\x95' : '*', - b'\x96' : '-', - b'\x97' : '--', - b'\x98' : '~', - b'\x99' : '(TM)', - b'\x9a' : 's', - b'\x9b' : '>', - b'\x9c' : 'oe', - b'\x9d' : '?', - b'\x9e' : 'z', - b'\x9f' : 'Y', - b'\xa0' : ' ', - b'\xa1' : '!', - b'\xa2' : 'c', - b'\xa3' : 'GBP', - b'\xa4' : '$', #This approximation is especially parochial--this is the - #generic currency symbol. - b'\xa5' : 'YEN', - b'\xa6' : '|', - b'\xa7' : 'S', - b'\xa8' : '..', - b'\xa9' : '', - b'\xaa' : '(th)', - b'\xab' : '<<', - b'\xac' : '!', - b'\xad' : ' ', - b'\xae' : '(R)', - b'\xaf' : '-', - b'\xb0' : 'o', - b'\xb1' : '+-', - b'\xb2' : '2', - b'\xb3' : '3', - b'\xb4' : ("'", 'acute'), - b'\xb5' : 'u', - b'\xb6' : 'P', - b'\xb7' : '*', - b'\xb8' : ',', - b'\xb9' : '1', - b'\xba' : '(th)', - b'\xbb' : '>>', - b'\xbc' : '1/4', - b'\xbd' : '1/2', - b'\xbe' : '3/4', - b'\xbf' : '?', - b'\xc0' : 'A', - b'\xc1' : 'A', - b'\xc2' : 'A', - b'\xc3' : 'A', - b'\xc4' : 'A', - b'\xc5' : 'A', - b'\xc6' : 'AE', - b'\xc7' : 'C', - b'\xc8' : 'E', - b'\xc9' : 'E', - b'\xca' : 'E', - b'\xcb' : 'E', - b'\xcc' : 'I', - b'\xcd' : 'I', - b'\xce' : 'I', - b'\xcf' : 'I', - b'\xd0' : 'D', - b'\xd1' : 'N', - b'\xd2' : 'O', - b'\xd3' : 'O', - b'\xd4' : 'O', - b'\xd5' : 'O', - b'\xd6' : 'O', - b'\xd7' : '*', - b'\xd8' : 'O', - b'\xd9' : 'U', - b'\xda' : 'U', - b'\xdb' : 'U', - b'\xdc' : 'U', - b'\xdd' : 'Y', - b'\xde' : 'b', - b'\xdf' : 'B', - b'\xe0' : 'a', - b'\xe1' : 'a', - b'\xe2' : 'a', - b'\xe3' : 'a', - b'\xe4' : 'a', - b'\xe5' : 'a', - b'\xe6' : 'ae', - b'\xe7' : 'c', - b'\xe8' : 'e', - b'\xe9' : 'e', - b'\xea' : 'e', - b'\xeb' : 'e', - b'\xec' : 'i', - b'\xed' : 'i', - b'\xee' : 'i', - b'\xef' : 'i', - b'\xf0' : 'o', - b'\xf1' : 'n', - b'\xf2' : 'o', - b'\xf3' : 'o', - b'\xf4' : 'o', - b'\xf5' : 'o', - b'\xf6' : 'o', - b'\xf7' : '/', - b'\xf8' : 'o', - b'\xf9' : 'u', - b'\xfa' : 'u', - b'\xfb' : 'u', - b'\xfc' : 'u', - b'\xfd' : 'y', - b'\xfe' : 'b', - b'\xff' : 'y', - } - - # A map used when removing rogue Windows-1252/ISO-8859-1 - # characters in otherwise UTF-8 documents. - # - # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in - # Windows-1252. - WINDOWS_1252_TO_UTF8 = { - 0x80 : b'\xe2\x82\xac', # € - 0x82 : b'\xe2\x80\x9a', # ‚ - 0x83 : b'\xc6\x92', # Æ’ - 0x84 : b'\xe2\x80\x9e', # „ - 0x85 : b'\xe2\x80\xa6', # … - 0x86 : b'\xe2\x80\xa0', # † - 0x87 : b'\xe2\x80\xa1', # ‡ - 0x88 : b'\xcb\x86', # ˆ - 0x89 : b'\xe2\x80\xb0', # ‰ - 0x8a : b'\xc5\xa0', # Å  - 0x8b : b'\xe2\x80\xb9', # ‹ - 0x8c : b'\xc5\x92', # Å’ - 0x8e : b'\xc5\xbd', # Ž - 0x91 : b'\xe2\x80\x98', # ‘ - 0x92 : b'\xe2\x80\x99', # ’ - 0x93 : b'\xe2\x80\x9c', # “ - 0x94 : b'\xe2\x80\x9d', # †- 0x95 : b'\xe2\x80\xa2', # • - 0x96 : b'\xe2\x80\x93', # – - 0x97 : b'\xe2\x80\x94', # — - 0x98 : b'\xcb\x9c', # Ëœ - 0x99 : b'\xe2\x84\xa2', # â„¢ - 0x9a : b'\xc5\xa1', # Å¡ - 0x9b : b'\xe2\x80\xba', # › - 0x9c : b'\xc5\x93', # Å“ - 0x9e : b'\xc5\xbe', # ž - 0x9f : b'\xc5\xb8', # Ÿ - 0xa0 : b'\xc2\xa0', #   - 0xa1 : b'\xc2\xa1', # ¡ - 0xa2 : b'\xc2\xa2', # ¢ - 0xa3 : b'\xc2\xa3', # £ - 0xa4 : b'\xc2\xa4', # ¤ - 0xa5 : b'\xc2\xa5', # Â¥ - 0xa6 : b'\xc2\xa6', # ¦ - 0xa7 : b'\xc2\xa7', # § - 0xa8 : b'\xc2\xa8', # ¨ - 0xa9 : b'\xc2\xa9', # © - 0xaa : b'\xc2\xaa', # ª - 0xab : b'\xc2\xab', # « - 0xac : b'\xc2\xac', # ¬ - 0xad : b'\xc2\xad', # ­ - 0xae : b'\xc2\xae', # ® - 0xaf : b'\xc2\xaf', # ¯ - 0xb0 : b'\xc2\xb0', # ° - 0xb1 : b'\xc2\xb1', # ± - 0xb2 : b'\xc2\xb2', # ² - 0xb3 : b'\xc2\xb3', # ³ - 0xb4 : b'\xc2\xb4', # ´ - 0xb5 : b'\xc2\xb5', # µ - 0xb6 : b'\xc2\xb6', # ¶ - 0xb7 : b'\xc2\xb7', # · - 0xb8 : b'\xc2\xb8', # ¸ - 0xb9 : b'\xc2\xb9', # ¹ - 0xba : b'\xc2\xba', # º - 0xbb : b'\xc2\xbb', # » - 0xbc : b'\xc2\xbc', # ¼ - 0xbd : b'\xc2\xbd', # ½ - 0xbe : b'\xc2\xbe', # ¾ - 0xbf : b'\xc2\xbf', # ¿ - 0xc0 : b'\xc3\x80', # À - 0xc1 : b'\xc3\x81', # à - 0xc2 : b'\xc3\x82', #  - 0xc3 : b'\xc3\x83', # à - 0xc4 : b'\xc3\x84', # Ä - 0xc5 : b'\xc3\x85', # Ã… - 0xc6 : b'\xc3\x86', # Æ - 0xc7 : b'\xc3\x87', # Ç - 0xc8 : b'\xc3\x88', # È - 0xc9 : b'\xc3\x89', # É - 0xca : b'\xc3\x8a', # Ê - 0xcb : b'\xc3\x8b', # Ë - 0xcc : b'\xc3\x8c', # ÃŒ - 0xcd : b'\xc3\x8d', # à - 0xce : b'\xc3\x8e', # ÃŽ - 0xcf : b'\xc3\x8f', # à - 0xd0 : b'\xc3\x90', # à - 0xd1 : b'\xc3\x91', # Ñ - 0xd2 : b'\xc3\x92', # Ã’ - 0xd3 : b'\xc3\x93', # Ó - 0xd4 : b'\xc3\x94', # Ô - 0xd5 : b'\xc3\x95', # Õ - 0xd6 : b'\xc3\x96', # Ö - 0xd7 : b'\xc3\x97', # × - 0xd8 : b'\xc3\x98', # Ø - 0xd9 : b'\xc3\x99', # Ù - 0xda : b'\xc3\x9a', # Ú - 0xdb : b'\xc3\x9b', # Û - 0xdc : b'\xc3\x9c', # Ãœ - 0xdd : b'\xc3\x9d', # à - 0xde : b'\xc3\x9e', # Þ - 0xdf : b'\xc3\x9f', # ß - 0xe0 : b'\xc3\xa0', # à - 0xe1 : b'\xa1', # á - 0xe2 : b'\xc3\xa2', # â - 0xe3 : b'\xc3\xa3', # ã - 0xe4 : b'\xc3\xa4', # ä - 0xe5 : b'\xc3\xa5', # Ã¥ - 0xe6 : b'\xc3\xa6', # æ - 0xe7 : b'\xc3\xa7', # ç - 0xe8 : b'\xc3\xa8', # è - 0xe9 : b'\xc3\xa9', # é - 0xea : b'\xc3\xaa', # ê - 0xeb : b'\xc3\xab', # ë - 0xec : b'\xc3\xac', # ì - 0xed : b'\xc3\xad', # í - 0xee : b'\xc3\xae', # î - 0xef : b'\xc3\xaf', # ï - 0xf0 : b'\xc3\xb0', # ð - 0xf1 : b'\xc3\xb1', # ñ - 0xf2 : b'\xc3\xb2', # ò - 0xf3 : b'\xc3\xb3', # ó - 0xf4 : b'\xc3\xb4', # ô - 0xf5 : b'\xc3\xb5', # õ - 0xf6 : b'\xc3\xb6', # ö - 0xf7 : b'\xc3\xb7', # ÷ - 0xf8 : b'\xc3\xb8', # ø - 0xf9 : b'\xc3\xb9', # ù - 0xfa : b'\xc3\xba', # ú - 0xfb : b'\xc3\xbb', # û - 0xfc : b'\xc3\xbc', # ü - 0xfd : b'\xc3\xbd', # ý - 0xfe : b'\xc3\xbe', # þ - } - - MULTIBYTE_MARKERS_AND_SIZES = [ - (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF - (0xe0, 0xef, 3), # 3-byte characters start with E0-EF - (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 - ] - - FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] - LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] - - @classmethod - def detwingle(cls, in_bytes, main_encoding="utf8", - embedded_encoding="windows-1252"): - """Fix characters from one encoding embedded in some other encoding. - - Currently the only situation supported is Windows-1252 (or its - subset ISO-8859-1), embedded in UTF-8. - - :param in_bytes: A bytestring that you suspect contains - characters from multiple encodings. Note that this _must_ - be a bytestring. If you've already converted the document - to Unicode, you're too late. - :param main_encoding: The primary encoding of `in_bytes`. - :param embedded_encoding: The encoding that was used to embed characters - in the main document. - :return: A bytestring in which `embedded_encoding` - characters have been converted to their `main_encoding` - equivalents. - """ - if embedded_encoding.replace('_', '-').lower() not in ( - 'windows-1252', 'windows_1252'): - raise NotImplementedError( - "Windows-1252 and ISO-8859-1 are the only currently supported " - "embedded encodings.") - - if main_encoding.lower() not in ('utf8', 'utf-8'): - raise NotImplementedError( - "UTF-8 is the only currently supported main encoding.") - - byte_chunks = [] - - chunk_start = 0 - pos = 0 - while pos < len(in_bytes): - byte = in_bytes[pos] - if not isinstance(byte, int): - # Python 2.x - byte = ord(byte) - if (byte >= cls.FIRST_MULTIBYTE_MARKER - and byte <= cls.LAST_MULTIBYTE_MARKER): - # This is the start of a UTF-8 multibyte character. Skip - # to the end. - for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: - if byte >= start and byte <= end: - pos += size - break - elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: - # We found a Windows-1252 character! - # Save the string up to this point as a chunk. - byte_chunks.append(in_bytes[chunk_start:pos]) - - # Now translate the Windows-1252 character into UTF-8 - # and add it as another, one-byte chunk. - byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) - pos += 1 - chunk_start = pos - else: - # Go on to the next character. - pos += 1 - if chunk_start == 0: - # The string is unchanged. - return in_bytes - else: - # Store the final chunk. - byte_chunks.append(in_bytes[chunk_start:]) - return b''.join(byte_chunks) - diff --git a/infra/api/package/bs4/diagnose.py b/infra/api/package/bs4/diagnose.py deleted file mode 100644 index 1877acd..0000000 --- a/infra/api/package/bs4/diagnose.py +++ /dev/null @@ -1,242 +0,0 @@ -"""Diagnostic functions, mainly for use when doing tech support.""" - -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -import cProfile -from io import StringIO -from html.parser import HTMLParser -import bs4 -from bs4 import BeautifulSoup, __version__ -from bs4.builder import builder_registry - -import os -import pstats -import random -import tempfile -import time -import traceback -import sys -import cProfile - -def diagnose(data): - """Diagnostic suite for isolating common problems. - - :param data: A string containing markup that needs to be explained. - :return: None; diagnostics are printed to standard output. - """ - print(("Diagnostic running on Beautiful Soup %s" % __version__)) - print(("Python version %s" % sys.version)) - - basic_parsers = ["html.parser", "html5lib", "lxml"] - for name in basic_parsers: - for builder in builder_registry.builders: - if name in builder.features: - break - else: - basic_parsers.remove(name) - print(( - "I noticed that %s is not installed. Installing it may help." % - name)) - - if 'lxml' in basic_parsers: - basic_parsers.append("lxml-xml") - try: - from lxml import etree - print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))) - except ImportError as e: - print( - "lxml is not installed or couldn't be imported.") - - - if 'html5lib' in basic_parsers: - try: - import html5lib - print(("Found html5lib version %s" % html5lib.__version__)) - except ImportError as e: - print( - "html5lib is not installed or couldn't be imported.") - - if hasattr(data, 'read'): - data = data.read() - elif data.startswith("http:") or data.startswith("https:"): - print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)) - print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") - return - else: - try: - if os.path.exists(data): - print(('"%s" looks like a filename. Reading data from the file.' % data)) - with open(data) as fp: - data = fp.read() - except ValueError: - # This can happen on some platforms when the 'filename' is - # too long. Assume it's data and not a filename. - pass - print("") - - for parser in basic_parsers: - print(("Trying to parse your markup with %s" % parser)) - success = False - try: - soup = BeautifulSoup(data, features=parser) - success = True - except Exception as e: - print(("%s could not parse the markup." % parser)) - traceback.print_exc() - if success: - print(("Here's what %s did with the markup:" % parser)) - print((soup.prettify())) - - print(("-" * 80)) - -def lxml_trace(data, html=True, **kwargs): - """Print out the lxml events that occur during parsing. - - This lets you see how lxml parses a document when no Beautiful - Soup code is running. You can use this to determine whether - an lxml-specific problem is in Beautiful Soup's lxml tree builders - or in lxml itself. - - :param data: Some markup. - :param html: If True, markup will be parsed with lxml's HTML parser. - if False, lxml's XML parser will be used. - """ - from lxml import etree - for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): - print(("%s, %4s, %s" % (event, element.tag, element.text))) - -class AnnouncingParser(HTMLParser): - """Subclass of HTMLParser that announces parse events, without doing - anything else. - - You can use this to get a picture of how html.parser sees a given - document. The easiest way to do this is to call `htmlparser_trace`. - """ - - def _p(self, s): - print(s) - - def handle_starttag(self, name, attrs): - self._p("%s START" % name) - - def handle_endtag(self, name): - self._p("%s END" % name) - - def handle_data(self, data): - self._p("%s DATA" % data) - - def handle_charref(self, name): - self._p("%s CHARREF" % name) - - def handle_entityref(self, name): - self._p("%s ENTITYREF" % name) - - def handle_comment(self, data): - self._p("%s COMMENT" % data) - - def handle_decl(self, data): - self._p("%s DECL" % data) - - def unknown_decl(self, data): - self._p("%s UNKNOWN-DECL" % data) - - def handle_pi(self, data): - self._p("%s PI" % data) - -def htmlparser_trace(data): - """Print out the HTMLParser events that occur during parsing. - - This lets you see how HTMLParser parses a document when no - Beautiful Soup code is running. - - :param data: Some markup. - """ - parser = AnnouncingParser() - parser.feed(data) - -_vowels = "aeiou" -_consonants = "bcdfghjklmnpqrstvwxyz" - -def rword(length=5): - "Generate a random word-like string." - s = '' - for i in range(length): - if i % 2 == 0: - t = _consonants - else: - t = _vowels - s += random.choice(t) - return s - -def rsentence(length=4): - "Generate a random sentence-like string." - return " ".join(rword(random.randint(4,9)) for i in list(range(length))) - -def rdoc(num_elements=1000): - """Randomly generate an invalid HTML document.""" - tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] - elements = [] - for i in range(num_elements): - choice = random.randint(0,3) - if choice == 0: - # New tag. - tag_name = random.choice(tag_names) - elements.append("<%s>" % tag_name) - elif choice == 1: - elements.append(rsentence(random.randint(1,4))) - elif choice == 2: - # Close a tag. - tag_name = random.choice(tag_names) - elements.append("" % tag_name) - return "" + "\n".join(elements) + "" - -def benchmark_parsers(num_elements=100000): - """Very basic head-to-head performance benchmark.""" - print(("Comparative parser benchmark on Beautiful Soup %s" % __version__)) - data = rdoc(num_elements) - print(("Generated a large invalid HTML document (%d bytes)." % len(data))) - - for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: - success = False - try: - a = time.time() - soup = BeautifulSoup(data, parser) - b = time.time() - success = True - except Exception as e: - print(("%s could not parse the markup." % parser)) - traceback.print_exc() - if success: - print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a))) - - from lxml import etree - a = time.time() - etree.HTML(data) - b = time.time() - print(("Raw lxml parsed the markup in %.2fs." % (b-a))) - - import html5lib - parser = html5lib.HTMLParser() - a = time.time() - parser.parse(data) - b = time.time() - print(("Raw html5lib parsed the markup in %.2fs." % (b-a))) - -def profile(num_elements=100000, parser="lxml"): - """Use Python's profiler on a randomly generated document.""" - filehandle = tempfile.NamedTemporaryFile() - filename = filehandle.name - - data = rdoc(num_elements) - vars = dict(bs4=bs4, data=data, parser=parser) - cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) - - stats = pstats.Stats(filename) - # stats.strip_dirs() - stats.sort_stats("cumulative") - stats.print_stats('_html5lib|bs4', 50) - -# If this file is run as a script, standard input is diagnosed. -if __name__ == '__main__': - diagnose(sys.stdin.read()) diff --git a/infra/api/package/bs4/element.py b/infra/api/package/bs4/element.py deleted file mode 100644 index c7dc650..0000000 --- a/infra/api/package/bs4/element.py +++ /dev/null @@ -1,2162 +0,0 @@ -# Use of this source code is governed by the MIT license. -__license__ = "MIT" - -try: - from collections.abc import Callable # Python 3.6 -except ImportError as e: - from collections import Callable -import re -import sys -import warnings -try: - import soupsieve -except ImportError as e: - soupsieve = None - warnings.warn( - 'The soupsieve package is not installed. CSS selectors cannot be used.' - ) - -from bs4.formatter import ( - Formatter, - HTMLFormatter, - XMLFormatter, -) - -DEFAULT_OUTPUT_ENCODING = "utf-8" -PY3K = (sys.version_info[0] > 2) - -nonwhitespace_re = re.compile(r"\S+") - -# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on -# the off chance someone imported it for their own use. -whitespace_re = re.compile(r"\s+") - -def _alias(attr): - """Alias one attribute name to another for backward compatibility""" - @property - def alias(self): - return getattr(self, attr) - - @alias.setter - def alias(self): - return setattr(self, attr) - return alias - - -# These encodings are recognized by Python (so PageElement.encode -# could theoretically support them) but XML and HTML don't recognize -# them (so they should not show up in an XML or HTML document as that -# document's encoding). -# -# If an XML document is encoded in one of these encodings, no encoding -# will be mentioned in the XML declaration. If an HTML document is -# encoded in one of these encodings, and the HTML document has a -# tag that mentions an encoding, the encoding will be given as -# the empty string. -# -# Source: -# https://docs.python.org/3/library/codecs.html#python-specific-encodings -PYTHON_SPECIFIC_ENCODINGS = set([ - "idna", - "mbcs", - "oem", - "palmos", - "punycode", - "raw_unicode_escape", - "undefined", - "unicode_escape", - "raw-unicode-escape", - "unicode-escape", - "string-escape", - "string_escape", -]) - - -class NamespacedAttribute(str): - """A namespaced string (e.g. 'xml:lang') that remembers the namespace - ('xml') and the name ('lang') that were used to create it. - """ - - def __new__(cls, prefix, name=None, namespace=None): - if not name: - # This is the default namespace. Its name "has no value" - # per https://www.w3.org/TR/xml-names/#defaulting - name = None - - if name is None: - obj = str.__new__(cls, prefix) - elif prefix is None: - # Not really namespaced. - obj = str.__new__(cls, name) - else: - obj = str.__new__(cls, prefix + ":" + name) - obj.prefix = prefix - obj.name = name - obj.namespace = namespace - return obj - -class AttributeValueWithCharsetSubstitution(str): - """A stand-in object for a character encoding specified in HTML.""" - -class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): - """A generic stand-in for the value of a meta tag's 'charset' attribute. - - When Beautiful Soup parses the markup '', the - value of the 'charset' attribute will be one of these objects. - """ - - def __new__(cls, original_value): - obj = str.__new__(cls, original_value) - obj.original_value = original_value - return obj - - def encode(self, encoding): - """When an HTML document is being encoded to a given encoding, the - value of a meta tag's 'charset' is the name of the encoding. - """ - if encoding in PYTHON_SPECIFIC_ENCODINGS: - return '' - return encoding - - -class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): - """A generic stand-in for the value of a meta tag's 'content' attribute. - - When Beautiful Soup parses the markup: - - - The value of the 'content' attribute will be one of these objects. - """ - - CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M) - - def __new__(cls, original_value): - match = cls.CHARSET_RE.search(original_value) - if match is None: - # No substitution necessary. - return str.__new__(str, original_value) - - obj = str.__new__(cls, original_value) - obj.original_value = original_value - return obj - - def encode(self, encoding): - if encoding in PYTHON_SPECIFIC_ENCODINGS: - return '' - def rewrite(match): - return match.group(1) + encoding - return self.CHARSET_RE.sub(rewrite, self.original_value) - - -class PageElement(object): - """Contains the navigational information for some part of the page: - that is, its current location in the parse tree. - - NavigableString, Tag, etc. are all subclasses of PageElement. - """ - - def setup(self, parent=None, previous_element=None, next_element=None, - previous_sibling=None, next_sibling=None): - """Sets up the initial relations between this element and - other elements. - - :param parent: The parent of this element. - - :param previous_element: The element parsed immediately before - this one. - - :param next_element: The element parsed immediately before - this one. - - :param previous_sibling: The most recently encountered element - on the same level of the parse tree as this one. - - :param previous_sibling: The next element to be encountered - on the same level of the parse tree as this one. - """ - self.parent = parent - - self.previous_element = previous_element - if previous_element is not None: - self.previous_element.next_element = self - - self.next_element = next_element - if self.next_element is not None: - self.next_element.previous_element = self - - self.next_sibling = next_sibling - if self.next_sibling is not None: - self.next_sibling.previous_sibling = self - - if (previous_sibling is None - and self.parent is not None and self.parent.contents): - previous_sibling = self.parent.contents[-1] - - self.previous_sibling = previous_sibling - if previous_sibling is not None: - self.previous_sibling.next_sibling = self - - def format_string(self, s, formatter): - """Format the given string using the given formatter. - - :param s: A string. - :param formatter: A Formatter object, or a string naming one of the standard formatters. - """ - if formatter is None: - return s - if not isinstance(formatter, Formatter): - formatter = self.formatter_for_name(formatter) - output = formatter.substitute(s) - return output - - def formatter_for_name(self, formatter): - """Look up or create a Formatter for the given identifier, - if necessary. - - :param formatter: Can be a Formatter object (used as-is), a - function (used as the entity substitution hook for an - XMLFormatter or HTMLFormatter), or a string (used to look - up an XMLFormatter or HTMLFormatter in the appropriate - registry. - """ - if isinstance(formatter, Formatter): - return formatter - if self._is_xml: - c = XMLFormatter - else: - c = HTMLFormatter - if isinstance(formatter, Callable): - return c(entity_substitution=formatter) - return c.REGISTRY[formatter] - - @property - def _is_xml(self): - """Is this element part of an XML tree or an HTML tree? - - This is used in formatter_for_name, when deciding whether an - XMLFormatter or HTMLFormatter is more appropriate. It can be - inefficient, but it should be called very rarely. - """ - if self.known_xml is not None: - # Most of the time we will have determined this when the - # document is parsed. - return self.known_xml - - # Otherwise, it's likely that this element was created by - # direct invocation of the constructor from within the user's - # Python code. - if self.parent is None: - # This is the top-level object. It should have .known_xml set - # from tree creation. If not, take a guess--BS is usually - # used on HTML markup. - return getattr(self, 'is_xml', False) - return self.parent._is_xml - - nextSibling = _alias("next_sibling") # BS3 - previousSibling = _alias("previous_sibling") # BS3 - - def replace_with(self, replace_with): - """Replace this PageElement with another one, keeping the rest of the - tree the same. - - :param replace_with: A PageElement. - :return: `self`, no longer part of the tree. - """ - if self.parent is None: - raise ValueError( - "Cannot replace one element with another when the " - "element to be replaced is not part of a tree.") - if replace_with is self: - return - if replace_with is self.parent: - raise ValueError("Cannot replace a Tag with its parent.") - old_parent = self.parent - my_index = self.parent.index(self) - self.extract(_self_index=my_index) - old_parent.insert(my_index, replace_with) - return self - replaceWith = replace_with # BS3 - - def unwrap(self): - """Replace this PageElement with its contents. - - :return: `self`, no longer part of the tree. - """ - my_parent = self.parent - if self.parent is None: - raise ValueError( - "Cannot replace an element with its contents when that" - "element is not part of a tree.") - my_index = self.parent.index(self) - self.extract(_self_index=my_index) - for child in reversed(self.contents[:]): - my_parent.insert(my_index, child) - return self - replace_with_children = unwrap - replaceWithChildren = unwrap # BS3 - - def wrap(self, wrap_inside): - """Wrap this PageElement inside another one. - - :param wrap_inside: A PageElement. - :return: `wrap_inside`, occupying the position in the tree that used - to be occupied by `self`, and with `self` inside it. - """ - me = self.replace_with(wrap_inside) - wrap_inside.append(me) - return wrap_inside - - def extract(self, _self_index=None): - """Destructively rips this element out of the tree. - - :param _self_index: The location of this element in its parent's - .contents, if known. Passing this in allows for a performance - optimization. - - :return: `self`, no longer part of the tree. - """ - if self.parent is not None: - if _self_index is None: - _self_index = self.parent.index(self) - del self.parent.contents[_self_index] - - #Find the two elements that would be next to each other if - #this element (and any children) hadn't been parsed. Connect - #the two. - last_child = self._last_descendant() - next_element = last_child.next_element - - if (self.previous_element is not None and - self.previous_element is not next_element): - self.previous_element.next_element = next_element - if next_element is not None and next_element is not self.previous_element: - next_element.previous_element = self.previous_element - self.previous_element = None - last_child.next_element = None - - self.parent = None - if (self.previous_sibling is not None - and self.previous_sibling is not self.next_sibling): - self.previous_sibling.next_sibling = self.next_sibling - if (self.next_sibling is not None - and self.next_sibling is not self.previous_sibling): - self.next_sibling.previous_sibling = self.previous_sibling - self.previous_sibling = self.next_sibling = None - return self - - def _last_descendant(self, is_initialized=True, accept_self=True): - """Finds the last element beneath this object to be parsed. - - :param is_initialized: Has `setup` been called on this PageElement - yet? - :param accept_self: Is `self` an acceptable answer to the question? - """ - if is_initialized and self.next_sibling is not None: - last_child = self.next_sibling.previous_element - else: - last_child = self - while isinstance(last_child, Tag) and last_child.contents: - last_child = last_child.contents[-1] - if not accept_self and last_child is self: - last_child = None - return last_child - # BS3: Not part of the API! - _lastRecursiveChild = _last_descendant - - def insert(self, position, new_child): - """Insert a new PageElement in the list of this PageElement's children. - - This works the same way as `list.insert`. - - :param position: The numeric position that should be occupied - in `self.children` by the new PageElement. - :param new_child: A PageElement. - """ - if new_child is None: - raise ValueError("Cannot insert None into a tag.") - if new_child is self: - raise ValueError("Cannot insert a tag into itself.") - if (isinstance(new_child, str) - and not isinstance(new_child, NavigableString)): - new_child = NavigableString(new_child) - - from bs4 import BeautifulSoup - if isinstance(new_child, BeautifulSoup): - # We don't want to end up with a situation where one BeautifulSoup - # object contains another. Insert the children one at a time. - for subchild in list(new_child.contents): - self.insert(position, subchild) - position += 1 - return - position = min(position, len(self.contents)) - if hasattr(new_child, 'parent') and new_child.parent is not None: - # We're 'inserting' an element that's already one - # of this object's children. - if new_child.parent is self: - current_index = self.index(new_child) - if current_index < position: - # We're moving this element further down the list - # of this object's children. That means that when - # we extract this element, our target index will - # jump down one. - position -= 1 - new_child.extract() - - new_child.parent = self - previous_child = None - if position == 0: - new_child.previous_sibling = None - new_child.previous_element = self - else: - previous_child = self.contents[position - 1] - new_child.previous_sibling = previous_child - new_child.previous_sibling.next_sibling = new_child - new_child.previous_element = previous_child._last_descendant(False) - if new_child.previous_element is not None: - new_child.previous_element.next_element = new_child - - new_childs_last_element = new_child._last_descendant(False) - - if position >= len(self.contents): - new_child.next_sibling = None - - parent = self - parents_next_sibling = None - while parents_next_sibling is None and parent is not None: - parents_next_sibling = parent.next_sibling - parent = parent.parent - if parents_next_sibling is not None: - # We found the element that comes next in the document. - break - if parents_next_sibling is not None: - new_childs_last_element.next_element = parents_next_sibling - else: - # The last element of this tag is the last element in - # the document. - new_childs_last_element.next_element = None - else: - next_child = self.contents[position] - new_child.next_sibling = next_child - if new_child.next_sibling is not None: - new_child.next_sibling.previous_sibling = new_child - new_childs_last_element.next_element = next_child - - if new_childs_last_element.next_element is not None: - new_childs_last_element.next_element.previous_element = new_childs_last_element - self.contents.insert(position, new_child) - - def append(self, tag): - """Appends the given PageElement to the contents of this one. - - :param tag: A PageElement. - """ - self.insert(len(self.contents), tag) - - def extend(self, tags): - """Appends the given PageElements to this one's contents. - - :param tags: A list of PageElements. - """ - for tag in tags: - self.append(tag) - - def insert_before(self, *args): - """Makes the given element(s) the immediate predecessor of this one. - - All the elements will have the same parent, and the given elements - will be immediately before this one. - - :param args: One or more PageElements. - """ - parent = self.parent - if parent is None: - raise ValueError( - "Element has no parent, so 'before' has no meaning.") - if any(x is self for x in args): - raise ValueError("Can't insert an element before itself.") - for predecessor in args: - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(predecessor, PageElement): - predecessor.extract() - index = parent.index(self) - parent.insert(index, predecessor) - - def insert_after(self, *args): - """Makes the given element(s) the immediate successor of this one. - - The elements will have the same parent, and the given elements - will be immediately after this one. - - :param args: One or more PageElements. - """ - # Do all error checking before modifying the tree. - parent = self.parent - if parent is None: - raise ValueError( - "Element has no parent, so 'after' has no meaning.") - if any(x is self for x in args): - raise ValueError("Can't insert an element after itself.") - - offset = 0 - for successor in args: - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(successor, PageElement): - successor.extract() - index = parent.index(self) - parent.insert(index+1+offset, successor) - offset += 1 - - def find_next(self, name=None, attrs={}, text=None, **kwargs): - """Find the first PageElement that matches the given criteria and - appears later in the document than this PageElement. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :kwargs: A dictionary of filters on attribute values. - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self._find_one(self.find_all_next, name, attrs, text, **kwargs) - findNext = find_next # BS3 - - def find_all_next(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Find all PageElements that match the given criteria and appear - later in the document than this PageElement. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - :return: A ResultSet containing PageElements. - """ - return self._find_all(name, attrs, text, limit, self.next_elements, - **kwargs) - findAllNext = find_all_next # BS3 - - def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Find the closest sibling to this PageElement that matches the - given criteria and appears later in the document. - - All find_* methods take a common set of arguments. See the - online documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :kwargs: A dictionary of filters on attribute values. - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self._find_one(self.find_next_siblings, name, attrs, text, - **kwargs) - findNextSibling = find_next_sibling # BS3 - - def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Find all siblings of this PageElement that match the given criteria - and appear later in the document. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - :return: A ResultSet of PageElements. - :rtype: bs4.element.ResultSet - """ - return self._find_all(name, attrs, text, limit, - self.next_siblings, **kwargs) - findNextSiblings = find_next_siblings # BS3 - fetchNextSiblings = find_next_siblings # BS2 - - def find_previous(self, name=None, attrs={}, text=None, **kwargs): - """Look backwards in the document from this PageElement and find the - first PageElement that matches the given criteria. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :kwargs: A dictionary of filters on attribute values. - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self._find_one( - self.find_all_previous, name, attrs, text, **kwargs) - findPrevious = find_previous # BS3 - - def find_all_previous(self, name=None, attrs={}, text=None, limit=None, - **kwargs): - """Look backwards in the document from this PageElement and find all - PageElements that match the given criteria. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - :return: A ResultSet of PageElements. - :rtype: bs4.element.ResultSet - """ - return self._find_all(name, attrs, text, limit, self.previous_elements, - **kwargs) - findAllPrevious = find_all_previous # BS3 - fetchPrevious = find_all_previous # BS2 - - def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs): - """Returns the closest sibling to this PageElement that matches the - given criteria and appears earlier in the document. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :kwargs: A dictionary of filters on attribute values. - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self._find_one(self.find_previous_siblings, name, attrs, text, - **kwargs) - findPreviousSibling = find_previous_sibling # BS3 - - def find_previous_siblings(self, name=None, attrs={}, text=None, - limit=None, **kwargs): - """Returns all siblings to this PageElement that match the - given criteria and appear earlier in the document. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param text: A filter for a NavigableString with specific text. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - :return: A ResultSet of PageElements. - :rtype: bs4.element.ResultSet - """ - return self._find_all(name, attrs, text, limit, - self.previous_siblings, **kwargs) - findPreviousSiblings = find_previous_siblings # BS3 - fetchPreviousSiblings = find_previous_siblings # BS2 - - def find_parent(self, name=None, attrs={}, **kwargs): - """Find the closest parent of this PageElement that matches the given - criteria. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :kwargs: A dictionary of filters on attribute values. - - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - # NOTE: We can't use _find_one because findParents takes a different - # set of arguments. - r = None - l = self.find_parents(name, attrs, 1, **kwargs) - if l: - r = l[0] - return r - findParent = find_parent # BS3 - - def find_parents(self, name=None, attrs={}, limit=None, **kwargs): - """Find all parents of this PageElement that match the given criteria. - - All find_* methods take a common set of arguments. See the online - documentation for detailed explanations. - - :param name: A filter on tag name. - :param attrs: A dictionary of filters on attribute values. - :param limit: Stop looking after finding this many results. - :kwargs: A dictionary of filters on attribute values. - - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self._find_all(name, attrs, None, limit, self.parents, - **kwargs) - findParents = find_parents # BS3 - fetchParents = find_parents # BS2 - - @property - def next(self): - """The PageElement, if any, that was parsed just after this one. - - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self.next_element - - @property - def previous(self): - """The PageElement, if any, that was parsed just before this one. - - :return: A PageElement. - :rtype: bs4.element.Tag | bs4.element.NavigableString - """ - return self.previous_element - - #These methods do the real heavy lifting. - - def _find_one(self, method, name, attrs, text, **kwargs): - r = None - l = method(name, attrs, text, 1, **kwargs) - if l: - r = l[0] - return r - - def _find_all(self, name, attrs, text, limit, generator, **kwargs): - "Iterates over a generator looking for things that match." - - if text is None and 'string' in kwargs: - text = kwargs['string'] - del kwargs['string'] - - if isinstance(name, SoupStrainer): - strainer = name - else: - strainer = SoupStrainer(name, attrs, text, **kwargs) - - if text is None and not limit and not attrs and not kwargs: - if name is True or name is None: - # Optimization to find all tags. - result = (element for element in generator - if isinstance(element, Tag)) - return ResultSet(strainer, result) - elif isinstance(name, str): - # Optimization to find all tags with a given name. - if name.count(':') == 1: - # This is a name with a prefix. If this is a namespace-aware document, - # we need to match the local name against tag.name. If not, - # we need to match the fully-qualified name against tag.name. - prefix, local_name = name.split(':', 1) - else: - prefix = None - local_name = name - result = (element for element in generator - if isinstance(element, Tag) - and ( - element.name == name - ) or ( - element.name == local_name - and (prefix is None or element.prefix == prefix) - ) - ) - return ResultSet(strainer, result) - results = ResultSet(strainer) - while True: - try: - i = next(generator) - except StopIteration: - break - if i: - found = strainer.search(i) - if found: - results.append(found) - if limit and len(results) >= limit: - break - return results - - #These generators can be used to navigate starting from both - #NavigableStrings and Tags. - @property - def next_elements(self): - """All PageElements that were parsed after this one. - - :yield: A sequence of PageElements. - """ - i = self.next_element - while i is not None: - yield i - i = i.next_element - - @property - def next_siblings(self): - """All PageElements that are siblings of this one but were parsed - later. - - :yield: A sequence of PageElements. - """ - i = self.next_sibling - while i is not None: - yield i - i = i.next_sibling - - @property - def previous_elements(self): - """All PageElements that were parsed before this one. - - :yield: A sequence of PageElements. - """ - i = self.previous_element - while i is not None: - yield i - i = i.previous_element - - @property - def previous_siblings(self): - """All PageElements that are siblings of this one but were parsed - earlier. - - :yield: A sequence of PageElements. - """ - i = self.previous_sibling - while i is not None: - yield i - i = i.previous_sibling - - @property - def parents(self): - """All PageElements that are parents of this PageElement. - - :yield: A sequence of PageElements. - """ - i = self.parent - while i is not None: - yield i - i = i.parent - - @property - def decomposed(self): - """Check whether a PageElement has been decomposed. - - :rtype: bool - """ - return getattr(self, '_decomposed', False) or False - - # Old non-property versions of the generators, for backwards - # compatibility with BS3. - def nextGenerator(self): - return self.next_elements - - def nextSiblingGenerator(self): - return self.next_siblings - - def previousGenerator(self): - return self.previous_elements - - def previousSiblingGenerator(self): - return self.previous_siblings - - def parentGenerator(self): - return self.parents - - -class NavigableString(str, PageElement): - """A Python Unicode string that is part of a parse tree. - - When Beautiful Soup parses the markup penguin, it will - create a NavigableString for the string "penguin". - """ - - PREFIX = '' - SUFFIX = '' - - # We can't tell just by looking at a string whether it's contained - # in an XML document or an HTML document. - - known_xml = None - - def __new__(cls, value): - """Create a new NavigableString. - - When unpickling a NavigableString, this method is called with - the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be - passed in to the superclass's __new__ or the superclass won't know - how to handle non-ASCII characters. - """ - if isinstance(value, str): - u = str.__new__(cls, value) - else: - u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) - u.setup() - return u - - def __copy__(self): - """A copy of a NavigableString has the same contents and class - as the original, but it is not connected to the parse tree. - """ - return type(self)(self) - - def __getnewargs__(self): - return (str(self),) - - def __getattr__(self, attr): - """text.string gives you text. This is for backwards - compatibility for Navigable*String, but for CData* it lets you - get the string without the CData wrapper.""" - if attr == 'string': - return self - else: - raise AttributeError( - "'%s' object has no attribute '%s'" % ( - self.__class__.__name__, attr)) - - def output_ready(self, formatter="minimal"): - """Run the string through the provided formatter. - - :param formatter: A Formatter object, or a string naming one of the standard formatters. - """ - output = self.format_string(self, formatter) - return self.PREFIX + output + self.SUFFIX - - @property - def name(self): - """Since a NavigableString is not a Tag, it has no .name. - - This property is implemented so that code like this doesn't crash - when run on a mixture of Tag and NavigableString objects: - [x.name for x in tag.children] - """ - return None - - @name.setter - def name(self, name): - """Prevent NavigableString.name from ever being set.""" - raise AttributeError("A NavigableString cannot be given a name.") - - -class PreformattedString(NavigableString): - """A NavigableString not subject to the normal formatting rules. - - This is an abstract class used for special kinds of strings such - as comments (the Comment class) and CDATA blocks (the CData - class). - """ - - PREFIX = '' - SUFFIX = '' - - def output_ready(self, formatter=None): - """Make this string ready for output by adding any subclass-specific - prefix or suffix. - - :param formatter: A Formatter object, or a string naming one - of the standard formatters. The string will be passed into the - Formatter, but only to trigger any side effects: the return - value is ignored. - - :return: The string, with any subclass-specific prefix and - suffix added on. - """ - if formatter is not None: - ignore = self.format_string(self, formatter) - return self.PREFIX + self + self.SUFFIX - -class CData(PreformattedString): - """A CDATA block.""" - PREFIX = '' - -class ProcessingInstruction(PreformattedString): - """A SGML processing instruction.""" - - PREFIX = '' - -class XMLProcessingInstruction(ProcessingInstruction): - """An XML processing instruction.""" - PREFIX = '' - -class Comment(PreformattedString): - """An HTML or XML comment.""" - PREFIX = '' - - -class Declaration(PreformattedString): - """An XML declaration.""" - PREFIX = '' - - -class Doctype(PreformattedString): - """A document type declaration.""" - @classmethod - def for_name_and_ids(cls, name, pub_id, system_id): - """Generate an appropriate document type declaration for a given - public ID and system ID. - - :param name: The name of the document's root element, e.g. 'html'. - :param pub_id: The Formal Public Identifier for this document type, - e.g. '-//W3C//DTD XHTML 1.1//EN' - :param system_id: The system identifier for this document type, - e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd' - - :return: A Doctype. - """ - value = name or '' - if pub_id is not None: - value += ' PUBLIC "%s"' % pub_id - if system_id is not None: - value += ' "%s"' % system_id - elif system_id is not None: - value += ' SYSTEM "%s"' % system_id - - return Doctype(value) - - PREFIX = '\n' - - -class Stylesheet(NavigableString): - """A NavigableString representing an stylesheet (probably - CSS). - - Used to distinguish embedded stylesheets from textual content. - """ - pass - - -class Script(NavigableString): - """A NavigableString representing an executable script (probably - Javascript). - - Used to distinguish executable code from textual content. - """ - pass - - -class TemplateString(NavigableString): - """A NavigableString representing a string found inside an HTML - template embedded in a larger document. - - Used to distinguish such strings from the main body of the document. - """ - pass - - -class Tag(PageElement): - """Represents an HTML or XML tag that is part of a parse tree, along - with its attributes and contents. - - When Beautiful Soup parses the markup penguin, it will - create a Tag object representing the tag. - """ - - def __init__(self, parser=None, builder=None, name=None, namespace=None, - prefix=None, attrs=None, parent=None, previous=None, - is_xml=None, sourceline=None, sourcepos=None, - can_be_empty_element=None, cdata_list_attributes=None, - preserve_whitespace_tags=None - ): - """Basic constructor. - - :param parser: A BeautifulSoup object. - :param builder: A TreeBuilder. - :param name: The name of the tag. - :param namespace: The URI of this Tag's XML namespace, if any. - :param prefix: The prefix for this Tag's XML namespace, if any. - :param attrs: A dictionary of this Tag's attribute values. - :param parent: The PageElement to use as this Tag's parent. - :param previous: The PageElement that was parsed immediately before - this tag. - :param is_xml: If True, this is an XML tag. Otherwise, this is an - HTML tag. - :param sourceline: The line number where this tag was found in its - source document. - :param sourcepos: The character position within `sourceline` where this - tag was found. - :param can_be_empty_element: If True, this tag should be - represented as . If False, this tag should be represented - as . - :param cdata_list_attributes: A list of attributes whose values should - be treated as CDATA if they ever show up on this tag. - :param preserve_whitespace_tags: A list of tag names whose contents - should have their whitespace preserved. - """ - if parser is None: - self.parser_class = None - else: - # We don't actually store the parser object: that lets extracted - # chunks be garbage-collected. - self.parser_class = parser.__class__ - if name is None: - raise ValueError("No value provided for new tag's name.") - self.name = name - self.namespace = namespace - self.prefix = prefix - if ((not builder or builder.store_line_numbers) - and (sourceline is not None or sourcepos is not None)): - self.sourceline = sourceline - self.sourcepos = sourcepos - if attrs is None: - attrs = {} - elif attrs: - if builder is not None and builder.cdata_list_attributes: - attrs = builder._replace_cdata_list_attribute_values( - self.name, attrs) - else: - attrs = dict(attrs) - else: - attrs = dict(attrs) - - # If possible, determine ahead of time whether this tag is an - # XML tag. - if builder: - self.known_xml = builder.is_xml - else: - self.known_xml = is_xml - self.attrs = attrs - self.contents = [] - self.setup(parent, previous) - self.hidden = False - - if builder is None: - # In the absence of a TreeBuilder, use whatever values were - # passed in here. They're probably None, unless this is a copy of some - # other tag. - self.can_be_empty_element = can_be_empty_element - self.cdata_list_attributes = cdata_list_attributes - self.preserve_whitespace_tags = preserve_whitespace_tags - else: - # Set up any substitutions for this tag, such as the charset in a META tag. - builder.set_up_substitutions(self) - - # Ask the TreeBuilder whether this tag might be an empty-element tag. - self.can_be_empty_element = builder.can_be_empty_element(name) - - # Keep track of the list of attributes of this tag that - # might need to be treated as a list. - # - # For performance reasons, we store the whole data structure - # rather than asking the question of every tag. Asking would - # require building a new data structure every time, and - # (unlike can_be_empty_element), we almost never need - # to check this. - self.cdata_list_attributes = builder.cdata_list_attributes - - # Keep track of the names that might cause this tag to be treated as a - # whitespace-preserved tag. - self.preserve_whitespace_tags = builder.preserve_whitespace_tags - - parserClass = _alias("parser_class") # BS3 - - def __copy__(self): - """A copy of a Tag is a new Tag, unconnected to the parse tree. - Its contents are a copy of the old Tag's contents. - """ - clone = type(self)( - None, self.builder, self.name, self.namespace, - self.prefix, self.attrs, is_xml=self._is_xml, - sourceline=self.sourceline, sourcepos=self.sourcepos, - can_be_empty_element=self.can_be_empty_element, - cdata_list_attributes=self.cdata_list_attributes, - preserve_whitespace_tags=self.preserve_whitespace_tags - ) - for attr in ('can_be_empty_element', 'hidden'): - setattr(clone, attr, getattr(self, attr)) - for child in self.contents: - clone.append(child.__copy__()) - return clone - - @property - def is_empty_element(self): - """Is this tag an empty-element tag? (aka a self-closing tag) - - A tag that has contents is never an empty-element tag. - - A tag that has no contents may or may not be an empty-element - tag. It depends on the builder used to create the tag. If the - builder has a designated list of empty-element tags, then only - a tag whose name shows up in that list is considered an - empty-element tag. - - If the builder has no designated list of empty-element tags, - then any tag with no contents is an empty-element tag. - """ - return len(self.contents) == 0 and self.can_be_empty_element - isSelfClosing = is_empty_element # BS3 - - @property - def string(self): - """Convenience property to get the single string within this - PageElement. - - TODO It might make sense to have NavigableString.string return - itself. - - :return: If this element has a single string child, return - value is that string. If this element has one child tag, - return value is the 'string' attribute of the child tag, - recursively. If this element is itself a string, has no - children, or has more than one child, return value is None. - """ - if len(self.contents) != 1: - return None - child = self.contents[0] - if isinstance(child, NavigableString): - return child - return child.string - - @string.setter - def string(self, string): - """Replace this PageElement's contents with `string`.""" - self.clear() - self.append(string.__class__(string)) - - def _all_strings(self, strip=False, types=(NavigableString, CData)): - """Yield all strings of certain classes, possibly stripping them. - - :param strip: If True, all strings will be stripped before being - yielded. - - :types: A tuple of NavigableString subclasses. Any strings of - a subclass not found in this list will be ignored. By - default, this means only NavigableString and CData objects - will be considered. So no comments, processing instructions, - etc. - - :yield: A sequence of strings. - """ - for descendant in self.descendants: - if ( - (types is None and not isinstance(descendant, NavigableString)) - or - (types is not None and type(descendant) not in types)): - continue - if strip: - descendant = descendant.strip() - if len(descendant) == 0: - continue - yield descendant - - strings = property(_all_strings) - - @property - def stripped_strings(self): - """Yield all strings in the document, stripping them first. - - :yield: A sequence of stripped strings. - """ - for string in self._all_strings(True): - yield string - - def get_text(self, separator="", strip=False, - types=(NavigableString, CData)): - """Get all child strings, concatenated using the given separator. - - :param separator: Strings will be concatenated using this separator. - - :param strip: If True, strings will be stripped before being - concatenated. - - :types: A tuple of NavigableString subclasses. Any strings of - a subclass not found in this list will be ignored. By - default, this means only NavigableString and CData objects - will be considered. So no comments, processing instructions, - stylesheets, etc. - - :return: A string. - """ - return separator.join([s for s in self._all_strings( - strip, types=types)]) - getText = get_text - text = property(get_text) - - def decompose(self): - """Recursively destroys this PageElement and its children. - - This element will be removed from the tree and wiped out; so - will everything beneath it. - - The behavior of a decomposed PageElement is undefined and you - should never use one for anything, but if you need to _check_ - whether an element has been decomposed, you can use the - `decomposed` property. - """ - self.extract() - i = self - while i is not None: - n = i.next_element - i.__dict__.clear() - i.contents = [] - i._decomposed = True - i = n - - def clear(self, decompose=False): - """Wipe out all children of this PageElement by calling extract() - on them. - - :param decompose: If this is True, decompose() (a more - destructive method) will be called instead of extract(). - """ - if decompose: - for element in self.contents[:]: - if isinstance(element, Tag): - element.decompose() - else: - element.extract() - else: - for element in self.contents[:]: - element.extract() - - def smooth(self): - """Smooth out this element's children by consolidating consecutive - strings. - - This makes pretty-printed output look more natural following a - lot of operations that modified the tree. - """ - # Mark the first position of every pair of children that need - # to be consolidated. Do this rather than making a copy of - # self.contents, since in most cases very few strings will be - # affected. - marked = [] - for i, a in enumerate(self.contents): - if isinstance(a, Tag): - # Recursively smooth children. - a.smooth() - if i == len(self.contents)-1: - # This is the last item in .contents, and it's not a - # tag. There's no chance it needs any work. - continue - b = self.contents[i+1] - if (isinstance(a, NavigableString) - and isinstance(b, NavigableString) - and not isinstance(a, PreformattedString) - and not isinstance(b, PreformattedString) - ): - marked.append(i) - - # Go over the marked positions in reverse order, so that - # removing items from .contents won't affect the remaining - # positions. - for i in reversed(marked): - a = self.contents[i] - b = self.contents[i+1] - b.extract() - n = NavigableString(a+b) - a.replace_with(n) - - def index(self, element): - """Find the index of a child by identity, not value. - - Avoids issues with tag.contents.index(element) getting the - index of equal elements. - - :param element: Look for this PageElement in `self.contents`. - """ - for i, child in enumerate(self.contents): - if child is element: - return i - raise ValueError("Tag.index: element not in tag") - - def get(self, key, default=None): - """Returns the value of the 'key' attribute for the tag, or - the value given for 'default' if it doesn't have that - attribute.""" - return self.attrs.get(key, default) - - def get_attribute_list(self, key, default=None): - """The same as get(), but always returns a list. - - :param key: The attribute to look for. - :param default: Use this value if the attribute is not present - on this PageElement. - :return: A list of values, probably containing only a single - value. - """ - value = self.get(key, default) - if not isinstance(value, list): - value = [value] - return value - - def has_attr(self, key): - """Does this PageElement have an attribute with the given name?""" - return key in self.attrs - - def __hash__(self): - return str(self).__hash__() - - def __getitem__(self, key): - """tag[key] returns the value of the 'key' attribute for the Tag, - and throws an exception if it's not there.""" - return self.attrs[key] - - def __iter__(self): - "Iterating over a Tag iterates over its contents." - return iter(self.contents) - - def __len__(self): - "The length of a Tag is the length of its list of contents." - return len(self.contents) - - def __contains__(self, x): - return x in self.contents - - def __bool__(self): - "A tag is non-None even if it has no contents." - return True - - def __setitem__(self, key, value): - """Setting tag[key] sets the value of the 'key' attribute for the - tag.""" - self.attrs[key] = value - - def __delitem__(self, key): - "Deleting tag[key] deletes all 'key' attributes for the tag." - self.attrs.pop(key, None) - - def __call__(self, *args, **kwargs): - """Calling a Tag like a function is the same as calling its - find_all() method. Eg. tag('a') returns a list of all the A tags - found within this tag.""" - return self.find_all(*args, **kwargs) - - def __getattr__(self, tag): - """Calling tag.subtag is the same as calling tag.find(name="subtag")""" - #print("Getattr %s.%s" % (self.__class__, tag)) - if len(tag) > 3 and tag.endswith('Tag'): - # BS3: soup.aTag -> "soup.find("a") - tag_name = tag[:-3] - warnings.warn( - '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict( - name=tag_name - ) - ) - return self.find(tag_name) - # We special case contents to avoid recursion. - elif not tag.startswith("__") and not tag == "contents": - return self.find(tag) - raise AttributeError( - "'%s' object has no attribute '%s'" % (self.__class__, tag)) - - def __eq__(self, other): - """Returns true iff this Tag has the same name, the same attributes, - and the same contents (recursively) as `other`.""" - if self is other: - return True - if (not hasattr(other, 'name') or - not hasattr(other, 'attrs') or - not hasattr(other, 'contents') or - self.name != other.name or - self.attrs != other.attrs or - len(self) != len(other)): - return False - for i, my_child in enumerate(self.contents): - if my_child != other.contents[i]: - return False - return True - - def __ne__(self, other): - """Returns true iff this Tag is not identical to `other`, - as defined in __eq__.""" - return not self == other - - def __repr__(self, encoding="unicode-escape"): - """Renders this PageElement as a string. - - :param encoding: The encoding to use (Python 2 only). - :return: Under Python 2, a bytestring; under Python 3, - a Unicode string. - """ - if PY3K: - # "The return value must be a string object", i.e. Unicode - return self.decode() - else: - # "The return value must be a string object", i.e. a bytestring. - # By convention, the return value of __repr__ should also be - # an ASCII string. - return self.encode(encoding) - - def __unicode__(self): - """Renders this PageElement as a Unicode string.""" - return self.decode() - - def __str__(self): - """Renders this PageElement as a generic string. - - :return: Under Python 2, a UTF-8 bytestring; under Python 3, - a Unicode string. - """ - if PY3K: - return self.decode() - else: - return self.encode() - - if PY3K: - __str__ = __repr__ = __unicode__ - - def encode(self, encoding=DEFAULT_OUTPUT_ENCODING, - indent_level=None, formatter="minimal", - errors="xmlcharrefreplace"): - """Render a bytestring representation of this PageElement and its - contents. - - :param encoding: The destination encoding. - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - :param formatter: A Formatter object, or a string naming one of - the standard formatters. - :param errors: An error handling strategy such as - 'xmlcharrefreplace'. This value is passed along into - encode() and its value should be one of the constants - defined by Python. - :return: A bytestring. - - """ - # Turn the data structure into Unicode, then encode the - # Unicode. - u = self.decode(indent_level, encoding, formatter) - return u.encode(encoding, errors) - - def decode(self, indent_level=None, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Render a Unicode representation of this PageElement and its - contents. - - :param indent_level: Each line of the rendering will be - indented this many spaces. Used internally in - recursive calls while pretty-printing. - :param eventual_encoding: The tag is destined to be - encoded into this encoding. This method is _not_ - responsible for performing that encoding. This information - is passed in so that it can be substituted in if the - document contains a tag that mentions the document's - encoding. - :param formatter: A Formatter object, or a string naming one of - the standard formatters. - """ - - # First off, turn a non-Formatter `formatter` into a Formatter - # object. This will stop the lookup from happening over and - # over again. - if not isinstance(formatter, Formatter): - formatter = self.formatter_for_name(formatter) - attributes = formatter.attributes(self) - attrs = [] - for key, val in attributes: - if val is None: - decoded = key - else: - if isinstance(val, list) or isinstance(val, tuple): - val = ' '.join(val) - elif not isinstance(val, str): - val = str(val) - elif ( - isinstance(val, AttributeValueWithCharsetSubstitution) - and eventual_encoding is not None - ): - val = val.encode(eventual_encoding) - - text = formatter.attribute_value(val) - decoded = ( - str(key) + '=' - + formatter.quoted_attribute_value(text)) - attrs.append(decoded) - close = '' - closeTag = '' - - prefix = '' - if self.prefix: - prefix = self.prefix + ":" - - if self.is_empty_element: - close = formatter.void_element_close_prefix or '' - else: - closeTag = '' % (prefix, self.name) - - pretty_print = self._should_pretty_print(indent_level) - space = '' - indent_space = '' - if indent_level is not None: - indent_space = (' ' * (indent_level - 1)) - if pretty_print: - space = indent_space - indent_contents = indent_level + 1 - else: - indent_contents = None - contents = self.decode_contents( - indent_contents, eventual_encoding, formatter - ) - - if self.hidden: - # This is the 'document root' object. - s = contents - else: - s = [] - attribute_string = '' - if attrs: - attribute_string = ' ' + ' '.join(attrs) - if indent_level is not None: - # Even if this particular tag is not pretty-printed, - # we should indent up to the start of the tag. - s.append(indent_space) - s.append('<%s%s%s%s>' % ( - prefix, self.name, attribute_string, close)) - if pretty_print: - s.append("\n") - s.append(contents) - if pretty_print and contents and contents[-1] != "\n": - s.append("\n") - if pretty_print and closeTag: - s.append(space) - s.append(closeTag) - if indent_level is not None and closeTag and self.next_sibling: - # Even if this particular tag is not pretty-printed, - # we're now done with the tag, and we should add a - # newline if appropriate. - s.append("\n") - s = ''.join(s) - return s - - def _should_pretty_print(self, indent_level): - """Should this tag be pretty-printed? - - Most of them should, but some (such as
 in HTML
-        documents) should not.
-        """
-        return (
-            indent_level is not None
-            and (
-                not self.preserve_whitespace_tags
-                or self.name not in self.preserve_whitespace_tags
-            )
-        )
-
-    def prettify(self, encoding=None, formatter="minimal"):
-        """Pretty-print this PageElement as a string.
-
-        :param encoding: The eventual encoding of the string. If this is None,
-            a Unicode string will be returned.
-        :param formatter: A Formatter object, or a string naming one of
-            the standard formatters.
-        :return: A Unicode string (if encoding==None) or a bytestring 
-            (otherwise).
-        """
-        if encoding is None:
-            return self.decode(True, formatter=formatter)
-        else:
-            return self.encode(encoding, True, formatter=formatter)
-
-    def decode_contents(self, indent_level=None,
-                       eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-                       formatter="minimal"):
-        """Renders the contents of this tag as a Unicode string.
-
-        :param indent_level: Each line of the rendering will be
-           indented this many spaces. Used internally in
-           recursive calls while pretty-printing.
-
-        :param eventual_encoding: The tag is destined to be
-           encoded into this encoding. decode_contents() is _not_
-           responsible for performing that encoding. This information
-           is passed in so that it can be substituted in if the
-           document contains a  tag that mentions the document's
-           encoding.
-
-        :param formatter: A Formatter object, or a string naming one of
-            the standard Formatters.
-        """
-        # First off, turn a string formatter into a Formatter object. This
-        # will stop the lookup from happening over and over again.
-        if not isinstance(formatter, Formatter):
-            formatter = self.formatter_for_name(formatter)
-
-        pretty_print = (indent_level is not None)
-        s = []
-        for c in self:
-            text = None
-            if isinstance(c, NavigableString):
-                text = c.output_ready(formatter)
-            elif isinstance(c, Tag):
-                s.append(c.decode(indent_level, eventual_encoding,
-                                  formatter))
-            preserve_whitespace = (
-                self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
-            )
-            if text and indent_level and not preserve_whitespace:
-                text = text.strip()
-            if text:
-                if pretty_print and not preserve_whitespace:
-                    s.append(" " * (indent_level - 1))
-                s.append(text)
-                if pretty_print and not preserve_whitespace:
-                    s.append("\n")
-        return ''.join(s)
-       
-    def encode_contents(
-        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
-        formatter="minimal"):
-        """Renders the contents of this PageElement as a bytestring.
-
-        :param indent_level: Each line of the rendering will be
-           indented this many spaces. Used internally in
-           recursive calls while pretty-printing.
-
-        :param eventual_encoding: The bytestring will be in this encoding.
-
-        :param formatter: A Formatter object, or a string naming one of
-            the standard Formatters.
-
-        :return: A bytestring.
-        """
-        contents = self.decode_contents(indent_level, encoding, formatter)
-        return contents.encode(encoding)
-
-    # Old method for BS3 compatibility
-    def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
-                       prettyPrint=False, indentLevel=0):
-        """Deprecated method for BS3 compatibility."""
-        if not prettyPrint:
-            indentLevel = None
-        return self.encode_contents(
-            indent_level=indentLevel, encoding=encoding)
-
-    #Soup methods
-
-    def find(self, name=None, attrs={}, recursive=True, text=None,
-             **kwargs):
-        """Look in the children of this PageElement and find the first
-        PageElement that matches the given criteria.
-
-        All find_* methods take a common set of arguments. See the online
-        documentation for detailed explanations.
-
-        :param name: A filter on tag name.
-        :param attrs: A dictionary of filters on attribute values.
-        :param recursive: If this is True, find() will perform a
-            recursive search of this PageElement's children. Otherwise,
-            only the direct children will be considered.
-        :param limit: Stop looking after finding this many results.
-        :kwargs: A dictionary of filters on attribute values.
-        :return: A PageElement.
-        :rtype: bs4.element.Tag | bs4.element.NavigableString
-        """
-        r = None
-        l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
-        if l:
-            r = l[0]
-        return r
-    findChild = find #BS2
-
-    def find_all(self, name=None, attrs={}, recursive=True, text=None,
-                 limit=None, **kwargs):
-        """Look in the children of this PageElement and find all
-        PageElements that match the given criteria.
-
-        All find_* methods take a common set of arguments. See the online
-        documentation for detailed explanations.
-
-        :param name: A filter on tag name.
-        :param attrs: A dictionary of filters on attribute values.
-        :param recursive: If this is True, find_all() will perform a
-            recursive search of this PageElement's children. Otherwise,
-            only the direct children will be considered.
-        :param limit: Stop looking after finding this many results.
-        :kwargs: A dictionary of filters on attribute values.
-        :return: A ResultSet of PageElements.
-        :rtype: bs4.element.ResultSet
-        """
-        generator = self.descendants
-        if not recursive:
-            generator = self.children
-        return self._find_all(name, attrs, text, limit, generator, **kwargs)
-    findAll = find_all       # BS3
-    findChildren = find_all  # BS2
-
-    #Generator methods
-    @property
-    def children(self):
-        """Iterate over all direct children of this PageElement.
-
-        :yield: A sequence of PageElements.
-        """
-        # return iter() to make the purpose of the method clear
-        return iter(self.contents)  # XXX This seems to be untested.
-
-    @property
-    def descendants(self):
-        """Iterate over all children of this PageElement in a
-        breadth-first sequence.
-
-        :yield: A sequence of PageElements.
-        """
-        if not len(self.contents):
-            return
-        stopNode = self._last_descendant().next_element
-        current = self.contents[0]
-        while current is not stopNode:
-            yield current
-            current = current.next_element
-
-    # CSS selector code
-    def select_one(self, selector, namespaces=None, **kwargs):
-        """Perform a CSS selection operation on the current element.
-
-        :param selector: A CSS selector.
-
-        :param namespaces: A dictionary mapping namespace prefixes
-           used in the CSS selector to namespace URIs. By default,
-           Beautiful Soup will use the prefixes it encountered while
-           parsing the document.
-
-        :param kwargs: Keyword arguments to be passed into SoupSieve's 
-           soupsieve.select() method.
-
-        :return: A Tag.
-        :rtype: bs4.element.Tag
-        """
-        value = self.select(selector, namespaces, 1, **kwargs)
-        if value:
-            return value[0]
-        return None
-
-    def select(self, selector, namespaces=None, limit=None, **kwargs):
-        """Perform a CSS selection operation on the current element.
-
-        This uses the SoupSieve library.
-
-        :param selector: A string containing a CSS selector.
-
-        :param namespaces: A dictionary mapping namespace prefixes
-           used in the CSS selector to namespace URIs. By default,
-           Beautiful Soup will use the prefixes it encountered while
-           parsing the document.
-
-        :param limit: After finding this number of results, stop looking.
-
-        :param kwargs: Keyword arguments to be passed into SoupSieve's 
-           soupsieve.select() method.
-
-        :return: A ResultSet of Tags.
-        :rtype: bs4.element.ResultSet
-        """
-        if namespaces is None:
-            namespaces = self._namespaces
-        
-        if limit is None:
-            limit = 0
-        if soupsieve is None:
-            raise NotImplementedError(
-                "Cannot execute CSS selectors because the soupsieve package is not installed."
-            )
-            
-        results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
-
-        # We do this because it's more consistent and because
-        # ResultSet.__getattr__ has a helpful error message.
-        return ResultSet(None, results)
-
-    # Old names for backwards compatibility
-    def childGenerator(self):
-        """Deprecated generator."""
-        return self.children
-
-    def recursiveChildGenerator(self):
-        """Deprecated generator."""
-        return self.descendants
-
-    def has_key(self, key):
-        """Deprecated method. This was kind of misleading because has_key()
-        (attributes) was different from __in__ (contents).
-
-        has_key() is gone in Python 3, anyway.
-        """
-        warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
-                key))
-        return self.has_attr(key)
-
-# Next, a couple classes to represent queries and their results.
-class SoupStrainer(object):
-    """Encapsulates a number of ways of matching a markup element (tag or
-    string).
-
-    This is primarily used to underpin the find_* methods, but you can
-    create one yourself and pass it in as `parse_only` to the
-    `BeautifulSoup` constructor, to parse a subset of a large
-    document.
-    """
-
-    def __init__(self, name=None, attrs={}, text=None, **kwargs):
-        """Constructor.
-
-        The SoupStrainer constructor takes the same arguments passed
-        into the find_* methods. See the online documentation for
-        detailed explanations.
-
-        :param name: A filter on tag name.
-        :param attrs: A dictionary of filters on attribute values.
-        :param text: A filter for a NavigableString with specific text.
-        :kwargs: A dictionary of filters on attribute values.
-        """        
-        self.name = self._normalize_search_value(name)
-        if not isinstance(attrs, dict):
-            # Treat a non-dict value for attrs as a search for the 'class'
-            # attribute.
-            kwargs['class'] = attrs
-            attrs = None
-
-        if 'class_' in kwargs:
-            # Treat class_="foo" as a search for the 'class'
-            # attribute, overriding any non-dict value for attrs.
-            kwargs['class'] = kwargs['class_']
-            del kwargs['class_']
-
-        if kwargs:
-            if attrs:
-                attrs = attrs.copy()
-                attrs.update(kwargs)
-            else:
-                attrs = kwargs
-        normalized_attrs = {}
-        for key, value in list(attrs.items()):
-            normalized_attrs[key] = self._normalize_search_value(value)
-
-        self.attrs = normalized_attrs
-        self.text = self._normalize_search_value(text)
-
-    def _normalize_search_value(self, value):
-        # Leave it alone if it's a Unicode string, a callable, a
-        # regular expression, a boolean, or None.
-        if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
-            or isinstance(value, bool) or value is None):
-            return value
-
-        # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
-        if isinstance(value, bytes):
-            return value.decode("utf8")
-
-        # If it's listlike, convert it into a list of strings.
-        if hasattr(value, '__iter__'):
-            new_value = []
-            for v in value:
-                if (hasattr(v, '__iter__') and not isinstance(v, bytes)
-                    and not isinstance(v, str)):
-                    # This is almost certainly the user's mistake. In the
-                    # interests of avoiding infinite loops, we'll let
-                    # it through as-is rather than doing a recursive call.
-                    new_value.append(v)
-                else:
-                    new_value.append(self._normalize_search_value(v))
-            return new_value
-
-        # Otherwise, convert it into a Unicode string.
-        # The unicode(str()) thing is so this will do the same thing on Python 2
-        # and Python 3.
-        return str(str(value))
-
-    def __str__(self):
-        """A human-readable representation of this SoupStrainer."""
-        if self.text:
-            return self.text
-        else:
-            return "%s|%s" % (self.name, self.attrs)
-
-    def search_tag(self, markup_name=None, markup_attrs={}):
-        """Check whether a Tag with the given name and attributes would
-        match this SoupStrainer.
-
-        Used prospectively to decide whether to even bother creating a Tag
-        object.
-
-        :param markup_name: A tag name as found in some markup.
-        :param markup_attrs: A dictionary of attributes as found in some markup.
-
-        :return: True if the prospective tag would match this SoupStrainer;
-            False otherwise.
-        """
-        found = None
-        markup = None
-        if isinstance(markup_name, Tag):
-            markup = markup_name
-            markup_attrs = markup
-        call_function_with_tag_data = (
-            isinstance(self.name, Callable)
-            and not isinstance(markup_name, Tag))
-
-        if ((not self.name)
-            or call_function_with_tag_data
-            or (markup and self._matches(markup, self.name))
-            or (not markup and self._matches(markup_name, self.name))):
-            if call_function_with_tag_data:
-                match = self.name(markup_name, markup_attrs)
-            else:
-                match = True
-                markup_attr_map = None
-                for attr, match_against in list(self.attrs.items()):
-                    if not markup_attr_map:
-                        if hasattr(markup_attrs, 'get'):
-                            markup_attr_map = markup_attrs
-                        else:
-                            markup_attr_map = {}
-                            for k, v in markup_attrs:
-                                markup_attr_map[k] = v
-                    attr_value = markup_attr_map.get(attr)
-                    if not self._matches(attr_value, match_against):
-                        match = False
-                        break
-            if match:
-                if markup:
-                    found = markup
-                else:
-                    found = markup_name
-        if found and self.text and not self._matches(found.string, self.text):
-            found = None
-        return found
-
-    # For BS3 compatibility.
-    searchTag = search_tag
-
-    def search(self, markup):
-        """Find all items in `markup` that match this SoupStrainer.
-
-        Used by the core _find_all() method, which is ultimately
-        called by all find_* methods.
-
-        :param markup: A PageElement or a list of them.
-        """
-        # print('looking for %s in %s' % (self, markup))
-        found = None
-        # If given a list of items, scan it for a text element that
-        # matches.
-        if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
-            for element in markup:
-                if isinstance(element, NavigableString) \
-                       and self.search(element):
-                    found = element
-                    break
-        # If it's a Tag, make sure its name or attributes match.
-        # Don't bother with Tags if we're searching for text.
-        elif isinstance(markup, Tag):
-            if not self.text or self.name or self.attrs:
-                found = self.search_tag(markup)
-        # If it's text, make sure the text matches.
-        elif isinstance(markup, NavigableString) or \
-                 isinstance(markup, str):
-            if not self.name and not self.attrs and self._matches(markup, self.text):
-                found = markup
-        else:
-            raise Exception(
-                "I don't know how to match against a %s" % markup.__class__)
-        return found
-
-    def _matches(self, markup, match_against, already_tried=None):
-        # print(u"Matching %s against %s" % (markup, match_against))
-        result = False
-        if isinstance(markup, list) or isinstance(markup, tuple):
-            # This should only happen when searching a multi-valued attribute
-            # like 'class'.
-            for item in markup:
-                if self._matches(item, match_against):
-                    return True
-            # We didn't match any particular value of the multivalue
-            # attribute, but maybe we match the attribute value when
-            # considered as a string.
-            if self._matches(' '.join(markup), match_against):
-                return True
-            return False
-        
-        if match_against is True:
-            # True matches any non-None value.
-            return markup is not None
-
-        if isinstance(match_against, Callable):
-            return match_against(markup)
-
-        # Custom callables take the tag as an argument, but all
-        # other ways of matching match the tag name as a string.
-        original_markup = markup
-        if isinstance(markup, Tag):
-            markup = markup.name
-
-        # Ensure that `markup` is either a Unicode string, or None.
-        markup = self._normalize_search_value(markup)
-
-        if markup is None:
-            # None matches None, False, an empty string, an empty list, and so on.
-            return not match_against
-
-        if (hasattr(match_against, '__iter__')
-            and not isinstance(match_against, str)):
-            # We're asked to match against an iterable of items.
-            # The markup must be match at least one item in the
-            # iterable. We'll try each one in turn.
-            #
-            # To avoid infinite recursion we need to keep track of
-            # items we've already seen.
-            if not already_tried:
-                already_tried = set()
-            for item in match_against:
-                if item.__hash__:
-                    key = item
-                else:
-                    key = id(item)
-                if key in already_tried:
-                    continue
-                else:
-                    already_tried.add(key)
-                    if self._matches(original_markup, item, already_tried):
-                        return True
-            else:
-                return False
-        
-        # Beyond this point we might need to run the test twice: once against
-        # the tag's name and once against its prefixed name.
-        match = False
-        
-        if not match and isinstance(match_against, str):
-            # Exact string match
-            match = markup == match_against
-
-        if not match and hasattr(match_against, 'search'):
-            # Regexp match
-            return match_against.search(markup)
-
-        if (not match
-            and isinstance(original_markup, Tag)
-            and original_markup.prefix):
-            # Try the whole thing again with the prefixed tag name.
-            return self._matches(
-                original_markup.prefix + ':' + original_markup.name, match_against
-            )
-
-        return match
-
-
-class ResultSet(list):
-    """A ResultSet is just a list that keeps track of the SoupStrainer
-    that created it."""
-    def __init__(self, source, result=()):
-        """Constructor.
-
-        :param source: A SoupStrainer.
-        :param result: A list of PageElements.
-        """
-        super(ResultSet, self).__init__(result)
-        self.source = source
-
-    def __getattr__(self, key):
-        """Raise a helpful exception to explain a common code fix."""
-        raise AttributeError(
-            "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
-        )
diff --git a/infra/api/package/bs4/formatter.py b/infra/api/package/bs4/formatter.py
deleted file mode 100644
index 2cbab4c..0000000
--- a/infra/api/package/bs4/formatter.py
+++ /dev/null
@@ -1,152 +0,0 @@
-from bs4.dammit import EntitySubstitution
-
-class Formatter(EntitySubstitution):
-    """Describes a strategy to use when outputting a parse tree to a string.
-
-    Some parts of this strategy come from the distinction between
-    HTML4, HTML5, and XML. Others are configurable by the user.
-
-    Formatters are passed in as the `formatter` argument to methods
-    like `PageElement.encode`. Most people won't need to think about
-    formatters, and most people who need to think about them can pass
-    in one of these predefined strings as `formatter` rather than
-    making a new Formatter object:
-
-    For HTML documents:
-     * 'html' - HTML entity substitution for generic HTML documents. (default)
-     * 'html5' - HTML entity substitution for HTML5 documents.
-     * 'minimal' - Only make the substitutions necessary to guarantee
-                   valid HTML.
-     * None - Do not perform any substitution. This will be faster
-              but may result in invalid markup.
-
-    For XML documents:
-     * 'html' - Entity substitution for XHTML documents.
-     * 'minimal' - Only make the substitutions necessary to guarantee
-                   valid XML. (default)
-     * None - Do not perform any substitution. This will be faster
-              but may result in invalid markup.
-    """
-    # Registries of XML and HTML formatters.
-    XML_FORMATTERS = {}
-    HTML_FORMATTERS = {}
-
-    HTML = 'html'
-    XML = 'xml'
-
-    HTML_DEFAULTS = dict(
-        cdata_containing_tags=set(["script", "style"]),
-    )
-
-    def _default(self, language, value, kwarg):
-        if value is not None:
-            return value
-        if language == self.XML:
-            return set()
-        return self.HTML_DEFAULTS[kwarg]
-
-    def __init__(
-            self, language=None, entity_substitution=None,
-            void_element_close_prefix='/', cdata_containing_tags=None,
-    ):
-        """Constructor.
-
-        :param language: This should be Formatter.XML if you are formatting
-           XML markup and Formatter.HTML if you are formatting HTML markup.
-
-        :param entity_substitution: A function to call to replace special
-           characters with XML/HTML entities. For examples, see 
-           bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
-        :param void_element_close_prefix: By default, void elements
-           are represented as  (XML rules) rather than 
-           (HTML rules). To get , pass in the empty string.
-        :param cdata_containing_tags: The list of tags that are defined
-           as containing CDATA in this dialect. For example, in HTML,
-           
-
This numeric entity is missing the final semicolon:
- -
a
-
This document contains (do you see it?)
-
This document ends with That attribute value was bogus
-The doctype is invalid because it contains extra whitespace -
That boolean attribute had no value
-
Here's a nonexistent entity: &#foo; (do you see it?)
-
This document ends before the entity finishes: > -

Paragraphs shouldn't contain block display elements, but this one does:

you see?

-Multiple values for the same attribute. -
Here's a table
-
-
This tag contains nothing but whitespace:
-

This p tag is cut off by

the end of the blockquote tag
-
Here's a nested table:
foo
This table contains bare markup
- -
This document contains a surprise doctype
- -
Tag name contains Unicode characters
- - -""" - - -class SoupTest(unittest.TestCase): - - @property - def default_builder(self): - return default_builder - - def soup(self, markup, **kwargs): - """Build a Beautiful Soup object from markup.""" - builder = kwargs.pop('builder', self.default_builder) - return BeautifulSoup(markup, builder=builder, **kwargs) - - def document_for(self, markup, **kwargs): - """Turn an HTML fragment into a document. - - The details depend on the builder. - """ - return self.default_builder(**kwargs).test_fragment_to_document(markup) - - def assertSoupEquals(self, to_parse, compare_parsed_to=None): - builder = self.default_builder - obj = BeautifulSoup(to_parse, builder=builder) - if compare_parsed_to is None: - compare_parsed_to = to_parse - - self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) - - def assertConnectedness(self, element): - """Ensure that next_element and previous_element are properly - set for all descendants of the given element. - """ - earlier = None - for e in element.descendants: - if earlier: - self.assertEqual(e, earlier.next_element) - self.assertEqual(earlier, e.previous_element) - earlier = e - - def linkage_validator(self, el, _recursive_call=False): - """Ensure proper linkage throughout the document.""" - descendant = None - # Document element should have no previous element or previous sibling. - # It also shouldn't have a next sibling. - if el.parent is None: - assert el.previous_element is None,\ - "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( - el, el.previous_element, None - ) - assert el.previous_sibling is None,\ - "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( - el, el.previous_sibling, None - ) - assert el.next_sibling is None,\ - "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( - el, el.next_sibling, None - ) - - idx = 0 - child = None - last_child = None - last_idx = len(el.contents) - 1 - for child in el.contents: - descendant = None - - # Parent should link next element to their first child - # That child should have no previous sibling - if idx == 0: - if el.parent is not None: - assert el.next_element is child,\ - "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format( - el, el.next_element, child - ) - assert child.previous_element is el,\ - "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format( - child, child.previous_element, el - ) - assert child.previous_sibling is None,\ - "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format( - child, child.previous_sibling, None - ) - - # If not the first child, previous index should link as sibling to this index - # Previous element should match the last index or the last bubbled up descendant - else: - assert child.previous_sibling is el.contents[idx - 1],\ - "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format( - child, child.previous_sibling, el.contents[idx - 1] - ) - assert el.contents[idx - 1].next_sibling is child,\ - "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - el.contents[idx - 1], el.contents[idx - 1].next_sibling, child - ) - - if last_child is not None: - assert child.previous_element is last_child,\ - "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format( - child, child.previous_element, last_child, child.parent.contents - ) - assert last_child.next_element is child,\ - "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - last_child, last_child.next_element, child - ) - - if isinstance(child, Tag) and child.contents: - descendant = self.linkage_validator(child, True) - # A bubbled up descendant should have no next siblings - assert descendant.next_sibling is None,\ - "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - descendant, descendant.next_sibling, None - ) - - # Mark last child as either the bubbled up descendant or the current child - if descendant is not None: - last_child = descendant - else: - last_child = child - - # If last child, there are non next siblings - if idx == last_idx: - assert child.next_sibling is None,\ - "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - child, child.next_sibling, None - ) - idx += 1 - - child = descendant if descendant is not None else child - if child is None: - child = el - - if not _recursive_call and child is not None: - target = el - while True: - if target is None: - assert child.next_element is None, \ - "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - child, child.next_element, None - ) - break - elif target.next_sibling is not None: - assert child.next_element is target.next_sibling, \ - "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format( - child, child.next_element, target.next_sibling - ) - break - target = target.parent - - # We are done, so nothing to return - return None - else: - # Return the child to the recursive caller - return child - - -class HTMLTreeBuilderSmokeTest(object): - - """A basic test of a treebuilder's competence. - - Any HTML treebuilder, present or future, should be able to pass - these tests. With invalid markup, there's room for interpretation, - and different parsers can handle it differently. But with the - markup in these tests, there's not much room for interpretation. - """ - - def test_empty_element_tags(self): - """Verify that all HTML4 and HTML5 empty element (aka void element) tags - are handled correctly. - """ - for name in [ - 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', - 'spacer', 'frame' - ]: - soup = self.soup("") - new_tag = soup.new_tag(name) - self.assertEqual(True, new_tag.is_empty_element) - - def test_special_string_containers(self): - soup = self.soup( - "" - ) - assert isinstance(soup.style.string, Stylesheet) - assert isinstance(soup.script.string, Script) - - soup = self.soup( - "" - ) - assert isinstance(soup.style.string, Stylesheet) - # The contents of the style tag resemble an HTML comment, but - # it's not treated as a comment. - self.assertEqual("", soup.style.string) - assert isinstance(soup.style.string, Stylesheet) - - def test_pickle_and_unpickle_identity(self): - # Pickling a tree, then unpickling it, yields a tree identical - # to the original. - tree = self.soup("foo") - dumped = pickle.dumps(tree, 2) - loaded = pickle.loads(dumped) - self.assertEqual(loaded.__class__, BeautifulSoup) - self.assertEqual(loaded.decode(), tree.decode()) - - def assertDoctypeHandled(self, doctype_fragment): - """Assert that a given doctype string is handled correctly.""" - doctype_str, soup = self._document_with_doctype(doctype_fragment) - - # Make sure a Doctype object was created. - doctype = soup.contents[0] - self.assertEqual(doctype.__class__, Doctype) - self.assertEqual(doctype, doctype_fragment) - self.assertEqual( - soup.encode("utf8")[:len(doctype_str)], - doctype_str - ) - - # Make sure that the doctype was correctly associated with the - # parse tree and that the rest of the document parsed. - self.assertEqual(soup.p.contents[0], 'foo') - - def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"): - """Generate and parse a document with the given doctype.""" - doctype = '' % (doctype_string, doctype_fragment) - markup = doctype + '\n

foo

' - soup = self.soup(markup) - return doctype.encode("utf8"), soup - - def test_normal_doctypes(self): - """Make sure normal, everyday HTML doctypes are handled correctly.""" - self.assertDoctypeHandled("html") - self.assertDoctypeHandled( - 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') - - def test_empty_doctype(self): - soup = self.soup("") - doctype = soup.contents[0] - self.assertEqual("", doctype.strip()) - - def test_mixed_case_doctype(self): - # A lowercase or mixed-case doctype becomes a Doctype. - for doctype_fragment in ("doctype", "DocType"): - doctype_str, soup = self._document_with_doctype( - "html", doctype_fragment - ) - - # Make sure a Doctype object was created and that the DOCTYPE - # is uppercase. - doctype = soup.contents[0] - self.assertEqual(doctype.__class__, Doctype) - self.assertEqual(doctype, "html") - self.assertEqual( - soup.encode("utf8")[:len(doctype_str)], - b"" - ) - - # Make sure that the doctype was correctly associated with the - # parse tree and that the rest of the document parsed. - self.assertEqual(soup.p.contents[0], 'foo') - - def test_public_doctype_with_url(self): - doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' - self.assertDoctypeHandled(doctype) - - def test_system_doctype(self): - self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') - - def test_namespaced_system_doctype(self): - # We can handle a namespaced doctype with a system ID. - self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') - - def test_namespaced_public_doctype(self): - # Test a namespaced doctype with a public id. - self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') - - def test_real_xhtml_document(self): - """A real XHTML document should come out more or less the same as it went in.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8").replace(b"\n", b""), - markup.replace(b"\n", b"")) - - def test_namespaced_html(self): - """When a namespaced XML document is parsed as HTML it should - be treated as HTML with weird tag names. - """ - markup = b"""content""" - soup = self.soup(markup) - self.assertEqual(2, len(soup.find_all("ns1:foo"))) - - def test_processing_instruction(self): - # We test both Unicode and bytestring to verify that - # process_markup correctly sets processing_instruction_class - # even when the markup is already Unicode and there is no - # need to process anything. - markup = """""" - soup = self.soup(markup) - self.assertEqual(markup, soup.decode()) - - markup = b"""""" - soup = self.soup(markup) - self.assertEqual(markup, soup.encode("utf8")) - - def test_deepcopy(self): - """Make sure you can copy the tree builder. - - This is important because the builder is part of a - BeautifulSoup object, and we want to be able to copy that. - """ - copy.deepcopy(self.default_builder) - - def test_p_tag_is_never_empty_element(self): - """A

tag is never designated as an empty-element tag. - - Even if the markup shows it as an empty-element tag, it - shouldn't be presented that way. - """ - soup = self.soup("

") - self.assertFalse(soup.p.is_empty_element) - self.assertEqual(str(soup.p), "

") - - def test_unclosed_tags_get_closed(self): - """A tag that's not closed by the end of the document should be closed. - - This applies to all tags except empty-element tags. - """ - self.assertSoupEquals("

", "

") - self.assertSoupEquals("", "") - - self.assertSoupEquals("
", "
") - - def test_br_is_always_empty_element_tag(self): - """A
tag is designated as an empty-element tag. - - Some parsers treat

as one
tag, some parsers as - two tags, but it should always be an empty-element tag. - """ - soup = self.soup("

") - self.assertTrue(soup.br.is_empty_element) - self.assertEqual(str(soup.br), "
") - - def test_nested_formatting_elements(self): - self.assertSoupEquals("") - - def test_double_head(self): - html = ''' - - -Ordinary HEAD element test - - - -Hello, world! - - -''' - soup = self.soup(html) - self.assertEqual("text/javascript", soup.find('script')['type']) - - def test_comment(self): - # Comments are represented as Comment objects. - markup = "

foobaz

" - self.assertSoupEquals(markup) - - soup = self.soup(markup) - comment = soup.find(text="foobar") - self.assertEqual(comment.__class__, Comment) - - # The comment is properly integrated into the tree. - foo = soup.find(text="foo") - self.assertEqual(comment, foo.next_element) - baz = soup.find(text="baz") - self.assertEqual(comment, baz.previous_element) - - def test_preserved_whitespace_in_pre_and_textarea(self): - """Whitespace must be preserved in
 and "
-        self.assertSoupEquals(pre_markup)
-        self.assertSoupEquals(textarea_markup)
-
-        soup = self.soup(pre_markup)
-        self.assertEqual(soup.pre.prettify(), pre_markup)
-
-        soup = self.soup(textarea_markup)
-        self.assertEqual(soup.textarea.prettify(), textarea_markup)
-
-        soup = self.soup("")
-        self.assertEqual(soup.textarea.prettify(), "")
-
-    def test_nested_inline_elements(self):
-        """Inline elements can be nested indefinitely."""
-        b_tag = "Inside a B tag"
-        self.assertSoupEquals(b_tag)
-
-        nested_b_tag = "

A nested tag

" - self.assertSoupEquals(nested_b_tag) - - double_nested_b_tag = "

A doubly nested tag

" - self.assertSoupEquals(nested_b_tag) - - def test_nested_block_level_elements(self): - """Block elements can be nested.""" - soup = self.soup('

Foo

') - blockquote = soup.blockquote - self.assertEqual(blockquote.p.b.string, 'Foo') - self.assertEqual(blockquote.b.string, 'Foo') - - def test_correctly_nested_tables(self): - """One table can go inside another one.""" - markup = ('' - '' - "') - - self.assertSoupEquals( - markup, - '
Here's another table:" - '' - '' - '
foo
Here\'s another table:' - '
foo
' - '
') - - self.assertSoupEquals( - "" - "" - "
Foo
Bar
Baz
") - - def test_multivalued_attribute_with_whitespace(self): - # Whitespace separating the values of a multi-valued attribute - # should be ignored. - - markup = '
' - soup = self.soup(markup) - self.assertEqual(['foo', 'bar'], soup.div['class']) - - # If you search by the literal name of the class it's like the whitespace - # wasn't there. - self.assertEqual(soup.div, soup.find('div', class_="foo bar")) - - def test_deeply_nested_multivalued_attribute(self): - # html5lib can set the attributes of the same tag many times - # as it rearranges the tree. This has caused problems with - # multivalued attributes. - markup = '
' - soup = self.soup(markup) - self.assertEqual(["css"], soup.div.div['class']) - - def test_multivalued_attribute_on_html(self): - # html5lib uses a different API to set the attributes ot the - # tag. This has caused problems with multivalued - # attributes. - markup = '' - soup = self.soup(markup) - self.assertEqual(["a", "b"], soup.html['class']) - - def test_angle_brackets_in_attribute_values_are_escaped(self): - self.assertSoupEquals('', '') - - def test_strings_resembling_character_entity_references(self): - # "&T" and "&p" look like incomplete character entities, but they are - # not. - self.assertSoupEquals( - "

• AT&T is in the s&p 500

", - "

\u2022 AT&T is in the s&p 500

" - ) - - def test_apos_entity(self): - self.assertSoupEquals( - "

Bob's Bar

", - "

Bob's Bar

", - ) - - def test_entities_in_foreign_document_encoding(self): - # “ and ” are invalid numeric entities referencing - # Windows-1252 characters. - references a character common - # to Windows-1252 and Unicode, and ☃ references a - # character only found in Unicode. - # - # All of these entities should be converted to Unicode - # characters. - markup = "

“Hello” -☃

" - soup = self.soup(markup) - self.assertEqual("“Hello†-☃", soup.p.string) - - def test_entities_in_attributes_converted_to_unicode(self): - expect = '

' - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - - def test_entities_in_text_converted_to_unicode(self): - expect = '

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - - def test_quot_entity_converted_to_quotation_mark(self): - self.assertSoupEquals("

I said "good day!"

", - '

I said "good day!"

') - - def test_out_of_range_entity(self): - expect = "\N{REPLACEMENT CHARACTER}" - self.assertSoupEquals("�", expect) - self.assertSoupEquals("�", expect) - self.assertSoupEquals("�", expect) - - def test_multipart_strings(self): - "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." - soup = self.soup("

\nfoo

") - self.assertEqual("p", soup.h2.string.next_element.name) - self.assertEqual("p", soup.p.name) - self.assertConnectedness(soup) - - def test_empty_element_tags(self): - """Verify consistent handling of empty-element tags, - no matter how they come in through the markup. - """ - self.assertSoupEquals('


', "


") - self.assertSoupEquals('


', "


") - - def test_head_tag_between_head_and_body(self): - "Prevent recurrence of a bug in the html5lib treebuilder." - content = """ - - foo - -""" - soup = self.soup(content) - self.assertNotEqual(None, soup.html.body) - self.assertConnectedness(soup) - - def test_multiple_copies_of_a_tag(self): - "Prevent recurrence of a bug in the html5lib treebuilder." - content = """ - - - - - -""" - soup = self.soup(content) - self.assertConnectedness(soup.article) - - def test_basic_namespaces(self): - """Parsers don't need to *understand* namespaces, but at the - very least they should not choke on namespaces or lose - data.""" - - markup = b'4' - soup = self.soup(markup) - self.assertEqual(markup, soup.encode()) - html = soup.html - self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) - self.assertEqual( - 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) - self.assertEqual( - 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) - - def test_multivalued_attribute_value_becomes_list(self): - markup = b'' - soup = self.soup(markup) - self.assertEqual(['foo', 'bar'], soup.a['class']) - - # - # Generally speaking, tests below this point are more tests of - # Beautiful Soup than tests of the tree builders. But parsers are - # weird, so we run these tests separately for every tree builder - # to detect any differences between them. - # - - def test_can_parse_unicode_document(self): - # A seemingly innocuous document... but it's in Unicode! And - # it contains characters that can't be represented in the - # encoding found in the declaration! The horror! - markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' - soup = self.soup(markup) - self.assertEqual('Sacr\xe9 bleu!', soup.body.string) - - def test_soupstrainer(self): - """Parsers should be able to work with SoupStrainers.""" - strainer = SoupStrainer("b") - soup = self.soup("A bold statement", - parse_only=strainer) - self.assertEqual(soup.decode(), "bold") - - def test_single_quote_attribute_values_become_double_quotes(self): - self.assertSoupEquals("", - '') - - def test_attribute_values_with_nested_quotes_are_left_alone(self): - text = """a""" - self.assertSoupEquals(text) - - def test_attribute_values_with_double_nested_quotes_get_quoted(self): - text = """a""" - soup = self.soup(text) - soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' - self.assertSoupEquals( - soup.foo.decode(), - """a""") - - def test_ampersand_in_attribute_value_gets_escaped(self): - self.assertSoupEquals('', - '') - - self.assertSoupEquals( - 'foo', - 'foo') - - def test_escaped_ampersand_in_attribute_value_is_left_alone(self): - self.assertSoupEquals('') - - def test_entities_in_strings_converted_during_parsing(self): - # Both XML and HTML entities are converted to Unicode characters - # during parsing. - text = "

<<sacré bleu!>>

" - expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" - self.assertSoupEquals(text, expected) - - def test_smart_quotes_converted_on_the_way_in(self): - # Microsoft smart quotes are converted to Unicode characters during - # parsing. - quote = b"

\x91Foo\x92

" - soup = self.soup(quote) - self.assertEqual( - soup.p.string, - "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") - - def test_non_breaking_spaces_converted_on_the_way_in(self): - soup = self.soup("  ") - self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) - - def test_entities_converted_on_the_way_out(self): - text = "

<<sacré bleu!>>

" - expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") - soup = self.soup(text) - self.assertEqual(soup.p.encode("utf-8"), expected) - - def test_real_iso_latin_document(self): - # Smoke test of interrelated functionality, using an - # easy-to-understand document. - - # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. - unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' - - # That's because we're going to encode it into ISO-Latin-1, and use - # that to test. - iso_latin_html = unicode_html.encode("iso-8859-1") - - # Parse the ISO-Latin-1 HTML. - soup = self.soup(iso_latin_html) - # Encode it to UTF-8. - result = soup.encode("utf-8") - - # What do we expect the result to look like? Well, it would - # look like unicode_html, except that the META tag would say - # UTF-8 instead of ISO-Latin-1. - expected = unicode_html.replace("ISO-Latin-1", "utf-8") - - # And, of course, it would be in UTF-8, not Unicode. - expected = expected.encode("utf-8") - - # Ta-da! - self.assertEqual(result, expected) - - def test_real_shift_jis_document(self): - # Smoke test to make sure the parser can handle a document in - # Shift-JIS encoding, without choking. - shift_jis_html = ( - b'
'
-            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
-            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
-            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
-            b'
') - unicode_html = shift_jis_html.decode("shift-jis") - soup = self.soup(unicode_html) - - # Make sure the parse tree is correctly encoded to various - # encodings. - self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) - self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) - - def test_real_hebrew_document(self): - # A real-world test to make sure we can convert ISO-8859-9 (a - # Hebrew encoding) to UTF-8. - hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' - soup = self.soup( - hebrew_document, from_encoding="iso8859-8") - # Some tree builders call it iso8859-8, others call it iso-8859-9. - # That's not a difference we really care about. - assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') - self.assertEqual( - soup.encode('utf-8'), - hebrew_document.decode("iso8859-8").encode("utf-8")) - - def test_meta_tag_reflects_current_encoding(self): - # Here's the tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('') - - # Here's a document incorporating that meta tag. - shift_jis_html = ( - '\n%s\n' - '' - 'Shift-JIS markup goes here.') % meta_tag - soup = self.soup(shift_jis_html) - - # Parse the document, and the charset is seemingly unaffected. - parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) - content = parsed_meta['content'] - self.assertEqual('text/html; charset=x-sjis', content) - - # But that value is actually a ContentMetaAttributeValue object. - self.assertTrue(isinstance(content, ContentMetaAttributeValue)) - - # And it will take on a value that reflects its current - # encoding. - self.assertEqual('text/html; charset=utf8', content.encode("utf8")) - - # For the rest of the story, see TestSubstitutions in - # test_tree.py. - - def test_html5_style_meta_tag_reflects_current_encoding(self): - # Here's the tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('') - - # Here's a document incorporating that meta tag. - shift_jis_html = ( - '\n%s\n' - '' - 'Shift-JIS markup goes here.') % meta_tag - soup = self.soup(shift_jis_html) - - # Parse the document, and the charset is seemingly unaffected. - parsed_meta = soup.find('meta', id="encoding") - charset = parsed_meta['charset'] - self.assertEqual('x-sjis', charset) - - # But that value is actually a CharsetMetaAttributeValue object. - self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) - - # And it will take on a value that reflects its current - # encoding. - self.assertEqual('utf8', charset.encode("utf8")) - - def test_python_specific_encodings_not_used_in_charset(self): - # You can encode an HTML document using a Python-specific - # encoding, but that encoding won't be mentioned _inside_ the - # resulting document. Instead, the document will appear to - # have no encoding. - for markup in [ - b'' - b'' - ]: - soup = self.soup(markup) - for encoding in PYTHON_SPECIFIC_ENCODINGS: - if encoding in ( - 'idna', 'mbcs', 'oem', 'undefined', - 'string_escape', 'string-escape' - ): - # For one reason or another, these will raise an - # exception if we actually try to use them, so don't - # bother. - continue - encoded = soup.encode(encoding) - assert b'meta charset=""' in encoded - assert encoding.encode("ascii") not in encoded - - def test_tag_with_no_attributes_can_have_attributes_added(self): - data = self.soup("text") - data.a['foo'] = 'bar' - self.assertEqual('text', data.a.decode()) - - def test_worst_case(self): - """Test the worst case (currently) for linking issues.""" - - soup = self.soup(BAD_DOCUMENT) - self.linkage_validator(soup) - - -class XMLTreeBuilderSmokeTest(object): - - def test_pickle_and_unpickle_identity(self): - # Pickling a tree, then unpickling it, yields a tree identical - # to the original. - tree = self.soup("foo") - dumped = pickle.dumps(tree, 2) - loaded = pickle.loads(dumped) - self.assertEqual(loaded.__class__, BeautifulSoup) - self.assertEqual(loaded.decode(), tree.decode()) - - def test_docstring_generated(self): - soup = self.soup("") - self.assertEqual( - soup.encode(), b'\n') - - def test_xml_declaration(self): - markup = b"""\n""" - soup = self.soup(markup) - self.assertEqual(markup, soup.encode("utf8")) - - def test_python_specific_encodings_not_used_in_xml_declaration(self): - # You can encode an XML document using a Python-specific - # encoding, but that encoding won't be mentioned _inside_ the - # resulting document. - markup = b"""\n""" - soup = self.soup(markup) - for encoding in PYTHON_SPECIFIC_ENCODINGS: - if encoding in ( - 'idna', 'mbcs', 'oem', 'undefined', - 'string_escape', 'string-escape' - ): - # For one reason or another, these will raise an - # exception if we actually try to use them, so don't - # bother. - continue - encoded = soup.encode(encoding) - assert b'' in encoded - assert encoding.encode("ascii") not in encoded - - def test_processing_instruction(self): - markup = b"""\n""" - soup = self.soup(markup) - self.assertEqual(markup, soup.encode("utf8")) - - def test_real_xhtml_document(self): - """A real XHTML document should come out *exactly* the same as it went in.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8"), markup) - - def test_nested_namespaces(self): - doc = b""" - - - - - -""" - soup = self.soup(doc) - self.assertEqual(doc, soup.encode()) - - def test_formatter_processes_script_tag_for_xml_documents(self): - doc = """ - -""" - soup = BeautifulSoup(doc, "lxml-xml") - # lxml would have stripped this while parsing, but we can add - # it later. - soup.script.string = 'console.log("< < hey > > ");' - encoded = soup.encode() - self.assertTrue(b"< < hey > >" in encoded) - - def test_can_parse_unicode_document(self): - markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' - soup = self.soup(markup) - self.assertEqual('Sacr\xe9 bleu!', soup.root.string) - - def test_popping_namespaced_tag(self): - markup = 'b2012-07-02T20:33:42Zcd' - soup = self.soup(markup) - self.assertEqual( - str(soup.rss), markup) - - def test_docstring_includes_correct_encoding(self): - soup = self.soup("") - self.assertEqual( - soup.encode("latin1"), - b'\n') - - def test_large_xml_document(self): - """A large XML document should come out the same as it went in.""" - markup = (b'\n' - + b'0' * (2**12) - + b'') - soup = self.soup(markup) - self.assertEqual(soup.encode("utf-8"), markup) - - - def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): - self.assertSoupEquals("

", "

") - self.assertSoupEquals("

foo

") - - def test_namespaces_are_preserved(self): - markup = 'This tag is in the a namespaceThis tag is in the b namespace' - soup = self.soup(markup) - root = soup.root - self.assertEqual("http://example.com/", root['xmlns:a']) - self.assertEqual("http://example.net/", root['xmlns:b']) - - def test_closing_namespaced_tag(self): - markup = '

20010504

' - soup = self.soup(markup) - self.assertEqual(str(soup.p), markup) - - def test_namespaced_attributes(self): - markup = '' - soup = self.soup(markup) - self.assertEqual(str(soup.foo), markup) - - def test_namespaced_attributes_xml_namespace(self): - markup = 'bar' - soup = self.soup(markup) - self.assertEqual(str(soup.foo), markup) - - def test_find_by_prefixed_name(self): - doc = """ -foo - bar - baz - -""" - soup = self.soup(doc) - - # There are three tags. - self.assertEqual(3, len(soup.find_all('tag'))) - - # But two of them are ns1:tag and one of them is ns2:tag. - self.assertEqual(2, len(soup.find_all('ns1:tag'))) - self.assertEqual(1, len(soup.find_all('ns2:tag'))) - - self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) - self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) - - def test_copy_tag_preserves_namespace(self): - xml = """ -""" - - soup = self.soup(xml) - tag = soup.document - duplicate = copy.copy(tag) - - # The two tags have the same namespace prefix. - self.assertEqual(tag.prefix, duplicate.prefix) - - def test_worst_case(self): - """Test the worst case (currently) for linking issues.""" - - soup = self.soup(BAD_DOCUMENT) - self.linkage_validator(soup) - - -class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): - """Smoke test for a tree builder that supports HTML5.""" - - def test_real_xhtml_document(self): - # Since XHTML is not HTML5, HTML5 parsers are not tested to handle - # XHTML documents in any particular way. - pass - - def test_html_tags_have_namespace(self): - markup = "" - soup = self.soup(markup) - self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) - - def test_svg_tags_have_namespace(self): - markup = '' - soup = self.soup(markup) - namespace = "http://www.w3.org/2000/svg" - self.assertEqual(namespace, soup.svg.namespace) - self.assertEqual(namespace, soup.circle.namespace) - - - def test_mathml_tags_have_namespace(self): - markup = '5' - soup = self.soup(markup) - namespace = 'http://www.w3.org/1998/Math/MathML' - self.assertEqual(namespace, soup.math.namespace) - self.assertEqual(namespace, soup.msqrt.namespace) - - def test_xml_declaration_becomes_comment(self): - markup = '' - soup = self.soup(markup) - self.assertTrue(isinstance(soup.contents[0], Comment)) - self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') - self.assertEqual("html", soup.contents[0].next_element.name) - -def skipIf(condition, reason): - def nothing(test, *args, **kwargs): - return None - - def decorator(test_item): - if condition: - return nothing - else: - return test_item - - return decorator diff --git a/infra/api/package/bs4/tests/__init__.py b/infra/api/package/bs4/tests/__init__.py deleted file mode 100644 index 142c8cc..0000000 --- a/infra/api/package/bs4/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"The beautifulsoup tests." diff --git a/infra/api/package/bs4/tests/test_builder_registry.py b/infra/api/package/bs4/tests/test_builder_registry.py deleted file mode 100644 index 90cad82..0000000 --- a/infra/api/package/bs4/tests/test_builder_registry.py +++ /dev/null @@ -1,147 +0,0 @@ -"""Tests of the builder registry.""" - -import unittest -import warnings - -from bs4 import BeautifulSoup -from bs4.builder import ( - builder_registry as registry, - HTMLParserTreeBuilder, - TreeBuilderRegistry, -) - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError: - HTML5LIB_PRESENT = False - -try: - from bs4.builder import ( - LXMLTreeBuilderForXML, - LXMLTreeBuilder, - ) - LXML_PRESENT = True -except ImportError: - LXML_PRESENT = False - - -class BuiltInRegistryTest(unittest.TestCase): - """Test the built-in registry with the default builders registered.""" - - def test_combination(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('fast', 'html'), - LXMLTreeBuilder) - - if LXML_PRESENT: - self.assertEqual(registry.lookup('permissive', 'xml'), - LXMLTreeBuilderForXML) - self.assertEqual(registry.lookup('strict', 'html'), - HTMLParserTreeBuilder) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html5lib', 'html'), - HTML5TreeBuilder) - - def test_lookup_by_markup_type(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) - self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) - else: - self.assertEqual(registry.lookup('xml'), None) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) - else: - self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) - - def test_named_library(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('lxml', 'xml'), - LXMLTreeBuilderForXML) - self.assertEqual(registry.lookup('lxml', 'html'), - LXMLTreeBuilder) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html5lib'), - HTML5TreeBuilder) - - self.assertEqual(registry.lookup('html.parser'), - HTMLParserTreeBuilder) - - def test_beautifulsoup_constructor_does_lookup(self): - - with warnings.catch_warnings(record=True) as w: - # This will create a warning about not explicitly - # specifying a parser, but we'll ignore it. - - # You can pass in a string. - BeautifulSoup("", features="html") - # Or a list of strings. - BeautifulSoup("", features=["html", "fast"]) - - # You'll get an exception if BS can't find an appropriate - # builder. - self.assertRaises(ValueError, BeautifulSoup, - "", features="no-such-feature") - -class RegistryTest(unittest.TestCase): - """Test the TreeBuilderRegistry class in general.""" - - def setUp(self): - self.registry = TreeBuilderRegistry() - - def builder_for_features(self, *feature_list): - cls = type('Builder_' + '_'.join(feature_list), - (object,), {'features' : feature_list}) - - self.registry.register(cls) - return cls - - def test_register_with_no_features(self): - builder = self.builder_for_features() - - # Since the builder advertises no features, you can't find it - # by looking up features. - self.assertEqual(self.registry.lookup('foo'), None) - - # But you can find it by doing a lookup with no features, if - # this happens to be the only registered builder. - self.assertEqual(self.registry.lookup(), builder) - - def test_register_with_features_makes_lookup_succeed(self): - builder = self.builder_for_features('foo', 'bar') - self.assertEqual(self.registry.lookup('foo'), builder) - self.assertEqual(self.registry.lookup('bar'), builder) - - def test_lookup_fails_when_no_builder_implements_feature(self): - builder = self.builder_for_features('foo', 'bar') - self.assertEqual(self.registry.lookup('baz'), None) - - def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): - builder1 = self.builder_for_features('foo') - builder2 = self.builder_for_features('bar') - self.assertEqual(self.registry.lookup(), builder2) - - def test_lookup_fails_when_no_tree_builders_registered(self): - self.assertEqual(self.registry.lookup(), None) - - def test_lookup_gets_most_recent_builder_supporting_all_features(self): - has_one = self.builder_for_features('foo') - has_the_other = self.builder_for_features('bar') - has_both_early = self.builder_for_features('foo', 'bar', 'baz') - has_both_late = self.builder_for_features('foo', 'bar', 'quux') - lacks_one = self.builder_for_features('bar') - has_the_other = self.builder_for_features('foo') - - # There are two builders featuring 'foo' and 'bar', but - # the one that also features 'quux' was registered later. - self.assertEqual(self.registry.lookup('foo', 'bar'), - has_both_late) - - # There is only one builder featuring 'foo', 'bar', and 'baz'. - self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), - has_both_early) - - def test_lookup_fails_when_cannot_reconcile_requested_features(self): - builder1 = self.builder_for_features('foo', 'bar') - builder2 = self.builder_for_features('foo', 'baz') - self.assertEqual(self.registry.lookup('bar', 'baz'), None) diff --git a/infra/api/package/bs4/tests/test_docs.py b/infra/api/package/bs4/tests/test_docs.py deleted file mode 100644 index 5b9f677..0000000 --- a/infra/api/package/bs4/tests/test_docs.py +++ /dev/null @@ -1,36 +0,0 @@ -"Test harness for doctests." - -# pylint: disable-msg=E0611,W0142 - -__metaclass__ = type -__all__ = [ - 'additional_tests', - ] - -import atexit -import doctest -import os -#from pkg_resources import ( -# resource_filename, resource_exists, resource_listdir, cleanup_resources) -import unittest - -DOCTEST_FLAGS = ( - doctest.ELLIPSIS | - doctest.NORMALIZE_WHITESPACE | - doctest.REPORT_NDIFF) - - -# def additional_tests(): -# "Run the doc tests (README.txt and docs/*, if any exist)" -# doctest_files = [ -# os.path.abspath(resource_filename('bs4', 'README.txt'))] -# if resource_exists('bs4', 'docs'): -# for name in resource_listdir('bs4', 'docs'): -# if name.endswith('.txt'): -# doctest_files.append( -# os.path.abspath( -# resource_filename('bs4', 'docs/%s' % name))) -# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) -# atexit.register(cleanup_resources) -# return unittest.TestSuite(( -# doctest.DocFileSuite(*doctest_files, **kwargs))) diff --git a/infra/api/package/bs4/tests/test_html5lib.py b/infra/api/package/bs4/tests/test_html5lib.py deleted file mode 100644 index b77659b..0000000 --- a/infra/api/package/bs4/tests/test_html5lib.py +++ /dev/null @@ -1,190 +0,0 @@ -"""Tests to ensure that the html5lib tree builder generates good trees.""" - -import warnings - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError as e: - HTML5LIB_PRESENT = False -from bs4.element import SoupStrainer -from bs4.testing import ( - HTML5TreeBuilderSmokeTest, - SoupTest, - skipIf, -) - -@skipIf( - not HTML5LIB_PRESENT, - "html5lib seems not to be present, not testing its tree builder.") -class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): - """See ``HTML5TreeBuilderSmokeTest``.""" - - @property - def default_builder(self): - return HTML5TreeBuilder - - def test_soupstrainer(self): - # The html5lib tree builder does not support SoupStrainers. - strainer = SoupStrainer("b") - markup = "

A bold statement.

" - with warnings.catch_warnings(record=True) as w: - soup = self.soup(markup, parse_only=strainer) - self.assertEqual( - soup.decode(), self.document_for(markup)) - - self.assertTrue( - "the html5lib tree builder doesn't support parse_only" in - str(w[0].message)) - - def test_correctly_nested_tables(self): - """html5lib inserts tags where other parsers don't.""" - markup = ('' - '' - "') - - self.assertSoupEquals( - markup, - '
Here's another table:" - '' - '' - '
foo
Here\'s another table:' - '
foo
' - '
') - - self.assertSoupEquals( - "" - "" - "
Foo
Bar
Baz
") - - def test_xml_declaration_followed_by_doctype(self): - markup = ''' - - - - - -

foo

- -''' - soup = self.soup(markup) - # Verify that we can reach the

tag; this means the tree is connected. - self.assertEqual(b"

foo

", soup.p.encode()) - - def test_reparented_markup(self): - markup = '

foo

\n

bar

' - soup = self.soup(markup) - self.assertEqual("

foo

\n

bar

", soup.body.decode()) - self.assertEqual(2, len(soup.find_all('p'))) - - - def test_reparented_markup_ends_with_whitespace(self): - markup = '

foo

\n

bar

\n' - soup = self.soup(markup) - self.assertEqual("

foo

\n

bar

\n", soup.body.decode()) - self.assertEqual(2, len(soup.find_all('p'))) - - def test_reparented_markup_containing_identical_whitespace_nodes(self): - """Verify that we keep the two whitespace nodes in this - document distinct when reparenting the adjacent tags. - """ - markup = '
' - soup = self.soup(markup) - space1, space2 = soup.find_all(string=' ') - tbody1, tbody2 = soup.find_all('tbody') - assert space1.next_element is tbody1 - assert tbody2.next_element is space2 - - def test_reparented_markup_containing_children(self): - markup = '' - soup = self.soup(markup) - noscript = soup.noscript - self.assertEqual("target", noscript.next_element) - target = soup.find(string='target') - - # The 'aftermath' string was duplicated; we want the second one. - final_aftermath = soup.find_all(string='aftermath')[-1] - - # The