From 65027e0d28a42346d6004c212222a9208e2720f0 Mon Sep 17 00:00:00 2001 From: Elsie Hupp Date: Fri, 8 Sep 2023 12:23:07 -0400 Subject: [PATCH] Placate mypy (redux) Signed-off-by: Elsie Hupp --- .pre-commit-config.yaml | 2 +- poetry.lock | 293 ++++++++++++++---- pyproject.toml | 8 + wikiteam3/dumpgenerator/__init__.py | 26 -- wikiteam3/dumpgenerator/__main__.py | 30 +- wikiteam3/dumpgenerator/api/__init__.py | 2 + wikiteam3/dumpgenerator/api/api.py | 66 ++-- wikiteam3/dumpgenerator/api/get_json.py | 2 +- wikiteam3/dumpgenerator/api/index_check.py | 5 +- wikiteam3/dumpgenerator/api/namespaces.py | 56 ++-- wikiteam3/dumpgenerator/api/page_titles.py | 50 +-- wikiteam3/dumpgenerator/api/wiki_check.py | 4 +- wikiteam3/dumpgenerator/cli/cli.py | 59 ++-- wikiteam3/dumpgenerator/cli/delay.py | 4 +- wikiteam3/dumpgenerator/config.py | 19 +- wikiteam3/dumpgenerator/dump/generator.py | 49 +-- wikiteam3/dumpgenerator/dump/image/image.py | 59 ++-- .../dumpgenerator/dump/misc/index_php.py | 10 +- .../dumpgenerator/dump/misc/site_info.py | 71 +++-- .../dumpgenerator/dump/misc/special_logs.py | 6 +- .../dump/misc/special_version.py | 10 +- .../dump/page/xmlexport/page_xml.py | 5 +- .../dump/page/xmlexport/page_xml_api.py | 164 ++++++---- .../dump/page/xmlexport/page_xml_export.py | 44 ++- .../dump/page/xmlrev/xml_revisions.py | 164 +++++----- .../dump/page/xmlrev/xml_revisions_page.py | 6 +- .../dumpgenerator/dump/xmldump/xml_dump.py | 55 ++-- .../dumpgenerator/dump/xmldump/xml_header.py | 79 +++-- .../dump/xmldump/xml_integrity.py | 4 +- .../dump/xmldump/xml_truncate.py | 15 +- wikiteam3/dumpgenerator/log/log_error.py | 2 +- wikiteam3/dumpgenerator/test/test_config.py | 3 +- wikiteam3/gui.py | 36 +-- wikiteam3/uploader.py | 59 ++-- wikiteam3/utils/__init__.py | 4 +- wikiteam3/utils/domain.py | 2 +- wikiteam3/utils/login/__init__.py | 2 +- wikiteam3/utils/login/api.py | 5 +- wikiteam3/utils/login/index.py | 4 +- wikiteam3/utils/monkey_patch.py | 8 +- wikiteam3/utils/user_agent.py | 6 +- wikiteam3/utils/wiki_avoid.py | 4 +- 42 files changed, 881 insertions(+), 621 deletions(-) mode change 100755 => 100644 wikiteam3/dumpgenerator/__init__.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7ef5649c..571cf10f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: rev: 1.6.0 hooks: - id: poetry-check - # - id: poetry-lock + - id: poetry-lock - id: poetry-export args: ["-f", "requirements.txt", "-o", "requirements.txt"] - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/poetry.lock b/poetry.lock index 0ee98d70..23b1d1e0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,10 +1,9 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. [[package]] name = "atomicwrites" version = "1.4.1" description = "Atomic file writes." -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -15,7 +14,6 @@ files = [ name = "attrs" version = "23.1.0" description = "Classes Without Boilerplate" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -30,11 +28,56 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib- tests = ["attrs[tests-no-zope]", "zope-interface"] tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +[[package]] +name = "black" +version = "23.7.0" +description = "The uncompromising code formatter." +optional = false +python-versions = ">=3.8" +files = [ + {file = "black-23.7.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:5c4bc552ab52f6c1c506ccae05681fab58c3f72d59ae6e6639e8885e94fe2587"}, + {file = "black-23.7.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:552513d5cd5694590d7ef6f46e1767a4df9af168d449ff767b13b084c020e63f"}, + {file = "black-23.7.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:86cee259349b4448adb4ef9b204bb4467aae74a386bce85d56ba4f5dc0da27be"}, + {file = "black-23.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:501387a9edcb75d7ae8a4412bb8749900386eaef258f1aefab18adddea1936bc"}, + {file = "black-23.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb074d8b213749fa1d077d630db0d5f8cc3b2ae63587ad4116e8a436e9bbe995"}, + {file = "black-23.7.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:b5b0ee6d96b345a8b420100b7d71ebfdd19fab5e8301aff48ec270042cd40ac2"}, + {file = "black-23.7.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:893695a76b140881531062d48476ebe4a48f5d1e9388177e175d76234ca247cd"}, + {file = "black-23.7.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:c333286dc3ddca6fdff74670b911cccedacb4ef0a60b34e491b8a67c833b343a"}, + {file = "black-23.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:831d8f54c3a8c8cf55f64d0422ee875eecac26f5f649fb6c1df65316b67c8926"}, + {file = "black-23.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:7f3bf2dec7d541b4619b8ce526bda74a6b0bffc480a163fed32eb8b3c9aed8ad"}, + {file = "black-23.7.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:f9062af71c59c004cd519e2fb8f5d25d39e46d3af011b41ab43b9c74e27e236f"}, + {file = "black-23.7.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:01ede61aac8c154b55f35301fac3e730baf0c9cf8120f65a9cd61a81cfb4a0c3"}, + {file = "black-23.7.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:327a8c2550ddc573b51e2c352adb88143464bb9d92c10416feb86b0f5aee5ff6"}, + {file = "black-23.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d1c6022b86f83b632d06f2b02774134def5d4d4f1dac8bef16d90cda18ba28a"}, + {file = "black-23.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:27eb7a0c71604d5de083757fbdb245b1a4fae60e9596514c6ec497eb63f95320"}, + {file = "black-23.7.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:8417dbd2f57b5701492cd46edcecc4f9208dc75529bcf76c514864e48da867d9"}, + {file = "black-23.7.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:47e56d83aad53ca140da0af87678fb38e44fd6bc0af71eebab2d1f59b1acf1d3"}, + {file = "black-23.7.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:25cc308838fe71f7065df53aedd20327969d05671bac95b38fdf37ebe70ac087"}, + {file = "black-23.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:642496b675095d423f9b8448243336f8ec71c9d4d57ec17bf795b67f08132a91"}, + {file = "black-23.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:ad0014efc7acf0bd745792bd0d8857413652979200ab924fbf239062adc12491"}, + {file = "black-23.7.0-py3-none-any.whl", hash = "sha256:9fd59d418c60c0348505f2ddf9609c1e1de8e7493eab96198fc89d9f865e7a96"}, + {file = "black-23.7.0.tar.gz", hash = "sha256:022a582720b0d9480ed82576c920a8c1dde97cc38ff11d8d8859b3bd6ca9eedb"}, +] + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +packaging = ">=22.0" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + [[package]] name = "certifi" version = "2023.7.22" description = "Python package for providing Mozilla's CA Bundle." -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -46,7 +89,6 @@ files = [ name = "cfgv" version = "3.4.0" description = "Validate configuration and produce human readable error messages." -category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -58,7 +100,6 @@ files = [ name = "charset-normalizer" version = "3.2.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -139,11 +180,24 @@ files = [ {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, ] +[[package]] +name = "click" +version = "8.1.7" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + [[package]] name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -155,7 +209,6 @@ files = [ name = "contextlib2" version = "21.6.0" description = "Backports and enhancements for the contextlib module" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -167,7 +220,6 @@ files = [ name = "distlib" version = "0.3.7" description = "Distribution utilities" -category = "dev" optional = false python-versions = "*" files = [ @@ -179,7 +231,6 @@ files = [ name = "docopt" version = "0.6.2" description = "Pythonic argument parser, that will make you smile" -category = "main" optional = false python-versions = "*" files = [ @@ -190,7 +241,6 @@ files = [ name = "file-read-backwards" version = "2.0.0" description = "Memory efficient way of reading files line-by-line from the end of file" -category = "main" optional = false python-versions = "*" files = [ @@ -200,25 +250,26 @@ files = [ [[package]] name = "filelock" -version = "3.12.2" +version = "3.12.3" description = "A platform independent file lock." -category = "dev" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "filelock-3.12.2-py3-none-any.whl", hash = "sha256:cbb791cdea2a72f23da6ac5b5269ab0a0d161e9ef0100e653b69049a7706d1ec"}, - {file = "filelock-3.12.2.tar.gz", hash = "sha256:002740518d8aa59a26b0c76e10fb8c6e15eae825d34b6fdf670333fd7b938d81"}, + {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"}, + {file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"}, ] +[package.dependencies] +typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.11\""} + [package.extras] -docs = ["furo (>=2023.5.20)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] -testing = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "diff-cover (>=7.5)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"] +docs = ["furo (>=2023.7.26)", "sphinx (>=7.1.2)", "sphinx-autodoc-typehints (>=1.24)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "pytest-timeout (>=2.1)"] [[package]] name = "flake8" version = "3.9.2" description = "the modular source code checker: pep8 pyflakes and co" -category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" files = [ @@ -231,16 +282,34 @@ mccabe = ">=0.6.0,<0.7.0" pycodestyle = ">=2.7.0,<2.8.0" pyflakes = ">=2.3.0,<2.4.0" +[[package]] +name = "flake8-black" +version = "0.3.6" +description = "flake8 plugin to call black as a code style validator" +optional = false +python-versions = ">=3.7" +files = [ + {file = "flake8-black-0.3.6.tar.gz", hash = "sha256:0dfbca3274777792a5bcb2af887a4cad72c72d0e86c94e08e3a3de151bb41c34"}, + {file = "flake8_black-0.3.6-py3-none-any.whl", hash = "sha256:fe8ea2eca98d8a504f22040d9117347f6b367458366952862ac3586e7d4eeaca"}, +] + +[package.dependencies] +black = ">=22.1.0" +flake8 = ">=3" +tomli = {version = "*", markers = "python_version < \"3.11\""} + +[package.extras] +develop = ["build", "twine"] + [[package]] name = "identify" -version = "2.5.26" +version = "2.5.27" description = "File identification library for Python" -category = "dev" optional = false python-versions = ">=3.8" files = [ - {file = "identify-2.5.26-py2.py3-none-any.whl", hash = "sha256:c22a8ead0d4ca11f1edd6c9418c3220669b3b7533ada0a0ffa6cc0ef85cf9b54"}, - {file = "identify-2.5.26.tar.gz", hash = "sha256:7243800bce2f58404ed41b7c002e53d4d22bcf3ae1b7900c2d7aefd95394bf7f"}, + {file = "identify-2.5.27-py2.py3-none-any.whl", hash = "sha256:fdb527b2dfe24602809b2201e033c2a113d7bdf716db3ca8e3243f735dcecaba"}, + {file = "identify-2.5.27.tar.gz", hash = "sha256:287b75b04a0e22d727bc9a41f0d4f3c1bcada97490fa6eabb5b28f0e9097e733"}, ] [package.extras] @@ -250,7 +319,6 @@ license = ["ukkonen"] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" optional = false python-versions = ">=3.5" files = [ @@ -262,7 +330,6 @@ files = [ name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -274,7 +341,6 @@ files = [ name = "internetarchive" version = "3.5.0" description = "A Python interface to archive.org." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -300,7 +366,6 @@ types = ["tqdm-stubs (>=0.2.0)", "types-colorama", "types-docopt (>=0.6.10,<0.7. name = "jsonpatch" version = "1.33" description = "Apply JSON-Patches (RFC 6902)" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ @@ -315,7 +380,6 @@ jsonpointer = ">=1.9" name = "jsonpointer" version = "2.4" description = "Identify specific nodes in a JSON document (RFC 6901)" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ @@ -327,7 +391,6 @@ files = [ name = "lxml" version = "4.9.3" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" files = [ @@ -435,7 +498,6 @@ source = ["Cython (>=0.29.35)"] name = "mccabe" version = "0.6.1" description = "McCabe checker, plugin for flake8" -category = "dev" optional = false python-versions = "*" files = [ @@ -447,7 +509,6 @@ files = [ name = "mwclient" version = "0.10.1" description = "MediaWiki API client" -category = "main" optional = false python-versions = "*" files = [ @@ -459,11 +520,67 @@ files = [ requests-oauthlib = "*" six = "*" +[[package]] +name = "mypy" +version = "1.5.1" +description = "Optional static typing for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mypy-1.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f33592ddf9655a4894aef22d134de7393e95fcbdc2d15c1ab65828eee5c66c70"}, + {file = "mypy-1.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:258b22210a4a258ccd077426c7a181d789d1121aca6db73a83f79372f5569ae0"}, + {file = "mypy-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9ec1f695f0c25986e6f7f8778e5ce61659063268836a38c951200c57479cc12"}, + {file = "mypy-1.5.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:abed92d9c8f08643c7d831300b739562b0a6c9fcb028d211134fc9ab20ccad5d"}, + {file = "mypy-1.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:a156e6390944c265eb56afa67c74c0636f10283429171018446b732f1a05af25"}, + {file = "mypy-1.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6ac9c21bfe7bc9f7f1b6fae441746e6a106e48fc9de530dea29e8cd37a2c0cc4"}, + {file = "mypy-1.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:51cb1323064b1099e177098cb939eab2da42fea5d818d40113957ec954fc85f4"}, + {file = "mypy-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:596fae69f2bfcb7305808c75c00f81fe2829b6236eadda536f00610ac5ec2243"}, + {file = "mypy-1.5.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:32cb59609b0534f0bd67faebb6e022fe534bdb0e2ecab4290d683d248be1b275"}, + {file = "mypy-1.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:159aa9acb16086b79bbb0016145034a1a05360626046a929f84579ce1666b315"}, + {file = "mypy-1.5.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f6b0e77db9ff4fda74de7df13f30016a0a663928d669c9f2c057048ba44f09bb"}, + {file = "mypy-1.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26f71b535dfc158a71264e6dc805a9f8d2e60b67215ca0bfa26e2e1aa4d4d373"}, + {file = "mypy-1.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc3a600f749b1008cc75e02b6fb3d4db8dbcca2d733030fe7a3b3502902f161"}, + {file = "mypy-1.5.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:26fb32e4d4afa205b24bf645eddfbb36a1e17e995c5c99d6d00edb24b693406a"}, + {file = "mypy-1.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:82cb6193de9bbb3844bab4c7cf80e6227d5225cc7625b068a06d005d861ad5f1"}, + {file = "mypy-1.5.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4a465ea2ca12804d5b34bb056be3a29dc47aea5973b892d0417c6a10a40b2d65"}, + {file = "mypy-1.5.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9fece120dbb041771a63eb95e4896791386fe287fefb2837258925b8326d6160"}, + {file = "mypy-1.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d28ddc3e3dfeab553e743e532fb95b4e6afad51d4706dd22f28e1e5e664828d2"}, + {file = "mypy-1.5.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:57b10c56016adce71fba6bc6e9fd45d8083f74361f629390c556738565af8eeb"}, + {file = "mypy-1.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:ff0cedc84184115202475bbb46dd99f8dcb87fe24d5d0ddfc0fe6b8575c88d2f"}, + {file = "mypy-1.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8f772942d372c8cbac575be99f9cc9d9fb3bd95c8bc2de6c01411e2c84ebca8a"}, + {file = "mypy-1.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5d627124700b92b6bbaa99f27cbe615c8ea7b3402960f6372ea7d65faf376c14"}, + {file = "mypy-1.5.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:361da43c4f5a96173220eb53340ace68cda81845cd88218f8862dfb0adc8cddb"}, + {file = "mypy-1.5.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:330857f9507c24de5c5724235e66858f8364a0693894342485e543f5b07c8693"}, + {file = "mypy-1.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:c543214ffdd422623e9fedd0869166c2f16affe4ba37463975043ef7d2ea8770"}, + {file = "mypy-1.5.1-py3-none-any.whl", hash = "sha256:f757063a83970d67c444f6e01d9550a7402322af3557ce7630d3c957386fa8f5"}, + {file = "mypy-1.5.1.tar.gz", hash = "sha256:b031b9601f1060bf1281feab89697324726ba0c0bae9d7cd7ab4b690940f0b92"}, +] + +[package.dependencies] +mypy-extensions = ">=1.0.0" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = ">=4.1.0" + +[package.extras] +dmypy = ["psutil (>=4.0)"] +install-types = ["pip"] +reports = ["lxml"] + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.5" +files = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] + [[package]] name = "nodeenv" version = "1.8.0" description = "Node.js virtual environment builder" -category = "dev" optional = false python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*" files = [ @@ -478,7 +595,6 @@ setuptools = "*" name = "oauthlib" version = "3.2.2" description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" -category = "main" optional = false python-versions = ">=3.6" files = [ @@ -495,7 +611,6 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] name = "packaging" version = "23.1" description = "Core utilities for Python packages" -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -503,11 +618,21 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] +[[package]] +name = "pathspec" +version = "0.11.2" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pathspec-0.11.2-py3-none-any.whl", hash = "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20"}, + {file = "pathspec-0.11.2.tar.gz", hash = "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"}, +] + [[package]] name = "platformdirs" version = "3.10.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -521,14 +646,13 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-co [[package]] name = "pluggy" -version = "1.2.0" +version = "1.3.0" description = "plugin and hook calling mechanisms for python" -category = "dev" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, - {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, + {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"}, + {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"}, ] [package.extras] @@ -539,7 +663,6 @@ testing = ["pytest", "pytest-benchmark"] name = "poster3" version = "0.8.1" description = "Streaming HTTP uploads and multipart/form-data encoding" -category = "main" optional = false python-versions = "*" files = [ @@ -553,7 +676,6 @@ poster3 = ["buildutils", "sphinx"] name = "pre-commit" version = "2.21.0" description = "A framework for managing and maintaining multi-language pre-commit hooks." -category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -572,7 +694,6 @@ virtualenv = ">=20.10.0" name = "pre-commit-poetry-export" version = "0.1.2" description = "pre-commit hook to keep requirements.txt updated" -category = "main" optional = false python-versions = ">=3.8,<4.0" files = [ @@ -584,7 +705,6 @@ files = [ name = "py" version = "1.11.0" description = "library with cross-python path, ini-parsing, io, code, log facilities" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -596,7 +716,6 @@ files = [ name = "pycodestyle" version = "2.7.0" description = "Python style guide checker" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -608,7 +727,6 @@ files = [ name = "pyflakes" version = "2.3.1" description = "passive checker of Python programs" -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -620,7 +738,6 @@ files = [ name = "pymarkdown" version = "0.1.4" description = "Evaluate code in markdown" -category = "dev" optional = false python-versions = "*" files = [ @@ -634,7 +751,6 @@ toolz = "*" name = "pymysql" version = "1.1.0" description = "Pure Python MySQL Driver" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -650,7 +766,6 @@ rsa = ["cryptography"] name = "pytest" version = "6.2.5" description = "pytest: simple powerful testing with Python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -675,7 +790,6 @@ testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xm name = "pywikibot" version = "6.6.5" description = "Python MediaWiki Bot Framework" -category = "main" optional = false python-versions = ">=3.5.0" files = [ @@ -712,7 +826,6 @@ wikitextparser = ["wikitextparser (>=0.47.0)", "wikitextparser (>=0.47.5)"] name = "pyyaml" version = "6.0.1" description = "YAML parser and emitter for Python" -category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -721,6 +834,7 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -728,8 +842,15 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -746,6 +867,7 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -753,6 +875,7 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -762,7 +885,6 @@ files = [ name = "requests" version = "2.31.0" description = "Python HTTP for Humans." -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -784,7 +906,6 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "requests-oauthlib" version = "1.3.1" description = "OAuthlib authentication support for Requests." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -803,7 +924,6 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] name = "schema" version = "0.7.5" description = "Simple data validation library" -category = "main" optional = false python-versions = "*" files = [ @@ -818,7 +938,6 @@ contextlib2 = ">=0.5.5" name = "setuptools" version = "68.1.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" -category = "main" optional = false python-versions = ">=3.8" files = [ @@ -835,7 +954,6 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs ( name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -847,7 +965,6 @@ files = [ name = "toml" version = "0.10.2" description = "Python Library for Tom's Obvious, Minimal Language" -category = "dev" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -855,11 +972,21 @@ files = [ {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + [[package]] name = "toolz" version = "0.12.0" description = "List processing tools and functional utilities" -category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -871,7 +998,6 @@ files = [ name = "tqdm" version = "4.66.1" description = "Fast, Extensible Progress Meter" -category = "main" optional = false python-versions = ">=3.7" files = [ @@ -888,11 +1014,46 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] +[[package]] +name = "types-requests" +version = "2.31.0.2" +description = "Typing stubs for requests" +optional = false +python-versions = "*" +files = [ + {file = "types-requests-2.31.0.2.tar.gz", hash = "sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40"}, + {file = "types_requests-2.31.0.2-py3-none-any.whl", hash = "sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a"}, +] + +[package.dependencies] +types-urllib3 = "*" + +[[package]] +name = "types-urllib3" +version = "1.26.25.14" +description = "Typing stubs for urllib3" +optional = false +python-versions = "*" +files = [ + {file = "types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f"}, + {file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"}, +] + +[[package]] +name = "typing-extensions" +version = "4.7.1" +description = "Backported and Experimental Type Hints for Python 3.7+" +optional = false +python-versions = ">=3.7" +files = [ + {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, + {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, +] + [[package]] name = "urllib3" version = "1.26.16" description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -907,14 +1068,13 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] name = "virtualenv" -version = "20.24.3" +version = "20.24.4" description = "Virtual Python Environment builder" -category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.24.3-py3-none-any.whl", hash = "sha256:95a6e9398b4967fbcb5fef2acec5efaf9aa4972049d9ae41f95e0972a683fd02"}, - {file = "virtualenv-20.24.3.tar.gz", hash = "sha256:e5c3b4ce817b0b328af041506a2a299418c98747c4b1e68cb7527e74ced23efc"}, + {file = "virtualenv-20.24.4-py3-none-any.whl", hash = "sha256:29c70bb9b88510f6414ac3e55c8b413a1f96239b6b789ca123437d5e892190cb"}, + {file = "virtualenv-20.24.4.tar.gz", hash = "sha256:772b05bfda7ed3b8ecd16021ca9716273ad9f4467c801f27e83ac73430246dca"}, ] [package.dependencies] @@ -923,14 +1083,13 @@ filelock = ">=3.12.2,<4" platformdirs = ">=3.9.1,<4" [package.extras] -docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] [[package]] name = "wikitools3" version = "3.0.1" description = "Python package for interacting with a MediaWiki wiki. It is used by WikiTeam for archiving MediaWiki wikis." -category = "main" optional = false python-versions = ">=3.8,<4.0" files = [ @@ -944,4 +1103,4 @@ poster3 = ">=0.8.1,<0.9.0" [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "1eee6035c5660e8cba28942140937e2ceb36bf90482e76fa5ddd054efa3c659c" +content-hash = "ebed56288c755209a5da1b75673fdda769a85b22d5f1c26fcb7492d971ffd617" diff --git a/pyproject.toml b/pyproject.toml index 8453bae1..040dbc52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,10 @@ requests = "^2.31.0" flake8 = "^3.9.2" pre-commit = "^2.17.0" pymarkdown = "^0.1.4" +mypy = "^1.5.1" +types-requests = "^2.31.0.2" +# flake8-black may be unnecessary? +flake8-black = "^0.3.6" [build-system] requires = ["poetry-core>=1.0.0"] @@ -84,3 +88,7 @@ build-backend = "poetry.core.masonry.api" [tool.pymarkdown] disable-rules = "line-length,no-inline-html" + +[tool.mypy] +check_untyped_defs = true +ignore_missing_imports = true diff --git a/wikiteam3/dumpgenerator/__init__.py b/wikiteam3/dumpgenerator/__init__.py old mode 100755 new mode 100644 index b5da8b1e..e69de29b --- a/wikiteam3/dumpgenerator/__init__.py +++ b/wikiteam3/dumpgenerator/__init__.py @@ -1,26 +0,0 @@ -#!/usr/bin/env python3 - -# DumpGenerator A generator of dumps for wikis -# Copyright (C) 2011-2018 WikiTeam developers -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# To learn more, read the documentation: -# https://github.com/WikiTeam/wikiteam/wiki - - -from wikiteam3.dumpgenerator.dump import DumpGenerator - - -def main(): - DumpGenerator() diff --git a/wikiteam3/dumpgenerator/__main__.py b/wikiteam3/dumpgenerator/__main__.py index 0321cad7..4981f111 100644 --- a/wikiteam3/dumpgenerator/__main__.py +++ b/wikiteam3/dumpgenerator/__main__.py @@ -1,6 +1,32 @@ +#!/usr/bin/env python3 + +# DumpGenerator A generator of dumps for wikis +# Copyright (C) 2011-2018 WikiTeam developers +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# To learn more, read the documentation: +# https://github.com/WikiTeam/wikiteam/wiki + + +from wikiteam3.dumpgenerator.dump import DumpGenerator + + +def main(): + DumpGenerator() + + if __name__ == "__main__": import sys - from .__init__ import main - sys.exit(main()) diff --git a/wikiteam3/dumpgenerator/api/__init__.py b/wikiteam3/dumpgenerator/api/__init__.py index 7d86c175..3748c5e3 100644 --- a/wikiteam3/dumpgenerator/api/__init__.py +++ b/wikiteam3/dumpgenerator/api/__init__.py @@ -2,3 +2,5 @@ from .get_json import getJSON from .handle_status_code import handleStatusCode from .wiki_check import getWikiEngine + +__all__ = [checkAPI, checkRetryAPI, mwGetAPIAndIndex, getJSON, handleStatusCode, getWikiEngine] # type: ignore diff --git a/wikiteam3/dumpgenerator/api/api.py b/wikiteam3/dumpgenerator/api/api.py index 0fa855d0..f3d39488 100644 --- a/wikiteam3/dumpgenerator/api/api.py +++ b/wikiteam3/dumpgenerator/api/api.py @@ -1,7 +1,6 @@ import re -import time -from typing import * -from urllib.parse import urljoin, urlparse, urlunparse +from typing import Any, Literal, Optional +from urllib.parse import urljoin, urlparse import mwclient import requests @@ -11,7 +10,8 @@ from .get_json import getJSON -def checkAPI(api="", session: requests.Session = None): +# api="", session: requests.Session = None +def checkAPI(api: str, session: requests.Session): """Checking API availability""" global cj # handle redirects @@ -34,29 +34,31 @@ def checkAPI(api="", session: requests.Session = None): "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code ) return None - if "MediaWiki API is not enabled for this site." in r.text: - return None - try: - result = getJSON(r) - index = None - if result: - try: - index = ( - result["query"]["general"]["server"] - + result["query"]["general"]["script"] - ) - return (True, index, api) - except KeyError: - print("MediaWiki API seems to work but returned no index URL") - return (True, None, api) - except ValueError: - print(repr(r.text)) - print("MediaWiki API returned data we could not parse") - return None + if r is not None: + if "MediaWiki API is not enabled for this site." in r.text: + return None + try: + result = getJSON(r) + index = None + if result: + try: + index = ( + result["query"]["general"]["server"] + + result["query"]["general"]["script"] + ) + return (True, index, api) + except KeyError: + print("MediaWiki API seems to work but returned no index URL") + return (True, None, api) + except ValueError: + print(repr(r.text)) + print("MediaWiki API returned data we could not parse") + return None return None -def mwGetAPIAndIndex(url="", session: requests.Session = None): +# url="" +def mwGetAPIAndIndex(url: str, session: requests.Session): """Returns the MediaWiki API and Index.php""" api = "" @@ -108,18 +110,21 @@ def mwGetAPIAndIndex(url="", session: requests.Session = None): return api, index -def checkRetryAPI(api="", apiclient=False, session: requests.Session = None): +# api="", apiclient=False +def checkRetryAPI(api: str, apiclient: bool, session: requests.Session): """Call checkAPI and mwclient if necessary""" - check = None + check: (tuple[Literal[True], Any, str] | tuple[Literal[True], None, str] | None) try: check = checkAPI(api, session=session) except requests.exceptions.ConnectionError as e: print(f"Connection error: {str(e)}") + check = None if check and apiclient: apiurl = urlparse(api) try: - site = mwclient.Site( + # Returns a value, but we're just checking for an error here + mwclient.Site( apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, @@ -138,13 +143,14 @@ def checkRetryAPI(api="", apiclient=False, session: requests.Session = None): ) try: - site = mwclient.Site( + # Returns a value, but we're just checking for an error here + mwclient.Site( apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=newscheme, pool=session, ) except KeyError: - check = False + check = False # type: ignore - return check, api + return check, api # type: ignore diff --git a/wikiteam3/dumpgenerator/api/get_json.py b/wikiteam3/dumpgenerator/api/get_json.py index 7a3b2273..bd1aa48d 100644 --- a/wikiteam3/dumpgenerator/api/get_json.py +++ b/wikiteam3/dumpgenerator/api/get_json.py @@ -8,6 +8,6 @@ def getJSON(request: requests.Response): # request.encoding = request.apparent_encoding try: return request.json() - except: + except Exception: # Maybe an older API version which did not return correct JSON return {} diff --git a/wikiteam3/dumpgenerator/api/index_check.py b/wikiteam3/dumpgenerator/api/index_check.py index 50ae58c0..d29fa2c9 100644 --- a/wikiteam3/dumpgenerator/api/index_check.py +++ b/wikiteam3/dumpgenerator/api/index_check.py @@ -3,9 +3,10 @@ import requests -def checkIndex(index="", cookies="", session: requests.Session = None): +# index="", cookies="", session=None +def checkIndex(index: str, cookies: str, session: requests.Session): """Checking index.php availability""" - r = session.post(url=index, data={"title": "Special:Version"}, timeout=30) + r = session.post(url=index, data={"title": "Special:Version"}, timeout=30) # type: ignore if r.status_code >= 400: print(f"ERROR: The wiki returned status code HTTP {r.status_code}") return False diff --git a/wikiteam3/dumpgenerator/api/namespaces.py b/wikiteam3/dumpgenerator/api/namespaces.py index b9fbbdeb..93c5f70f 100644 --- a/wikiteam3/dumpgenerator/api/namespaces.py +++ b/wikiteam3/dumpgenerator/api/namespaces.py @@ -1,53 +1,50 @@ import re +import requests + from wikiteam3.dumpgenerator.api import getJSON from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.config import Config -def getNamespacesScraper(config: Config = None, session=None): +def getNamespacesScraper(config: Config, session: requests.Session): """Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages""" """Function called if no API is available""" namespaces = config.namespaces - namespacenames = {0: ""} # main is 0, no prefix + # namespacenames = {0: ""} # main is 0, no prefix if namespaces: r = session.post( - url=config.index, params={"title": "Special:Allpages"}, timeout=30 + url=config.index, params={"title": "Special:Allpages"}, timeout=30 # type: ignore ) raw = r.text - Delay(config=config, session=session) + Delay(config=config) # [^>]*? to include selected="selected" m = re.compile( r'' ).finditer(raw) if "all" in namespaces: - namespaces = [] - for i in m: - namespaces.append(int(i.group("namespaceid"))) - namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") + namespaces = [int(i.group("namespaceid")) for i in m] + # namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") else: - # check if those namespaces really exist in this wiki - namespaces2 = [] - for i in m: - if int(i.group("namespaceid")) in namespaces: - namespaces2.append(int(i.group("namespaceid"))) - namespacenames[int(i.group("namespaceid"))] = i.group( - "namespacename" - ) + namespaces2 = [ + int(i.group("namespaceid")) + for i in m + if int(i.group("namespaceid")) in namespaces + ] namespaces = namespaces2 else: namespaces = [0] namespaces = list(set(namespaces)) # uniques print("%d namespaces found" % (len(namespaces))) - return namespaces, namespacenames + return namespaces -def getNamespacesAPI(config: Config = None, session=None): +def getNamespacesAPI(config: Config, session: requests.Session): """Uses the API to get the list of namespaces names and ids""" namespaces = config.namespaces - namespacenames = {0: ""} # main is 0, no prefix + # namespacenames = {0: ""} # main is 0, no prefix if namespaces: r = session.get( url=config.api, @@ -60,37 +57,34 @@ def getNamespacesAPI(config: Config = None, session=None): timeout=30, ) result = getJSON(r) - Delay(config=config, session=session) + Delay(config=config) try: nsquery = result["query"]["namespaces"] - except KeyError: + except KeyError as ke: print("Error: could not get namespaces from the API request.") print("HTTP %d" % r.status_code) print(r.text) - return None + raise ke if "all" in namespaces: - namespaces = [] - for i in nsquery.keys(): - if int(i) < 0: # -1: Special, -2: Media, excluding - continue - namespaces.append(int(i)) - namespacenames[int(i)] = nsquery[i]["*"] + namespaces = [int(i) for i in nsquery.keys() if int(i) >= 0] + # -1: Special, -2: Media, excluding + # namespacenames[int(i)] = nsquery[i]["*"] else: # check if those namespaces really exist in this wiki namespaces2 = [] for i in nsquery.keys(): - bi = i + # bi = i i = int(i) if i < 0: # -1: Special, -2: Media, excluding continue if i in namespaces: namespaces2.append(i) - namespacenames[i] = nsquery[bi]["*"] + # namespacenames[i] = nsquery[bi]["*"] namespaces = namespaces2 else: namespaces = [0] namespaces = list(set(namespaces)) # uniques print("%d namespaces found" % (len(namespaces))) - return namespaces, namespacenames + return namespaces diff --git a/wikiteam3/dumpgenerator/api/page_titles.py b/wikiteam3/dumpgenerator/api/page_titles.py index 4e12ba26..d1c9b29e 100644 --- a/wikiteam3/dumpgenerator/api/page_titles.py +++ b/wikiteam3/dumpgenerator/api/page_titles.py @@ -1,9 +1,11 @@ import re -import sys +from typing import List from urllib.parse import urlparse import mwclient +import requests from file_read_backwards import FileReadBackwards +from mwclient.page import Page from wikiteam3.dumpgenerator.api.namespaces import ( getNamespacesAPI, @@ -15,10 +17,10 @@ from wikiteam3.utils.monkey_patch import DelaySession -def getPageTitlesAPI(config: Config = None, session=None): +def getPageTitlesAPI(config: Config, session: requests.Session): """Uses the API to get the list of page titles""" titles = [] - namespaces, namespacenames = getNamespacesAPI(config=config, session=session) + namespaces: List[int] = getNamespacesAPI(config=config, session=session) # apply delay to the session for mwclient.Site.allpages() delay_session = DelaySession( @@ -38,10 +40,11 @@ def getPageTitlesAPI(config: Config = None, session=None): scheme=apiurl.scheme, pool=session, ) - for page in site.allpages(namespace=namespace): - title = page.name - titles.append(title) - yield title + for page in site.allpages(namespace=str(namespace)): + if page is Page: + title = page.name + titles.append(title) + yield title if len(titles) != len(set(titles)): print("Probably a loop, switching to next namespace") @@ -50,10 +53,10 @@ def getPageTitlesAPI(config: Config = None, session=None): delay_session.release() -def getPageTitlesScraper(config: Config = None, session=None): +def getPageTitlesScraper(config: Config, session: requests.Session): """Scrape the list of page titles from Special:Allpages""" titles = [] - namespaces, namespacenames = getNamespacesScraper(config=config, session=session) + namespaces = getNamespacesScraper(config=config, session=session) r_title = r'title="(?P[^>]+)">' r_suballpages1 = r'&from=(?P<from>[^>"]+)&to=(?P<to>[^>"]+)">' r_suballpages2 = r'Special:Allpages/(?P<from>[^>"]+)">' @@ -75,7 +78,7 @@ def getPageTitlesScraper(config: Config = None, session=None): elif re.search(r_suballpages3, raw): r_suballpages = r_suballpages3 c = 0 - oldfr = "" + # oldfr = "" checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: @@ -105,10 +108,10 @@ def getPageTitlesScraper(config: Config = None, session=None): if name not in checked_suballpages: # to avoid reload dupe subpages links checked_suballpages.append(name) - Delay(config=config, session=session) + Delay(config=config) # print ('Fetching URL: ', url) r = session.get(url=url, timeout=10) - raw = str(r.text) + raw = r.text raw = cleanHTML(raw) rawacum += raw # merge it after removed junk print( @@ -122,27 +125,26 @@ def getPageTitlesScraper(config: Config = None, session=None): "pages", ) - Delay(config=config, session=session) + Delay(config=config) assert ( currfr is not None ), "re.search found the pattern, but re.finditer fails, why?" - oldfr = currfr + # oldfr = currfr c += 1 c = 0 m = re.compile(r_title).finditer(rawacum) for i in m: t = undoHTMLEntities(text=i.group("title")) - if not t.startswith("Special:"): - if t not in titles: - titles.append(t) - c += 1 + if not t.startswith("Special:") and t not in titles: + titles.append(t) + c += 1 print(" %d titles retrieved in the namespace %d" % (c, namespace)) return titles -def getPageTitles(config: Config = None, session=None): +def getPageTitles(config: Config, session: requests.Session): """Get list of page titles""" # http://en.wikipedia.org/wiki/Special:AllPages # http://wiki.archiveteam.org/index.php?title=Special:AllPages @@ -168,7 +170,7 @@ def getPageTitles(config: Config = None, session=None): if config.api: try: titles = getPageTitlesAPI(config=config, session=session) - except: + except Exception: print("Error: could not get page titles from the API") titles = getPageTitlesScraper(config=config, session=session) elif config.index: @@ -193,7 +195,7 @@ def getPageTitles(config: Config = None, session=None): def checkTitleOk( - config: Config = None, + config: Config, ): try: with FileReadBackwards( @@ -208,13 +210,13 @@ def checkTitleOk( lasttitle = frb.readline().strip() if lasttitle == "": lasttitle = frb.readline().strip() - except: + except Exception: lasttitle = "" # probably file does not exists return lasttitle == "--END--" -def readTitles(config: Config = None, session=None, start=None, batch=False): +def readTitles(config: Config, session: requests.Session, start: str, batch: bool): """Read title list from a file, from the title "start" """ if not checkTitleOk(config): getPageTitles(config=config, session=session) @@ -225,7 +227,7 @@ def readTitles(config: Config = None, session=None, start=None, batch=False): titlesfile = open(f"{config.path}/{titlesfilename}", encoding="utf-8") titlelist = [] - seeking = start is not None + seeking = start != "" with titlesfile as f: for line in f: title = line.strip() diff --git a/wikiteam3/dumpgenerator/api/wiki_check.py b/wikiteam3/dumpgenerator/api/wiki_check.py index 93e0465e..b5d9b0d1 100644 --- a/wikiteam3/dumpgenerator/api/wiki_check.py +++ b/wikiteam3/dumpgenerator/api/wiki_check.py @@ -5,13 +5,13 @@ from wikiteam3.utils import getUserAgent -def getWikiEngine(url="", session: requests.Session = None) -> str: +def getWikiEngine(url: str, session: requests.Session) -> str: """Returns the wiki engine of a URL, if known""" if not session: session = requests.Session() # Create a new session session.headers.update({"User-Agent": getUserAgent()}) - r = session.post(url=url, timeout=30) + r = session.post(url=url, timeout=30) # type: ignore if r.status_code == 405 or not r.text: r = session.get(url=url, timeout=120) result = r.text diff --git a/wikiteam3/dumpgenerator/cli/cli.py b/wikiteam3/dumpgenerator/cli/cli.py index 582ca862..bad2e2e3 100644 --- a/wikiteam3/dumpgenerator/cli/cli.py +++ b/wikiteam3/dumpgenerator/cli/cli.py @@ -6,7 +6,7 @@ import queue import re import sys -from typing import * +from typing import Any, Dict, Literal, Tuple import requests import urllib3 @@ -15,10 +15,9 @@ from wikiteam3.dumpgenerator.api.index_check import checkIndex from wikiteam3.dumpgenerator.config import Config, newConfig from wikiteam3.dumpgenerator.version import getVersion -from wikiteam3.utils import domain2prefix, getUserAgent, mod_requests_text -from wikiteam3.utils.login import uniLogin +from wikiteam3.utils import domain2prefix, getUserAgent, mod_requests_text, uniLogin +from wikiteam3.utils.user_agent import setupUserAgent -from ...utils.user_agent import setupUserAgent from .delay import Delay @@ -223,13 +222,13 @@ def getParameters(params=None) -> Tuple[Config, Dict]: ######################################## # Create session - mod_requests_text(requests) # monkey patch + mod_requests_text(requests) # type: ignore # monkey patch session = requests.Session() # Disable SSL verification if args.insecure: session.verify = False - requests.packages.urllib3.disable_warnings() + urllib3.disable_warnings() print("WARNING: SSL certificate verification disabled") # Custom session retry @@ -241,14 +240,12 @@ def getParameters(params=None) -> Tuple[Config, Dict]: class CustomRetry(Retry): def increment(self, method=None, url=None, *args, **kwargs): if "_pool" in kwargs: - conn = kwargs[ - "_pool" - ] # type: urllib3.connectionpool.HTTPSConnectionPool + conn: urllib3.connectionpool.HTTPSConnectionPool = kwargs["_pool"] if "response" in kwargs: try: # drain conn in advance so that it won't be put back into conn.pool kwargs["response"].drain_conn() - except: + except Exception: pass # Useless, retry happens inside urllib3 # for adapters in session.adapters.values(): @@ -256,12 +253,12 @@ def increment(self, method=None, url=None, *args, **kwargs): # adapters.poolmanager.clear() # Close existing connection so that a new connection will be used - if hasattr(conn, "pool"): + if hasattr(conn, "pool") and conn.pool is not None: pool = conn.pool # type: queue.Queue try: # Don't directly use this, This closes connection pool by making conn.pool = None conn.close() - except: + except Exception: pass conn.pool = pool return super().increment(method=method, url=url, *args, **kwargs) @@ -274,7 +271,8 @@ def sleep(self, response=None): msg = "req retry (%s)" % response.status else: msg = None - Delay(config=None, session=session, msg=msg, delay=backoff) + # config=None + Delay(config=config, msg=msg, delay=backoff) __retries__ = CustomRetry( total=int(args.retries), @@ -292,7 +290,7 @@ def sleep(self, response=None): ) session.mount("https://", HTTPAdapter(max_retries=__retries__)) session.mount("http://", HTTPAdapter(max_retries=__retries__)) - except: + except Exception: # Our urllib3/requests is too old pass @@ -301,7 +299,7 @@ def sleep(self, response=None): if args.cookies: cj.load(args.cookies) print("Using cookies from %s" % args.cookies) - session.cookies = cj + session.cookies = cj # type: ignore # Setup user agent session.headers.update({"User-Agent": getUserAgent()}) @@ -312,17 +310,17 @@ def sleep(self, response=None): session.auth = (args.user, args.password) # Execute meta info params - if args.wiki: - if args.get_wiki_engine: - print(getWikiEngine(url=args.wiki, session=session)) - sys.exit(0) + if args.wiki and args.get_wiki_engine: + print(getWikiEngine(url=args.wiki, session=session)) + sys.exit(0) # Get API and index and verify - api = args.api if args.api else "" - index = args.index if args.index else "" + api: str = args.api or "" + index: str = args.index or "" if api == "" or index == "": if args.wiki: if getWikiEngine(args.wiki, session=session) == "MediaWiki": + index2: str api2, index2 = mwGetAPIAndIndex(args.wiki, session=session) if not api: api = api2 @@ -339,9 +337,12 @@ def sleep(self, response=None): # print (api) # print (index) - index2 = None + index2 = "" - check, checkedapi = False, None + check: ( + tuple[Literal[True], Any, str] | tuple[Literal[True], None, str] | None + ) = False # type: ignore + checkedapi = "" if api: check, checkedapi = checkRetryAPI( api=api, @@ -349,9 +350,9 @@ def sleep(self, response=None): session=session, ) - if api and check: + if api != "" and check: # Replace the index URL we got from the API check - index2 = check[1] + index2 = str(check[1]) api = checkedapi print("API is OK: ", checkedapi) else: @@ -391,8 +392,10 @@ def sleep(self, response=None): try: index = "/".join(index.split("/")[:-1]) except AttributeError: - index = None - if index and checkIndex(index=index, cookies=args.cookies, session=session): + index = "" + if index != "" and checkIndex( + index=index, cookies=args.cookies, session=session + ): print("index.php is OK") else: print("Error in index.php.") @@ -473,7 +476,7 @@ def sleep(self, response=None): # calculating path, if not defined by user with --path= if not config.path: config.path = "./{}-{}-wikidump".format( - domain2prefix(config=config, session=session), + domain2prefix(config=config), config.date, ) print("No --path argument provided. Defaulting to:") diff --git a/wikiteam3/dumpgenerator/cli/delay.py b/wikiteam3/dumpgenerator/cli/delay.py index 7ebbd021..64e64cd7 100644 --- a/wikiteam3/dumpgenerator/cli/delay.py +++ b/wikiteam3/dumpgenerator/cli/delay.py @@ -1,5 +1,3 @@ -import itertools -import sys import threading import time @@ -21,7 +19,7 @@ def animate(self): time.sleep(0.3) - def __init__(self, config: Config = None, session=None, msg=None, delay=None): + def __init__(self, config: Config, msg=None, delay=None): """Add a delay if configured for that""" self.ellipses: str = "." diff --git a/wikiteam3/dumpgenerator/config.py b/wikiteam3/dumpgenerator/config.py index 21dbff32..97b64424 100644 --- a/wikiteam3/dumpgenerator/config.py +++ b/wikiteam3/dumpgenerator/config.py @@ -19,10 +19,12 @@ } """ +import contextlib import dataclasses import json import sys -from typing import * +from dataclasses import field +from typing import List def _dataclass_from_dict(klass_or_obj, d): @@ -43,7 +45,7 @@ def asdict(self): retries: int = 0 path: str = "" logs: bool = False - date: str = False + date: str = "" # URL params index: str = "" @@ -56,8 +58,8 @@ def asdict(self): xmlrevisions: bool = False xmlrevisions_page: bool = False images: bool = False - namespaces: List[int] = None - exnamespaces: List[int] = None + namespaces: List[int] = field(default_factory=lambda: []) + exnamespaces: List[int] = field(default_factory=lambda: []) api_chunksize: int = 0 # arvlimit, ailimit, etc export: str = "" # Special:Export page name @@ -73,24 +75,21 @@ def newConfig(configDict) -> Config: return _dataclass_from_dict(Config, configDict) -def loadConfig(config: Config = None, configfilename=""): +def loadConfig(config: Config, configfilename=""): """Load config file""" configDict = dataclasses.asdict(config) if config.path: - try: + with contextlib.suppress(Exception): with open(f"{config.path}/{configfilename}", encoding="utf-8") as infile: configDict.update(json.load(infile)) return newConfig(configDict) - except: - pass - print("There is no config file. we can't resume. Start a new dump.") sys.exit() -def saveConfig(config: Config = None, configfilename=""): +def saveConfig(config: Config, configfilename=""): """Save config file""" with open(f"{config.path}/{configfilename}", "w", encoding="utf-8") as outfile: diff --git a/wikiteam3/dumpgenerator/dump/generator.py b/wikiteam3/dumpgenerator/dump/generator.py index 80ca3c4f..41fa132d 100644 --- a/wikiteam3/dumpgenerator/dump/generator.py +++ b/wikiteam3/dumpgenerator/dump/generator.py @@ -1,10 +1,12 @@ try: import contextlib - import http.cookiejar + + # import http.cookiejar import os import re import sys import traceback + from typing import List from file_read_backwards import FileReadBackwards @@ -20,7 +22,7 @@ ) sys.exit(1) -from typing import * +from typing import Dict from wikiteam3.dumpgenerator.cli import bye, getParameters, welcome from wikiteam3.dumpgenerator.config import Config, loadConfig, saveConfig @@ -75,7 +77,7 @@ def __init__(params=None): else contextlib.nullcontext() ): print(welcome()) - print(f"Analysing {config.api if config.api else config.index}") + print(f"Analysing {config.api or config.index}") # creating path or resuming if desired c = 2 @@ -124,57 +126,58 @@ def __init__(params=None): bye() @staticmethod - def createNewDump(config: Config = None, other: Dict = None): + def createNewDump(config: Config, other: Dict): + # other: Dict = None # we do lazy title dumping here :) images = [] print("Trying generating a new dump into a new directory...") if config.xml: - generateXMLDump(config=config, session=other["session"]) + generateXMLDump(config=config, resume=False, session=other["session"]) checkXMLIntegrity(config=config, session=other["session"]) if config.images: images += Image.getImageNames(config=config, session=other["session"]) - Image.saveImageNames(config=config, images=images, session=other["session"]) + Image.saveImageNames(config=config, images=images) Image.generateImageDump( config=config, other=other, images=images, session=other["session"] ) if config.logs: saveLogs(config=config, session=other["session"]) + # other: Dict = None @staticmethod - def resumePreviousDump(config: Config = None, other: Dict = None): - images = [] + def resumePreviousDump(config: Config, other: Dict): + images: List[str] = [] print("Resuming previous dump process...") if config.xml: # checking xml dump xmliscomplete = False lastxmltitle = None lastxmlrevid = None - try: + + # Exception means probably file does not exist + with contextlib.suppress(Exception): with FileReadBackwards( "%s/%s-%s-%s.xml" % ( config.path, - domain2prefix(config=config, session=other["session"]), + domain2prefix(config=config), config.date, "current" if config.curonly else "history", ), encoding="utf-8", ) as frb: - for l in frb: - if l.strip() == "</mediawiki>": + for line in frb: + if line.strip() == "</mediawiki>": # xml dump is complete xmliscomplete = True break - if xmlrevid := re.search(r" <id>([^<]+)</id>", l): + if xmlrevid := re.search(r" <id>([^<]+)</id>", line): lastxmlrevid = int(xmlrevid.group(1)) - if xmltitle := re.search(r"<title>([^<]+)", l): + if xmltitle := re.search(r"([^<]+)", line): lastxmltitle = undoHTMLEntities(text=xmltitle.group(1)) break - except: - pass # probably file does not exists - if xmliscomplete: print("XML dump was completed in the previous session") elif lastxmltitle: @@ -190,7 +193,7 @@ def resumePreviousDump(config: Config = None, other: Dict = None): else: # corrupt? only has XML header? print("XML is corrupt? Regenerating...") - generateXMLDump(config=config, session=other["session"]) + generateXMLDump(config=config, resume=False, session=other["session"]) if config.images: # load images list @@ -203,7 +206,9 @@ def resumePreviousDump(config: Config = None, other: Dict = None): if os.path.exists(imagesFilePath): with open(imagesFilePath) as f: lines = f.read().splitlines() - images.extend(l.split("\t") for l in lines if re.search(r"\t", l)) + images.extend( + line.split("\t") for line in lines if re.search(r"\t", line) + ) if len(lines) == 0: # empty file lastimage = "--EMPTY--" if not lastimage: @@ -226,16 +231,14 @@ def resumePreviousDump(config: Config = None, other: Dict = None): Image.saveImageNames(config=config, images=images) # checking images directory listdir = [] - try: + with contextlib.suppress(OSError): listdir = os.listdir(f"{config.path}/images") - except OSError: - pass # probably directory does not exist listdir = set(listdir) c_desc = 0 c_images = 0 c_checked = 0 for filename, url, uploader, size, sha1 in images: - lastfilename = filename + # lastfilename = filename if other["filenamelimit"] < len(filename.encode("utf-8")): logerror( config=config, diff --git a/wikiteam3/dumpgenerator/dump/image/image.py b/wikiteam3/dumpgenerator/dump/image/image.py index b79e9ebb..f5eedfc3 100644 --- a/wikiteam3/dumpgenerator/dump/image/image.py +++ b/wikiteam3/dumpgenerator/dump/image/image.py @@ -4,7 +4,7 @@ import sys import time import urllib.parse -from typing import Dict, List, Optional +from typing import Dict, List import requests @@ -20,19 +20,19 @@ class Image: @staticmethod - def getXMLFileDesc(config: Config = None, title="", session=None): + def getXMLFileDesc(config: Config, title: str, session: requests.Session): """Get XML for image description page""" - config.curonly = 1 # tricky to get only the most recent desc + config.curonly = True # tricky to get only the most recent desc return "".join( list(getXMLPage(config=config, title=title, verbose=False, session=session)) ) + # other: Dict = None, + # images: List[List] = None, + # session: requests.Session = None, @staticmethod def generateImageDump( - config: Config = None, - other: Dict = None, - images: List[List] = None, - session: requests.Session = None, + config: Config, other: Dict, images: List[List], session: requests.Session ): """Save files and descriptions using a file list\n Deprecated: `start` is not used anymore.""" @@ -49,7 +49,9 @@ def generateImageDump( bypass_cdn_image_compression: bool = other["bypass_cdn_image_compression"] - def modify_params(params: Optional[Dict] = None) -> Dict: + def modify_params( + params: Dict[str, (str | int)] = {} + ) -> Dict[str, (str | int)]: """bypass Cloudflare Polish (image optimization)""" if params is None: params = {} @@ -101,7 +103,7 @@ def check_response(r: requests.Response) -> None: + "we will not try to download it...", ) else: - Delay(config=config, session=session) + Delay(config=config) original_url = url r = session.head(url=url, params=modify_params(), allow_redirects=True) check_response(r) @@ -116,17 +118,20 @@ def check_response(r: requests.Response) -> None: check_response(r) # Try to fix a broken HTTP to HTTPS redirect - if r.status_code == 404 and original_url_redirected: - if ( + if ( + r.status_code == 404 + and original_url_redirected + and ( original_url.split("://")[0] == "http" and url.split("://")[0] == "https" - ): - url = "https://" + original_url.split("://")[1] - # print 'Maybe a broken http to https redirect, trying ', url - r = session.get( - url=url, params=modify_params(), allow_redirects=False - ) - check_response(r) + ) + ): + url = "https://" + original_url.split("://")[1] + # print 'Maybe a broken http to https redirect, trying ', url + r = session.get( + url=url, params=modify_params(), allow_redirects=False + ) + check_response(r) if r.status_code == 200: try: @@ -160,7 +165,7 @@ def check_response(r: requests.Response) -> None: if os.path.isfile(f"{filename3}.desc"): toContinue += 1 else: - Delay(config=config, session=session) + Delay(config=config) # saving description if any title = f"Image:{filename}" try: @@ -231,7 +236,7 @@ def check_response(r: requests.Response) -> None: ) @staticmethod - def getImageNames(config: Config = None, session: requests.Session = None): + def getImageNames(config: Config, session: requests.Session): """Get list of image names""" print(")Retrieving image filenames") @@ -251,7 +256,7 @@ def getImageNames(config: Config = None, session: requests.Session = None): return images @staticmethod - def getImageNamesScraper(config: Config = None, session: requests.Session = None): + def getImageNamesScraper(config: Config, session: requests.Session): """Retrieve file list: filename, url, uploader""" images = [] @@ -268,7 +273,7 @@ def getImageNamesScraper(config: Config = None, session: requests.Session = None timeout=30, ) raw = r.text - Delay(config=config, session=session) + Delay(config=config) # delicate wiki if re.search( r"(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)", @@ -345,7 +350,7 @@ def getImageNamesScraper(config: Config = None, session: requests.Session = None return images @staticmethod - def getImageNamesAPI(config: Config = None, session: requests.Session = None): + def getImageNamesAPI(config: Config, session: requests.Session): """Retrieve file list: filename, url, uploader, size, sha1""" # # Commented by @yzqzss: # https://www.mediawiki.org/wiki/API:Allpages @@ -377,7 +382,7 @@ def getImageNamesAPI(config: Config = None, session: requests.Session = None): r = session.get(url=config.api, params=params, timeout=30) handleStatusCode(r) jsonimages = getJSON(r) - Delay(config=config, session=session) + Delay(config=config) if "query" in jsonimages: countImages += len(jsonimages["query"]["allimages"]) @@ -465,7 +470,7 @@ def getImageNamesAPI(config: Config = None, session: requests.Session = None): r = session.get(url=config.api, params=params, timeout=30) handleStatusCode(r) jsonimages = getJSON(r) - Delay(config=config, session=session) + Delay(config=config) if "query" not in jsonimages: # if the API doesn't return query data, then we're done @@ -512,7 +517,7 @@ def getImageNamesAPI(config: Config = None, session: requests.Session = None): return images @staticmethod - def saveImageNames(config: Config = None, images: List[List] = None, session=None): + def saveImageNames(config: Config, images: List[List]): """Save image list in a file, including filename, url, uploader, size and sha1""" imagesfilename = "{}-{}-images.txt".format( @@ -545,7 +550,7 @@ def saveImageNames(config: Config = None, images: List[List] = None, session=Non print("Image filenames and URLs saved at...", imagesfilename) @staticmethod - def curateImageURL(config: Config = None, url=""): + def curateImageURL(config: Config, url=""): """Returns an absolute URL for an image, adding the domain if missing""" if config.index: diff --git a/wikiteam3/dumpgenerator/dump/misc/index_php.py b/wikiteam3/dumpgenerator/dump/misc/index_php.py index b2ae3279..ac96adf6 100644 --- a/wikiteam3/dumpgenerator/dump/misc/index_php.py +++ b/wikiteam3/dumpgenerator/dump/misc/index_php.py @@ -1,20 +1,22 @@ import os +import requests + from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.config import Config from wikiteam3.utils import removeIP -def saveIndexPHP(config: Config = None, session=None): +def saveIndexPHP(config: Config, session: requests.Session): """Save index.php as .html, to preserve license details available at the botom of the page""" if os.path.exists(f"{config.path}/index.html"): print("index.html exists, do not overwrite") else: print("Downloading index.php (Main Page) as index.html") - r = session.post(url=config.index, params=None, timeout=10) - raw = str(r.text) - Delay(config=config, session=session) + r = session.post(url=config.index, params=None, timeout=10) # type: ignore + raw = r.text + Delay(config=config) raw = removeIP(raw=raw) with open(f"{config.path}/index.html", "w", encoding="utf-8") as outfile: outfile.write(raw) diff --git a/wikiteam3/dumpgenerator/dump/misc/site_info.py b/wikiteam3/dumpgenerator/dump/misc/site_info.py index 0a8160f0..a357017b 100644 --- a/wikiteam3/dumpgenerator/dump/misc/site_info.py +++ b/wikiteam3/dumpgenerator/dump/misc/site_info.py @@ -1,58 +1,61 @@ import json import os +import requests + from wikiteam3.dumpgenerator.api import getJSON from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.config import Config -def saveSiteInfo(config: Config = None, session=None): +def saveSiteInfo(config: Config, session: requests.Session): """Save a file with site info""" if not config.api: return if os.path.exists(f"{config.path}/siteinfo.json"): print("siteinfo.json exists, do not overwrite") - else: - print("Downloading site info as siteinfo.json") + return + + print("Downloading site info as siteinfo.json") - # MediaWiki 1.13+ + # MediaWiki 1.13+ + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo", + "sinumberingroup": 1, + "format": "json", + }, + timeout=10, + ) + # MediaWiki 1.11-1.12 + if "query" not in getJSON(r): + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "general|namespaces|statistics|dbrepllag|interwikimap", + "format": "json", + }, + timeout=10, + ) + # MediaWiki 1.8-1.10 + if "query" not in getJSON(r): r = session.get( url=config.api, params={ "action": "query", "meta": "siteinfo", - "siprop": "general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo", - "sinumberingroup": 1, + "siprop": "general|namespaces", "format": "json", }, timeout=10, ) - # MediaWiki 1.11-1.12 - if "query" not in getJSON(r): - r = session.get( - url=config.api, - params={ - "action": "query", - "meta": "siteinfo", - "siprop": "general|namespaces|statistics|dbrepllag|interwikimap", - "format": "json", - }, - timeout=10, - ) - # MediaWiki 1.8-1.10 - if "query" not in getJSON(r): - r = session.get( - url=config.api, - params={ - "action": "query", - "meta": "siteinfo", - "siprop": "general|namespaces", - "format": "json", - }, - timeout=10, - ) - result = getJSON(r) - Delay(config=config, session=session) - with open(f"{config.path}/siteinfo.json", "w", encoding="utf-8") as outfile: - outfile.write(json.dumps(result, indent=4, sort_keys=True)) + result = getJSON(r) + Delay(config=config) + with open(f"{config.path}/siteinfo.json", "w", encoding="utf-8") as outfile: + outfile.write(json.dumps(result, indent=4, sort_keys=True)) diff --git a/wikiteam3/dumpgenerator/dump/misc/special_logs.py b/wikiteam3/dumpgenerator/dump/misc/special_logs.py index 0b35939d..666c8a1e 100644 --- a/wikiteam3/dumpgenerator/dump/misc/special_logs.py +++ b/wikiteam3/dumpgenerator/dump/misc/special_logs.py @@ -1,8 +1,10 @@ +import requests + from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.config import Config -def saveLogs(config: Config = None, session=None): +def saveLogs(config: Config, session: requests.Session): """Save Special:Log""" # get all logs from Special:Log """parse @@ -20,4 +22,4 @@ def saveLogs(config: Config = None, session=None): """ - Delay(config=config, session=session) + Delay(config=config) diff --git a/wikiteam3/dumpgenerator/dump/misc/special_version.py b/wikiteam3/dumpgenerator/dump/misc/special_version.py index 55473373..c15e175f 100644 --- a/wikiteam3/dumpgenerator/dump/misc/special_version.py +++ b/wikiteam3/dumpgenerator/dump/misc/special_version.py @@ -1,11 +1,13 @@ import os +import requests + from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.config import Config from wikiteam3.utils import removeIP -def saveSpecialVersion(config: Config = None, session=None): +def saveSpecialVersion(config: Config, session: requests.Session): """Save Special:Version as .html, to preserve extensions details""" if os.path.exists(f"{config.path}/SpecialVersion.html"): @@ -13,10 +15,10 @@ def saveSpecialVersion(config: Config = None, session=None): else: print("Downloading Special:Version with extensions and other related info") r = session.post( - url=config.index, params={"title": "Special:Version"}, timeout=10 + url=config.index, params={"title": "Special:Version"}, timeout=10 # type: ignore ) - raw = str(r.text) - Delay(config=config, session=session) + raw = r.text + Delay(config=config) raw = str(removeIP(raw=raw)) with open( f"{config.path}/SpecialVersion.html", "w", encoding="utf-8" diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py index 277b05f9..59d9d6e8 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py @@ -1,10 +1,13 @@ +import requests + from wikiteam3.dumpgenerator.config import Config from .page_xml_api import getXMLPageWithApi from .page_xml_export import getXMLPageWithExport -def getXMLPage(config: Config = None, title="", verbose=True, session=None): +# title="", verbose=True +def getXMLPage(config: Config, title: str, verbose: bool, session: requests.Session): if config.xmlapiexport: return getXMLPageWithApi( config=config, title=title, verbose=verbose, session=session diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py index 9e9b676e..f6a158ae 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py @@ -1,7 +1,7 @@ import re import time import traceback -from typing import * +from typing import Dict import requests @@ -11,58 +11,71 @@ from wikiteam3.dumpgenerator.log import logerror try: - import xml.etree.ElementTree as ET + import xml.etree.ElementTree as ElementTree except ImportError: - import xml.etree.ElementTree as ET + import xml.etree.ElementTree as ElementTree import xml.dom.minidom as MD -def reconstructRevisions(root=None): - # print ET.tostring(rev) - page = ET.Element("stub") +def reconstructRevisions(root: ElementTree.Element): + # print ElementTree.tostring(rev) + page = ElementTree.Element("stub") edits = 0 - for rev in ( - root.find("query").find("pages").find("page").find("revisions").findall("rev") - ): + + query: (ElementTree.Element | None) = root.find("query") + if query is None: + raise ValueError("query was none") + pages: (ElementTree.Element | None) = query.find("pages") + if pages is None: + raise ValueError("pages was none") + page_element: (ElementTree.Element | None) = query.find("page") + if page_element is None: + raise ValueError("page was none") + revisions: (ElementTree.Element | None) = page_element.find("revisions") + if revisions is None: + raise ValueError("revisions was none") + for rev in revisions.findall("rev"): try: - rev_ = ET.SubElement(page, "revision") + rev_ = ElementTree.SubElement(page, "revision") # id - ET.SubElement(rev_, "id").text = rev.attrib["revid"] + ElementTree.SubElement(rev_, "id").text = rev.attrib["revid"] # parentid (optional, export-0.7+) if "parentid" in rev.attrib: - ET.SubElement(rev_, "parentid").text = rev.attrib["parentid"] + ElementTree.SubElement(rev_, "parentid").text = rev.attrib["parentid"] # timestamp - ET.SubElement(rev_, "timestamp").text = rev.attrib["timestamp"] + ElementTree.SubElement(rev_, "timestamp").text = rev.attrib["timestamp"] # contributor - contributor = ET.SubElement(rev_, "contributor") + contributor = ElementTree.SubElement(rev_, "contributor") if "userhidden" not in rev.attrib: - ET.SubElement(contributor, "username").text = rev.attrib["user"] - ET.SubElement(contributor, "id").text = rev.attrib["userid"] + ElementTree.SubElement(contributor, "username").text = rev.attrib[ + "user" + ] + ElementTree.SubElement(contributor, "id").text = rev.attrib["userid"] else: contributor.set("deleted", "deleted") # comment (optional) if "commenthidden" in rev.attrib: print("commenthidden") - comment = ET.SubElement(rev_, "comment") + comment = ElementTree.SubElement(rev_, "comment") comment.set("deleted", "deleted") elif "comment" in rev.attrib and rev.attrib["comment"]: # '' is empty - comment = ET.SubElement(rev_, "comment") + comment = ElementTree.SubElement(rev_, "comment") comment.text = rev.attrib["comment"] # minor edit (optional) if "minor" in rev.attrib: - ET.SubElement(rev_, "minor") + ElementTree.SubElement(rev_, "minor") # model and format (optional, export-0.8+) if "contentmodel" in rev.attrib: - ET.SubElement(rev_, "model").text = rev.attrib[ + ElementTree.SubElement(rev_, "model").text = rev.attrib[ "contentmodel" ] # default: 'wikitext' if "contentformat" in rev.attrib: - ET.SubElement(rev_, "format").text = rev.attrib[ + ElementTree.SubElement(rev_, "format").text = rev.attrib[ "contentformat" ] # default: 'text/x-wiki' # text - text = ET.SubElement(rev_, "text") + text = ElementTree.SubElement(rev_, "text") if "texthidden" not in rev.attrib: text.attrib["xml:space"] = "preserve" text.attrib["bytes"] = rev.attrib["size"] @@ -72,24 +85,28 @@ def reconstructRevisions(root=None): text.set("deleted", "deleted") # sha1 if "sha1" in rev.attrib: - sha1 = ET.SubElement(rev_, "sha1") + sha1 = ElementTree.SubElement(rev_, "sha1") sha1.text = rev.attrib["sha1"] elif "sha1hidden" in rev.attrib: - ET.SubElement(rev_, "sha1") # stub + ElementTree.SubElement(rev_, "sha1") # stub edits += 1 except Exception as e: - # logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev))) - print(ET.tostring(rev)) + # logerror(config=config, text='Error reconstructing revision, xml:%s' % (ElementTree.tostring(rev))) + print(ElementTree.tostring(rev)) traceback.print_exc() - page = None + page = None # type: ignore edits = 0 raise e return page, edits +# headers: Dict = None, params: Dict = None def getXMLPageCoreWithApi( - headers: Dict = None, params: Dict = None, config: Config = None, session=None + headers: Dict, + params: Dict[str, (str | int)], + config: Config, + session: requests.Session, ): """ """ # just send the API request @@ -101,7 +118,7 @@ def getXMLPageCoreWithApi( increment = 20 # increment every retry while not re.search( - r"" if not config.curonly else r"", xml + r"" if config.curonly else r"", xml ) or re.search(r"", xml): if c > 0 and c < maxretries: wait = ( @@ -114,8 +131,8 @@ def getXMLPageCoreWithApi( time.sleep(wait) # reducing server load requesting smallest chunks (if curonly then # rvlimit = 1 from mother function) - if params["rvlimit"] > 1: - params["rvlimit"] = params["rvlimit"] / 2 # half + if int(params["rvlimit"]) > 1: + params["rvlimit"] = int(params["rvlimit"]) // 2 # half if c >= maxretries: print(" We have retried %d times" % (c)) print( @@ -130,7 +147,7 @@ def getXMLPageCoreWithApi( print(" Saving in the errors log, and skipping...") logerror( config=config, - text=f'Error while retrieving the last revision of "{params["titles" if config.xmlapiexport else "pages"].decode("utf-8")}". Skipping.', + text=f'Error while retrieving the last revision of "{params["titles" if config.xmlapiexport else "pages"]}". Skipping.', # .decode("utf-8") ) raise ExportAbortedError(config.index) # FIXME HANDLE HTTP Errors HERE @@ -149,7 +166,10 @@ def getXMLPageCoreWithApi( return xml -def getXMLPageWithApi(config: Config = None, title="", verbose=True, session=None): +# title="", verbose=True +def getXMLPageWithApi( + config: Config, title: str, verbose: bool, session: requests.Session +): """Get the full history (or current only) of a page using API:Query if params['curonly'] is set, then using export&exportwrap to export """ @@ -170,42 +190,52 @@ def getXMLPageWithApi(config: Config = None, title="", verbose=True, session=Non "rvcontinue": None, "rvlimit": config.api_chunksize, } - firstpartok = False - lastcontinue = None + firstpartok: bool = False + lastcontinue: str = "" numberofedits = 0 ret = "" - continueKey: Optional[str] = None + continueKey: str = "" while True: # in case the last request is not right, saving last time's progress if not firstpartok: try: lastcontinue = params[continueKey] - except: - lastcontinue = None + except Exception: + lastcontinue = "" - xml = getXMLPageCoreWithApi(params=params, config=config, session=session) + xml = getXMLPageCoreWithApi( + headers={}, params=params, config=config, session=session + ) if xml == "": # just return so that we can continue, and getXMLPageCoreWithApi will log the error return try: - root = ET.fromstring(xml.encode("utf-8")) - except: + root = ElementTree.fromstring(xml.encode("utf-8")) + except Exception: continue try: - retpage = root.find("query").find("pages").find("page") - except: + ret_query: (ElementTree.Element | None) = root.find("query") + if ret_query is None: + raise Exception("query was none") + ret_pages: (ElementTree.Element | None) = root.find("pages") + if ret_pages is None: + raise Exception("pages was none") + ret_page = ret_pages.find("page") + if ret_page is None: + continue + except Exception: continue - if "missing" in retpage.attrib or "invalid" in retpage.attrib: + if "missing" in ret_page.attrib or "invalid" in ret_page.attrib: print("Page not found") raise PageMissingError(params["titles"], xml) if not firstpartok: try: # build the firstpart by ourselves to improve the memory usage ret = " \n" - ret += " %s\n" % (retpage.attrib["title"]) - ret += " %s\n" % (retpage.attrib["ns"]) - ret += " %s\n" % (retpage.attrib["pageid"]) - except: + ret += " %s\n" % (ret_page.attrib["title"]) + ret += " %s\n" % (ret_page.attrib["ns"]) + ret += " %s\n" % (ret_page.attrib["pageid"]) + except Exception: firstpartok = False continue else: @@ -213,30 +243,34 @@ def getXMLPageWithApi(config: Config = None, title="", verbose=True, session=Non yield ret continueVal = None - if root.find("continue") is not None: + continue_element: (ElementTree.Element | None) = root.find("continue") + query_continue_element: (ElementTree.Element | None) = root.find( + "query-continue" + ) + if continue_element is not None: # uses continue.rvcontinue # MW 1.26+ continueKey = "rvcontinue" - continueVal = root.find("continue").attrib["rvcontinue"] - elif root.find("query-continue") is not None: - revContinue = root.find("query-continue").find("revisions") - assert revContinue is not None, "Should only have revisions continue" - if "rvcontinue" in revContinue.attrib: + continueVal = continue_element.attrib["rvcontinue"] + elif query_continue_element is not None: + rev_continue = query_continue_element.find("revisions") + assert rev_continue is not None, "Should only have revisions continue" + if "rvcontinue" in rev_continue.attrib: # MW 1.21 ~ 1.25 continueKey = "rvcontinue" - continueVal = revContinue.attrib["rvcontinue"] - elif "rvstartid" in revContinue.attrib: + continueVal = rev_continue.attrib["rvcontinue"] + elif "rvstartid" in rev_continue.attrib: # TODO: MW ???? continueKey = "rvstartid" - continueVal = revContinue.attrib["rvstartid"] + continueVal = rev_continue.attrib["rvstartid"] else: # blindly assume the first attribute is the continue key # may never happen assert ( - len(revContinue.attrib) > 0 + len(rev_continue.attrib) > 0 ), "Should have at least one attribute" - for continueKey in revContinue.attrib.keys(): - continueVal = revContinue.attrib[continueKey] + for continueKey in rev_continue.attrib.keys(): + continueVal = rev_continue.attrib[continueKey] break if continueVal is not None: params[continueKey] = continueVal @@ -246,7 +280,9 @@ def getXMLPageWithApi(config: Config = None, title="", verbose=True, session=Non # transform the revision rev_, edits = reconstructRevisions(root=root) - xmldom = MD.parseString(b"" + ET.tostring(rev_) + b"") + xmldom = MD.parseString( + b"" + ElementTree.tostring(rev_) + b"" + ) # convert it into text in case it throws MemoryError # delete the first three line and last two line,which is for setting the indent ret += "".join(xmldom.toprettyxml(indent=" ").splitlines(True)[3:-2]) @@ -254,7 +290,7 @@ def getXMLPageWithApi(config: Config = None, title="", verbose=True, session=Non numberofedits += edits if config.curonly or continueVal is None: # no continue break - except: + except Exception: traceback.print_exc() params["rvcontinue"] = lastcontinue ret = "" @@ -267,7 +303,9 @@ def getXMLPageWithApi(config: Config = None, title="", verbose=True, session=Non "export": 1, "exportnowrap": 1, } - xml = getXMLPageCoreWithApi(params=params, config=config, session=session) + xml = getXMLPageCoreWithApi( + headers={}, params=params, config=config, session=session + ) if xml == "": raise ExportAbortedError(config.index) if "" not in xml: diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py index 350dbd36..7d67f55f 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py @@ -1,7 +1,7 @@ import re import sys import time -from typing import * +from typing import Dict import requests @@ -12,8 +12,12 @@ from wikiteam3.utils import uprint +# headers: Dict = None, params: Dict = None def getXMLPageCore( - headers: Dict = None, params: Dict = None, config: Config = None, session=None + headers: Dict, + params: Dict[str, (str | int)], + config: Config, + session: requests.Session, ) -> str: """""" # returns a XML containing params['limit'] revisions (or current only), ending in @@ -37,8 +41,8 @@ def getXMLPageCore( time.sleep(wait) # reducing server load requesting smallest chunks (if curonly then # limit = 1 from mother function) - if params["limit"] > 1: - params["limit"] = params["limit"] / 2 # half + if int(params["limit"]) > 1: + params["limit"] = int(params["limit"]) // 2 # half if c >= maxretries: print(" We have retried %d times" % (c)) print( @@ -52,9 +56,9 @@ def getXMLPageCore( # params['curonly'] should mean that we've already tried this # fallback, because it's set by the following if and passed to # getXMLPageCore - if not config.curonly and "curonly" not in params: + if not config.curonly: # and "curonly" not in params: print(" Trying to save only the last revision for this page...") - params["curonly"] = 1 + params["curonly"] = True logerror( config=config, to_stdout=True, @@ -75,7 +79,7 @@ def getXMLPageCore( try: r = session.post( url=config.index, params=params, headers=headers, timeout=10 - ) + ) # type: ignore handleStatusCode(r) xml = r.text except requests.exceptions.ConnectionError as e: @@ -89,7 +93,9 @@ def getXMLPageCore( return xml -def getXMLPageWithExport(config: Config = None, title="", verbose=True, session=None): +def getXMLPageWithExport( + config: Config, title: str, verbose: bool, session: requests.Session +): """Get the full history (or current only) of a page""" truncated = False @@ -97,9 +103,17 @@ def getXMLPageWithExport(config: Config = None, title="", verbose=True, session= title_ = re.sub(" ", "_", title_) # do not convert & into %26, title_ = re.sub('&', '%26', title_) if config.export: - params = {"title": config.export, "pages": title_, "action": "submit"} + params: Dict[str, (str | int)] = { + "title": config.export, + "pages": title_, + "action": "submit", + } else: - params = {"title": "Special:Export", "pages": title_, "action": "submit"} + params = { + "title": "Special:Export", + "pages": title_, + "action": "submit", + } if config.curonly: params["curonly"] = 1 params["limit"] = 1 @@ -114,7 +128,7 @@ def getXMLPageWithExport(config: Config = None, title="", verbose=True, session= if config.templates: params["templates"] = 1 - xml = getXMLPageCore(params=params, config=config, session=session) + xml = getXMLPageCore(headers={}, params=params, config=config, session=session) if xml == "": raise ExportAbortedError(config.index) if "" not in xml: @@ -139,10 +153,12 @@ def getXMLPageWithExport(config: Config = None, title="", verbose=True, session= # get the last timestamp from the acum XML params["offset"] = re.findall(r_timestamp, xml)[-1] try: - xml2 = getXMLPageCore(params=params, config=config, session=session) + xml2 = getXMLPageCore( + headers={}, params=params, config=config, session=session + ) except MemoryError: print("The page's history exceeds our memory, halving limit.") - params["limit"] /= 2 + params["limit"] = int(params["limit"]) // 2 continue # are there more edits in this next XML chunk or no ? @@ -177,7 +193,7 @@ def getXMLPageWithExport(config: Config = None, title="", verbose=True, session= ) except MemoryError: "The page's history exceeds our memory, halving limit." - params["limit"] /= 2 + params["limit"] = int(params["limit"]) // 2 continue xml = xml2 edit_count += len(re.findall(r_timestamp, xml)) diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py index 1af38c9c..958072d6 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py @@ -1,14 +1,15 @@ import sys import time -from datetime import datetime -from typing import * +from typing import List from urllib.parse import urlparse import lxml.etree import mwclient import requests +from lxml.etree import _ElementTree as ElementTree +from mwclient.errors import InvalidResponse, MwClientError -from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI +# from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI from wikiteam3.dumpgenerator.api.page_titles import readTitles from wikiteam3.dumpgenerator.config import Config from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import ( @@ -22,9 +23,8 @@ def getXMLRevisionsByAllRevisions( - config: Config = None, - session=None, - site: mwclient.Site = None, + config: Config, + site: mwclient.Site, # = None, nscontinue=None, arvcontinue=None, ): @@ -62,55 +62,7 @@ def getXMLRevisionsByAllRevisions( if _arvcontinue is not None: arvparams["arvcontinue"] = _arvcontinue - if not config.curonly: - # We have to build the XML manually... - # Skip flags, presumably needed to add which is in the schema. - # Also missing: parentid and contentformat. - arvparams[ - "arvprop" - ] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags" - print( - "Trying to get wikitext from the allrevisions API and to build the XML" - ) - while True: - try: - arvrequest = site.api(http_method=config.http_method, **arvparams) - except requests.exceptions.HTTPError as e: - if e.response.status_code != 405 or config.http_method != "POST": - raise - print("POST request to the API failed, retrying with GET") - config.http_method = "GET" - continue - except requests.exceptions.ReadTimeout as err: - # Hopefully temporary, just wait a bit and continue with the same request. - # No point putting a limit to retries, we'd need to abort everything. - # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient - # to use the retry adapter we use for our own requests session? - print(f"ERROR: {str(err)}") - print("Sleeping for 20 seconds") - time.sleep(20) - continue - except mwclient.errors.InvalidResponse as e: - if ( - not e.response_text.startswith("") - or config.http_method != "POST" - ): - raise - - print( - "POST request to the API failed (got HTML), retrying with GET" - ) - config.http_method = "GET" - continue - for page in arvrequest["query"]["allrevisions"]: - yield makeXmlFromPage(page, arvparams.get("arvcontinue", "")) - if "continue" in arvrequest: - arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"] - else: - # End of continuation. We are done with this namespace. - break - - else: + if config.curonly: # FIXME: this is not curonly, just different strategy to do all revisions # Just cycle through revision IDs and use the XML as is print("Trying to list the revisions and to export them one by one") @@ -189,22 +141,69 @@ def getXMLRevisionsByAllRevisions( ) except requests.exceptions.ReadTimeout as err: # As above - print(f"ERROR: {str(err)}") - print("Sleeping for 20 seconds") + print(f"ERROR: {str(err)}\nSleeping for 20 seconds") time.sleep(20) # But avoid rewriting the same revisions arvrequest["query"]["allrevisions"] = [] + else: + # We have to build the XML manually... + # Skip flags, presumably needed to add which is in the schema. + # Also missing: parentid and contentformat. + arvparams[ + "arvprop" + ] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags" + print( + "Trying to get wikitext from the allrevisions API and to build the XML" + ) + while True: + try: + arvrequest = site.api(http_method=config.http_method, **arvparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code != 405 or config.http_method != "POST": + raise + print("POST request to the API failed, retrying with GET") + config.http_method = "GET" + continue + except requests.exceptions.ReadTimeout as err: + # Hopefully temporary, just wait a bit and continue with the same request. + # No point putting a limit to retries, we'd need to abort everything. + # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient + # to use the retry adapter we use for our own requests session? + print(f"ERROR: {str(err)}") + print("Sleeping for 20 seconds") + time.sleep(20) + continue + except InvalidResponse as e: + if ( + e.response_text is not None + and not e.response_text.startswith("") + ) or config.http_method != "POST": + raise + + print( + "POST request to the API failed (got HTML), retrying with GET" + ) + config.http_method = "GET" + continue + for page in arvrequest["query"]["allrevisions"]: + yield makeXmlFromPage(page, arvparams.get("arvcontinue", "")) + if "continue" in arvrequest: + arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"] + else: + # End of continuation. We are done with this namespace. + break + def getXMLRevisionsByTitles( - config: Config = None, session=None, site: mwclient.Site = None, start=None + config: Config, session: requests.Session, site: mwclient.Site, start: str ): c = 0 if config.curonly: # The raw XML export in the API gets a title and gives the latest revision. # We could also use the allpages API as generator but let's be consistent. print("Getting titles to export the latest revision for each") - for title in readTitles(config, session=session, start=start): + for title in readTitles(config, session=session, start=start, batch=False): # TODO: respect verbose flag, reuse output from getXMLPage print(f" {title}") # TODO: as we're doing one page and revision at a time, we might @@ -238,7 +237,7 @@ def getXMLRevisionsByTitles( # The XML needs to be made manually because the export=1 option # refuses to return an arbitrary number of revisions (see above). print("Getting titles to export all the revisions of each") - titlelist = [] + titlelist: (str | List[str]) = [] # TODO: Decide a suitable number of a batched request. Careful: # batched responses may not return all revisions. for titlelist in readTitles(config, session=session, start=start, batch=False): @@ -248,9 +247,11 @@ def getXMLRevisionsByTitles( print(f" {title}") # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded: # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}} + if titlelist is List: + titlelist = "|".join(titlelist) pparams = { "action": "query", - "titles": "|".join(titlelist), + "titles": titlelist, "prop": "revisions", "rvlimit": config.api_chunksize, "rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags", @@ -263,11 +264,13 @@ def getXMLRevisionsByTitles( print("POST request to the API failed, retrying with GET") config.http_method = "GET" prequest = site.api(http_method=config.http_method, **pparams) - except mwclient.errors.InvalidResponse: + except InvalidResponse: + if titlelist is List: + titlelist = "; ".join(titlelist) logerror( config=config, to_stdout=True, - text=f'Error: page inaccessible? Could not export page: {"; ".join(titlelist)}', + text=f"Error: page inaccessible? Could not export page: {titlelist}", ) continue @@ -279,10 +282,12 @@ def getXMLRevisionsByTitles( try: pages = prequest["query"]["pages"] except KeyError: + if titlelist is List: + titlelist = "; ".join(titlelist) logerror( config=config, to_stdout=True, - text=f'Error: page inaccessible? Could not export page: {"; ".join(titlelist)}', + text=f"Error: page inaccessible? Could not export page: {titlelist}", ) break # Go through the data we got to build the XML. @@ -290,10 +295,12 @@ def getXMLRevisionsByTitles( try: yield makeXmlFromPage(pages[pageid], None) except PageMissingError: + if titlelist is List: + titlelist = "; ".join(titlelist) logerror( config=config, to_stdout=True, - text=f'Error: empty revision from API. Could not export page: {"; ".join(titlelist)}', + text=f"Error: empty revision from API. Could not export page: {titlelist}", ) continue @@ -324,8 +331,12 @@ def getXMLRevisionsByTitles( print(f"\n-> Downloaded {c} pages\n") +# useAllrevision=True, lastPage=None def getXMLRevisions( - config: Config = None, session=None, useAllrevision=True, lastPage=None + config: Config, + session: requests.Session, + useAllrevision: bool, + lastPage: (ElementTree | None), ): # FIXME: actually figure out the various strategies for each MediaWiki version apiurl = urlparse(config.api) @@ -342,7 +353,7 @@ def getXMLRevisions( # Find last title if lastPage is not None: try: - lastNs = int(lastPage.find("ns").text) + lastNs = int(lastPage.find("ns", None).text) lastArvcontinue = lastPage.attrib["arvcontinue"] except Exception: print( @@ -350,43 +361,38 @@ def getXMLRevisions( ) raise nscontinue = lastNs - arvcontinue = lastArvcontinue - if not arvcontinue: - arvcontinue = None + arvcontinue = lastArvcontinue or None else: nscontinue = None arvcontinue = None try: - return getXMLRevisionsByAllRevisions( - config, session, site, nscontinue, arvcontinue - ) - except (KeyError, mwclient.errors.InvalidResponse) as e: - print(e) + return getXMLRevisionsByAllRevisions(config, site, nscontinue, arvcontinue) + except (KeyError, InvalidResponse) as e: # TODO: check whether the KeyError was really for a missing arv API print( - "Warning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page" + f"{str(e)}/nWarning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page" ) sys.exit() else: # Find last title if lastPage is not None: try: - start = lastPage.find("title") + start = lastPage.find("title", None) except Exception: print( f"Failed to find title in last trunk XML: {lxml.etree.tostring(lastPage)}" ) raise else: - start = None + start = "" try: # # Uncomment these lines to raise an KeyError for testing # raise KeyError(999999) # # DO NOT UNCOMMMENT IN RELEASE return getXMLRevisionsByTitles(config, session, site, start) - except mwclient.errors.MwClientError as e: + except MwClientError as e: print(e) print("This mwclient version seems not to work for us. Exiting.") sys.exit() diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py index a249a269..b57d03d2 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py @@ -6,7 +6,7 @@ def makeXmlPageFromRaw(xml, arvcontinue) -> str: """Discard the metadata around a element in string""" - root = etree.XML(xml) + root = etree.XML(text=xml, parser=None) find = etree.XPath("//*[local-name() = 'page']") page = find(root)[0] if arvcontinue is not None: @@ -14,7 +14,7 @@ def makeXmlPageFromRaw(xml, arvcontinue) -> str: # The tag will inherit the namespace, like: # # FIXME: pretty_print doesn't seem to work, only adds a newline - return etree.tostring(page, pretty_print=True, encoding="unicode") + return etree.tostring(page, pretty_print=True, encoding="unicode") # type: ignore def makeXmlFromPage(page: dict, arvcontinue) -> str: @@ -124,4 +124,4 @@ def makeXmlFromPage(page: dict, arvcontinue) -> str: except KeyError as e: print(e) raise PageMissingError(page["title"], e) - return etree.tostring(p, pretty_print=True, encoding="unicode") + return etree.tostring(p, pretty_print=True, encoding="unicode") # type: ignore diff --git a/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py b/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py index 991323dd..d8a46546 100644 --- a/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py +++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py @@ -1,8 +1,12 @@ import re import sys -from typing import * +from io import TextIOWrapper import lxml.etree +import requests + +# from typing import * +from lxml.etree import _ElementTree as ElementTree from wikiteam3.dumpgenerator.api.page_titles import readTitles from wikiteam3.dumpgenerator.cli import Delay @@ -19,12 +23,14 @@ from wikiteam3.utils import cleanXML, domain2prefix, undoHTMLEntities +# lastPage=None, +# useAllrevisions=False, def doXMLRevisionDump( - config: Config = None, - session=None, - xmlfile=None, - lastPage=None, - useAllrevisions=False, + config: Config, + session: requests.Session, + xmlfile: TextIOWrapper, + lastPage: (ElementTree | None), + useAllrevisions: bool, ): try: r_timestamp = "([^<]+)" @@ -41,16 +47,17 @@ def doXMLRevisionDump( if arvcontinueRe := re.findall(r_arvcontinue, xml): curArvcontinue = arvcontinueRe[0] if lastArvcontinue != curArvcontinue: - Delay(config=config, session=session) + Delay(config=config) lastArvcontinue = curArvcontinue # Due to how generators work, it's expected this may be less xml = cleanXML(xml=xml) xmlfile.write(xml) xmltitle = re.search(r"([^<]+)", xml) - title = undoHTMLEntities(text=xmltitle.group(1)) - print(f"{title}, {numrevs} edits (--xmlrevisions)") - # Delay(config=config, session=session) + if xmltitle is not None: + title = undoHTMLEntities(text=xmltitle[1]) + print(f"{title}, {numrevs} edits (--xmlrevisions)") + # Delay(config=config) except AttributeError as e: print(e) print("This API library version is not working") @@ -59,11 +66,13 @@ def doXMLRevisionDump( print(e) -def doXMLExportDump(config: Config = None, session=None, xmlfile=None, lastPage=None): +def doXMLExportDump( + config: Config, session: requests.Session, xmlfile: TextIOWrapper, lastPage=None +): print("\nRetrieving the XML for every page\n") lock = True - start = None + start: str = "" if lastPage is not None: try: start = lastPage.find("title").text @@ -77,18 +86,20 @@ def doXMLExportDump(config: Config = None, session=None, xmlfile=None, lastPage= lock = False c = 1 - for title in readTitles(config, session=session, start=start): - if not title: + for title in readTitles(config, session=session, start=start, batch=False): + if title is not str or title == "": continue if title == start: # start downloading from start, included lock = False if lock: continue - Delay(config=config, session=session) + Delay(config=config) if c % 10 == 0: print(f"\n-> Downloaded {c} pages\n") try: - for xml in getXMLPage(config=config, title=title, session=session): + for xml in getXMLPage( + config=config, verbose=True, title=title, session=session + ): xml = cleanXML(xml=xml) xmlfile.write(xml) except PageMissingError: @@ -104,7 +115,8 @@ def doXMLExportDump(config: Config = None, session=None, xmlfile=None, lastPage= c += 1 -def generateXMLDump(config: Config = None, resume=False, session=None): +# resume=False +def generateXMLDump(config: Config, resume: bool, session: requests.Session): """Generates a XML dump for a list of titles or from revision IDs""" header, config = getXMLHeader(config=config, session=session) @@ -114,9 +126,9 @@ def generateXMLDump(config: Config = None, resume=False, session=None): config.date, "current" if config.curonly else "history", ) - xmlfile = None + xmlfile: TextIOWrapper - lastPage = None + lastPage: (ElementTree | None) = None lastPageChunk = None # start != None, means we are resuming a XML dump if resume: @@ -128,8 +140,9 @@ def generateXMLDump(config: Config = None, resume=False, session=None): resume = False lastPage = None else: - lastPage = parseLastPageChunk(lastPageChunk) - if lastPage is None: + try: + lastPage = parseLastPageChunk(lastPageChunk) + except lxml.etree.LxmlError: print("Failed to parse last page chunk: \n%s" % lastPageChunk) print("Cannot resume, exiting now!") sys.exit(1) diff --git a/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py b/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py index f3602439..e95129a2 100644 --- a/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py +++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py @@ -1,7 +1,8 @@ +import contextlib import json import re import sys -from typing import * +from typing import Tuple import requests @@ -11,31 +12,29 @@ from wikiteam3.dumpgenerator.log import logerror -def getXMLHeader(config: Config = None, session=None) -> Tuple[str, Config]: +def getXMLHeader(config: Config, session: requests.Session) -> Tuple[str, Config]: """Retrieve a random page to extract XML headers (namespace info, etc)""" print(config.api) xml = "" disableSpecialExport = config.xmlrevisions or config.xmlapiexport randomtitle = "Main_Page" if disableSpecialExport and config.api and config.api.endswith("api.php"): - try: + with contextlib.suppress(requests.exceptions.RetryError): print("Getting the XML header from the API") # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.8 r = session.get( f"{config.api}?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1", timeout=10, ) - xml: str = r.text + xml = r.text # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19 if not re.match(r"\s* Tuple[str, Config]: f"{config.api}?action=query&export=1&format=json&titles={randomtitle}", timeout=10, ) - try: + with contextlib.suppress(KeyError): xml = r.json()["query"]["export"]["*"] - except KeyError: - pass - except requests.exceptions.RetryError: - pass else: try: @@ -72,36 +67,36 @@ def getXMLHeader(config: Config = None, session=None) -> Tuple[str, Config]: # The does not exist. Not a problem, if we get the . xml = pme.xml except ExportAbortedError: - try: - if config.api: - print("Trying the local name for the Special namespace instead") - r = session.get( - url=config.api, - params={ - "action": "query", - "meta": "siteinfo", - "siprop": "namespaces", - "format": "json", - }, - timeout=120, - ) - config.export = ( - json.loads(r.text)["query"]["namespaces"]["-1"]["*"] + ":Export" - ) - xml = "".join( - list( - getXMLPage( - config=config, - title=randomtitle, - verbose=False, - session=session, + with contextlib.suppress(ExportAbortedError): + try: + if config.api: + print("Trying the local name for the Special namespace instead") + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "namespaces", + "format": "json", + }, + timeout=120, + ) + config.export = ( + json.loads(r.text)["query"]["namespaces"]["-1"]["*"] + + ":Export" + ) + xml = "".join( + list( + getXMLPage( + config=config, + title=randomtitle, + verbose=False, + session=session, + ) ) ) - ) - except PageMissingError as pme: - xml = pme.xml - except ExportAbortedError: - pass + except PageMissingError as pme: + xml = pme.xml header = xml.split("")[0] if not re.match(r"\s* Tuple[str, Config]: print(xml) print("XML export on this wiki is broken, quitting.") logerror( - to_stdout=True, text="XML export on this wiki is broken, quitting." + config=config, + to_stdout=True, + text="XML export on this wiki is broken, quitting.", ) sys.exit() return header, config diff --git a/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py b/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py index 5f17d156..819ff29b 100644 --- a/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py +++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py @@ -1,10 +1,10 @@ -from typing import * +from typing import Iterable from wikiteam3.dumpgenerator.config import Config def checkXMLIntegrity( - config: Config = None, titles: Iterable[str] = None, session=None + config: Config, titles: (Iterable[str] | None) = None, session=None ): """Check XML dump integrity, to detect broken XML chunks""" # TODO: Fix XML Integrity Check diff --git a/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py b/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py index 3cfb5528..fe73be2e 100644 --- a/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py +++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py @@ -1,9 +1,9 @@ import os from io import StringIO -from typing import * import lxml.etree from file_read_backwards import FileReadBackwards +from lxml.etree import _ElementTree as ElementTree def endsWithNewlines(filename: str) -> int: @@ -60,10 +60,9 @@ def truncateXMLDump(filename: str) -> str: return incomplete_segment -def parseLastPageChunk(chunk) -> Optional[lxml.etree._ElementTree]: - try: - parser = lxml.etree.XMLParser(recover=True) - tree = lxml.etree.parse(StringIO(chunk), parser) - return tree.getroot() - except lxml.etree.LxmlError: - return None +def parseLastPageChunk(chunk) -> ElementTree: + parser = lxml.etree.XMLParser(recover=True) + tree = lxml.etree.parse(StringIO(chunk), parser) + return tree.getroot() + # except lxml.etree.LxmlError: + # return None diff --git a/wikiteam3/dumpgenerator/log/log_error.py b/wikiteam3/dumpgenerator/log/log_error.py index 7f18fbf9..5902ac9a 100644 --- a/wikiteam3/dumpgenerator/log/log_error.py +++ b/wikiteam3/dumpgenerator/log/log_error.py @@ -3,7 +3,7 @@ from wikiteam3.dumpgenerator.config import Config -def logerror(config: Config = None, to_stdout=False, text="") -> None: +def logerror(config: Config, to_stdout=False, text="") -> None: """Log error in errors.log""" if text: with open(f"{config.path}/errors.log", "a", encoding="utf-8") as outfile: diff --git a/wikiteam3/dumpgenerator/test/test_config.py b/wikiteam3/dumpgenerator/test/test_config.py index da9869e5..ce6521a3 100644 --- a/wikiteam3/dumpgenerator/test/test_config.py +++ b/wikiteam3/dumpgenerator/test/test_config.py @@ -25,7 +25,7 @@ def _new_config_from_parameter(params): def get_config(mediawiki_ver, api=True): - assert api == True + assert api == True # type: ignore if mediawiki_ver == "1.16.5": return _new_config_from_parameter( [ @@ -33,3 +33,4 @@ def get_config(mediawiki_ver, api=True): "http://group0.mediawiki.demo.save-web.org/mediawiki-1.16.5/api.php", ] ) + raise ValueError(f"Expected mediawiki_ver '1.16.5'; got {mediawiki_ver}") diff --git a/wikiteam3/gui.py b/wikiteam3/gui.py index e4f2cfec..a3cfb3d9 100644 --- a/wikiteam3/gui.py +++ b/wikiteam3/gui.py @@ -22,7 +22,7 @@ * advanced: batch downloads, upload to Internet Archive or anywhere """ - +import contextlib import os import platform import random @@ -129,7 +129,7 @@ def __init__(self, master): self.button11 = Button( self.labelframe11, text="Check", - command=lambda: threading.start_new_threading(self.checkURL, ()), + command=lambda: threading.start_new_threading(self.checkURL, ()), # type: ignore width=5, ) self.button11.grid(row=0, column=3) @@ -275,14 +275,14 @@ def __init__(self, master): self.button21 = Button( self.frame2, text="Load available dumps", - command=lambda: threading.start_new_threading(self.loadAvailableDumps, ()), + command=lambda: threading.start_new_threading(self.loadAvailableDumps, ()), # type: ignore width=15, ) self.button21.grid(row=3, column=0) self.button23 = Button( self.frame2, text="Download selection", - command=lambda: threading.start_new_threading(self.downloadDump, ()), + command=lambda: threading.start_new_threading(self.downloadDump, ()), # type: ignore width=15, ) self.button23.grid(row=3, column=4) @@ -337,7 +337,7 @@ def checkURL(self): ): # well-constructed URL?, one dot at least, aaaaa.com, but bb.aaaaa.com is allowed too if self.optionmenu11var.get() == "api.php": self.msg("Please wait... Checking api.php...") - if checkAPI(self.entry11.get()): + if checkAPI(self.entry11.get(), None): # type: ignore self.entry11.config(background="lightgreen") self.msg("api.php is correct!", level="ok") else: @@ -345,7 +345,7 @@ def checkURL(self): self.msg("api.php is incorrect!", level="error") elif self.optionmenu11var.get() == "index.php": self.msg("Please wait... Checking index.php...") - if checkIndex(self.entry11.get()): + if checkIndex(self.entry11.get(), None): # type: ignore self.entry11.config(background="lightgreen") self.msg("index.php is OK!", level="ok") else: @@ -374,7 +374,7 @@ def sumSizes(self, sizes): def run(self): for _ in range(10): time.sleep(0.1) - self.value += 10 + self.value += 10 # type: ignore """ #get parameters selected @@ -388,7 +388,7 @@ def run(self): def msg(self, msg="", level=""): levels = {"ok": "lightgreen", "warning": "yellow", "error": "red"} - if levels.has_key(level.lower()): + if level.lower() in levels: print(f"{level.upper()}: {msg}") self.status.config( text=f"{level.upper()}: {msg}", background=levels[level.lower()] @@ -398,9 +398,9 @@ def msg(self, msg="", level=""): self.status.config(text=msg, background="grey") def treeSortColumn(self, column, reverse=False): - l = [(self.tree.set(i, column), i) for i in self.tree.get_children("")] - l.sort(reverse=reverse) - for index, (val, i) in enumerate(l): + line = [(self.tree.set(i, column), i) for i in self.tree.get_children("")] + line.sort(reverse=reverse) + for index, (val, i) in enumerate(line): self.tree.move(i, "", index) self.tree.heading( column, @@ -408,7 +408,7 @@ def treeSortColumn(self, column, reverse=False): ) def downloadProgress(self, block_count, block_size, total_size): - try: + with contextlib.suppress(Exception): total_mb = total_size / 1024 / 1024.0 downloaded = block_count * (block_size / 1024 / 1024.0) percent = downloaded / (total_mb / 100.0) @@ -419,8 +419,6 @@ def downloadProgress(self, block_count, block_size, total_size): self.msg(msg, level="ok") # sys.stdout.write("%.1f MB of %.1f MB downloaded (%.2f%%)" %(downloaded, total_mb, percent)) # sys.stdout.flush() - except: - pass def downloadDump(self, event=None): if self.block: @@ -452,7 +450,7 @@ def downloadDump(self, event=None): self.dumps[int(item)][5], ) ) - f = urllib.urlretrieve( + urllib.urlretrieve( # type: ignore self.dumps[int(item)][5], filepath, reporthook=self.downloadProgress, @@ -614,11 +612,11 @@ def loadAvailableDumps(self): ], ] wikifarms_r = re.compile(f'({"|".join(wikifarms.keys())})') - c = 0 + # c = 0 for mirror, url, regexp in self.urls: print("Loading data from", mirror, url) self.msg(msg=f"Please wait... Loading data from {mirror} {url}") - f = urllib.request.urlopen(url) + f = urllib.request.urlopen(url) # type: ignore m = re.compile(regexp).finditer(f.read()) for i in m: filename = i.group("filename") @@ -628,9 +626,7 @@ def loadAvailableDumps(self): if re.search(wikifarms_r, filename): wikifarm = re.findall(wikifarms_r, filename)[0] wikifarm = wikifarms[wikifarm] - size = i.group("size") - if not size: - size = "Unknown" + size = i.group("size") or "Unknown" date = "Unknown" if re.search(r"\-(\d{8})[\.-]", filename): date = re.findall(r"\-(\d{4})(\d{2})(\d{2})[\.-]", filename)[0] diff --git a/wikiteam3/uploader.py b/wikiteam3/uploader.py index d4b4ede0..cee7ae55 100644 --- a/wikiteam3/uploader.py +++ b/wikiteam3/uploader.py @@ -15,12 +15,9 @@ # along with this program. If not, see . import argparse -import getopt import hashlib -import os import re import shutil -import subprocess import time import urllib.parse from io import BytesIO @@ -95,6 +92,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): prefix = domain2prefix(Config(api=wiki)) except KeyError: print("ERROR: could not produce the prefix for %s" % wiki) + continue wikiname = prefix.split("-")[0] dumps = [] @@ -163,29 +161,29 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): r = requests.get(url=wiki, params=params, headers=headers) if r.status_code < 400: xml = r.text - except requests.exceptions.ConnectionError as e: + except requests.exceptions.ConnectionError: pass sitename = "" baseurl = "" lang = "" try: - sitename = re.findall(r"sitename=\"([^\"]+)\"", xml)[0] - except: + sitename = re.findall(r"sitename=\"([^\"]+)\"", xml)[0] # type: ignore + except Exception: pass try: - baseurl = re.findall(r"base=\"([^\"]+)\"", xml)[0] - except: + baseurl = re.findall(r"base=\"([^\"]+)\"", xml)[0] # type: ignore + except Exception: pass try: - lang = re.findall(r"lang=\"([^\"]+)\"", xml)[0] - except: + lang = re.findall(r"lang=\"([^\"]+)\"", xml)[0] # type: ignore + except Exception: pass if not sitename: sitename = wikiname if not baseurl: - baseurl = re.sub(r"(?im)/api\.php", r"", wiki) + baseurl = re.sub(r"(?im)/api\.php", r"", wiki) # type: ignore # Convert protocol-relative URLs baseurl = re.sub("^//", "https://", baseurl) if lang: @@ -207,7 +205,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): r = requests.get(url=wiki, params=params, headers=headers) if r.status_code < 400: xml = r.text - except requests.exceptions.ConnectionError as e: + except requests.exceptions.ConnectionError: pass rightsinfourl = "" @@ -215,7 +213,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): try: rightsinfourl = re.findall(r"rightsinfo url=\"([^\"]+)\"", xml)[0] rightsinfotext = re.findall(r"text=\"([^\"]+)\"", xml)[0] - except: + except Exception: pass raw = "" @@ -223,7 +221,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): r = requests.get(url=baseurl, headers=headers) if r.status_code < 400: raw = r.text - except requests.exceptions.ConnectionError as e: + except requests.exceptions.ConnectionError: pass # or copyright info from #footer in mainpage @@ -235,13 +233,13 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): rightsinfourl = re.findall( r"", raw )[0] - except: + except Exception: pass try: rightsinfotext = re.findall( r"
  • ([^\n\r]*?)
  • ", raw )[0] - except: + except Exception: pass if rightsinfotext and not rightsinfourl: rightsinfourl = baseurl + "#footer" @@ -260,7 +258,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): if "http" not in logourl: # Probably a relative path, construct the absolute path logourl = urllib.parse.urljoin(wiki, logourl) - except: + except Exception: pass # retrieve some info from the wiki @@ -323,7 +321,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): try: item.upload( str(dump), - metadata=md, + metadata=md, # type: ignore access_key=ia_keys["access"], secret_key=ia_keys["secret"], verbose=True, @@ -341,12 +339,14 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): # Update metadata r = item.modify_metadata( - md, access_key=ia_keys["access"], secret_key=ia_keys["secret"] + md, # type: ignore + access_key=ia_keys["access"], + secret_key=ia_keys["secret"], ) - if r.status_code != 200: + if r.status_code != 200: # type: ignore print("Error when updating metadata") - print(r.status_code) - print(r.text) + print(r.status_code) # type: ignore + print(r.text) # type: ignore print( "You can find it in https://archive.org/details/%s" % (identifier) @@ -358,11 +358,11 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): try: log(logfile, wiki, dump, "ok") if logourl: - logo = BytesIO(requests.get(logourl, timeout=10).content) + logo = BytesIO(requests.get(logourl, timeout=10).content) # type: ignore if ".png" in logourl: logoextension = "png" - elif logourl.split("."): - logoextension = logourl.split(".")[-1] + elif logourl.split("."): # type: ignore + logoextension = logourl.split(".")[-1] # type: ignore else: logoextension = "unknown" logoname = "wiki-" + wikiname + "_logo." + logoextension @@ -388,6 +388,7 @@ def main(params=[]): Dumps must be in the same directory and follow the -wikidump.7z/-history.xml.7z format as produced by launcher.py (explained in https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump ). You need a file named keys.txt with access and secret keys, in two different lines +You also need py in the same directory as this script. Use --help to print this help.""" ) @@ -409,11 +410,11 @@ def main(params=[]): listfile = config.listfile try: uploadeddumps = [ - l.split(";")[1] - for l in open("uploader-%s.log" % (listfile)).read().strip().splitlines() - if len(l.split(";")) > 1 + line.split(";")[1] + for line in open("uploader-%s.log" % (listfile)).read().strip().splitlines() + if len(line.split(";")) > 1 ] - except: + except Exception: pass if config.logfile is None: diff --git a/wikiteam3/utils/__init__.py b/wikiteam3/utils/__init__.py index f05f8ca9..518f689c 100644 --- a/wikiteam3/utils/__init__.py +++ b/wikiteam3/utils/__init__.py @@ -1,7 +1,9 @@ from .domain import domain2prefix -from .login import botLogin, clientLogin, fetchLoginToken, indexLogin, uniLogin +from .login import botLogin, clientLogin, indexLogin, uniLogin from .monkey_patch import mod_requests_text from .uprint import uprint from .user_agent import getUserAgent from .util import cleanHTML, cleanXML, removeIP, sha1File, undoHTMLEntities from .wiki_avoid import avoidWikimediaProjects + +__all__ = [domain2prefix, botLogin, clientLogin, indexLogin, uniLogin, mod_requests_text, uprint, getUserAgent, cleanHTML, cleanXML, removeIP, sha1File, undoHTMLEntities, avoidWikimediaProjects] # type: ignore diff --git a/wikiteam3/utils/domain.py b/wikiteam3/utils/domain.py index aad0d05d..8a230d86 100644 --- a/wikiteam3/utils/domain.py +++ b/wikiteam3/utils/domain.py @@ -3,7 +3,7 @@ from wikiteam3.dumpgenerator.config import Config -def domain2prefix(config: Config = None, session=None): +def domain2prefix(config: Config): """Convert domain name to a valid prefix filename.""" # At this point, both api and index are supposed to be defined diff --git a/wikiteam3/utils/login/__init__.py b/wikiteam3/utils/login/__init__.py index f16f2bfe..04734135 100644 --- a/wikiteam3/utils/login/__init__.py +++ b/wikiteam3/utils/login/__init__.py @@ -4,7 +4,7 @@ import requests -from wikiteam3.utils.login.api import botLogin, clientLogin, fetchLoginToken +from wikiteam3.utils.login.api import botLogin, clientLogin from wikiteam3.utils.login.index import indexLogin diff --git a/wikiteam3/utils/login/api.py b/wikiteam3/utils/login/api.py index e1b1f4c3..d87da042 100644 --- a/wikiteam3/utils/login/api.py +++ b/wikiteam3/utils/login/api.py @@ -1,6 +1,6 @@ """ Available since MediaWiki 1.27. login to a wiki using username and password (API) """ -from typing import * +from typing import Optional import requests @@ -15,8 +15,7 @@ def fetchLoginToken(session: requests.Session, api: str) -> Optional[str]: data = response.json() try: token = data["query"]["tokens"]["logintoken"] - if type(token) is str: - return token + return token if type(token) is str else None except KeyError: print("fetch login token: Oops! Something went wrong -- ", data) return None diff --git a/wikiteam3/utils/login/index.py b/wikiteam3/utils/login/index.py index 94d332fb..202fe739 100644 --- a/wikiteam3/utils/login/index.py +++ b/wikiteam3/utils/login/index.py @@ -1,7 +1,7 @@ """ Always available login methods.(mw 1.16-1.39) Even oler versions of MW may work, but not tested. """ -from typing import * +from typing import Optional import lxml.html import requests @@ -45,7 +45,7 @@ def indexLogin( "title": "Special:UserLogin", # introduced before MW 1.39. "force": "", # introduced before MW 1.39, empty string is OK. } - r = session.post(index, allow_redirects=False, params=params, data=data) + r = session.post(index, allow_redirects=False, params=params, data=data) # type: ignore if r.status_code == 302: print("index login: Success! Welcome, ", username, "!") return session diff --git a/wikiteam3/utils/monkey_patch.py b/wikiteam3/utils/monkey_patch.py index 6abda313..2ad9323d 100644 --- a/wikiteam3/utils/monkey_patch.py +++ b/wikiteam3/utils/monkey_patch.py @@ -3,13 +3,13 @@ from wikiteam3.dumpgenerator.cli.delay import Delay -def mod_requests_text(requests: requests): +def mod_requests_text(requests: requests): # type: ignore """Monkey patch `requests.Response.text` to remove BOM""" def new_text(self): return self.content.lstrip(b"\xef\xbb\xbf").decode(self.encoding) - requests.Response.text = property(new_text) + requests.Response.text = property(new_text) # type: ignore class DelaySession: @@ -26,8 +26,8 @@ def hijack(self): """Don't forget to call `release()`""" def new_send(request, **kwargs): - Delay(msg=self.msg, delay=self.delay, config=self.config) - return self.old_send(request, **kwargs) + Delay(msg=self.msg, delay=self.delay, config=self.config) # type: ignore + return self.old_send(request, **kwargs) # type: ignore self.old_send = self.session.send self.session.send = new_send diff --git a/wikiteam3/utils/user_agent.py b/wikiteam3/utils/user_agent.py index dd1df20b..eef019ee 100644 --- a/wikiteam3/utils/user_agent.py +++ b/wikiteam3/utils/user_agent.py @@ -319,10 +319,10 @@ def getUserAgent(): def setupUserAgent(session: requests.Session): - session._orirequest = session.request + session._orirequest = session.request # type: ignore def newrequest(*args, **kwargs): session.headers.update({"User-Agent": getUserAgent()}) - return session._orirequest(*args, **kwargs) + return session._orirequest(*args, **kwargs) # type: ignore - session.request = newrequest + session.request = newrequest # type: ignore diff --git a/wikiteam3/utils/wiki_avoid.py b/wikiteam3/utils/wiki_avoid.py index c7593fdc..aed5641a 100644 --- a/wikiteam3/utils/wiki_avoid.py +++ b/wikiteam3/utils/wiki_avoid.py @@ -1,11 +1,11 @@ import re import sys -from typing import * +from typing import Dict from wikiteam3.dumpgenerator.config import Config -def avoidWikimediaProjects(config: Config = None, other: Dict = None): +def avoidWikimediaProjects(config: Config, other: Dict): """Skip Wikimedia projects and redirect to the dumps website""" # notice about wikipedia dumps