From b142e2c491528fdf29889f3561ad98a33a3ef778 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 23 Jan 2021 13:26:28 +0900 Subject: [PATCH 01/10] chore: ignore .pytest_cahce and etc. --- .gitignore | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 143 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ad06151b..769a6d06 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,146 @@ data node_modules .env -__pycache__ \ No newline at end of file + +#################### +# Python.gitignore # +#################### + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ From fcfe6c99e89f38bde530b978f0fe0c8be04d3d6c Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 23 Jan 2021 13:31:21 +0900 Subject: [PATCH 02/10] chore: add linter and formatter --- Pipfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Pipfile b/Pipfile index f1e33d20..857e405c 100644 --- a/Pipfile +++ b/Pipfile @@ -4,6 +4,9 @@ url = "https://pypi.org/simple" verify_ssl = true [dev-packages] +black = "*" +flake8 = "*" +isort = "*" [packages] lxml = "*" From 86ad3a7b8e65245b8d3de0aa4cda0d78d2321ca0 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 23 Jan 2021 13:38:12 +0900 Subject: [PATCH 03/10] refactor: enable to import download.py in test code --- mynumbercard_data/download.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mynumbercard_data/download.py b/mynumbercard_data/download.py index b554708c..7f6acb9d 100644 --- a/mynumbercard_data/download.py +++ b/mynumbercard_data/download.py @@ -14,8 +14,6 @@ from docopt import docopt import camelot -args = docopt(__doc__) - def getFileID(filepath: str): return filepath.rsplit('/', 1)[1].replace('.pdf', '').replace('.xlsx', '') @@ -45,7 +43,7 @@ def loadPDF(filepath: str): table.to_csv(fname) -if __name__ == "__main__": +def main(args): # url of the mynumber card PDF PDF_URL = "https://www.soumu.go.jp/kojinbango_card/" ABSOLUTE_URL = "https://www.soumu.go.jp/kojinbango_card/" @@ -98,3 +96,8 @@ def loadPDF(filepath: str): # save loaded files data with open(DATA_FILE, 'w', encoding='utf-8') as f: json.dump(loaded, f, indent=2, ensure_ascii=False) + + +if __name__ == "__main__": + args = docopt(__doc__) + main(args) From b8b544984a59be158d63c7ac6b9f1f39e538b660 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 23 Jan 2021 13:38:59 +0900 Subject: [PATCH 04/10] test: getFileID --- tests/mynumbercard_data/test_download.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 tests/mynumbercard_data/test_download.py diff --git a/tests/mynumbercard_data/test_download.py b/tests/mynumbercard_data/test_download.py new file mode 100644 index 00000000..e059773b --- /dev/null +++ b/tests/mynumbercard_data/test_download.py @@ -0,0 +1,17 @@ +"""Tests of download.py""" + +import pytest + +from mynumbercard_data import download + +get_file_id_input = [ + ("https://www.soumu.go.jp/main_content/000728832.pdf", "000728832"), + ("https://www.soumu.go.jp/main_content/000703058.xlsx", "000703058"), +] + + +@pytest.mark.parametrize("filepath,expected", get_file_id_input) +def test_getFileID(filepath, expected): + actual = download.getFileID(filepath) + + assert actual == expected From d131dee33edcde668e697bace7ccc6da9c8278e6 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 23 Jan 2021 17:45:05 +0900 Subject: [PATCH 05/10] test: main (only pdf link, without cache) --- tests/mynumbercard_data/test_download.py | 58 ++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/tests/mynumbercard_data/test_download.py b/tests/mynumbercard_data/test_download.py index e059773b..8552a3c1 100644 --- a/tests/mynumbercard_data/test_download.py +++ b/tests/mynumbercard_data/test_download.py @@ -1,6 +1,11 @@ """Tests of download.py""" +from unittest import TestCase +from unittest.mock import mock_open, patch + +import lxml.html import pytest +from docopt import Dict from mynumbercard_data import download @@ -15,3 +20,56 @@ def test_getFileID(filepath, expected): actual = download.getFileID(filepath) assert actual == expected + + +pdf_only_list_item = lxml.html.fromstring( + '
  • ' + "マイナンバーカード交付状況(平成29年5月15日時点)" + 'PDF
  • ' +) + + +class MainTestCase(TestCase): + @patch("mynumbercard_data.download.json") + @patch("mynumbercard_data.download.loadPDF") + @patch("mynumbercard_data.download.lxml.html") + @patch("mynumbercard_data.download.urllib.request") + @patch("mynumbercard_data.download.os") + def test_when_only_pdf_link_without_cache( + self, os, urllib_request, lxml_html, loadPDF, json + ): + args = Dict({"--all": False, "--help": False}) + os.path.exists.return_value = False + tree = lxml_html.fromstring.return_value + tree.xpath.return_value = [pdf_only_list_item] + m = mock_open() + + with patch("builtins.open", m): + download.main(args) + + os.path.exists.assert_called_once_with("./data/loaded_files.json") + urllib_request.urlopen.assert_called_once_with( + "https://www.soumu.go.jp/kojinbango_card/" + ) + urllib_request.urlopen.return_value.read.assert_called_once_with() + lxml_html.fromstring.assert_called_once_with( + urllib_request.urlopen.return_value.read.return_value + ) + tree.xpath.assert_called_once_with( + '//*[@id="contentsWrapper"]/div[2]/div[2]/div[4]/ul/li' + ) + tree.make_links_absolute.assert_called_once_with( + "https://www.soumu.go.jp/kojinbango_card/" + ) + loadPDF.assert_called_once_with( + "https://www.soumu.go.jp/main_content/000490029.pdf" + ) + m.assert_called_once_with( + "./data/loaded_files.json", "w", encoding="utf-8" + ) + json.dump.assert_called_once_with( + {"000490029": "マイナンバーカード交付状況(平成29年5月15日時点)"}, + m(), + indent=2, + ensure_ascii=False, + ) From fe5d34cdf90bee504d5aa1ab36c6fc265076c0a7 Mon Sep 17 00:00:00 2001 From: ftnext Date: Sat, 23 Jan 2021 20:48:53 +0900 Subject: [PATCH 06/10] test: main (only pdf link, with cache) --- tests/mynumbercard_data/test_download.py | 50 +++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/tests/mynumbercard_data/test_download.py b/tests/mynumbercard_data/test_download.py index 8552a3c1..fe5b3f81 100644 --- a/tests/mynumbercard_data/test_download.py +++ b/tests/mynumbercard_data/test_download.py @@ -1,7 +1,7 @@ """Tests of download.py""" from unittest import TestCase -from unittest.mock import mock_open, patch +from unittest.mock import call, mock_open, patch import lxml.html import pytest @@ -73,3 +73,51 @@ def test_when_only_pdf_link_without_cache( indent=2, ensure_ascii=False, ) + + @patch("mynumbercard_data.download.json.dump") + @patch("mynumbercard_data.download.loadPDF") + @patch("mynumbercard_data.download.lxml.html") + @patch("mynumbercard_data.download.urllib.request") + @patch("mynumbercard_data.download.os") + def test_when_only_pdf_link_with_cache( + self, os, urllib_request, lxml_html, loadPDF, json_dump + ): + args = Dict({"--all": False, "--help": False}) + os.path.exists.return_value = True + tree = lxml_html.fromstring.return_value + tree.xpath.return_value = [pdf_only_list_item] + m = mock_open( + read_data='{\n "000490029": "マイナンバーカード交付状況(平成29年5月15日時点)"\n}' + ) + + with patch("builtins.open", m): + download.main(args) + + os.path.exists.assert_called_once_with("./data/loaded_files.json") + urllib_request.urlopen.assert_called_once_with( + "https://www.soumu.go.jp/kojinbango_card/" + ) + urllib_request.urlopen.return_value.read.assert_called_once_with() + lxml_html.fromstring.assert_called_once_with( + urllib_request.urlopen.return_value.read.return_value + ) + tree.xpath.assert_called_once_with( + '//*[@id="contentsWrapper"]/div[2]/div[2]/div[4]/ul/li' + ) + tree.make_links_absolute.assert_called_once_with( + "https://www.soumu.go.jp/kojinbango_card/" + ) + loadPDF.assert_not_called() + self.assertEqual( + m.call_args_list, + [ + call("./data/loaded_files.json"), + call("./data/loaded_files.json", "w", encoding="utf-8"), + ], + ) + json_dump.assert_called_once_with( + {"000490029": "マイナンバーカード交付状況(平成29年5月15日時点)"}, + m(), + indent=2, + ensure_ascii=False, + ) From 0dd119e4b9296e09bad1e46cedefc4f35abbec51 Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 26 Jan 2021 20:28:35 +0900 Subject: [PATCH 07/10] test: main (pdf and excel link, without cache) --- tests/mynumbercard_data/test_download.py | 53 ++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/mynumbercard_data/test_download.py b/tests/mynumbercard_data/test_download.py index fe5b3f81..2141964a 100644 --- a/tests/mynumbercard_data/test_download.py +++ b/tests/mynumbercard_data/test_download.py @@ -27,6 +27,15 @@ def test_getFileID(filepath, expected): "マイナンバーカード交付状況(平成29年5月15日時点)" 'PDF' ) +pdf_and_excel_list_item = lxml.html.fromstring( + '
  • マイナンバーカード交付状況(令和2年8月1日現在) ' + 'PDF形式' + ' ' + 'Excel形式' + '
  • ' +) class MainTestCase(TestCase): @@ -121,3 +130,47 @@ def test_when_only_pdf_link_with_cache( indent=2, ensure_ascii=False, ) + + @patch("mynumbercard_data.download.json") + @patch("mynumbercard_data.download.loadPDF") + @patch("mynumbercard_data.download.lxml.html") + @patch("mynumbercard_data.download.urllib.request") + @patch("mynumbercard_data.download.os") + def test_when_pdf_and_excel_link_without_cache( + self, os, urllib_request, lxml_html, loadPDF, json + ): + args = Dict({"--all": False, "--help": False}) + os.path.exists.return_value = False + tree = lxml_html.fromstring.return_value + tree.xpath.return_value = [pdf_and_excel_list_item] + m = mock_open() + + with patch("builtins.open", m): + download.main(args) + + os.path.exists.assert_called_once_with("./data/loaded_files.json") + urllib_request.urlopen.assert_called_once_with( + "https://www.soumu.go.jp/kojinbango_card/" + ) + urllib_request.urlopen.return_value.read.assert_called_once_with() + lxml_html.fromstring.assert_called_once_with( + urllib_request.urlopen.return_value.read.return_value + ) + tree.make_links_absolute.assert_called_once_with( + "https://www.soumu.go.jp/kojinbango_card/" + ) + tree.xpath.assert_called_once_with( + '//*[@id="contentsWrapper"]/div[2]/div[2]/div[4]/ul/li' + ) + loadPDF.assert_called_once_with( + "https://www.soumu.go.jp/main_content/000703057.pdf" + ) + m.assert_called_once_with( + "./data/loaded_files.json", "w", encoding="utf-8" + ) + json.dump.assert_called_once_with( + {"000703057": "マイナンバーカード交付状況(令和2年8月1日現在) "}, + m(), + indent=2, + ensure_ascii=False, + ) From 0d25c9b855d8029c9c5584c7e10189839981565e Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 26 Jan 2021 20:30:24 +0900 Subject: [PATCH 08/10] format with black --- tests/mynumbercard_data/test_download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/mynumbercard_data/test_download.py b/tests/mynumbercard_data/test_download.py index 2141964a..710f8769 100644 --- a/tests/mynumbercard_data/test_download.py +++ b/tests/mynumbercard_data/test_download.py @@ -28,7 +28,7 @@ def test_getFileID(filepath, expected): 'PDF' ) pdf_and_excel_list_item = lxml.html.fromstring( - '
  • マイナンバーカード交付状況(令和2年8月1日現在) ' + "
  • マイナンバーカード交付状況(令和2年8月1日現在) " 'PDF形式' ' ' From bb4d40d75cd82bda26c14002572077c73e3bb357 Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 26 Jan 2021 20:38:09 +0900 Subject: [PATCH 09/10] test: main (pdf and excel link, with cache) --- tests/mynumbercard_data/test_download.py | 48 ++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/mynumbercard_data/test_download.py b/tests/mynumbercard_data/test_download.py index 710f8769..b11089b4 100644 --- a/tests/mynumbercard_data/test_download.py +++ b/tests/mynumbercard_data/test_download.py @@ -174,3 +174,51 @@ def test_when_pdf_and_excel_link_without_cache( indent=2, ensure_ascii=False, ) + + @patch("mynumbercard_data.download.json.dump") + @patch("mynumbercard_data.download.loadPDF") + @patch("mynumbercard_data.download.lxml.html") + @patch("mynumbercard_data.download.urllib.request") + @patch("mynumbercard_data.download.os") + def test_when_pdf_and_excel_link_with_cache( + self, os, urllib_request, lxml_html, loadPDF, json_dump + ): + args = Dict({"--all": False, "--help": False}) + os.path.exists.return_value = True + tree = lxml_html.fromstring.return_value + tree.xpath.return_value = [pdf_and_excel_list_item] + m = mock_open( + read_data='{\n "000703057": "マイナンバーカード交付状況(令和2年8月1日現在) "\n}' + ) + + with patch("builtins.open", m): + download.main(args) + + os.path.exists.assert_called_once_with("./data/loaded_files.json") + urllib_request.urlopen.assert_called_once_with( + "https://www.soumu.go.jp/kojinbango_card/" + ) + urllib_request.urlopen.return_value.read.assert_called_once_with() + lxml_html.fromstring.assert_called_once_with( + urllib_request.urlopen.return_value.read.return_value + ) + tree.make_links_absolute.assert_called_once_with( + "https://www.soumu.go.jp/kojinbango_card/" + ) + tree.xpath.assert_called_once_with( + '//*[@id="contentsWrapper"]/div[2]/div[2]/div[4]/ul/li' + ) + loadPDF.assert_not_called() + self.assertEqual( + m.call_args_list, + [ + call("./data/loaded_files.json"), + call("./data/loaded_files.json", "w", encoding="utf-8"), + ], + ) + json_dump.assert_called_once_with( + {"000703057": "マイナンバーカード交付状況(令和2年8月1日現在) "}, + m(), + indent=2, + ensure_ascii=False, + ) From d7107d732eaeb73600b6225eb91a553c1b7b8e3a Mon Sep 17 00:00:00 2001 From: ftnext Date: Tue, 26 Jan 2021 20:40:03 +0900 Subject: [PATCH 10/10] tweak: more real fixture, change order to match implementation --- tests/mynumbercard_data/test_download.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/mynumbercard_data/test_download.py b/tests/mynumbercard_data/test_download.py index b11089b4..df7e3411 100644 --- a/tests/mynumbercard_data/test_download.py +++ b/tests/mynumbercard_data/test_download.py @@ -25,7 +25,8 @@ def test_getFileID(filepath, expected): pdf_only_list_item = lxml.html.fromstring( '
  • ' "マイナンバーカード交付状況(平成29年5月15日時点)" - 'PDF
  • ' + 'PDF' + "" ) pdf_and_excel_list_item = lxml.html.fromstring( "
  • マイナンバーカード交付状況(令和2年8月1日現在) " @@ -64,12 +65,12 @@ def test_when_only_pdf_link_without_cache( lxml_html.fromstring.assert_called_once_with( urllib_request.urlopen.return_value.read.return_value ) - tree.xpath.assert_called_once_with( - '//*[@id="contentsWrapper"]/div[2]/div[2]/div[4]/ul/li' - ) tree.make_links_absolute.assert_called_once_with( "https://www.soumu.go.jp/kojinbango_card/" ) + tree.xpath.assert_called_once_with( + '//*[@id="contentsWrapper"]/div[2]/div[2]/div[4]/ul/li' + ) loadPDF.assert_called_once_with( "https://www.soumu.go.jp/main_content/000490029.pdf" ) @@ -110,12 +111,12 @@ def test_when_only_pdf_link_with_cache( lxml_html.fromstring.assert_called_once_with( urllib_request.urlopen.return_value.read.return_value ) - tree.xpath.assert_called_once_with( - '//*[@id="contentsWrapper"]/div[2]/div[2]/div[4]/ul/li' - ) tree.make_links_absolute.assert_called_once_with( "https://www.soumu.go.jp/kojinbango_card/" ) + tree.xpath.assert_called_once_with( + '//*[@id="contentsWrapper"]/div[2]/div[2]/div[4]/ul/li' + ) loadPDF.assert_not_called() self.assertEqual( m.call_args_list,