From 59ec0833b93a3c3fdec4eec17558e0cfcb066d59 Mon Sep 17 00:00:00 2001 From: Anthony Bretaudeau Date: Thu, 22 Aug 2024 14:28:28 +0200 Subject: [PATCH 1/6] Upgrade ci --- .github/workflows/ci.yml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e28c8bb..b2f20a9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,7 +9,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v5 with: python-version: 3.8 - name: Install Flake8 @@ -36,7 +36,7 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v5 with: python-version: 3.8 - name: Install requirements @@ -54,7 +54,7 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v5 with: python-version: 3.8 - name: Check that the package build works @@ -66,10 +66,15 @@ jobs: runs-on: ubuntu-latest needs: [lint, test, pkg_build] name: Deploy release to Pypi + environment: + name: release + url: https://pypi.org/p/biomaj-download + permissions: + id-token: write steps: - name: Checkout uses: actions/checkout@v4 - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v5 with: python-version: 3.8 - name: Python install @@ -78,6 +83,6 @@ jobs: run: python -m build --sdist --wheel --outdir dist/ . - name: Publish distribution 📦 to PyPI if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') - uses: pypa/gh-action-pypi-publish@master + uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.pypi_password }} From e49d7dcef4639ea0fc2c01fcea22bffc76d11496 Mon Sep 17 00:00:00 2001 From: Anthony Bretaudeau Date: Thu, 22 Aug 2024 14:31:06 +0200 Subject: [PATCH 2/6] oh that's already done in pypi job --- .github/workflows/ci.yml | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2f20a9..688131f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -49,19 +49,6 @@ jobs: run: | LOCAL_IRODS=0 NETWORK=0 pytest -v tests/biomaj_tests.py - pkg_build: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: 3.8 - - name: Check that the package build works - run: | - pip install -U pip setuptools build - python -m build --sdist --wheel --outdir dist/ . - pypi: runs-on: ubuntu-latest needs: [lint, test, pkg_build] From 0cd068d7bb792bf8b2071aafbf371c6db4cc2a3e Mon Sep 17 00:00:00 2001 From: Anthony Bretaudeau Date: Thu, 22 Aug 2024 14:33:08 +0200 Subject: [PATCH 3/6] damn --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 688131f..5ca034e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -51,7 +51,7 @@ jobs: pypi: runs-on: ubuntu-latest - needs: [lint, test, pkg_build] + needs: [lint, test] name: Deploy release to Pypi environment: name: release From 94431559580ca00f340946bae36caf057670eece Mon Sep 17 00:00:00 2001 From: mboudet Date: Fri, 23 Aug 2024 15:42:59 +0200 Subject: [PATCH 4/6] Trying some stuff.. --- biomaj_download/download/curl.py | 91 +++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 1 deletion(-) diff --git a/biomaj_download/download/curl.py b/biomaj_download/download/curl.py index 326d662..3d69279 100644 --- a/biomaj_download/download/curl.py +++ b/biomaj_download/download/curl.py @@ -298,8 +298,92 @@ def _append_file_to_download(self, rfile): rfile['url'] = self.url if 'root' not in rfile or not rfile['root']: rfile['root'] = self.rootdir + + if rfile.get('modified_size'): + # Size parsed is inacurate. Try to get a more accurate size with a HEAD query + try: + self.logger.debug('Trying to get a more accurate size for ' + rfile['name']) + head_size = self._estimate_size(rfile) + if head_size: + rfile['size'] = head_size + except Exception as e: + self.logger.error('Exception while trying to get a more accurate size for ' + rfile['name'] + ' - ' + str(e)) + super(CurlDownload, self)._append_file_to_download(rfile) + def _estimate_size(self, rfile): + # Cannot reuse _file_url, since we did not cleanup the name yet + # Mostly pasted for the same stuff in direct download + name = re.sub('//+', '/', rfile['name']) + url = self.url + '/' + rfile['root'] + name + url_elts = url.split('://') + if len(url_elts) == 2: + url_elts[1] = re.sub("/{2,}", "/", url_elts[1]) + full_url = '://'.join(url_elts) + else: + full_url = re.sub("/{2,}", "/", url) + + return self._head_size_call(full_url) + + def _head_size_call(self, full_url): + # Now do a HEAD call on this url + size = 0 + + self._network_configuration() + self.crl.setopt(pycurl.HEADER, True) + self.crl.setopt(pycurl.NOBODY, True) + + try: + self.crl.setopt(pycurl.URL, full_url) + except Exception: + self.crl.setopt(pycurl.URL, full_url.encode('ascii', 'ignore')) + + output = BytesIO() + self.crl.setopt(pycurl.WRITEFUNCTION, output.write) + + try: + self.crl.perform() + errcode = int(self.crl.getinfo(pycurl.RESPONSE_CODE)) + if errcode == 405: + # HEAD not supported by the server for this URL so we can skip + return 0 + elif errcode not in self.ERRCODE_OK: + msg = 'Error while listing ' + full_url + ' - ' + str(errcode) + self.logger.error(msg) + raise Exception(msg) + except Exception as e: + msg = 'Error while listing ' + full_url + ' - ' + str(e) + self.logger.error(msg) + raise e + + # Figure out what encoding was sent with the response, if any. + # Check against lowercased header name. + encoding = None + if 'content-type' in self.headers: + content_type = self.headers['content-type'].lower() + match = re.search(r'charset=(\S+)', content_type) + if match: + encoding = match.group(1) + if encoding is None: + # Default encoding for HTML is iso-8859-1. + # Other content types may have different default encoding, + # or in case of binary data, may have no encoding at all. + encoding = 'iso-8859-1' + + # lets get the output in a string + result = output.getvalue().decode(encoding) + lines = re.split(r'[\n\r]+', result) + for line in lines: + parts = line.split(':') + if parts[0].strip() == 'Content-Length': + # Not sure if Content-Length is always in bytes + try: + size = int(parts[1].strip()) + except Exception: + size = 0 + return size + return size + def _file_url(self, rfile): # rfile['root'] is set to self.rootdir if needed but may be different. # We don't use os.path.join because rfile['name'] may starts with / @@ -519,7 +603,12 @@ def _http_parse_result(self, result): rfile['group'] = '' rfile['user'] = '' if self.http_parse.file_size != -1: - rfile['size'] = humanfriendly.parse_size(foundfile[self.http_parse.file_size - 1]) + size = humanfriendly.parse_size(foundfile[self.http_parse.file_size - 1]) + if not str(size) == foundfile[self.http_parse.file_size - 1]: + # This is an approximation of the real size (conversion to byte) + # We will check later (in match()) if we can get a more accurate size + rfile['modified_size'] = True + rfile['size'] = size else: rfile['size'] = 0 if self.http_parse.file_date != -1: From 744f1964ca3a3725ca1e50054cb325c89bd2c2d3 Mon Sep 17 00:00:00 2001 From: mboudet Date: Fri, 23 Aug 2024 15:59:17 +0200 Subject: [PATCH 5/6] use requests instead --- biomaj_download/download/curl.py | 60 ++++---------------------------- 1 file changed, 6 insertions(+), 54 deletions(-) diff --git a/biomaj_download/download/curl.py b/biomaj_download/download/curl.py index 3d69279..4a924c6 100644 --- a/biomaj_download/download/curl.py +++ b/biomaj_download/download/curl.py @@ -1,4 +1,5 @@ import re +import requests from datetime import datetime import hashlib import time @@ -327,62 +328,13 @@ def _estimate_size(self, rfile): def _head_size_call(self, full_url): # Now do a HEAD call on this url - size = 0 - - self._network_configuration() - self.crl.setopt(pycurl.HEADER, True) - self.crl.setopt(pycurl.NOBODY, True) - - try: - self.crl.setopt(pycurl.URL, full_url) - except Exception: - self.crl.setopt(pycurl.URL, full_url.encode('ascii', 'ignore')) - - output = BytesIO() - self.crl.setopt(pycurl.WRITEFUNCTION, output.write) - try: - self.crl.perform() - errcode = int(self.crl.getinfo(pycurl.RESPONSE_CODE)) - if errcode == 405: - # HEAD not supported by the server for this URL so we can skip - return 0 - elif errcode not in self.ERRCODE_OK: - msg = 'Error while listing ' + full_url + ' - ' + str(errcode) - self.logger.error(msg) - raise Exception(msg) - except Exception as e: - msg = 'Error while listing ' + full_url + ' - ' + str(e) - self.logger.error(msg) - raise e + size_response = requests.head(full_url, allow_redirects=True) + size = int(size_response.headers.get('content-length', 0)) + return size - # Figure out what encoding was sent with the response, if any. - # Check against lowercased header name. - encoding = None - if 'content-type' in self.headers: - content_type = self.headers['content-type'].lower() - match = re.search(r'charset=(\S+)', content_type) - if match: - encoding = match.group(1) - if encoding is None: - # Default encoding for HTML is iso-8859-1. - # Other content types may have different default encoding, - # or in case of binary data, may have no encoding at all. - encoding = 'iso-8859-1' - - # lets get the output in a string - result = output.getvalue().decode(encoding) - lines = re.split(r'[\n\r]+', result) - for line in lines: - parts = line.split(':') - if parts[0].strip() == 'Content-Length': - # Not sure if Content-Length is always in bytes - try: - size = int(parts[1].strip()) - except Exception: - size = 0 - return size - return size + except Exception: + return 0 def _file_url(self, rfile): # rfile['root'] is set to self.rootdir if needed but may be different. From bf7f4f4601f57bcc54e45447f5c09991c6d65425 Mon Sep 17 00:00:00 2001 From: mboudet Date: Fri, 23 Aug 2024 17:01:11 +0200 Subject: [PATCH 6/6] Manage proxies & auth just in case --- biomaj_download/download/curl.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/biomaj_download/download/curl.py b/biomaj_download/download/curl.py index 4a924c6..27ae122 100644 --- a/biomaj_download/download/curl.py +++ b/biomaj_download/download/curl.py @@ -328,8 +328,26 @@ def _estimate_size(self, rfile): def _head_size_call(self, full_url): # Now do a HEAD call on this url + + auth = () + proxies = {} + + if self.credentials is not None: + auth = tuple(self.credentials.split(":")) + + if self.proxy is not None: + proxy = self.proxy + if not self.proxy.startswith("http"): + proxy = 'http://' + self.proxy + if self.proxy_auth is not None: + # Don't really want to manage properly the various schemes + proxy.replace('http://', 'http://{}@'.format(self.proxy_auth)) + proxy.replace('https://', 'https://{}@'.format(self.proxy_auth)) + proxies['http'] = proxy + proxies['https'] = proxy + try: - size_response = requests.head(full_url, allow_redirects=True) + size_response = requests.head(full_url, allow_redirects=True, auth=auth, proxies=proxies) size = int(size_response.headers.get('content-length', 0)) return size