From 59ec0833b93a3c3fdec4eec17558e0cfcb066d59 Mon Sep 17 00:00:00 2001
From: Anthony Bretaudeau <anthony.bretaudeau@inria.fr>
Date: Thu, 22 Aug 2024 14:28:28 +0200
Subject: [PATCH 1/6] Upgrade ci

---
 .github/workflows/ci.yml | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e28c8bb..b2f20a9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,7 +9,7 @@ jobs:
     steps:
     - name: Checkout
       uses: actions/checkout@v4
-    - uses: actions/setup-python@v1
+    - uses: actions/setup-python@v5
       with:
         python-version: 3.8
     - name: Install Flake8
@@ -36,7 +36,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - uses: actions/checkout@v4
-    - uses: actions/setup-python@v1
+    - uses: actions/setup-python@v5
       with:
         python-version: 3.8
     - name: Install requirements
@@ -54,7 +54,7 @@ jobs:
     steps:
     - name: Checkout
       uses: actions/checkout@v4
-    - uses: actions/setup-python@v1
+    - uses: actions/setup-python@v5
       with:
         python-version: 3.8
     - name: Check that the package build works
@@ -66,10 +66,15 @@ jobs:
     runs-on: ubuntu-latest
     needs: [lint, test, pkg_build]
     name: Deploy release to Pypi
+    environment:
+      name: release
+      url: https://pypi.org/p/biomaj-download
+    permissions:
+      id-token: write
     steps:
     - name: Checkout
       uses: actions/checkout@v4
-    - uses: actions/setup-python@v1
+    - uses: actions/setup-python@v5
       with:
         python-version: 3.8
     - name: Python install
@@ -78,6 +83,6 @@ jobs:
       run: python -m build --sdist --wheel --outdir dist/ .
     - name: Publish distribution 📦 to PyPI
       if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
-      uses: pypa/gh-action-pypi-publish@master
+      uses: pypa/gh-action-pypi-publish@release/v1
       with:
         password: ${{ secrets.pypi_password }}

From e49d7dcef4639ea0fc2c01fcea22bffc76d11496 Mon Sep 17 00:00:00 2001
From: Anthony Bretaudeau <anthony.bretaudeau@inria.fr>
Date: Thu, 22 Aug 2024 14:31:06 +0200
Subject: [PATCH 2/6] oh that's already done in pypi job

---
 .github/workflows/ci.yml | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b2f20a9..688131f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -49,19 +49,6 @@ jobs:
       run: |
         LOCAL_IRODS=0 NETWORK=0 pytest -v tests/biomaj_tests.py
 
-  pkg_build:
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
-      with:
-        python-version: 3.8
-    - name: Check that the package build works
-      run: |
-        pip install -U pip setuptools build
-        python -m build --sdist --wheel --outdir dist/ .
-
   pypi:
     runs-on: ubuntu-latest
     needs: [lint, test, pkg_build]

From 0cd068d7bb792bf8b2071aafbf371c6db4cc2a3e Mon Sep 17 00:00:00 2001
From: Anthony Bretaudeau <anthony.bretaudeau@inria.fr>
Date: Thu, 22 Aug 2024 14:33:08 +0200
Subject: [PATCH 3/6] damn

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 688131f..5ca034e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -51,7 +51,7 @@ jobs:
 
   pypi:
     runs-on: ubuntu-latest
-    needs: [lint, test, pkg_build]
+    needs: [lint, test]
     name: Deploy release to Pypi
     environment:
       name: release

From 94431559580ca00f340946bae36caf057670eece Mon Sep 17 00:00:00 2001
From: mboudet <mateo.boudet@gmail.com>
Date: Fri, 23 Aug 2024 15:42:59 +0200
Subject: [PATCH 4/6] Trying some stuff..

---
 biomaj_download/download/curl.py | 91 +++++++++++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/biomaj_download/download/curl.py b/biomaj_download/download/curl.py
index 326d662..3d69279 100644
--- a/biomaj_download/download/curl.py
+++ b/biomaj_download/download/curl.py
@@ -298,8 +298,92 @@ def _append_file_to_download(self, rfile):
             rfile['url'] = self.url
         if 'root' not in rfile or not rfile['root']:
             rfile['root'] = self.rootdir
+
+        if rfile.get('modified_size'):
+            # Size parsed is inacurate. Try to get a more accurate size with a HEAD query
+            try:
+                self.logger.debug('Trying to get a more accurate size for ' + rfile['name'])
+                head_size = self._estimate_size(rfile)
+                if head_size:
+                    rfile['size'] = head_size
+            except Exception as e:
+                self.logger.error('Exception while trying to get a more accurate size for ' + rfile['name'] + ' - ' + str(e))
+
         super(CurlDownload, self)._append_file_to_download(rfile)
 
+    def _estimate_size(self, rfile):
+        # Cannot reuse _file_url, since we did not cleanup the name yet
+        # Mostly pasted for the same stuff in direct download
+        name = re.sub('//+', '/', rfile['name'])
+        url = self.url + '/' + rfile['root'] + name
+        url_elts = url.split('://')
+        if len(url_elts) == 2:
+            url_elts[1] = re.sub("/{2,}", "/", url_elts[1])
+            full_url = '://'.join(url_elts)
+        else:
+            full_url = re.sub("/{2,}", "/", url)
+
+        return self._head_size_call(full_url)
+
+    def _head_size_call(self, full_url):
+        # Now do a HEAD call on this url
+        size = 0
+
+        self._network_configuration()
+        self.crl.setopt(pycurl.HEADER, True)
+        self.crl.setopt(pycurl.NOBODY, True)
+
+        try:
+            self.crl.setopt(pycurl.URL, full_url)
+        except Exception:
+            self.crl.setopt(pycurl.URL, full_url.encode('ascii', 'ignore'))
+
+        output = BytesIO()
+        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
+
+        try:
+            self.crl.perform()
+            errcode = int(self.crl.getinfo(pycurl.RESPONSE_CODE))
+            if errcode == 405:
+                # HEAD not supported by the server for this URL so we can skip
+                return 0
+            elif errcode not in self.ERRCODE_OK:
+                msg = 'Error while listing ' + full_url + ' - ' + str(errcode)
+                self.logger.error(msg)
+                raise Exception(msg)
+        except Exception as e:
+            msg = 'Error while listing ' + full_url + ' - ' + str(e)
+            self.logger.error(msg)
+            raise e
+
+        # Figure out what encoding was sent with the response, if any.
+        # Check against lowercased header name.
+        encoding = None
+        if 'content-type' in self.headers:
+            content_type = self.headers['content-type'].lower()
+            match = re.search(r'charset=(\S+)', content_type)
+            if match:
+                encoding = match.group(1)
+        if encoding is None:
+            # Default encoding for HTML is iso-8859-1.
+            # Other content types may have different default encoding,
+            # or in case of binary data, may have no encoding at all.
+            encoding = 'iso-8859-1'
+
+        # lets get the output in a string
+        result = output.getvalue().decode(encoding)
+        lines = re.split(r'[\n\r]+', result)
+        for line in lines:
+            parts = line.split(':')
+            if parts[0].strip() == 'Content-Length':
+                # Not sure if Content-Length is always in bytes
+                try:
+                    size = int(parts[1].strip())
+                except Exception:
+                    size = 0
+                return size
+        return size
+
     def _file_url(self, rfile):
         # rfile['root'] is set to self.rootdir if needed but may be different.
         # We don't use os.path.join because rfile['name'] may starts with /
@@ -519,7 +603,12 @@ def _http_parse_result(self, result):
                 rfile['group'] = ''
                 rfile['user'] = ''
                 if self.http_parse.file_size != -1:
-                    rfile['size'] = humanfriendly.parse_size(foundfile[self.http_parse.file_size - 1])
+                    size = humanfriendly.parse_size(foundfile[self.http_parse.file_size - 1])
+                    if not str(size) == foundfile[self.http_parse.file_size - 1]:
+                        # This is an approximation of the real size (conversion to byte)
+                        # We will check later (in match()) if we can get a more accurate size
+                        rfile['modified_size'] = True
+                    rfile['size'] = size
                 else:
                     rfile['size'] = 0
                 if self.http_parse.file_date != -1:

From 744f1964ca3a3725ca1e50054cb325c89bd2c2d3 Mon Sep 17 00:00:00 2001
From: mboudet <mateo.boudet@gmail.com>
Date: Fri, 23 Aug 2024 15:59:17 +0200
Subject: [PATCH 5/6] use requests instead

---
 biomaj_download/download/curl.py | 60 ++++----------------------------
 1 file changed, 6 insertions(+), 54 deletions(-)

diff --git a/biomaj_download/download/curl.py b/biomaj_download/download/curl.py
index 3d69279..4a924c6 100644
--- a/biomaj_download/download/curl.py
+++ b/biomaj_download/download/curl.py
@@ -1,4 +1,5 @@
 import re
+import requests
 from datetime import datetime
 import hashlib
 import time
@@ -327,62 +328,13 @@ def _estimate_size(self, rfile):
 
     def _head_size_call(self, full_url):
         # Now do a HEAD call on this url
-        size = 0
-
-        self._network_configuration()
-        self.crl.setopt(pycurl.HEADER, True)
-        self.crl.setopt(pycurl.NOBODY, True)
-
-        try:
-            self.crl.setopt(pycurl.URL, full_url)
-        except Exception:
-            self.crl.setopt(pycurl.URL, full_url.encode('ascii', 'ignore'))
-
-        output = BytesIO()
-        self.crl.setopt(pycurl.WRITEFUNCTION, output.write)
-
         try:
-            self.crl.perform()
-            errcode = int(self.crl.getinfo(pycurl.RESPONSE_CODE))
-            if errcode == 405:
-                # HEAD not supported by the server for this URL so we can skip
-                return 0
-            elif errcode not in self.ERRCODE_OK:
-                msg = 'Error while listing ' + full_url + ' - ' + str(errcode)
-                self.logger.error(msg)
-                raise Exception(msg)
-        except Exception as e:
-            msg = 'Error while listing ' + full_url + ' - ' + str(e)
-            self.logger.error(msg)
-            raise e
+            size_response = requests.head(full_url, allow_redirects=True)
+            size = int(size_response.headers.get('content-length', 0))
+            return size
 
-        # Figure out what encoding was sent with the response, if any.
-        # Check against lowercased header name.
-        encoding = None
-        if 'content-type' in self.headers:
-            content_type = self.headers['content-type'].lower()
-            match = re.search(r'charset=(\S+)', content_type)
-            if match:
-                encoding = match.group(1)
-        if encoding is None:
-            # Default encoding for HTML is iso-8859-1.
-            # Other content types may have different default encoding,
-            # or in case of binary data, may have no encoding at all.
-            encoding = 'iso-8859-1'
-
-        # lets get the output in a string
-        result = output.getvalue().decode(encoding)
-        lines = re.split(r'[\n\r]+', result)
-        for line in lines:
-            parts = line.split(':')
-            if parts[0].strip() == 'Content-Length':
-                # Not sure if Content-Length is always in bytes
-                try:
-                    size = int(parts[1].strip())
-                except Exception:
-                    size = 0
-                return size
-        return size
+        except Exception:
+            return 0
 
     def _file_url(self, rfile):
         # rfile['root'] is set to self.rootdir if needed but may be different.

From bf7f4f4601f57bcc54e45447f5c09991c6d65425 Mon Sep 17 00:00:00 2001
From: mboudet <mateo.boudet@gmail.com>
Date: Fri, 23 Aug 2024 17:01:11 +0200
Subject: [PATCH 6/6] Manage proxies & auth just in case

---
 biomaj_download/download/curl.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/biomaj_download/download/curl.py b/biomaj_download/download/curl.py
index 4a924c6..27ae122 100644
--- a/biomaj_download/download/curl.py
+++ b/biomaj_download/download/curl.py
@@ -328,8 +328,26 @@ def _estimate_size(self, rfile):
 
     def _head_size_call(self, full_url):
         # Now do a HEAD call on this url
+
+        auth = ()
+        proxies = {}
+
+        if self.credentials is not None:
+            auth = tuple(self.credentials.split(":"))
+
+        if self.proxy is not None:
+            proxy = self.proxy
+            if not self.proxy.startswith("http"):
+                proxy = 'http://' + self.proxy
+            if self.proxy_auth is not None:
+                # Don't really want to manage properly the various schemes
+                proxy.replace('http://', 'http://{}@'.format(self.proxy_auth))
+                proxy.replace('https://', 'https://{}@'.format(self.proxy_auth))
+            proxies['http'] = proxy
+            proxies['https'] = proxy
+
         try:
-            size_response = requests.head(full_url, allow_redirects=True)
+            size_response = requests.head(full_url, allow_redirects=True, auth=auth, proxies=proxies)
             size = int(size_response.headers.get('content-length', 0))
             return size