alliance-genome · mluypaert · Mar 6, 2024 · Mar 7, 2024 · Mar 7, 2024 · Mar 7, 2024
@@ -0,0 +1,46 @@
+name: PR validation
+on:
+  pull_request:
+    types: [synchronize, opened, reopened, edited]
+    branches:
+      - main
+jobs:
+  pipeline-seq-retrieval-container-image-build:
+    name: pipeline/seq_retrieval container-image build
+    runs-on: ubuntu-22.04
+    defaults:
+      run:
+        shell: bash
+        working-directory: ./pipeline/seq_retrieval/
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          sparse-checkout: |
+            pipeline/seq_retrieval/
+      - name: Build container image
+        run: |
+          make container-image
+  pipeline-seq-retrieval-check-python-typing:
+    name: pipeline/seq_retrieval check python typing
+    runs-on: ubuntu-22.04
+    defaults:
+      run:
+        shell: bash
+        working-directory: ./pipeline/seq_retrieval/
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          sparse-checkout: |
+            pipeline/seq_retrieval/
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+      - name: Check python typing
+        run: |
+          make check-python-typing
+  #TODO: add unit testing
+  #TODO: add integration testing
diff --git a/pipeline/seq_retrieval/Dockerfile b/pipeline/seq_retrieval/Dockerfile
@@ -0,0 +1,13 @@
+FROM python:3.12-alpine
+
+WORKDIR /usr/src/app
+
+RUN apk add --no-cache build-base zlib-dev bzip2-dev xz-dev
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY src/ ./
+
+ENTRYPOINT [ "python", "main.py"]
+CMD [ "--help" ]
@@ -0,0 +1,29 @@
+.PHONY: check-venv-active check-python-typing
+
+container-image:
+	docker build -t agr_pavi/seq_retrieval .
+python-dependencies:
+	pip install -r requirements.txt
+
+python-dependencies-update:
+	pip install -U -r requirements.txt
+
+check-python-typing: python-dependencies
+	pip install -r test/type-testing-requirements.txt
+	mypy --install-types --non-interactive src/main.py
+
+check-venv-active:
+ifeq ($(VIRTUAL_ENV),)
+	@echo 'No active python virtual environment found.'\
+		  'Please active the virtual environment first by running `source venv/bin/activate`,'\
+		  'or read README.md for instructions on how to set up a new one.'
+	@exit 1
+else
+	@:
+endif
+
+python-dependencies-dev: check-venv-active python-dependencies
+
+python-dependencies-dev-update: check-venv-active python-dependencies-update
+
+check-python-typing-dev: check-venv-active check-python-typing
diff --git a/pipeline/seq_retrieval/README.md b/pipeline/seq_retrieval/README.md
@@ -0,0 +1,32 @@
+# PAVI Sequence retrieval
+This subdirectory contains all code and configs for the PAVI sequence retrieval component.
+
+## Development
+In order to enable isolated local development that does not interfere with the global system python setup,
+a virtual environment is used to do code development and testing for this component.
+
+To start developing on a new system, create a virtual environment using the following command
+(having this directory as your working directory):
+```bash
+python3.12 -m venv ./venv
+```
+
+Then when developing this component, activated the virtual environment with:
+```bash
+source venv/bin/activate
+```
+Once the virtual environment is activated, all packages will be installed in this isolated virtual environment.
+To install all python package dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+To upgrade all packages (already installed) to the latest available version matching the requirements:
+```bash
+pip install -U -r requirements.txt
+```
+
+Once ending development for this component, deactivate the virtual environment with:
+```bash
+deactivate
+```
diff --git a/pipeline/seq_retrieval/requirements.txt b/pipeline/seq_retrieval/requirements.txt
@@ -0,0 +1,4 @@
+biopython==1.83
+click==8.1.*
+pysam==0.22.*
+requests==2.31.*
diff --git a/pipeline/seq_retrieval/src/data_mover/data_file_mover.py b/pipeline/seq_retrieval/src/data_mover/data_file_mover.py
@@ -0,0 +1,98 @@
+"""
+Moving files to and from remote locations
+"""
+import os.path
+from pathlib import Path
+import typing
+import requests
+from urllib.parse import urlparse, unquote
+
+_stored_files: typing.Dict[str, str] = dict()
+_DEFAULT_DIR = '/tmp/pavi/'
+_reuse_local_cache = False
+
+def set_local_cache_reuse(reuse: bool):
+    """
+    Set _reuse_local_cache (True or False, default False)
+    """
+    global _reuse_local_cache
+    _reuse_local_cache = reuse
+
+def is_accessible_url(url: str):
+    """
+    Returns True when provided `url` is an accessible URL
+    """
+    response = requests.head(url)
+    if response.ok:
+        return True
+    else:
+        return False
+
+def fetch_file(url: str, dest_dir: str = _DEFAULT_DIR, reuse_local_cache: typing.Optional[bool] = None) -> str:
+    """
+    Fetch file from URL, return its local path.
+    """
+    global _stored_files
+    local_path = None
+    if url not in _stored_files.keys():
+        url_components = urlparse(url)
+        if url_components.scheme == 'file':
+            filepath = url_components.netloc + url_components.path
+            local_path = find_local_file(filepath)
+        else:
+            local_path = download_from_url(url, dest_dir, reuse_local_cache=reuse_local_cache)
+        _stored_files[url] = local_path
+    else:
+        local_path = _stored_files[url]
+
+    return local_path
+
+def find_local_file(path: str):
+    """
+    Find a file locally based on path and return its absolute path.
+    If no file was found at given path, throws Exception.
+    """
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"No file found at path '{path}'.")
+    else:
+        if not os.path.isfile(path):
+            raise FileNotFoundError(f"Specified path '{path}' exists but is not a file.")
+        else:
+            return Path(path).resolve()
+
+def download_from_url(url: str, dest_dir: str = _DEFAULT_DIR, chunk_size = 10 * 1024, reuse_local_cache: typing.Optional[bool] = None):
+    if reuse_local_cache == None:
+        reuse_local_cache = _reuse_local_cache
+
+    url_components = urlparse(url)
+    if url_components.scheme in ['http', 'https']:
+
+        if not is_accessible_url(url):
+            raise ValueError(f"URL {url} is not accessible.")
+
+        Path(dest_dir).mkdir(parents=True, exist_ok=True)
+
+        filename = unquote(os.path.basename(url_components.path))
+        local_file_path = os.path.join(dest_dir, filename)
+
+        if os.path.exists(local_file_path) and os.path.isfile(local_file_path):
+            if reuse_local_cache == True:
+                #Return the local file path without downloading new content
+                return Path(local_file_path).resolve()
+            else:
+                os.remove(local_file_path)
+
+        #Download file through streaming to support large files
+        tmp_file_path = f"{local_file_path}.part"
+        response = requests.get(url, stream=True)
+
+        with open(tmp_file_path, mode="wb") as local_file:
+            for chunk in response.iter_content(chunk_size=chunk_size):
+                local_file.write(chunk)
+
+        os.rename(tmp_file_path, local_file_path)
+
+        return Path(local_file_path).resolve()
+    else:
+        #Currently not supported
+        raise ValueError(f"URL with scheme '{url_components.scheme}' is currently not supported.")
diff --git a/pipeline/seq_retrieval/src/main.py b/pipeline/seq_retrieval/src/main.py
@@ -0,0 +1,87 @@
+import click
+import json
+
+import data_mover.data_file_mover as data_file_mover
+from seq_region import SeqRegion, chain_seq_region_seqs
+
+def validate_strand_param(ctx, param, value):
+    """Returns a normalised version of strings representing a strand.
+    Negative strand is normalised to '-', positive strand to '+'.
+    Throws a click.BadParameter exception if an unrecognised string was provided."""
+    POS_CHOICES = ['+', '+1', 'pos']
+    NEG_CHOICES = ['-', '-1', 'neg']
+    if   value in POS_CHOICES:
+        return '+'
+    elif value in NEG_CHOICES:
+        return '-'
+    else:
+        raise click.BadParameter(f"Must be one of {POS_CHOICES} for positive strand, or {NEG_CHOICES} for negative strand.")
+
+def process_seq_regions_param(ctx, param, value):
+    """Parse the seq_regions parameter value and validate it's structure.
+    Value is expected to be a JSON-formatted list of sequence regions to retrieve.
+    Each region should have:
+     * a 'start' property indicating the region start (inclusive)
+     * a 'end' property indicating the region end (inclusive)
+    Throws a click.BadParameter exception if value could not be parsed as JSON or had an invalid structure."""
+    seq_regions = None
+    try:
+        seq_regions = json.loads(value)
+    except:
+        raise click.BadParameter(f"Must be a valid JSON-formatted string.")
+    else:
+        if not isinstance(seq_regions, list):
+            raise click.BadParameter("Must be a valid list (JSON-array) of sequence regions to retrieve.")
+        for region in seq_regions:
+            if not isinstance(region, dict):
+                raise click.BadParameter(f"Region {region} is not a valid dict. All regions in seq_regions list must be valid dicts (JSON-objects).")
+            if 'start' not in region.keys():
+                raise click.BadParameter(f"Region {region} does not have a 'start' property, which is a required property.")
+            if 'end' not in region.keys():
+                raise click.BadParameter(f"Region {region} does not have a 'end' property, which is a required property.")
+            if not isinstance(region['start'], int):
+                raise click.BadParameter(f"'start' property of region {region} is not an integer. All positions must be integers.")
+            if not isinstance(region['end'], int):
+                raise click.BadParameter(f"'end' property of region {region} is not an integer. All positions must be integers.")
+
+        return seq_regions
+
+@click.command()
+@click.option("--seq_id", type=click.STRING, required=True,
+              help="The sequence ID to retrieve sequences for.")
+@click.option("--seq_strand", type=click.STRING, default='+', callback=validate_strand_param,
+              help="The sequence strand to retrieve sequences for.")
+@click.option("--seq_regions", type=click.UNPROCESSED, required=True, callback=process_seq_regions_param,
+              help="A list of sequence regions to retrieve sequences for.")
+@click.option("--fasta_file_url", type=click.STRING, required=True,
+              help="""URL to (faidx-indexed) fasta file to retrieve sequences from.\
+                   Assumes additional index files can be found at `<fasta_file_url>.fai`,
+                   and at `<fasta_file_url>.gzi` if the fastafile is compressed.
+                   Use "file://*" URL for local file or "http(s)://*" for remote files.""")
+@click.option("--reuse_local_cache", is_flag=True,
+              help="""When defined and using remote `fasta_file_url`, reused local files
+              if file already exists at destination path, rather than re-downloading and overwritting.""")
+def main(seq_id, seq_strand, seq_regions, fasta_file_url: str, reuse_local_cache: bool):
+    """Main method for sequence retrieval from JBrowse faidx indexed fasta files.
+    Returns a single (transcript) sequence made by concatenating all sequence regions requested
+    (in positional order defined by specified seq_strand)."""
+
+    click.echo(f"Received request to retrieve sequences for {seq_id}, strand {seq_strand}, seq_regions {seq_regions}!")
+
+    data_file_mover.set_local_cache_reuse(reuse_local_cache)
+
+    seq_region_objs = []
+    for region in seq_regions:
+        seq_region_objs.append(SeqRegion(seq_id=seq_id, start=region['start'], end=region['end'], strand=seq_strand,
+                                          fasta_file_url=fasta_file_url))
+
+    for seq_region in seq_region_objs:
+        #Retrieve sequence for region
+        seq_region.fetch_seq()
+
+    #Concatenate all regions into single sequence
+    seq_concat = chain_seq_region_seqs(seq_region_objs, seq_strand)
+    click.echo(f"\nSeq concat: {seq_concat}")
+
+if __name__ == '__main__':
+    main()
diff --git a/pipeline/seq_retrieval/src/seq_region/__init__.py b/pipeline/seq_retrieval/src/seq_region/__init__.py
@@ -0,0 +1,2 @@
+from .seq_region import SeqRegion
+from .seq_region import chain_seq_region_seqs
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .seq_region import SeqRegion
		from .seq_region import chain_seq_region_seqs